557 files changed, 123976 insertions, 29762 deletions
diff --git a/client/mysql_upgrade.c b/client/mysql_upgrade.c
index d7f311fcb69..5fe5445bc05 100644
--- a/client/mysql_upgrade.c
+++ b/client/mysql_upgrade.c
@@ -800,14 +800,32 @@ static void print_line(char* line)
 static int run_sql_fix_privilege_tables(void)
 {
   int found_real_errors= 0;
+  const char **query_ptr;
+  DYNAMIC_STRING ds_script;
   DYNAMIC_STRING ds_result;
   DBUG_ENTER("run_sql_fix_privilege_tables");
 
+  if (init_dynamic_string(&ds_script, "", 65536, 1024))
+    die("Out of memory");
+
   if (init_dynamic_string(&ds_result, "", 512, 512))
     die("Out of memory");
 
   verbose("Phase 3/3: Running 'mysql_fix_privilege_tables'...");
-  run_query(mysql_fix_privilege_tables,
+  /*
+    Individual queries can not be executed independently by invoking
+    a forked mysql client, because the script uses session variables
+    and prepared statements.
+  */
+  for ( query_ptr= &mysql_fix_privilege_tables[0];
+        *query_ptr != NULL;
+        query_ptr++
+      )
+  {
+    dynstr_append(&ds_script, *query_ptr);
+  }
+
+  run_query(ds_script.str,
             &ds_result, /* Collect result */
             TRUE);
 
@@ -835,6 +853,7 @@ static int run_sql_fix_privilege_tables(void)
   }
 
   dynstr_free(&ds_result);
+  dynstr_free(&ds_script);
   DBUG_RETURN(found_real_errors);
 }
 
diff --git a/configure.cmake b/configure.cmake
index ff87d632055..96918798807 100644
--- a/configure.cmake
+++ b/configure.cmake
@@ -67,14 +67,7 @@ IF(CMAKE_COMPILER_IS_GNUCXX)
   # MySQL "canonical" GCC flags. At least -fno-rtti flag affects
   # ABI and cannot be simply removed. 
   SET(CMAKE_CXX_FLAGS 
-    "${CMAKE_CXX_FLAGS} -fno-implicit-templates -fno-exceptions -fno-rtti")
-  IF(CMAKE_CXX_FLAGS)
-    STRING(REGEX MATCH "fno-implicit-templates" NO_IMPLICIT_TEMPLATES
-      ${CMAKE_CXX_FLAGS})
-    IF (NO_IMPLICIT_TEMPLATES)
-      SET(HAVE_EXPLICIT_TEMPLATE_INSTANTIATION TRUE)
-    ENDIF()
-  ENDIF()
+    "${CMAKE_CXX_FLAGS} -fno-exceptions -fno-rtti")
 
   IF (CMAKE_EXE_LINKER_FLAGS MATCHES " -static " 
      OR CMAKE_EXE_LINKER_FLAGS MATCHES " -static$")
diff --git a/include/my_global.h b/include/my_global.h
index a5fa57dbfe3..6fcaa258f21 100644
--- a/include/my_global.h
+++ b/include/my_global.h
@@ -18,6 +18,7 @@
 
 #ifndef _global_h
 #define _global_h
+#define MY_GLOBAL_INCLUDED
 
 /* Client library users on Windows need this macro defined here. */
 #if !defined(__WIN__) && defined(_WIN32)
@@ -1438,6 +1439,8 @@ static inline char *dlerror(void)
 #define max(a, b)	((a) > (b) ? (a) : (b))
 #define min(a, b)	((a) < (b) ? (a) : (b))
 #endif  
+#define MY_MAX(a, b)	((a) > (b) ? (a) : (b))
+#define MY_MIN(a, b)	((a) < (b) ? (a) : (b))
 
 #define CMP_NUM(a,b)    (((a) < (b)) ? -1 : ((a) == (b)) ? 0 : 1)
 
diff --git a/include/my_md5.h b/include/my_md5.h
index 782bef8a27a..4f90541067b 100644
--- a/include/my_md5.h
+++ b/include/my_md5.h
@@ -22,6 +22,36 @@
  * $FreeBSD: src/contrib/cvs/lib/md5.h,v 1.2 1999/12/11 15:10:02 peter Exp $
  */
 
+#if defined(HAVE_YASSL) || defined(HAVE_OPENSSL)
+/*
+  Use MD5 implementation provided by the SSL libraries.
+*/
+
+#if defined(HAVE_YASSL)
+
+C_MODE_START
+
+void my_md5_hash(char *digest, const char *buf, int len);
+
+C_MODE_END
+
+#else /* HAVE_YASSL */
+
+#include <openssl/md5.h>
+
+#define MY_MD5_HASH(digest, buf, len) \
+do { \
+  MD5_CTX ctx; \
+  MD5_Init (&ctx); \
+  MD5_Update (&ctx, buf, len); \
+  MD5_Final (digest, &ctx); \
+} while (0)
+
+#endif /* HAVE_YASSL */
+
+#else /* HAVE_YASSL || HAVE_OPENSSL */
+/* Fallback to the MySQL's implementation. */
+
 /* Unlike previous versions of this code, uint32 need not be exactly
    32 bits, merely 32 bits or more.  Choosing a data type which is 32
    bits instead of 64 is not important; speed is considerably more
@@ -35,18 +65,15 @@ typedef struct {
   unsigned char in[64];
 } my_MD5Context;
 
-#ifdef __cplusplus
-extern "C" {
-#endif
+C_MODE_START
+
 void my_MD5Init (my_MD5Context *context);
 void my_MD5Update (my_MD5Context *context,
                    unsigned char const *buf, unsigned len);
 void my_MD5Final (unsigned char digest[16],
                   my_MD5Context *context);
 
-#ifdef __cplusplus
-}
-#endif
+C_MODE_END
 
 #define MY_MD5_HASH(digest,buf,len) \
 do { \
@@ -56,4 +83,12 @@ do { \
   my_MD5Final (digest, &ctx); \
 } while (0)
 
-#endif /* MY_MD__INCLUDED */
+#endif /* defined(HAVE_YASSL) || defined(HAVE_OPENSSL) */
+
+C_MODE_START
+
+void compute_md5_hash(char *digest, const char *buf, int len);
+
+C_MODE_END
+
+#endif /* MY_MD5_INCLUDED */
diff --git a/include/my_pthread.h b/include/my_pthread.h
index 21c85c633d6..4f484265583 100644
--- a/include/my_pthread.h
+++ b/include/my_pthread.h
@@ -498,13 +498,22 @@ void safe_mutex_free_deadlock_data(safe_mutex_t *mp);
           DBUG_ASSERT(! (mp)->count || \
                       ! pthread_equal(pthread_self(), (mp)->thread))
 #define safe_mutex_setflags(mp, F)      do { (mp)->create_flags|= (F); } while (0)
+#define my_cond_timedwait(A,B,C) safe_cond_timedwait((A),(B),(C),__FILE__,__LINE__)
+#define my_cond_wait(A,B) safe_cond_wait((A), (B), __FILE__, __LINE__)
 #else
-#define my_pthread_mutex_init(A,B,C,D) pthread_mutex_init((A),(B))
-#define safe_mutex_assert_owner(mp)    do {} while(0)
-#define safe_mutex_assert_not_owner(mp) do {} while(0)
-#define safe_mutex_free_deadlock_data(mp) do {} while(0)
+
+#define safe_mutex_assert_owner(mp) do {} while (0)
+#define safe_mutex_assert_not_owner(mp) do {} while (0)
 #define safe_mutex_setflags(mp, F) do {} while (0)
-#endif /* SAFE_MUTEX */
+
+#if defined(MY_PTHREAD_FASTMUTEX)
+#define my_cond_timedwait(A,B,C) pthread_cond_timedwait((A), &(B)->mutex, (C))
+#define my_cond_wait(A,B) pthread_cond_wait((A), &(B)->mutex)
+#else
+#define my_cond_timedwait(A,B,C) pthread_cond_timedwait((A),(B),(C))
+#define my_cond_wait(A,B) pthread_cond_wait((A), (B))
+#endif /* MY_PTHREAD_FASTMUTEX */
+#endif /* !SAFE_MUTEX */
 
 #if defined(MY_PTHREAD_FASTMUTEX) && !defined(SAFE_MUTEX)
 typedef struct st_my_pthread_fastmutex_t
diff --git a/include/my_sys.h b/include/my_sys.h
index e10cedfb654..94c57a93e73 100644
--- a/include/my_sys.h
+++ b/include/my_sys.h
@@ -252,7 +252,9 @@ extern const char *my_defaults_extra_file;
 extern const char *my_defaults_group_suffix;
 extern const char *my_defaults_file;
 
+#ifndef timed_mutexes
 extern my_bool timed_mutexes;
+#endif
 
 enum loglevel {
    ERROR_LEVEL,
@@ -772,16 +774,17 @@ extern my_bool init_dynamic_array2(DYNAMIC_ARRAY *array, uint element_size,
 /* init_dynamic_array() function is deprecated */
 extern my_bool init_dynamic_array(DYNAMIC_ARRAY *array, uint element_size,
                                   uint init_alloc, uint alloc_increment);
-extern my_bool insert_dynamic(DYNAMIC_ARRAY *array, const uchar * element);
-extern uchar *alloc_dynamic(DYNAMIC_ARRAY *array);
-extern uchar *pop_dynamic(DYNAMIC_ARRAY*);
-extern my_bool set_dynamic(DYNAMIC_ARRAY *array,uchar * element,uint array_index);
+extern my_bool insert_dynamic(DYNAMIC_ARRAY *array, const void* element);
+extern void *alloc_dynamic(DYNAMIC_ARRAY *array);
+extern void *pop_dynamic(DYNAMIC_ARRAY*);
+extern my_bool set_dynamic(DYNAMIC_ARRAY *array, const void *element,
+                           uint array_index);
 extern my_bool allocate_dynamic(DYNAMIC_ARRAY *array, uint max_elements);
-extern void get_dynamic(DYNAMIC_ARRAY *array,uchar * element,uint array_index);
+extern void get_dynamic(DYNAMIC_ARRAY *array, void *element, uint array_index);
 extern void delete_dynamic(DYNAMIC_ARRAY *array);
 extern void delete_dynamic_element(DYNAMIC_ARRAY *array, uint array_index);
 extern void freeze_size(DYNAMIC_ARRAY *array);
-extern int  get_index_dynamic(DYNAMIC_ARRAY *array, uchar * element);
+extern int  get_index_dynamic(DYNAMIC_ARRAY *array, void *element);
 #define dynamic_array_ptr(array,array_index) ((array)->buffer+(array_index)*(array)->size_of_element)
 #define dynamic_element(array,array_index,type) ((type)((array)->buffer) +(array_index))
 #define push_dynamic(A,B) insert_dynamic((A),(B))
diff --git a/include/my_valgrind.h b/include/my_valgrind.h
index 31651f6c3ed..0aeaef68b50 100644
--- a/include/my_valgrind.h
+++ b/include/my_valgrind.h
@@ -37,9 +37,9 @@
 #endif /* HAVE_VALGRIND */
 
 #ifndef DBUG_OFF
-#define TRASH_FILL(A,B,C) do { bfill(A, B, C); MEM_UNDEFINED(A, B); } while (0)
+#define TRASH_FILL(A,B,C) do { const size_t trash_tmp= (B) ; bfill(A, trash_tmp, C); MEM_UNDEFINED(A, trash_tmp); } while (0)
 #else
-#define TRASH_FILL(A,B,C) do{ MEM_CHECK_ADDRESSABLE(A,B);MEM_UNDEFINED(A,B);} while (0)
+#define TRASH_FILL(A,B,C) do{ const size_t trash_tmp= (B) ; MEM_CHECK_ADDRESSABLE(A,trash_tmp);MEM_UNDEFINED(A,trash_tmp);} while (0)
 #endif
 #define TRASH_ALLOC(A,B) TRASH_FILL(A,B,0xA5)
 #define TRASH_FREE(A,B) TRASH_FILL(A,B,0x8F)
diff --git a/include/mysql/psi/mysql_file.h b/include/mysql/psi/mysql_file.h
index 6fc6689c47d..816ac713631 100644
--- a/include/mysql/psi/mysql_file.h
+++ b/include/mysql/psi/mysql_file.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2008, 2011, Oracle and/or its affiliates. All rights reserved.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
@@ -50,11 +50,18 @@
 */
 
 /**
+  @def mysql_file_register(P1, P2, P3)
+  File registration.
+*/
+#define mysql_file_register(P1, P2, P3) \
+  inline_mysql_file_register(P1, P2, P3)
+
+/**
   @def mysql_file_fgets(P1, P2, F)
   Instrumented fgets.
   @c mysql_file_fgets is a replacement for @c fgets.
 */
-#ifdef HAVE_PSI_INTERFACE
+#ifdef HAVE_PSI_FILE_INTERFACE
   #define mysql_file_fgets(P1, P2, F) \
     inline_mysql_file_fgets(__FILE__, __LINE__, P1, P2, F)
 #else
@@ -67,7 +74,7 @@
   Instrumented fgetc.
   @c mysql_file_fgetc is a replacement for @c fgetc.
 */
-#ifdef HAVE_PSI_INTERFACE
+#ifdef HAVE_PSI_FILE_INTERFACE
   #define mysql_file_fgetc(F) inline_mysql_file_fgetc(__FILE__, __LINE__, F)
 #else
   #define mysql_file_fgetc(F) inline_mysql_file_fgetc(F)
@@ -78,7 +85,7 @@
   Instrumented fputs.
   @c mysql_file_fputs is a replacement for @c fputs.
 */
-#ifdef HAVE_PSI_INTERFACE
+#ifdef HAVE_PSI_FILE_INTERFACE
   #define mysql_file_fputs(P1, F) \
     inline_mysql_file_fputs(__FILE__, __LINE__, P1, F)
 #else
@@ -91,7 +98,7 @@
   Instrumented fputc.
   @c mysql_file_fputc is a replacement for @c fputc.
 */
-#ifdef HAVE_PSI_INTERFACE
+#ifdef HAVE_PSI_FILE_INTERFACE
   #define mysql_file_fputc(P1, F) \
     inline_mysql_file_fputc(__FILE__, __LINE__, P1, F)
 #else
@@ -111,7 +118,7 @@
   Instrumented vfprintf.
   @c mysql_file_vfprintf is a replacement for @c vfprintf.
 */
-#ifdef HAVE_PSI_INTERFACE
+#ifdef HAVE_PSI_FILE_INTERFACE
   #define mysql_file_vfprintf(F, P1, P2) \
     inline_mysql_file_vfprintf(__FILE__, __LINE__, F, P1, P2)
 #else
@@ -124,7 +131,7 @@
   Instrumented fflush.
   @c mysql_file_fflush is a replacement for @c fflush.
 */
-#ifdef HAVE_PSI_INTERFACE
+#ifdef HAVE_PSI_FILE_INTERFACE
   #define mysql_file_fflush(F) \
     inline_mysql_file_fflush(__FILE__, __LINE__, F)
 #else
@@ -144,7 +151,7 @@
   Instrumented fstat.
   @c mysql_file_fstat is a replacement for @c my_fstat.
 */
-#ifdef HAVE_PSI_INTERFACE
+#ifdef HAVE_PSI_FILE_INTERFACE
   #define mysql_file_fstat(FN, S, FL) \
     inline_mysql_file_fstat(__FILE__, __LINE__, FN, S, FL)
 #else
@@ -157,7 +164,7 @@
   Instrumented stat.
   @c mysql_file_stat is a replacement for @c my_stat.
 */
-#ifdef HAVE_PSI_INTERFACE
+#ifdef HAVE_PSI_FILE_INTERFACE
   #define mysql_file_stat(K, FN, S, FL) \
     inline_mysql_file_stat(K, __FILE__, __LINE__, FN, S, FL)
 #else
@@ -170,7 +177,7 @@
   Instrumented chsize.
   @c mysql_file_chsize is a replacement for @c my_chsize.
 */
-#ifdef HAVE_PSI_INTERFACE
+#ifdef HAVE_PSI_FILE_INTERFACE
   #define mysql_file_chsize(F, P1, P2, P3) \
     inline_mysql_file_chsize(__FILE__, __LINE__, F, P1, P2, P3)
 #else
@@ -183,7 +190,7 @@
   Instrumented fopen.
   @c mysql_file_fopen is a replacement for @c my_fopen.
 */
-#ifdef HAVE_PSI_INTERFACE
+#ifdef HAVE_PSI_FILE_INTERFACE
   #define mysql_file_fopen(K, N, F1, F2) \
     inline_mysql_file_fopen(K, __FILE__, __LINE__, N, F1, F2)
 #else
@@ -203,7 +210,7 @@
   @code DBUG_ASSERT(file != NULL) @endcode,
   since doing so could introduce regressions.
 */
-#ifdef HAVE_PSI_INTERFACE
+#ifdef HAVE_PSI_FILE_INTERFACE
   #define mysql_file_fclose(FD, FL) \
     inline_mysql_file_fclose(__FILE__, __LINE__, FD, FL)
 #else
@@ -216,7 +223,7 @@
   Instrumented fread.
   @c mysql_file_fread is a replacement for @c my_fread.
 */
-#ifdef HAVE_PSI_INTERFACE
+#ifdef HAVE_PSI_FILE_INTERFACE
   #define mysql_file_fread(FD, P1, P2, P3) \
     inline_mysql_file_fread(__FILE__, __LINE__, FD, P1, P2, P3)
 #else
@@ -229,7 +236,7 @@
   Instrumented fwrite.
   @c mysql_file_fwrite is a replacement for @c my_fwrite.
 */
-#ifdef HAVE_PSI_INTERFACE
+#ifdef HAVE_PSI_FILE_INTERFACE
   #define mysql_file_fwrite(FD, P1, P2, P3) \
     inline_mysql_file_fwrite(__FILE__, __LINE__, FD, P1, P2, P3)
 #else
@@ -242,7 +249,7 @@
   Instrumented fseek.
   @c mysql_file_fseek is a replacement for @c my_fseek.
 */
-#ifdef HAVE_PSI_INTERFACE
+#ifdef HAVE_PSI_FILE_INTERFACE
   #define mysql_file_fseek(FD, P, W, F) \
     inline_mysql_file_fseek(__FILE__, __LINE__, FD, P, W, F)
 #else
@@ -255,7 +262,7 @@
   Instrumented ftell.
   @c mysql_file_ftell is a replacement for @c my_ftell.
 */
-#ifdef HAVE_PSI_INTERFACE
+#ifdef HAVE_PSI_FILE_INTERFACE
   #define mysql_file_ftell(FD, F) \
     inline_mysql_file_ftell(__FILE__, __LINE__, FD, F)
 #else
@@ -268,7 +275,7 @@
   Instrumented create.
   @c mysql_file_create is a replacement for @c my_create.
 */
-#ifdef HAVE_PSI_INTERFACE
+#ifdef HAVE_PSI_FILE_INTERFACE
   #define mysql_file_create(K, N, F1, F2, F3) \
   inline_mysql_file_create(K, __FILE__, __LINE__, N, F1, F2, F3)
 #else
@@ -281,7 +288,7 @@
   Instrumented create_temp_file.
   @c mysql_file_create_temp is a replacement for @c create_temp_file.
 */
-#ifdef HAVE_PSI_INTERFACE
+#ifdef HAVE_PSI_FILE_INTERFACE
   #define mysql_file_create_temp(K, T, D, P, M, F) \
     inline_mysql_file_create_temp(K, T, D, P, M, F)
 #else
@@ -294,7 +301,7 @@
   Instrumented open.
   @c mysql_file_open is a replacement for @c my_open.
 */
-#ifdef HAVE_PSI_INTERFACE
+#ifdef HAVE_PSI_FILE_INTERFACE
   #define mysql_file_open(K, N, F1, F2) \
     inline_mysql_file_open(K, __FILE__, __LINE__, N, F1, F2)
 #else
@@ -307,7 +314,7 @@
   Instrumented close.
   @c mysql_file_close is a replacement for @c my_close.
 */
-#ifdef HAVE_PSI_INTERFACE
+#ifdef HAVE_PSI_FILE_INTERFACE
   #define mysql_file_close(FD, F) \
     inline_mysql_file_close(__FILE__, __LINE__, FD, F)
 #else
@@ -320,7 +327,7 @@
   Instrumented read.
   @c mysql_read is a replacement for @c my_read.
 */
-#ifdef HAVE_PSI_INTERFACE
+#ifdef HAVE_PSI_FILE_INTERFACE
   #define mysql_file_read(FD, B, S, F) \
     inline_mysql_file_read(__FILE__, __LINE__, FD, B, S, F)
 #else
@@ -333,7 +340,7 @@
   Instrumented write.
   @c mysql_file_write is a replacement for @c my_write.
 */
-#ifdef HAVE_PSI_INTERFACE
+#ifdef HAVE_PSI_FILE_INTERFACE
   #define mysql_file_write(FD, B, S, F) \
     inline_mysql_file_write(__FILE__, __LINE__, FD, B, S, F)
 #else
@@ -346,7 +353,7 @@
   Instrumented pread.
   @c mysql_pread is a replacement for @c my_pread.
 */
-#ifdef HAVE_PSI_INTERFACE
+#ifdef HAVE_PSI_FILE_INTERFACE
   #define mysql_file_pread(FD, B, S, O, F) \
     inline_mysql_file_pread(__FILE__, __LINE__, FD, B, S, O, F)
 #else
@@ -359,7 +366,7 @@
   Instrumented pwrite.
   @c mysql_file_pwrite is a replacement for @c my_pwrite.
 */
-#ifdef HAVE_PSI_INTERFACE
+#ifdef HAVE_PSI_FILE_INTERFACE
   #define mysql_file_pwrite(FD, B, S, O, F) \
     inline_mysql_file_pwrite(__FILE__, __LINE__, FD, B, S, O, F)
 #else
@@ -372,7 +379,7 @@
   Instrumented seek.
   @c mysql_file_seek is a replacement for @c my_seek.
 */
-#ifdef HAVE_PSI_INTERFACE
+#ifdef HAVE_PSI_FILE_INTERFACE
   #define mysql_file_seek(FD, P, W, F) \
     inline_mysql_file_seek(__FILE__, __LINE__, FD, P, W, F)
 #else
@@ -385,7 +392,7 @@
   Instrumented tell.
   @c mysql_file_tell is a replacement for @c my_tell.
 */
-#ifdef HAVE_PSI_INTERFACE
+#ifdef HAVE_PSI_FILE_INTERFACE
   #define mysql_file_tell(FD, F) \
     inline_mysql_file_tell(__FILE__, __LINE__, FD, F)
 #else
@@ -398,7 +405,7 @@
   Instrumented delete.
   @c mysql_file_delete is a replacement for @c my_delete.
 */
-#ifdef HAVE_PSI_INTERFACE
+#ifdef HAVE_PSI_FILE_INTERFACE
   #define mysql_file_delete(K, P1, P2) \
     inline_mysql_file_delete(K, __FILE__, __LINE__, P1, P2)
 #else
@@ -411,7 +418,7 @@
   Instrumented rename.
   @c mysql_file_rename is a replacement for @c my_rename.
 */
-#ifdef HAVE_PSI_INTERFACE
+#ifdef HAVE_PSI_FILE_INTERFACE
   #define mysql_file_rename(K, P1, P2, P3) \
     inline_mysql_file_rename(K, __FILE__, __LINE__, P1, P2, P3)
 #else
@@ -425,7 +432,7 @@
   @c mysql_file_create_with_symlink is a replacement
   for @c my_create_with_symlink.
 */
-#ifdef HAVE_PSI_INTERFACE
+#ifdef HAVE_PSI_FILE_INTERFACE
   #define mysql_file_create_with_symlink(K, P1, P2, P3, P4, P5) \
   inline_mysql_file_create_with_symlink(K, __FILE__, __LINE__, \
                                         P1, P2, P3, P4, P5)
@@ -440,7 +447,7 @@
   @c mysql_file_delete_with_symlink is a replacement
   for @c my_delete_with_symlink.
 */
-#ifdef HAVE_PSI_INTERFACE
+#ifdef HAVE_PSI_FILE_INTERFACE
   #define mysql_file_delete_with_symlink(K, P1, P2) \
   inline_mysql_file_delete_with_symlink(K, __FILE__, __LINE__, P1, P2)
 #else
@@ -454,7 +461,7 @@
   @c mysql_file_rename_with_symlink is a replacement
   for @c my_rename_with_symlink.
 */
-#ifdef HAVE_PSI_INTERFACE
+#ifdef HAVE_PSI_FILE_INTERFACE
   #define mysql_file_rename_with_symlink(K, P1, P2, P3) \
   inline_mysql_file_rename_with_symlink(K, __FILE__, __LINE__, P1, P2, P3)
 #else
@@ -467,7 +474,7 @@
   Instrumented file sync.
   @c mysql_file_sync is a replacement for @c my_sync.
 */
-#ifdef HAVE_PSI_INTERFACE
+#ifdef HAVE_PSI_FILE_INTERFACE
   #define mysql_file_sync(P1, P2) \
     inline_mysql_file_sync(__FILE__, __LINE__, P1, P2)
 #else
@@ -498,115 +505,126 @@ struct st_mysql_file
 */
 typedef struct st_mysql_file MYSQL_FILE;
 
+static inline void inline_mysql_file_register(
+#ifdef HAVE_PSI_FILE_INTERFACE
+  const char *category,
+  PSI_file_info *info,
+  int count
+#else
+  const char *category __attribute__ ((unused)),
+  void *info __attribute__ ((unused)),
+  int count __attribute__ ((unused))
+#endif
+)
+{
+#ifdef HAVE_PSI_FILE_INTERFACE
+  PSI_CALL(register_file)(category, info, count);
+#endif
+}
+
 static inline char *
 inline_mysql_file_fgets(
-#ifdef HAVE_PSI_INTERFACE
+#ifdef HAVE_PSI_FILE_INTERFACE
   const char *src_file, uint src_line,
 #endif
   char *str, int size, MYSQL_FILE *file)
 {
   char *result;
-#ifdef HAVE_PSI_INTERFACE
-  struct PSI_file_locker *locker= NULL;
+#ifdef HAVE_PSI_FILE_INTERFACE
+  struct PSI_file_locker *locker;
   PSI_file_locker_state state;
-  if (likely(PSI_server && file->m_psi))
+  locker= PSI_CALL(get_thread_file_stream_locker)(&state, file->m_psi,
+                                                  PSI_FILE_READ);
+  if (likely(locker != NULL))
   {
-    locker= PSI_server->get_thread_file_stream_locker(&state, file->m_psi,
-                                                      PSI_FILE_READ);
-    if (likely(locker != NULL))
-      PSI_server->start_file_wait(locker, (size_t) size, src_file, src_line);
+    PSI_CALL(start_file_wait)(locker, (size_t) size, src_file, src_line);
+    result= fgets(str, size, file->m_file);
+    PSI_CALL(end_file_wait)(locker, result ? strlen(result) : 0);
+    return result;
   }
 #endif
+
   result= fgets(str, size, file->m_file);
-#ifdef HAVE_PSI_INTERFACE
-  if (likely(locker != NULL))
-    PSI_server->end_file_wait(locker, result ? strlen(result) : 0);
-#endif
   return result;
 }
 
 static inline int
 inline_mysql_file_fgetc(
-#ifdef HAVE_PSI_INTERFACE
+#ifdef HAVE_PSI_FILE_INTERFACE
   const char *src_file, uint src_line,
 #endif
   MYSQL_FILE *file)
 {
   int result;
-#ifdef HAVE_PSI_INTERFACE
-  struct PSI_file_locker *locker= NULL;
+#ifdef HAVE_PSI_FILE_INTERFACE
+  struct PSI_file_locker *locker;
   PSI_file_locker_state state;
-  if (likely(PSI_server && file->m_psi))
+  locker= PSI_CALL(get_thread_file_stream_locker)(&state, file->m_psi,
+                                                  PSI_FILE_READ);
+  if (likely(locker != NULL))
   {
-    locker= PSI_server->get_thread_file_stream_locker(&state, file->m_psi,
-                                                      PSI_FILE_READ);
-    if (likely(locker != NULL))
-      PSI_server->start_file_wait(locker, (size_t) 1, src_file, src_line);
+    PSI_CALL(start_file_wait)(locker, (size_t) 1, src_file, src_line);
+    result= fgetc(file->m_file);
+    PSI_CALL(end_file_wait)(locker, (size_t) 1);
+    return result;
   }
 #endif
+
   result= fgetc(file->m_file);
-#ifdef HAVE_PSI_INTERFACE
-  if (likely(locker != NULL))
-    PSI_server->end_file_wait(locker, (size_t) 1);
-#endif
   return result;
 }
 
 static inline int
 inline_mysql_file_fputs(
-#ifdef HAVE_PSI_INTERFACE
+#ifdef HAVE_PSI_FILE_INTERFACE
   const char *src_file, uint src_line,
 #endif
   const char *str, MYSQL_FILE *file)
 {
   int result;
-#ifdef HAVE_PSI_INTERFACE
-  struct PSI_file_locker *locker= NULL;
+#ifdef HAVE_PSI_FILE_INTERFACE
+  struct PSI_file_locker *locker;
   PSI_file_locker_state state;
-  size_t bytes= 0;
-  if (likely(PSI_server && file->m_psi))
+  size_t bytes;
+  locker= PSI_CALL(get_thread_file_stream_locker)(&state, file->m_psi,
+                                                  PSI_FILE_WRITE);
+  if (likely(locker != NULL))
   {
-    locker= PSI_server->get_thread_file_stream_locker(&state, file->m_psi,
-                                                      PSI_FILE_WRITE);
-    if (likely(locker != NULL))
-    {
-      bytes= str ? strlen(str) : 0;
-      PSI_server->start_file_wait(locker, bytes, src_file, src_line);
-    }
+    bytes= str ? strlen(str) : 0;
+    PSI_CALL(start_file_wait)(locker, bytes, src_file, src_line);
+    result= fputs(str, file->m_file);
+    PSI_CALL(end_file_wait)(locker, bytes);
+    return result;
   }
 #endif
+
   result= fputs(str, file->m_file);
-#ifdef HAVE_PSI_INTERFACE
-  if (likely(locker != NULL))
-    PSI_server->end_file_wait(locker, bytes);
-#endif
   return result;
 }
 
 static inline int
 inline_mysql_file_fputc(
-#ifdef HAVE_PSI_INTERFACE
+#ifdef HAVE_PSI_FILE_INTERFACE
   const char *src_file, uint src_line,
 #endif
   char c, MYSQL_FILE *file)
 {
   int result;
-#ifdef HAVE_PSI_INTERFACE
-  struct PSI_file_locker *locker= NULL;
+#ifdef HAVE_PSI_FILE_INTERFACE
+  struct PSI_file_locker *locker;
   PSI_file_locker_state state;
-  if (likely(PSI_server && file->m_psi))
+  locker= PSI_CALL(get_thread_file_stream_locker)(&state, file->m_psi,
+                                                  PSI_FILE_WRITE);
+  if (likely(locker != NULL))
   {
-    locker= PSI_server->get_thread_file_stream_locker(&state, file->m_psi,
-                                                      PSI_FILE_WRITE);
-    if (likely(locker != NULL))
-      PSI_server->start_file_wait(locker, (size_t) 1, src_file, src_line);
+    PSI_CALL(start_file_wait)(locker, (size_t) 1, src_file, src_line);
+    result= fputc(c, file->m_file);
+    PSI_CALL(end_file_wait)(locker, (size_t) 1);
+    return result;
   }
 #endif
+
   result= fputc(c, file->m_file);
-#ifdef HAVE_PSI_INTERFACE
-  if (likely(locker != NULL))
-    PSI_server->end_file_wait(locker, (size_t) 1);
-#endif
   return result;
 }
 
@@ -618,78 +636,77 @@ inline_mysql_file_fprintf(MYSQL_FILE *file, const char *format, ...)
   */
   int result;
   va_list args;
-#ifdef HAVE_PSI_INTERFACE
-  struct PSI_file_locker *locker= NULL;
+#ifdef HAVE_PSI_FILE_INTERFACE
+  struct PSI_file_locker *locker;
   PSI_file_locker_state state;
-  if (likely(PSI_server && file->m_psi))
+  locker= PSI_CALL(get_thread_file_stream_locker)(&state, file->m_psi,
+                                                  PSI_FILE_WRITE);
+  if (likely(locker != NULL))
   {
-    locker= PSI_server->get_thread_file_stream_locker(&state, file->m_psi,
-                                                      PSI_FILE_WRITE);
-    if (likely(locker != NULL))
-      PSI_server->start_file_wait(locker, (size_t) 0, __FILE__, __LINE__);
+    PSI_CALL(start_file_wait)(locker, (size_t) 0, __FILE__, __LINE__);
+    va_start(args, format);
+    result= vfprintf(file->m_file, format, args);
+    va_end(args);
+    PSI_CALL(end_file_wait)(locker, (size_t) result);
+    return result;
   }
 #endif
+
   va_start(args, format);
   result= vfprintf(file->m_file, format, args);
   va_end(args);
-#ifdef HAVE_PSI_INTERFACE
-  if (likely(locker != NULL))
-    PSI_server->end_file_wait(locker, (size_t) result);
-#endif
   return result;
 }
 
 static inline int
 inline_mysql_file_vfprintf(
-#ifdef HAVE_PSI_INTERFACE
+#ifdef HAVE_PSI_FILE_INTERFACE
   const char *src_file, uint src_line,
 #endif
   MYSQL_FILE *file, const char *format, va_list args)
 {
   int result;
-#ifdef HAVE_PSI_INTERFACE
-  struct PSI_file_locker *locker= NULL;
+#ifdef HAVE_PSI_FILE_INTERFACE
+  struct PSI_file_locker *locker;
   PSI_file_locker_state state;
-  if (likely(PSI_server && file->m_psi))
+  locker= PSI_CALL(get_thread_file_stream_locker)(&state, file->m_psi,
+                                                  PSI_FILE_WRITE);
+  if (likely(locker != NULL))
   {
-    locker= PSI_server->get_thread_file_stream_locker(&state, file->m_psi,
-                                                      PSI_FILE_WRITE);
-    if (likely(locker != NULL))
-      PSI_server->start_file_wait(locker, (size_t) 0, src_file, src_line);
+    PSI_CALL(start_file_wait)(locker, (size_t) 0, src_file, src_line);
+    result= vfprintf(file->m_file, format, args);
+    PSI_CALL(end_file_wait)(locker, (size_t) result);
+    return result;
   }
 #endif
+
   result= vfprintf(file->m_file, format, args);
-#ifdef HAVE_PSI_INTERFACE
-  if (likely(locker != NULL))
-    PSI_server->end_file_wait(locker, (size_t) result);
-#endif
   return result;
 }
 
 static inline int
 inline_mysql_file_fflush(
-#ifdef HAVE_PSI_INTERFACE
+#ifdef HAVE_PSI_FILE_INTERFACE
   const char *src_file, uint src_line,
 #endif
   MYSQL_FILE *file)
 {
   int result;
-#ifdef HAVE_PSI_INTERFACE
-  struct PSI_file_locker *locker= NULL;
+#ifdef HAVE_PSI_FILE_INTERFACE
+  struct PSI_file_locker *locker;
   PSI_file_locker_state state;
-  if (likely(PSI_server && file->m_psi))
+  locker= PSI_CALL(get_thread_file_stream_locker)(&state, file->m_psi,
+                                                  PSI_FILE_FLUSH);
+  if (likely(locker != NULL))
   {
-    locker= PSI_server->get_thread_file_stream_locker(&state, file->m_psi,
-                                                      PSI_FILE_FLUSH);
-    if (likely(locker != NULL))
-      PSI_server->start_file_wait(locker, (size_t) 0, src_file, src_line);
+    PSI_CALL(start_file_wait)(locker, (size_t) 0, src_file, src_line);
+    result= fflush(file->m_file);
+    PSI_CALL(end_file_wait)(locker, (size_t) 0);
+    return result;
   }
 #endif
+
   result= fflush(file->m_file);
-#ifdef HAVE_PSI_INTERFACE
-  if (likely(locker != NULL))
-    PSI_server->end_file_wait(locker, (size_t) 0);
-#endif
   return result;
 }
 
@@ -701,90 +718,87 @@ static inline int inline_mysql_file_feof(MYSQL_FILE *file)
 
 static inline int
 inline_mysql_file_fstat(
-#ifdef HAVE_PSI_INTERFACE
+#ifdef HAVE_PSI_FILE_INTERFACE
   const char *src_file, uint src_line,
 #endif
   int filenr, MY_STAT *stat_area, myf flags)
 {
   int result;
-#ifdef HAVE_PSI_INTERFACE
-  struct PSI_file_locker *locker= NULL;
+#ifdef HAVE_PSI_FILE_INTERFACE
+  struct PSI_file_locker *locker;
   PSI_file_locker_state state;
-  if (likely(PSI_server != NULL))
+  locker= PSI_CALL(get_thread_file_descriptor_locker)(&state, filenr,
+                                                      PSI_FILE_FSTAT);
+  if (likely(locker != NULL))
   {
-    locker= PSI_server->get_thread_file_descriptor_locker(&state, filenr,
-                                                          PSI_FILE_FSTAT);
-    if (likely(locker != NULL))
-      PSI_server->start_file_wait(locker, (size_t) 0, src_file, src_line);
+    PSI_CALL(start_file_wait)(locker, (size_t) 0, src_file, src_line);
+    result= my_fstat(filenr, stat_area, flags);
+    PSI_CALL(end_file_wait)(locker, (size_t) 0);
+    return result;
   }
 #endif
+
   result= my_fstat(filenr, stat_area, flags);
-#ifdef HAVE_PSI_INTERFACE
-  if (likely(locker != NULL))
-    PSI_server->end_file_wait(locker, (size_t) 0);
-#endif
   return result;
 }
 
 static inline MY_STAT *
 inline_mysql_file_stat(
-#ifdef HAVE_PSI_INTERFACE
+#ifdef HAVE_PSI_FILE_INTERFACE
   PSI_file_key key, const char *src_file, uint src_line,
 #endif
   const char *path, MY_STAT *stat_area, myf flags)
 {
   MY_STAT *result;
-#ifdef HAVE_PSI_INTERFACE
-  struct PSI_file_locker *locker= NULL;
+#ifdef HAVE_PSI_FILE_INTERFACE
+  struct PSI_file_locker *locker;
   PSI_file_locker_state state;
-  if (likely(PSI_server != NULL))
+  locker= PSI_CALL(get_thread_file_name_locker)(&state,
+                                                key, PSI_FILE_STAT,
+                                                path, &locker);
+  if (likely(locker != NULL))
   {
-    locker= PSI_server->get_thread_file_name_locker(&state,
-                                                    key, PSI_FILE_STAT,
-                                                    path, &locker);
-    if (likely(locker != NULL))
-      PSI_server->start_file_open_wait(locker, src_file, src_line);
+    PSI_CALL(start_file_open_wait)(locker, src_file, src_line);
+    result= my_stat(path, stat_area, flags);
+    PSI_CALL(end_file_wait)(locker, (size_t) 0);
+    return result;
   }
 #endif
+
   result= my_stat(path, stat_area, flags);
-#ifdef HAVE_PSI_INTERFACE
-  if (likely(locker != NULL))
-    PSI_server->end_file_wait(locker, (size_t) 0);
-#endif
   return result;
 }
 
 static inline int
 inline_mysql_file_chsize(
-#ifdef HAVE_PSI_INTERFACE
+#ifdef HAVE_PSI_FILE_INTERFACE
   const char *src_file, uint src_line,
 #endif
   File file, my_off_t newlength, int filler, myf flags)
 {
   int result;
-#ifdef HAVE_PSI_INTERFACE
-  struct PSI_file_locker *locker= NULL;
+#ifdef HAVE_PSI_FILE_INTERFACE
+  struct PSI_file_locker *locker;
   PSI_file_locker_state state;
-  if (likely(PSI_server != NULL))
+  locker= PSI_CALL(get_thread_file_descriptor_locker)(&state, file,
+                                                      PSI_FILE_CHSIZE);
+  if (likely(locker != NULL))
   {
-    locker= PSI_server->get_thread_file_descriptor_locker(&state, file,
-                                                          PSI_FILE_CHSIZE);
-    if (likely(locker != NULL))
-      PSI_server->start_file_wait(locker, (size_t) newlength, src_file,
-                                  src_line);
+    PSI_CALL(start_file_wait)(locker, (size_t) newlength, src_file,
+                              src_line);
+    result= my_chsize(file, newlength, filler, flags);
+    PSI_CALL(end_file_wait)(locker, (size_t) newlength);
+    return result;
   }
 #endif
+
   result= my_chsize(file, newlength, filler, flags);
-#ifdef HAVE_PSI_INTERFACE
-  if (likely(locker != NULL))
-    PSI_server->end_file_wait(locker, (size_t) newlength);
-#endif
   return result;
 }
 
 static inline MYSQL_FILE*
 inline_mysql_file_fopen(
-#ifdef HAVE_PSI_INTERFACE
+#ifdef HAVE_PSI_FILE_INTERFACE
   PSI_file_key key, const char *src_file, uint src_line,
 #endif
   const char *filename, int flags, myf myFlags)
@@ -793,30 +807,32 @@ inline_mysql_file_fopen(
   that= (MYSQL_FILE*) my_malloc(sizeof(MYSQL_FILE), MYF(MY_WME));
   if (likely(that != NULL))
   {
-    that->m_psi= NULL;
+#ifdef HAVE_PSI_FILE_INTERFACE
+    struct PSI_file_locker *locker;
+    PSI_file_locker_state state;
+    locker= PSI_CALL(get_thread_file_name_locker)
+        (&state, key, PSI_FILE_STREAM_OPEN, filename, that);
+    if (likely(locker != NULL))
     {
-#ifdef HAVE_PSI_INTERFACE
-      struct PSI_file_locker *locker= NULL;
-      PSI_file_locker_state state;
-      if (likely(PSI_server != NULL))
-      {
-        locker= PSI_server->get_thread_file_name_locker
-          (&state, key, PSI_FILE_STREAM_OPEN, filename, that);
-        if (likely(locker != NULL))
-          that->m_psi= PSI_server->start_file_open_wait(locker, src_file,
-                                                        src_line);
-      }
-#endif
+      that->m_psi= PSI_CALL(start_file_open_wait)(locker, src_file,
+                                                  src_line);
       that->m_file= my_fopen(filename, flags, myFlags);
-#ifdef HAVE_PSI_INTERFACE
-      if (likely(locker != NULL))
-        PSI_server->end_file_open_wait(locker);
-#endif
+      PSI_CALL(end_file_open_wait)(locker);
       if (unlikely(that->m_file == NULL))
       {
         my_free(that);
         return NULL;
       }
+      return that;
+    }
+#endif
+
+    that->m_psi= NULL;
+    that->m_file= my_fopen(filename, flags, myFlags);
+    if (unlikely(that->m_file == NULL))
+    {
+      my_free(that);
+      return NULL;
     }
   }
   return that;
@@ -824,7 +840,7 @@ inline_mysql_file_fopen(
 
 static inline int
 inline_mysql_file_fclose(
-#ifdef HAVE_PSI_INTERFACE
+#ifdef HAVE_PSI_FILE_INTERFACE
   const char *src_file, uint src_line,
 #endif
   MYSQL_FILE *file, myf flags)
@@ -832,23 +848,22 @@ inline_mysql_file_fclose(
   int result= 0;
   if (likely(file != NULL))
   {
-#ifdef HAVE_PSI_INTERFACE
-    struct PSI_file_locker *locker= NULL;
+#ifdef HAVE_PSI_FILE_INTERFACE
+    struct PSI_file_locker *locker;
     PSI_file_locker_state state;
-    DBUG_ASSERT(file != NULL);
-    if (likely(PSI_server && file->m_psi))
+    locker= PSI_CALL(get_thread_file_stream_locker)(&state, file->m_psi,
+                                                    PSI_FILE_STREAM_CLOSE);
+    if (likely(locker != NULL))
     {
-      locker= PSI_server->get_thread_file_stream_locker(&state, file->m_psi,
-                                                        PSI_FILE_STREAM_CLOSE);
-      if (likely(locker != NULL))
-        PSI_server->start_file_wait(locker, (size_t) 0, src_file, src_line);
+      PSI_CALL(start_file_wait)(locker, (size_t) 0, src_file, src_line);
+      result= my_fclose(file->m_file, flags);
+      PSI_CALL(end_file_wait)(locker, (size_t) 0);
+      my_free(file);
+      return result;
     }
 #endif
+
     result= my_fclose(file->m_file, flags);
-#ifdef HAVE_PSI_INTERFACE
-    if (likely(locker != NULL))
-      PSI_server->end_file_wait(locker, (size_t) 0);
-#endif
     my_free(file);
   }
   return result;
@@ -856,156 +871,147 @@ inline_mysql_file_fclose(
 
 static inline size_t
 inline_mysql_file_fread(
-#ifdef HAVE_PSI_INTERFACE
+#ifdef HAVE_PSI_FILE_INTERFACE
   const char *src_file, uint src_line,
 #endif
   MYSQL_FILE *file, uchar *buffer, size_t count, myf flags)
 {
-  size_t result= 0;
-#ifdef HAVE_PSI_INTERFACE
-  struct PSI_file_locker *locker= NULL;
+  size_t result;
+#ifdef HAVE_PSI_FILE_INTERFACE
+  struct PSI_file_locker *locker;
   PSI_file_locker_state state;
-  if (likely(PSI_server && file->m_psi))
-  {
-    locker= PSI_server->get_thread_file_stream_locker(&state, file->m_psi,
-                                                      PSI_FILE_READ);
-    if (likely(locker != NULL))
-      PSI_server->start_file_wait(locker, count, src_file, src_line);
-  }
-#endif
-  result= my_fread(file->m_file, buffer, count, flags);
-#ifdef HAVE_PSI_INTERFACE
+  size_t bytes_read;
+  locker= PSI_CALL(get_thread_file_stream_locker)(&state, file->m_psi,
+                                                  PSI_FILE_READ);
   if (likely(locker != NULL))
   {
-    size_t bytes_read;
+    PSI_CALL(start_file_wait)(locker, count, src_file, src_line);
+    result= my_fread(file->m_file, buffer, count, flags);
     if (flags & (MY_NABP | MY_FNABP))
       bytes_read= (result == 0) ? count : 0;
     else
       bytes_read= (result != MY_FILE_ERROR) ? result : 0;
-    PSI_server->end_file_wait(locker, bytes_read);
+    PSI_CALL(end_file_wait)(locker, bytes_read);
+    return result;
   }
 #endif
+
+  result= my_fread(file->m_file, buffer, count, flags);
   return result;
 }
 
 static inline size_t
 inline_mysql_file_fwrite(
-#ifdef HAVE_PSI_INTERFACE
+#ifdef HAVE_PSI_FILE_INTERFACE
   const char *src_file, uint src_line,
 #endif
   MYSQL_FILE *file, const uchar *buffer, size_t count, myf flags)
 {
-  size_t result= 0;
-#ifdef HAVE_PSI_INTERFACE
-  struct PSI_file_locker *locker= NULL;
+  size_t result;
+#ifdef HAVE_PSI_FILE_INTERFACE
+  struct PSI_file_locker *locker;
   PSI_file_locker_state state;
-  if (likely(PSI_server && file->m_psi))
-  {
-    locker= PSI_server->get_thread_file_stream_locker(&state, file->m_psi,
-                                                      PSI_FILE_WRITE);
-    if (likely(locker != NULL))
-      PSI_server->start_file_wait(locker, count, src_file, src_line);
-  }
-#endif
-  result= my_fwrite(file->m_file, buffer, count, flags);
-#ifdef HAVE_PSI_INTERFACE
+  size_t bytes_written;
+  locker= PSI_CALL(get_thread_file_stream_locker)(&state, file->m_psi,
+                                                  PSI_FILE_WRITE);
   if (likely(locker != NULL))
   {
-    size_t bytes_written;
+    PSI_CALL(start_file_wait)(locker, count, src_file, src_line);
+    result= my_fwrite(file->m_file, buffer, count, flags);
     if (flags & (MY_NABP | MY_FNABP))
       bytes_written= (result == 0) ? count : 0;
     else
       bytes_written= (result != MY_FILE_ERROR) ? result : 0;
-    PSI_server->end_file_wait(locker, bytes_written);
+    PSI_CALL(end_file_wait)(locker, bytes_written);
+    return result;
   }
 #endif
+
+  result= my_fwrite(file->m_file, buffer, count, flags);
   return result;
 }
 
 static inline my_off_t
 inline_mysql_file_fseek(
-#ifdef HAVE_PSI_INTERFACE
+#ifdef HAVE_PSI_FILE_INTERFACE
   const char *src_file, uint src_line,
 #endif
   MYSQL_FILE *file, my_off_t pos, int whence, myf flags)
 {
   my_off_t result;
-#ifdef HAVE_PSI_INTERFACE
-  struct PSI_file_locker *locker= NULL;
+#ifdef HAVE_PSI_FILE_INTERFACE
+  struct PSI_file_locker *locker;
   PSI_file_locker_state state;
-  if (likely(PSI_server && file->m_psi))
+  locker= PSI_CALL(get_thread_file_stream_locker)(&state, file->m_psi,
+                                                  PSI_FILE_SEEK);
+  if (likely(locker != NULL))
   {
-    locker= PSI_server->get_thread_file_stream_locker(&state, file->m_psi,
-                                                      PSI_FILE_SEEK);
-    if (likely(locker != NULL))
-      PSI_server->start_file_wait(locker, (size_t) 0, src_file, src_line);
+    PSI_CALL(start_file_wait)(locker, (size_t) 0, src_file, src_line);
+    result= my_fseek(file->m_file, pos, whence, flags);
+    PSI_CALL(end_file_wait)(locker, (size_t) 0);
+    return result;
   }
 #endif
+
   result= my_fseek(file->m_file, pos, whence, flags);
-#ifdef HAVE_PSI_INTERFACE
-  if (likely(locker != NULL))
-    PSI_server->end_file_wait(locker, (size_t) 0);
-#endif
   return result;
 }
 
 static inline my_off_t
 inline_mysql_file_ftell(
-#ifdef HAVE_PSI_INTERFACE
+#ifdef HAVE_PSI_FILE_INTERFACE
   const char *src_file, uint src_line,
 #endif
   MYSQL_FILE *file, myf flags)
 {
   my_off_t result;
-#ifdef HAVE_PSI_INTERFACE
-  struct PSI_file_locker *locker= NULL;
+#ifdef HAVE_PSI_FILE_INTERFACE
+  struct PSI_file_locker *locker;
   PSI_file_locker_state state;
-  if (likely(PSI_server && file->m_psi))
+  locker= PSI_CALL(get_thread_file_stream_locker)(&state, file->m_psi,
+                                                  PSI_FILE_TELL);
+  if (likely(locker != NULL))
   {
-    locker= PSI_server->get_thread_file_stream_locker(&state, file->m_psi,
-                                                      PSI_FILE_TELL);
-    if (likely(locker != NULL))
-      PSI_server->start_file_wait(locker, (size_t) 0, src_file, src_line);
+    PSI_CALL(start_file_wait)(locker, (size_t) 0, src_file, src_line);
+    result= my_ftell(file->m_file, flags);
+    PSI_CALL(end_file_wait)(locker, (size_t) 0);
+    return result;
   }
 #endif
+
   result= my_ftell(file->m_file, flags);
-#ifdef HAVE_PSI_INTERFACE
-  if (likely(locker != NULL))
-    PSI_server->end_file_wait(locker, (size_t) 0);
-#endif
   return result;
 }
 
 static inline File
 inline_mysql_file_create(
-#ifdef HAVE_PSI_INTERFACE
+#ifdef HAVE_PSI_FILE_INTERFACE
   PSI_file_key key, const char *src_file, uint src_line,
 #endif
   const char *filename, int create_flags, int access_flags, myf myFlags)
 {
   File file;
-#ifdef HAVE_PSI_INTERFACE
-  struct PSI_file_locker *locker= NULL;
+#ifdef HAVE_PSI_FILE_INTERFACE
+  struct PSI_file_locker *locker;
   PSI_file_locker_state state;
-  if (likely(PSI_server != NULL))
+  locker= PSI_CALL(get_thread_file_name_locker)(&state, key, PSI_FILE_CREATE,
+                                                filename, &locker);
+  if (likely(locker != NULL))
   {
-    locker= PSI_server->get_thread_file_name_locker(&state, key, PSI_FILE_CREATE,
-                                                    filename, &locker);
-    if (likely(locker != NULL))
-      PSI_server->start_file_open_wait(locker, src_file, src_line);
+    PSI_CALL(start_file_open_wait)(locker, src_file, src_line);
+    file= my_create(filename, create_flags, access_flags, myFlags);
+    PSI_CALL(end_file_open_wait_and_bind_to_descriptor)(locker, file);
+    return file;
   }
 #endif
+
   file= my_create(filename, create_flags, access_flags, myFlags);
-#ifdef HAVE_PSI_INTERFACE
-  if (likely(locker != NULL))
-    PSI_server->end_file_open_wait_and_bind_to_descriptor(locker, file);
-#endif
   return file;
 }
 
 static inline File
 inline_mysql_file_create_temp(
-#ifdef HAVE_PSI_INTERFACE
+#ifdef HAVE_PSI_FILE_INTERFACE
   PSI_file_key key,
 #endif
   char *to, const char *dir, const char *pfx, int mode, myf myFlags)
@@ -1017,414 +1023,392 @@ inline_mysql_file_create_temp(
     before the create_temp_file call.
   */
   file= create_temp_file(to, dir, pfx, mode, myFlags);
-#ifdef HAVE_PSI_INTERFACE
-  if (likely(PSI_server != NULL))
-    PSI_server->create_file(key, to, file);
+#ifdef HAVE_PSI_FILE_INTERFACE
+  PSI_CALL(create_file)(key, to, file);
 #endif
   return file;
 }
 
 static inline File
 inline_mysql_file_open(
-#ifdef HAVE_PSI_INTERFACE
+#ifdef HAVE_PSI_FILE_INTERFACE
   PSI_file_key key, const char *src_file, uint src_line,
 #endif
   const char *filename, int flags, myf myFlags)
 {
   File file;
-#ifdef HAVE_PSI_INTERFACE
-  struct PSI_file_locker *locker= NULL;
+#ifdef HAVE_PSI_FILE_INTERFACE
+  struct PSI_file_locker *locker;
   PSI_file_locker_state state;
-  if (likely(PSI_server != NULL))
+  locker= PSI_CALL(get_thread_file_name_locker)(&state, key, PSI_FILE_OPEN,
+                                                filename, &locker);
+  if (likely(locker != NULL))
   {
-    locker= PSI_server->get_thread_file_name_locker(&state, key, PSI_FILE_OPEN,
-                                                    filename, &locker);
-    if (likely(locker != NULL))
-      PSI_server->start_file_open_wait(locker, src_file, src_line);
+    PSI_CALL(start_file_open_wait)(locker, src_file, src_line);
+    file= my_open(filename, flags, myFlags);
+    PSI_CALL(end_file_open_wait_and_bind_to_descriptor)(locker, file);
+    return file;
   }
 #endif
+
   file= my_open(filename, flags, myFlags);
-#ifdef HAVE_PSI_INTERFACE
-  if (likely(locker != NULL))
-    PSI_server->end_file_open_wait_and_bind_to_descriptor(locker, file);
-#endif
   return file;
 }
 
 static inline int
 inline_mysql_file_close(
-#ifdef HAVE_PSI_INTERFACE
+#ifdef HAVE_PSI_FILE_INTERFACE
   const char *src_file, uint src_line,
 #endif
   File file, myf flags)
 {
   int result;
-#ifdef HAVE_PSI_INTERFACE
-  struct PSI_file_locker *locker= NULL;
+#ifdef HAVE_PSI_FILE_INTERFACE
+  struct PSI_file_locker *locker;
   PSI_file_locker_state state;
-  if (likely(PSI_server != NULL))
+  locker= PSI_CALL(get_thread_file_descriptor_locker)(&state, file,
+                                                      PSI_FILE_CLOSE);
+  if (likely(locker != NULL))
   {
-    locker= PSI_server->get_thread_file_descriptor_locker(&state, file,
-                                                          PSI_FILE_CLOSE);
-    if (likely(locker != NULL))
-      PSI_server->start_file_wait(locker, (size_t) 0, src_file, src_line);
+    PSI_CALL(start_file_wait)(locker, (size_t) 0, src_file, src_line);
+    result= my_close(file, flags);
+    PSI_CALL(end_file_wait)(locker, (size_t) 0);
+    return result;
   }
 #endif
+
   result= my_close(file, flags);
-#ifdef HAVE_PSI_INTERFACE
-  if (likely(locker != NULL))
-    PSI_server->end_file_wait(locker, (size_t) 0);
-#endif
   return result;
 }
 
 static inline size_t
 inline_mysql_file_read(
-#ifdef HAVE_PSI_INTERFACE
+#ifdef HAVE_PSI_FILE_INTERFACE
   const char *src_file, uint src_line,
 #endif
   File file, uchar *buffer, size_t count, myf flags)
 {
-  size_t result= 0;
-#ifdef HAVE_PSI_INTERFACE
-  struct PSI_file_locker *locker= NULL;
+  size_t result;
+#ifdef HAVE_PSI_FILE_INTERFACE
+  struct PSI_file_locker *locker;
   PSI_file_locker_state state;
-  if (likely(PSI_server != NULL))
-  {
-    locker= PSI_server->get_thread_file_descriptor_locker(&state, file,
-                                                          PSI_FILE_READ);
-    if (likely(locker != NULL))
-      PSI_server->start_file_wait(locker, count, src_file, src_line);
-  }
-#endif
-  result= my_read(file, buffer, count, flags);
-#ifdef HAVE_PSI_INTERFACE
+  size_t bytes_read;
+  locker= PSI_CALL(get_thread_file_descriptor_locker)(&state, file,
+                                                      PSI_FILE_READ);
   if (likely(locker != NULL))
   {
-    size_t bytes_read;
+    PSI_CALL(start_file_wait)(locker, count, src_file, src_line);
+    result= my_read(file, buffer, count, flags);
     if (flags & (MY_NABP | MY_FNABP))
       bytes_read= (result == 0) ? count : 0;
     else
       bytes_read= (result != MY_FILE_ERROR) ? result : 0;
-    PSI_server->end_file_wait(locker, bytes_read);
+    PSI_CALL(end_file_wait)(locker, bytes_read);
+    return result;
   }
 #endif
+
+  result= my_read(file, buffer, count, flags);
   return result;
 }
 
 static inline size_t
 inline_mysql_file_write(
-#ifdef HAVE_PSI_INTERFACE
+#ifdef HAVE_PSI_FILE_INTERFACE
   const char *src_file, uint src_line,
 #endif
   File file, const uchar *buffer, size_t count, myf flags)
 {
   size_t result;
-#ifdef HAVE_PSI_INTERFACE
-  struct PSI_file_locker *locker= NULL;
+#ifdef HAVE_PSI_FILE_INTERFACE
+  struct PSI_file_locker *locker;
   PSI_file_locker_state state;
-  if (likely(PSI_server != NULL))
-  {
-    locker= PSI_server->get_thread_file_descriptor_locker(&state, file,
-                                                          PSI_FILE_WRITE);
-    if (likely(locker != NULL))
-      PSI_server->start_file_wait(locker, count, src_file, src_line);
-  }
-#endif
-  result= my_write(file, buffer, count, flags);
-#ifdef HAVE_PSI_INTERFACE
+  size_t bytes_written;
+  locker= PSI_CALL(get_thread_file_descriptor_locker)(&state, file,
+                                                      PSI_FILE_WRITE);
   if (likely(locker != NULL))
   {
-    size_t bytes_written;
+    PSI_CALL(start_file_wait)(locker, count, src_file, src_line);
+    result= my_write(file, buffer, count, flags);
     if (flags & (MY_NABP | MY_FNABP))
       bytes_written= (result == 0) ? count : 0;
     else
       bytes_written= (result != MY_FILE_ERROR) ? result : 0;
-    PSI_server->end_file_wait(locker, bytes_written);
+    PSI_CALL(end_file_wait)(locker, bytes_written);
+    return result;
   }
 #endif
+
+  result= my_write(file, buffer, count, flags);
   return result;
 }
 
 static inline size_t
 inline_mysql_file_pread(
-#ifdef HAVE_PSI_INTERFACE
+#ifdef HAVE_PSI_FILE_INTERFACE
   const char *src_file, uint src_line,
 #endif
   File file, uchar *buffer, size_t count, my_off_t offset, myf flags)
 {
   size_t result;
-#ifdef HAVE_PSI_INTERFACE
-  struct PSI_file_locker *locker= NULL;
+#ifdef HAVE_PSI_FILE_INTERFACE
+  struct PSI_file_locker *locker;
   PSI_file_locker_state state;
-  if (likely(PSI_server != NULL))
-  {
-    locker= PSI_server->get_thread_file_descriptor_locker(&state, file, PSI_FILE_READ);
-    if (likely(locker != NULL))
-      PSI_server->start_file_wait(locker, count, src_file, src_line);
-  }
-#endif
-  result= my_pread(file, buffer, count, offset, flags);
-#ifdef HAVE_PSI_INTERFACE
+  size_t bytes_read;
+  locker= PSI_CALL(get_thread_file_descriptor_locker)(&state, file, PSI_FILE_READ);
   if (likely(locker != NULL))
   {
-    size_t bytes_read;
+    PSI_CALL(start_file_wait)(locker, count, src_file, src_line);
+    result= my_pread(file, buffer, count, offset, flags);
     if (flags & (MY_NABP | MY_FNABP))
       bytes_read= (result == 0) ? count : 0;
     else
       bytes_read= (result != MY_FILE_ERROR) ? result : 0;
-    PSI_server->end_file_wait(locker, bytes_read);
+    PSI_CALL(end_file_wait)(locker, bytes_read);
+    return result;
   }
 #endif
+
+  result= my_pread(file, buffer, count, offset, flags);
   return result;
 }
 
 static inline size_t
 inline_mysql_file_pwrite(
-#ifdef HAVE_PSI_INTERFACE
+#ifdef HAVE_PSI_FILE_INTERFACE
   const char *src_file, uint src_line,
 #endif
   File file, const uchar *buffer, size_t count, my_off_t offset, myf flags)
 {
   size_t result;
-#ifdef HAVE_PSI_INTERFACE
-  struct PSI_file_locker *locker= NULL;
+#ifdef HAVE_PSI_FILE_INTERFACE
+  struct PSI_file_locker *locker;
   PSI_file_locker_state state;
-  if (likely(PSI_server != NULL))
-  {
-    locker= PSI_server->get_thread_file_descriptor_locker(&state, file,
-                                                          PSI_FILE_WRITE);
-    if (likely(locker != NULL))
-      PSI_server->start_file_wait(locker, count, src_file, src_line);
-  }
-#endif
-  result= my_pwrite(file, buffer, count, offset, flags);
-#ifdef HAVE_PSI_INTERFACE
+  size_t bytes_written;
+  locker= PSI_CALL(get_thread_file_descriptor_locker)(&state, file,
+                                                      PSI_FILE_WRITE);
   if (likely(locker != NULL))
   {
-    size_t bytes_written;
+    PSI_CALL(start_file_wait)(locker, count, src_file, src_line);
+    result= my_pwrite(file, buffer, count, offset, flags);
     if (flags & (MY_NABP | MY_FNABP))
       bytes_written= (result == 0) ? count : 0;
     else
       bytes_written= (result != MY_FILE_ERROR) ? result : 0;
-    PSI_server->end_file_wait(locker, bytes_written);
+    PSI_CALL(end_file_wait)(locker, bytes_written);
+    return result;
   }
 #endif
+
+  result= my_pwrite(file, buffer, count, offset, flags);
   return result;
 }
 
 static inline my_off_t
 inline_mysql_file_seek(
-#ifdef HAVE_PSI_INTERFACE
+#ifdef HAVE_PSI_FILE_INTERFACE
   const char *src_file, uint src_line,
 #endif
   File file, my_off_t pos, int whence, myf flags)
 {
   my_off_t result;
-#ifdef HAVE_PSI_INTERFACE
-  struct PSI_file_locker *locker= NULL;
+#ifdef HAVE_PSI_FILE_INTERFACE
+  struct PSI_file_locker *locker;
   PSI_file_locker_state state;
-  if (likely(PSI_server != NULL))
+  locker= PSI_CALL(get_thread_file_descriptor_locker)(&state, file, PSI_FILE_SEEK);
+  if (likely(locker != NULL))
   {
-    locker= PSI_server->get_thread_file_descriptor_locker(&state, file, PSI_FILE_SEEK);
-    if (likely(locker != NULL))
-      PSI_server->start_file_wait(locker, (size_t) 0, src_file, src_line);
+    PSI_CALL(start_file_wait)(locker, (size_t) 0, src_file, src_line);
+    result= my_seek(file, pos, whence, flags);
+    PSI_CALL(end_file_wait)(locker, (size_t) 0);
+    return result;
   }
 #endif
+
   result= my_seek(file, pos, whence, flags);
-#ifdef HAVE_PSI_INTERFACE
-  if (likely(locker != NULL))
-    PSI_server->end_file_wait(locker, (size_t) 0);
-#endif
   return result;
 }
 
 static inline my_off_t
 inline_mysql_file_tell(
-#ifdef HAVE_PSI_INTERFACE
+#ifdef HAVE_PSI_FILE_INTERFACE
   const char *src_file, uint src_line,
 #endif
   File file, myf flags)
 {
   my_off_t result;
-#ifdef HAVE_PSI_INTERFACE
-  struct PSI_file_locker *locker= NULL;
+#ifdef HAVE_PSI_FILE_INTERFACE
+  struct PSI_file_locker *locker;
   PSI_file_locker_state state;
-  if (likely(PSI_server != NULL))
+  locker= PSI_CALL(get_thread_file_descriptor_locker)(&state, file, PSI_FILE_TELL);
+  if (likely(locker != NULL))
   {
-    locker= PSI_server->get_thread_file_descriptor_locker(&state, file, PSI_FILE_TELL);
-    if (likely(locker != NULL))
-      PSI_server->start_file_wait(locker, (size_t) 0, src_file, src_line);
+    PSI_CALL(start_file_wait)(locker, (size_t) 0, src_file, src_line);
+    result= my_tell(file, flags);
+    PSI_CALL(end_file_wait)(locker, (size_t) 0);
+    return result;
   }
 #endif
+
   result= my_tell(file, flags);
-#ifdef HAVE_PSI_INTERFACE
-  if (likely(locker != NULL))
-    PSI_server->end_file_wait(locker, (size_t) 0);
-#endif
   return result;
 }
 
 static inline int
 inline_mysql_file_delete(
-#ifdef HAVE_PSI_INTERFACE
+#ifdef HAVE_PSI_FILE_INTERFACE
   PSI_file_key key, const char *src_file, uint src_line,
 #endif
   const char *name, myf flags)
 {
   int result;
-#ifdef HAVE_PSI_INTERFACE
-  struct PSI_file_locker *locker= NULL;
+#ifdef HAVE_PSI_FILE_INTERFACE
+  struct PSI_file_locker *locker;
   PSI_file_locker_state state;
-  if (likely(PSI_server != NULL))
+  locker= PSI_CALL(get_thread_file_name_locker)(&state, key, PSI_FILE_DELETE,
+                                                name, &locker);
+  if (likely(locker != NULL))
   {
-    locker= PSI_server->get_thread_file_name_locker(&state, key, PSI_FILE_DELETE,
-                                                    name, &locker);
-    if (likely(locker != NULL))
-      PSI_server->start_file_wait(locker, (size_t) 0, src_file, src_line);
+    PSI_CALL(start_file_wait)(locker, (size_t) 0, src_file, src_line);
+    result= my_delete(name, flags);
+    PSI_CALL(end_file_wait)(locker, (size_t) 0);
+    return result;
   }
 #endif
+
   result= my_delete(name, flags);
-#ifdef HAVE_PSI_INTERFACE
-  if (likely(locker != NULL))
-    PSI_server->end_file_wait(locker, (size_t) 0);
-#endif
   return result;
 }
 
 static inline int
 inline_mysql_file_rename(
-#ifdef HAVE_PSI_INTERFACE
+#ifdef HAVE_PSI_FILE_INTERFACE
   PSI_file_key key, const char *src_file, uint src_line,
 #endif
   const char *from, const char *to, myf flags)
 {
   int result;
-#ifdef HAVE_PSI_INTERFACE
-  struct PSI_file_locker *locker= NULL;
+#ifdef HAVE_PSI_FILE_INTERFACE
+  struct PSI_file_locker *locker;
   PSI_file_locker_state state;
-  if (likely(PSI_server != NULL))
+  locker= PSI_CALL(get_thread_file_name_locker)(&state, key, PSI_FILE_RENAME,
+                                                to, &locker);
+  if (likely(locker != NULL))
   {
-    locker= PSI_server->get_thread_file_name_locker(&state, key, PSI_FILE_RENAME,
-                                                    to, &locker);
-    if (likely(locker != NULL))
-      PSI_server->start_file_wait(locker, (size_t) 0, src_file, src_line);
+    PSI_CALL(start_file_wait)(locker, (size_t) 0, src_file, src_line);
+    result= my_rename(from, to, flags);
+    PSI_CALL(end_file_wait)(locker, (size_t) 0);
+    return result;
   }
 #endif
+
   result= my_rename(from, to, flags);
-#ifdef HAVE_PSI_INTERFACE
-  if (likely(locker != NULL))
-    PSI_server->end_file_wait(locker, (size_t) 0);
-#endif
   return result;
 }
 
 static inline File
 inline_mysql_file_create_with_symlink(
-#ifdef HAVE_PSI_INTERFACE
+#ifdef HAVE_PSI_FILE_INTERFACE
   PSI_file_key key, const char *src_file, uint src_line,
 #endif
   const char *linkname, const char *filename, int create_flags,
   int access_flags, myf flags)
 {
   File file;
-#ifdef HAVE_PSI_INTERFACE
-  struct PSI_file_locker *locker= NULL;
+#ifdef HAVE_PSI_FILE_INTERFACE
+  struct PSI_file_locker *locker;
   PSI_file_locker_state state;
-  if (likely(PSI_server != NULL))
+  locker= PSI_CALL(get_thread_file_name_locker)(&state, key, PSI_FILE_CREATE,
+                                                filename, &locker);
+  if (likely(locker != NULL))
   {
-    locker= PSI_server->get_thread_file_name_locker(&state, key, PSI_FILE_CREATE,
-                                                    filename, &locker);
-    if (likely(locker != NULL))
-      PSI_server->start_file_open_wait(locker, src_file, src_line);
+    PSI_CALL(start_file_open_wait)(locker, src_file, src_line);
+    file= my_create_with_symlink(linkname, filename, create_flags, access_flags,
+                                 flags);
+    PSI_CALL(end_file_open_wait_and_bind_to_descriptor)(locker, file);
+    return file;
   }
 #endif
+
   file= my_create_with_symlink(linkname, filename, create_flags, access_flags,
                                flags);
-#ifdef HAVE_PSI_INTERFACE
-  if (likely(locker != NULL))
-    PSI_server->end_file_open_wait_and_bind_to_descriptor(locker, file);
-#endif
   return file;
 }
 
 static inline int
 inline_mysql_file_delete_with_symlink(
-#ifdef HAVE_PSI_INTERFACE
+#ifdef HAVE_PSI_FILE_INTERFACE
   PSI_file_key key, const char *src_file, uint src_line,
 #endif
   const char *name, myf flags)
 {
   int result;
-#ifdef HAVE_PSI_INTERFACE
-  struct PSI_file_locker *locker= NULL;
+#ifdef HAVE_PSI_FILE_INTERFACE
+  struct PSI_file_locker *locker;
   PSI_file_locker_state state;
-  if (likely(PSI_server != NULL))
+  locker= PSI_CALL(get_thread_file_name_locker)(&state, key, PSI_FILE_DELETE,
+                                                name, &locker);
+  if (likely(locker != NULL))
   {
-    locker= PSI_server->get_thread_file_name_locker(&state, key, PSI_FILE_DELETE,
-                                                    name, &locker);
-    if (likely(locker != NULL))
-      PSI_server->start_file_wait(locker, (size_t) 0, src_file, src_line);
+    PSI_CALL(start_file_wait)(locker, (size_t) 0, src_file, src_line);
+    result= my_delete_with_symlink(name, flags);
+    PSI_CALL(end_file_wait)(locker, (size_t) 0);
+    return result;
   }
 #endif
+
   result= my_delete_with_symlink(name, flags);
-#ifdef HAVE_PSI_INTERFACE
-  if (likely(locker != NULL))
-    PSI_server->end_file_wait(locker, (size_t) 0);
-#endif
   return result;
 }
 
 static inline int
 inline_mysql_file_rename_with_symlink(
-#ifdef HAVE_PSI_INTERFACE
+#ifdef HAVE_PSI_FILE_INTERFACE
   PSI_file_key key, const char *src_file, uint src_line,
 #endif
   const char *from, const char *to, myf flags)
 {
   int result;
-#ifdef HAVE_PSI_INTERFACE
-  struct PSI_file_locker *locker= NULL;
+#ifdef HAVE_PSI_FILE_INTERFACE
+  struct PSI_file_locker *locker;
   PSI_file_locker_state state;
-  if (likely(PSI_server != NULL))
+  locker= PSI_CALL(get_thread_file_name_locker)(&state, key, PSI_FILE_RENAME,
+                                                to, &locker);
+  if (likely(locker != NULL))
   {
-    locker= PSI_server->get_thread_file_name_locker(&state, key, PSI_FILE_RENAME,
-                                                    to, &locker);
-    if (likely(locker != NULL))
-      PSI_server->start_file_wait(locker, (size_t) 0, src_file, src_line);
+    PSI_CALL(start_file_wait)(locker, (size_t) 0, src_file, src_line);
+    result= my_rename_with_symlink(from, to, flags);
+    PSI_CALL(end_file_wait)(locker, (size_t) 0);
+    return result;
   }
 #endif
+
   result= my_rename_with_symlink(from, to, flags);
-#ifdef HAVE_PSI_INTERFACE
-  if (likely(locker != NULL))
-    PSI_server->end_file_wait(locker, (size_t) 0);
-#endif
   return result;
 }
 
 static inline int
 inline_mysql_file_sync(
-#ifdef HAVE_PSI_INTERFACE
+#ifdef HAVE_PSI_FILE_INTERFACE
   const char *src_file, uint src_line,
 #endif
   File fd, myf flags)
 {
   int result= 0;
-#ifdef HAVE_PSI_INTERFACE
-  struct PSI_file_locker *locker= NULL;
+#ifdef HAVE_PSI_FILE_INTERFACE
+  struct PSI_file_locker *locker;
   PSI_file_locker_state state;
-  if (likely(PSI_server != NULL))
+  locker= PSI_CALL(get_thread_file_descriptor_locker)(&state, fd, PSI_FILE_SYNC);
+  if (likely(locker != NULL))
   {
-    locker= PSI_server->get_thread_file_descriptor_locker(&state, fd, PSI_FILE_SYNC);
-    if (likely(locker != NULL))
-      PSI_server->start_file_wait(locker, (size_t) 0, src_file, src_line);
+    PSI_CALL(start_file_wait)(locker, (size_t) 0, src_file, src_line);
+    result= my_sync(fd, flags);
+    PSI_CALL(end_file_wait)(locker, (size_t) 0);
+    return result;
   }
 #endif
+
   result= my_sync(fd, flags);
-#ifdef HAVE_PSI_INTERFACE
-  if (likely(locker != NULL))
-    PSI_server->end_file_wait(locker, (size_t) 0);
-#endif
   return result;
 }
 
diff --git a/include/mysql/psi/mysql_idle.h b/include/mysql/psi/mysql_idle.h
new file mode 100644
index 00000000000..7a3fccfdb8c
--- /dev/null
+++ b/include/mysql/psi/mysql_idle.h
@@ -0,0 +1,92 @@
+/* Copyright (c) 2011, Oracle and/or its affiliates. All rights reserved.   
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; version 2 of the License.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software Foundation,
+  51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA */
+
+#ifndef MYSQL_IDLE_H
+#define MYSQL_IDLE_H
+
+/**
+  @file mysql/psi/mysql_idle.h
+  Instrumentation helpers for idle waits.
+*/
+
+#include "mysql/psi/psi.h"
+
+/**
+  @defgroup Idle_instrumentation Idle Instrumentation
+  @ingroup Instrumentation_interface
+  @{
+*/
+
+/**
+  @def MYSQL_START_IDLE_WAIT
+  Instrumentation helper for table io_waits.
+  This instrumentation marks the start of a wait event.
+  @param LOCKER the locker
+  @param STATE the locker state
+  @sa MYSQL_END_IDLE_WAIT.
+*/
+#ifdef HAVE_PSI_IDLE_INTERFACE
+  #define MYSQL_START_IDLE_WAIT(LOCKER, STATE) \
+    LOCKER= inline_mysql_start_idle_wait(STATE, __FILE__, __LINE__)
+#else
+  #define MYSQL_START_IDLE_WAIT(LOCKER, STATE) \
+    do {} while (0)
+#endif
+
+/**
+  @def MYSQL_END_IDLE_WAIT
+  Instrumentation helper for idle waits.
+  This instrumentation marks the end of a wait event.
+  @param LOCKER the locker
+  @sa MYSQL_START_IDLE_WAIT.
+*/
+#ifdef HAVE_PSI_IDLE_INTERFACE
+  #define MYSQL_END_IDLE_WAIT(LOCKER) \
+    inline_mysql_end_idle_wait(LOCKER)
+#else
+  #define MYSQL_END_IDLE_WAIT(LOCKER) \
+    do {} while (0)
+#endif
+
+#ifdef HAVE_PSI_IDLE_INTERFACE
+/**
+  Instrumentation calls for MYSQL_START_IDLE_WAIT.
+  @sa MYSQL_END_IDLE_WAIT.
+*/
+static inline struct PSI_idle_locker *
+inline_mysql_start_idle_wait(PSI_idle_locker_state *state,
+                             const char *src_file, int src_line)
+{
+  struct PSI_idle_locker *locker;
+  locker= PSI_CALL(start_idle_wait)(state, src_file, src_line);
+  return locker;
+}
+
+/**
+  Instrumentation calls for MYSQL_END_IDLE_WAIT.
+  @sa MYSQL_START_IDLE_WAIT.
+*/
+static inline void
+inline_mysql_end_idle_wait(struct PSI_idle_locker *locker)
+{
+  if (likely(locker != NULL))
+    PSI_CALL(end_idle_wait)(locker);
+}
+#endif
+
+/** @} (end of group Idle_instrumentation) */
+
+#endif
+
diff --git a/include/mysql/psi/mysql_socket.h b/include/mysql/psi/mysql_socket.h
new file mode 100644
index 00000000000..c908032883a
--- /dev/null
+++ b/include/mysql/psi/mysql_socket.h
@@ -0,0 +1,1137 @@
+/* Copyright (c) 2010, 2012, Oracle and/or its affiliates. All rights reserved.
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License as
+published by the Free Software Foundation; version 2 of the
+License.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
+02110-1301  USA
+*/
+
+#ifndef MYSQL_SOCKET_H
+#define MYSQL_SOCKET_H
+
+/* For strlen() */
+#include <string.h>
+/* For MY_STAT */
+#include <my_dir.h>
+/* For my_chsize */
+#include <my_sys.h>
+/* For socket api */
+#ifdef __WIN__
+  #include <ws2def.h>
+  #include <winsock2.h>
+  #define SOCKBUF_T char
+#else
+  #include <netinet/in.h>
+  #define SOCKBUF_T void
+#endif
+/**
+  @file mysql/psi/mysql_socket.h
+[...]
+*/
+
+#include "mysql/psi/psi.h"
+
+/**
+  @defgroup Socket_instrumentation Socket Instrumentation
+  @ingroup Instrumentation_interface
+  @{
+*/
+
+/**
+  @def mysql_socket_register(P1, P2, P3)
+  Socket registration.
+*/
+#ifdef HAVE_PSI_SOCKET_INTERFACE
+  #define mysql_socket_register(P1, P2, P3) \
+    inline_mysql_socket_register(P1, P2, P3)
+#else
+  #define mysql_socket_register(P1, P2, P3) \
+    do {} while (0)
+#endif
+
+struct st_mysql_socket
+{
+  /** The real socket descriptor. */
+  my_socket fd;
+
+  /**
+    The instrumentation hook.
+    Note that this hook is not conditionally defined,
+    for binary compatibility of the @c MYSQL_SOCKET interface.
+  */
+  struct PSI_socket *m_psi;
+};
+
+/**
+  An instrumented socket.
+  @c MYSQL_SOCKET is a replacement for @c my_socket.
+*/
+typedef struct st_mysql_socket MYSQL_SOCKET;
+
+
+/**
+  @def MYSQL_INVALID_SOCKET
+  MYSQL_SOCKET initial value.
+*/
+//MYSQL_SOCKET MYSQL_INVALID_SOCKET= {INVALID_SOCKET, NULL};
+#define MYSQL_INVALID_SOCKET mysql_socket_invalid()
+
+/**
+  MYSQL_SOCKET helper. Initialize instrumented socket.
+  @sa mysql_socket_getfd
+  @sa mysql_socket_setfd
+*/
+static inline MYSQL_SOCKET
+mysql_socket_invalid()
+{
+  MYSQL_SOCKET mysql_socket= {INVALID_SOCKET, NULL};
+  return mysql_socket;
+}
+
+/**
+  Set socket descriptor and address.
+  @param socket nstrumented socket
+  @param fd socket descriptor
+  @param addr unformatted socket address
+  @param adr_len length of socket addres
+*/
+
+static inline void
+mysql_socket_set_address(
+#ifdef HAVE_PSI_SOCKET_INTERFACE
+  MYSQL_SOCKET socket,
+  const struct sockaddr *addr,
+  socklen_t addr_len
+#else
+  MYSQL_SOCKET socket __attribute__ ((unused)),
+  const struct sockaddr *addr __attribute__ ((unused)),
+  socklen_t addr_len __attribute__ ((unused))
+#endif
+)
+{
+#ifdef HAVE_PSI_SOCKET_INTERFACE
+  if (socket.m_psi != NULL)
+    PSI_CALL(set_socket_info)(socket.m_psi, NULL, addr, addr_len);
+#endif
+}
+
+/**
+  Set socket descriptor and address.
+  @param socket instrumented socket
+  @param thread instrumented owning thread
+*/
+static inline void
+mysql_socket_set_thread_owner(
+#ifdef HAVE_PSI_SOCKET_INTERFACE
+MYSQL_SOCKET socket
+#else
+MYSQL_SOCKET socket __attribute__ ((unused))
+#endif
+)
+{
+#ifdef HAVE_PSI_SOCKET_INTERFACE
+  if (socket.m_psi != NULL)
+    PSI_CALL(set_socket_thread_owner)(socket.m_psi);
+#endif
+}
+
+/**
+  MYSQL_SOCKET helper. Get socket descriptor.
+  @param mysql_socket Instrumented socket
+  @sa mysql_socket_setfd
+*/
+static inline my_socket
+mysql_socket_getfd(MYSQL_SOCKET mysql_socket)
+{
+  return mysql_socket.fd;
+}
+
+/**
+  MYSQL_SOCKET helper. Set socket descriptor.
+  @param mysql_socket Instrumented socket
+  @param fd Socket descriptor
+  @sa mysql_socket_getfd
+*/
+static inline void
+mysql_socket_setfd(MYSQL_SOCKET *mysql_socket, my_socket fd)
+{
+  if (likely(mysql_socket != NULL))
+    mysql_socket->fd= fd;
+}
+
+/**
+  @def MYSQL_SOCKET_WAIT_VARIABLES
+  Instrumentation helper for socket waits.
+  This instrumentation declares local variables.
+  Do not use a ';' after this macro
+  @param LOCKER locker
+  @param STATE locker state
+  @sa MYSQL_START_SOCKET_WAIT.
+  @sa MYSQL_END_SOCKET_WAIT.
+*/
+#ifdef HAVE_PSI_SOCKET_INTERFACE
+  #define MYSQL_SOCKET_WAIT_VARIABLES(LOCKER, STATE) \
+    struct PSI_socket_locker* LOCKER; \
+    PSI_socket_locker_state STATE;
+#else
+  #define MYSQL_SOCKET_WAIT_VARIABLES(LOCKER, STATE)
+#endif
+
+/**
+  @def MYSQL_START_SOCKET_WAIT
+  Instrumentation helper for socket waits.
+  This instrumentation marks the start of a wait event.
+  @param LOCKER locker
+  @param STATE locker state
+  @param SOCKET instrumented socket
+  @param OP The socket operation to be performed
+  @param FLAGS per-socket operation flags.
+  @param COUNT bytes to be written/read
+  @sa MYSQL_END_SOCKET_WAIT.
+*/
+#ifdef HAVE_PSI_SOCKET_INTERFACE
+  #define MYSQL_START_SOCKET_WAIT(LOCKER, STATE, SOCKET, OP, COUNT) \
+    LOCKER= inline_mysql_start_socket_wait(STATE, SOCKET, OP, COUNT,\
+                                           __FILE__, __LINE__)
+#else
+  #define MYSQL_START_SOCKET_WAIT(LOCKER, STATE, SOCKET, OP, COUNT) \
+    do {} while (0)
+#endif
+
+/**
+  @def MYSQL_END_SOCKET_WAIT
+  Instrumentation helper for socket waits.
+  This instrumentation marks the end of a wait event.
+  @param LOCKER locker
+  @param COUNT actual bytes written/read, or -1
+  @sa MYSQL_START_SOCKET_WAIT.
+*/
+#ifdef HAVE_PSI_SOCKET_INTERFACE
+  #define MYSQL_END_SOCKET_WAIT(LOCKER, COUNT) \
+    inline_mysql_end_socket_wait(LOCKER, COUNT)
+#else
+  #define MYSQL_END_SOCKET_WAIT(LOCKER, COUNT) \
+    do {} while (0)
+#endif
+
+#ifdef HAVE_PSI_SOCKET_INTERFACE
+  #define MYSQL_SOCKET_SET_STATE(SOCKET, STATE) \
+    inline_mysql_socket_set_state(SOCKET, STATE)
+#else
+  #define MYSQL_SOCKET_SET_STATE(SOCKET, STATE) \
+    do {} while (0)
+#endif
+
+#ifdef HAVE_PSI_SOCKET_INTERFACE
+/**
+  Instrumentation calls for MYSQL_START_SOCKET_WAIT.
+  @sa MYSQL_START_SOCKET_WAIT.
+*/
+static inline struct PSI_socket_locker*
+inline_mysql_start_socket_wait(PSI_socket_locker_state *state,
+                               MYSQL_SOCKET mysql_socket,
+                               enum PSI_socket_operation op,
+                               size_t byte_count,
+                               const char *src_file, int src_line)
+{
+  struct PSI_socket_locker *locker;
+  if (mysql_socket.m_psi != NULL)
+  {
+    locker= PSI_CALL(start_socket_wait)(state, mysql_socket.m_psi, op,
+                                        byte_count, src_file, src_line);
+  }
+  else
+    locker= NULL;
+  return locker;
+}
+
+/**
+  Instrumentation calls for MYSQL_END_SOCKET_WAIT.
+  @sa MYSQL_END_SOCKET_WAIT.
+*/
+static inline void
+inline_mysql_end_socket_wait(struct PSI_socket_locker *locker, size_t byte_count)
+{
+  if (locker != NULL)
+    PSI_CALL(end_socket_wait)(locker, byte_count);
+}
+
+/**
+  Set the state (IDLE, ACTIVE) of an instrumented socket.
+  @param socket the instrumented socket
+  @param state the new state
+  @sa PSI_socket_state
+*/
+static inline void
+inline_mysql_socket_set_state(MYSQL_SOCKET socket, enum PSI_socket_state state)
+{
+  if (socket.m_psi != NULL)
+    PSI_CALL(set_socket_state)(socket.m_psi, state);
+}
+#endif /* HAVE_PSI_SOCKET_INTERFACE */
+
+/**
+  @def mysql_socket_socket(K, D, T, P)
+  Create a socket.
+  @c mysql_socket_socket is a replacement for @c socket.
+  @param K PSI_socket_key for this instrumented socket
+  @param D Socket domain
+  @param T Protocol type
+  @param P Transport protocol
+*/
+
+#ifdef HAVE_PSI_SOCKET_INTERFACE
+  #define mysql_socket_socket(K, D, T, P) \
+    inline_mysql_socket_socket(K, D, T, P)
+#else
+  #define mysql_socket_socket(K, D, T, P) \
+    inline_mysql_socket_socket(D, T, P)
+#endif
+
+/**
+  @def mysql_socket_bind(FD, AP, L)
+  Bind a socket to a local port number and IP address
+  @c mysql_socket_bind is a replacement for @c bind.
+  @param FD Instrumented socket descriptor returned by socket()
+  @param AP Pointer to local port number and IP address in sockaddr structure
+  @param L  Length of sockaddr structure
+*/
+#ifdef HAVE_PSI_SOCKET_INTERFACE
+  #define mysql_socket_bind(FD, AP, L) \
+    inline_mysql_socket_bind(__FILE__, __LINE__, FD, AP, L)
+#else
+  #define mysql_socket_bind(FD, AP, L) \
+    inline_mysql_socket_bind(FD, AP, L)
+#endif
+
+/**
+  @def mysql_socket_getsockname(FD, AP, LP)
+  Return port number and IP address of the local host
+  @c mysql_socket_getsockname is a replacement for @c getsockname.
+  @param FD Instrumented socket descriptor returned by socket()
+  @param A  Pointer to returned address of local host in sockaddr structure
+  @param L  Pointer to length of sockaddr structure
+*/
+#ifdef HAVE_PSI_SOCKET_INTERFACE
+  #define mysql_socket_getsockname(FD, AP, LP) \
+    inline_mysql_socket_getsockname(__FILE__, __LINE__, FD, AP, LP)
+#else
+  #define mysql_socket_getsockname(FD, AP, LP) \
+    inline_mysql_socket_getsockname(FD, AP, LP)
+#endif
+
+/**
+  @def mysql_socket_connect(FD, AP, L)
+  Establish a connection to a remote host.
+  @c mysql_socket_connect is a replacement for @c connect.
+  @param FD Instrumented socket descriptor returned by socket()
+  @param AP Pointer to target address in sockaddr structure
+  @param L  Length of sockaddr structure
+*/
+#ifdef HAVE_PSI_SOCKET_INTERFACE
+  #define mysql_socket_connect(FD, AP, L) \
+    inline_mysql_socket_connect(__FILE__, __LINE__, FD, AP, L)
+#else
+  #define mysql_socket_connect(FD, AP, L) \
+    inline_mysql_socket_connect(FD, AP, L)
+#endif
+
+/**
+  @def mysql_socket_getpeername(FD, AP, LP)
+  Get port number and IP address of remote host that a socket is connected to.
+  @c mysql_socket_getpeername is a replacement for @c getpeername.
+  @param FD Instrumented socket descriptor returned by socket() or accept()
+  @param AP Pointer to returned address of remote host in sockaddr structure
+  @param LP Pointer to length of sockaddr structure
+*/
+#ifdef HAVE_PSI_SOCKET_INTERFACE
+  #define mysql_socket_getpeername(FD, AP, LP) \
+    inline_mysql_socket_getpeername(__FILE__, __LINE__, FD, AP, LP)
+#else
+  #define mysql_socket_getpeername(FD, AP, LP) \
+    inline_mysql_socket_getpeername(FD, AP, LP)
+#endif
+
+/**
+  @def mysql_socket_send(FD, B, N, FL)
+  Send data from the buffer, B, to a connected socket.
+  @c mysql_socket_send is a replacement for @c send.
+  @param FD Instrumented socket descriptor returned by socket() or accept()
+  @param B  Buffer to send
+  @param N  Number of bytes to send
+  @param FL Control flags
+*/
+#ifdef HAVE_PSI_SOCKET_INTERFACE
+  #define mysql_socket_send(FD, B, N, FL) \
+    inline_mysql_socket_send(__FILE__, __LINE__, FD, B, N, FL)
+#else
+  #define mysql_socket_send(FD, B, N, FL) \
+    inline_mysql_socket_send(FD, B, N, FL)
+#endif
+
+/**
+  @def mysql_socket_recv(FD, B, N, FL)
+  Receive data from a connected socket.
+  @c mysql_socket_recv is a replacement for @c recv.
+  @param FD Instrumented socket descriptor returned by socket() or accept()
+  @param B  Buffer to receive to
+  @param N  Maximum bytes to receive
+  @param FL Control flags
+*/
+#ifdef HAVE_PSI_SOCKET_INTERFACE
+  #define mysql_socket_recv(FD, B, N, FL) \
+    inline_mysql_socket_recv(__FILE__, __LINE__, FD, B, N, FL)
+#else
+  #define mysql_socket_recv(FD, B, N, FL) \
+    inline_mysql_socket_recv(FD, B, N, FL)
+#endif
+
+/**
+  @def mysql_socket_sendto(FD, B, N, FL, AP, L)
+  Send data to a socket at the specified address.
+  @c mysql_socket_sendto is a replacement for @c sendto.
+  @param FD Instrumented socket descriptor returned by socket()
+  @param B  Buffer to send
+  @param N  Number of bytes to send
+  @param FL Control flags
+  @param AP Pointer to destination sockaddr structure
+  @param L  Size of sockaddr structure
+*/
+#ifdef HAVE_PSI_SOCKET_INTERFACE
+  #define mysql_socket_sendto(FD, B, N, FL, AP, L) \
+    inline_mysql_socket_sendto(__FILE__, __LINE__, FD, B, N, FL, AP, L)
+#else
+  #define mysql_socket_sendto(FD, B, N, FL, AP, L) \
+    inline_mysql_socket_sendto(FD, B, N, FL, AP, L)
+#endif
+
+/**
+  @def mysql_socket_recvfrom(FD, B, N, FL, AP, L)
+  Receive data from a socket and return source address information
+  @c mysql_socket_recvfrom is a replacement for @c recvfrom.
+  @param FD Instrumented socket descriptor returned by socket()
+  @param B  Buffer to receive to
+  @param N  Maximum bytes to receive
+  @param FL Control flags
+  @param AP Pointer to source address in sockaddr_storage structure
+  @param L  Size of sockaddr_storage structure
+*/
+#ifdef HAVE_PSI_SOCKET_INTERFACE
+  #define mysql_socket_recvfrom(FD, B, N, FL, AP, LP) \
+    inline_mysql_socket_recvfrom(__FILE__, __LINE__, FD, B, N, FL, AP, LP)
+#else
+  #define mysql_socket_recvfrom(FD, B, N, FL, AP, LP) \
+    inline_mysql_socket_recvfrom(FD, B, N, FL, AP, LP)
+#endif
+
+/**
+  @def mysql_socket_getsockopt(FD, LV, ON, OP, OL)
+  Get a socket option for the specified socket.
+  @c mysql_socket_getsockopt is a replacement for @c getsockopt.
+  @param FD Instrumented socket descriptor returned by socket()
+  @param LV Protocol level
+  @param ON Option to query
+  @param OP Buffer which will contain the value for the requested option
+  @param OL Pointer to length of OP
+*/
+#ifdef HAVE_PSI_SOCKET_INTERFACE
+  #define mysql_socket_getsockopt(FD, LV, ON, OP, OL) \
+    inline_mysql_socket_getsockopt(__FILE__, __LINE__, FD, LV, ON, OP, OL)
+#else
+  #define mysql_socket_getsockopt(FD, LV, ON, OP, OL) \
+    inline_mysql_socket_getsockopt(FD, LV, ON, OP, OL)
+#endif
+
+/**
+  @def mysql_socket_setsockopt(FD, LV, ON, OP, OL)
+  Set a socket option for the specified socket.
+  @c mysql_socket_setsockopt is a replacement for @c setsockopt.
+  @param FD Instrumented socket descriptor returned by socket()
+  @param LV Protocol level
+  @param ON Option to modify
+  @param OP Buffer containing the value for the specified option
+  @param OL Pointer to length of OP
+*/
+#ifdef HAVE_PSI_SOCKET_INTERFACE
+  #define mysql_socket_setsockopt(FD, LV, ON, OP, OL) \
+    inline_mysql_socket_setsockopt(__FILE__, __LINE__, FD, LV, ON, OP, OL)
+#else
+  #define mysql_socket_setsockopt(FD, LV, ON, OP, OL) \
+    inline_mysql_socket_setsockopt(FD, LV, ON, OP, OL)
+#endif
+
+/**
+  @def mysql_socket_listen(FD, N)
+  Set socket state to listen for an incoming connection.
+  @c mysql_socket_listen is a replacement for @c listen.
+  @param FD Instrumented socket descriptor, bound and connected
+  @param N  Maximum number of pending connections allowed.
+*/
+#ifdef HAVE_PSI_SOCKET_INTERFACE
+  #define mysql_socket_listen(FD, N) \
+    inline_mysql_socket_listen(__FILE__, __LINE__, FD, N)
+#else
+  #define mysql_socket_listen(FD, N) \
+    inline_mysql_socket_listen(FD, N)
+#endif
+
+/**
+  @def mysql_socket_accept(K, FD, AP, LP)
+  Accept a connection from any remote host; TCP only.
+  @c mysql_socket_accept is a replacement for @c accept.
+  @param K PSI_socket_key for this instrumented socket
+  @param FD Instrumented socket descriptor, bound and placed in a listen state
+  @param AP Pointer to sockaddr structure with returned IP address and port of connected host
+  @param LP Pointer to length of valid information in AP
+*/
+#ifdef HAVE_PSI_SOCKET_INTERFACE
+  #define mysql_socket_accept(K, FD, AP, LP) \
+    inline_mysql_socket_accept(__FILE__, __LINE__, K, FD, AP, LP)
+#else
+  #define mysql_socket_accept(K, FD, AP, LP) \
+    inline_mysql_socket_accept(FD, AP, LP)
+#endif
+
+/**
+  @def mysql_socket_close(FD)
+  Close a socket and sever any connections.
+  @c mysql_socket_close is a replacement for @c close.
+  @param FD Instrumented socket descriptor returned by socket() or accept()
+*/
+#ifdef HAVE_PSI_SOCKET_INTERFACE
+  #define mysql_socket_close(FD) \
+    inline_mysql_socket_close(__FILE__, __LINE__, FD)
+#else
+  #define mysql_socket_close(FD) \
+    inline_mysql_socket_close(FD)
+#endif
+
+/**
+  @def mysql_socket_shutdown(FD, H)
+  Disable receives and/or sends on a socket.
+  @c mysql_socket_shutdown is a replacement for @c shutdown.
+  @param FD Instrumented socket descriptor returned by socket() or accept()
+  @param H  Specifies which operations to shutdown
+*/
+#ifdef HAVE_PSI_SOCKET_INTERFACE
+  #define mysql_socket_shutdown(FD, H) \
+    inline_mysql_socket_shutdown(__FILE__, __LINE__, FD, H)
+#else
+  #define mysql_socket_shutdown(FD, H) \
+    inline_mysql_socket_shutdown(FD, H)
+#endif
+
+#ifdef HAVE_PSI_SOCKET_INTERFACE
+static inline void inline_mysql_socket_register(
+  const char *category,
+  PSI_socket_info *info,
+  int count)
+{
+  PSI_CALL(register_socket)(category, info, count);
+}
+#endif
+
+/** mysql_socket_socket */
+
+static inline MYSQL_SOCKET
+inline_mysql_socket_socket
+(
+#ifdef HAVE_PSI_SOCKET_INTERFACE
+  PSI_socket_key key,
+#endif
+  int domain, int type, int protocol)
+{
+  MYSQL_SOCKET mysql_socket;
+  mysql_socket.fd= socket(domain, type, protocol);
+
+#ifdef HAVE_PSI_SOCKET_INTERFACE
+  mysql_socket.m_psi= PSI_CALL(init_socket)(key, (const my_socket*)&mysql_socket.fd);
+
+  if (likely(mysql_socket.fd != INVALID_SOCKET && mysql_socket.m_psi != NULL))
+    PSI_CALL(set_socket_info)(mysql_socket.m_psi, &mysql_socket.fd, NULL, 0);
+#else
+  mysql_socket.m_psi= NULL;
+#endif
+  return mysql_socket;
+}
+
+/** mysql_socket_bind */
+
+static inline int
+inline_mysql_socket_bind
+(
+#ifdef HAVE_PSI_SOCKET_INTERFACE
+  const char *src_file, uint src_line,
+#endif
+  MYSQL_SOCKET mysql_socket, const struct sockaddr *addr, socklen_t len)
+{
+  int result;
+
+#ifdef HAVE_PSI_SOCKET_INTERFACE
+  if (mysql_socket.m_psi != NULL)
+  {
+    /* Instrumentation start */
+    PSI_socket_locker_state state;
+    PSI_socket_locker *locker;
+    locker= PSI_CALL(start_socket_wait)(&state, mysql_socket.m_psi,
+                                        PSI_SOCKET_BIND, (size_t)0, src_file, src_line);
+
+    /* Instrumented code */
+    result= bind(mysql_socket.fd, addr, len);
+
+    /* Instrumentation end */
+    PSI_CALL(set_socket_info)(mysql_socket.m_psi, NULL, addr, len);
+
+    if (locker != NULL)
+      PSI_CALL(end_socket_wait)(locker, (size_t)0);
+
+    return result;
+  }
+#endif
+
+  /* Non instrumented code */
+  result= bind(mysql_socket.fd, addr, len);
+  return result;
+}
+
+/** mysql_socket_getsockname */
+
+static inline int
+inline_mysql_socket_getsockname
+(
+#ifdef HAVE_PSI_SOCKET_INTERFACE
+  const char *src_file, uint src_line,
+#endif
+ MYSQL_SOCKET mysql_socket, struct sockaddr *addr, socklen_t *len)
+{
+  int result;
+
+#ifdef HAVE_PSI_SOCKET_INTERFACE
+  if (mysql_socket.m_psi != NULL)
+  {
+    /* Instrumentation start */
+    PSI_socket_locker *locker;
+    PSI_socket_locker_state state;
+    locker= PSI_CALL(start_socket_wait)(&state, mysql_socket.m_psi,
+                                        PSI_SOCKET_BIND, (size_t)0, src_file, src_line);
+
+    /* Instrumented code */
+    result= getsockname(mysql_socket.fd, addr, len);
+
+    /* Instrumentation end */
+    if (locker != NULL)
+      PSI_CALL(end_socket_wait)(locker, (size_t)0);
+
+    return result;
+  }
+#endif
+
+  /* Non instrumented code */
+  result= getsockname(mysql_socket.fd, addr, len);
+
+  return result;
+}
+
+/** mysql_socket_connect */
+
+static inline int
+inline_mysql_socket_connect
+(
+#ifdef HAVE_PSI_SOCKET_INTERFACE
+  const char *src_file, uint src_line,
+#endif
+ MYSQL_SOCKET mysql_socket, const struct sockaddr *addr, socklen_t len)
+{
+  int result;
+
+#ifdef HAVE_PSI_SOCKET_INTERFACE
+  if (mysql_socket.m_psi != NULL)
+  {
+    /* Instrumentation start */
+    PSI_socket_locker *locker;
+    PSI_socket_locker_state state;
+    locker= PSI_CALL(start_socket_wait)(&state, mysql_socket.m_psi,
+                                        PSI_SOCKET_CONNECT, (size_t)0, src_file, src_line);
+
+    /* Instrumented code */
+    result= connect(mysql_socket.fd, addr, len);
+
+    /* Instrumentation end */
+    if (locker != NULL)
+      PSI_CALL(end_socket_wait)(locker, (size_t)0);
+
+    return result;
+  }
+#endif
+
+  /* Non instrumented code */
+  result= connect(mysql_socket.fd, addr, len);
+
+  return result;
+}
+
+/** mysql_socket_getpeername */
+
+static inline int
+inline_mysql_socket_getpeername
+(
+#ifdef HAVE_PSI_SOCKET_INTERFACE
+  const char *src_file, uint src_line,
+#endif
+ MYSQL_SOCKET mysql_socket, struct sockaddr *addr, socklen_t *len)
+{
+  int result;
+
+#ifdef HAVE_PSI_SOCKET_INTERFACE
+  if (mysql_socket.m_psi != NULL)
+  {
+    /* Instrumentation start */
+    PSI_socket_locker *locker;
+    PSI_socket_locker_state state;
+    locker= PSI_CALL(start_socket_wait)(&state, mysql_socket.m_psi,
+                                        PSI_SOCKET_BIND, (size_t)0, src_file, src_line);
+
+    /* Instrumented code */
+    result= getpeername(mysql_socket.fd, addr, len);
+
+    /* Instrumentation end */
+    if (locker != NULL)
+      PSI_CALL(end_socket_wait)(locker, (size_t)0);
+
+    return result;
+  }
+#endif
+
+  /* Non instrumented code */
+  result= getpeername(mysql_socket.fd, addr, len);
+
+  return result;
+}
+
+/** mysql_socket_send */
+
+static inline ssize_t
+inline_mysql_socket_send
+(
+#ifdef HAVE_PSI_SOCKET_INTERFACE
+  const char *src_file, uint src_line,
+#endif
+ MYSQL_SOCKET mysql_socket, const SOCKBUF_T *buf, size_t n, int flags)
+{
+  ssize_t result;
+
+#ifdef HAVE_PSI_SOCKET_INTERFACE
+  if (mysql_socket.m_psi != NULL)
+  {
+    /* Instrumentation start */
+    PSI_socket_locker *locker;
+    PSI_socket_locker_state state;
+    locker= PSI_CALL(start_socket_wait)(&state, mysql_socket.m_psi,
+                                        PSI_SOCKET_SEND, n, src_file, src_line);
+
+    /* Instrumented code */
+    result= send(mysql_socket.fd, buf, n, flags);
+
+    /* Instrumentation end */
+    if (locker != NULL)
+    {
+      size_t bytes_written;
+      bytes_written= (result > -1) ? result : 0;
+      PSI_CALL(end_socket_wait)(locker, bytes_written);
+    }
+
+    return result;
+  }
+#endif
+
+  /* Non instrumented code */
+  result= send(mysql_socket.fd, buf, n, flags);
+
+  return result;
+}
+
+/** mysql_socket_recv */
+
+static inline ssize_t
+inline_mysql_socket_recv
+(
+#ifdef HAVE_PSI_SOCKET_INTERFACE
+  const char *src_file, uint src_line,
+#endif
+ MYSQL_SOCKET mysql_socket,  SOCKBUF_T *buf, size_t n, int flags)
+{
+  ssize_t result;
+
+#ifdef HAVE_PSI_SOCKET_INTERFACE
+  if (mysql_socket.m_psi != NULL)
+  {
+    /* Instrumentation start */
+    PSI_socket_locker *locker;
+    PSI_socket_locker_state state;
+    locker= PSI_CALL(start_socket_wait)(&state, mysql_socket.m_psi,
+                                        PSI_SOCKET_RECV, (size_t)0, src_file, src_line);
+
+    /* Instrumented code */
+    result= recv(mysql_socket.fd, buf, n, flags);
+
+    /* Instrumentation end */
+    if (locker != NULL)
+    {
+      size_t bytes_read;
+      bytes_read= (result > -1) ? result : 0;
+      PSI_CALL(end_socket_wait)(locker, bytes_read);
+    }
+
+    return result;
+  }
+#endif
+
+  /* Non instrumented code */
+  result= recv(mysql_socket.fd, buf, n, flags);
+
+  return result;
+}
+
+/** mysql_socket_sendto */
+
+static inline ssize_t
+inline_mysql_socket_sendto
+(
+#ifdef HAVE_PSI_SOCKET_INTERFACE
+  const char *src_file, uint src_line,
+#endif
+ MYSQL_SOCKET mysql_socket, const SOCKBUF_T *buf, size_t n, int flags, const struct sockaddr *addr, socklen_t addr_len)
+{
+  ssize_t result;
+
+#ifdef HAVE_PSI_SOCKET_INTERFACE
+  if (mysql_socket.m_psi != NULL)
+  {
+    /* Instrumentation start */
+    PSI_socket_locker *locker;
+    PSI_socket_locker_state state;
+    locker= PSI_CALL(start_socket_wait)(&state, mysql_socket.m_psi,
+                                        PSI_SOCKET_SEND, n, src_file, src_line);
+
+    /* Instrumented code */
+    result= sendto(mysql_socket.fd, buf, n, flags, addr, addr_len);
+
+    /* Instrumentation end */
+    if (locker != NULL)
+    {
+      size_t bytes_written;
+      bytes_written = (result > -1) ? result : 0;
+      PSI_CALL(end_socket_wait)(locker, bytes_written);
+    }
+
+    return result;
+  }
+#endif
+
+  /* Non instrumented code */
+  result= sendto(mysql_socket.fd, buf, n, flags, addr, addr_len);
+
+  return result;
+}
+
+/** mysql_socket_recvfrom */
+
+static inline ssize_t
+inline_mysql_socket_recvfrom
+(
+#ifdef HAVE_PSI_SOCKET_INTERFACE
+  const char *src_file, uint src_line,
+#endif
+ MYSQL_SOCKET mysql_socket, SOCKBUF_T *buf, size_t n, int flags,
+ struct sockaddr *addr, socklen_t *addr_len)
+{
+  ssize_t result;
+
+#ifdef HAVE_PSI_SOCKET_INTERFACE
+  if (mysql_socket.m_psi != NULL)
+  {
+    /* Instrumentation start */
+    PSI_socket_locker *locker;
+    PSI_socket_locker_state state;
+    locker= PSI_CALL(start_socket_wait)(&state, mysql_socket.m_psi,
+                                        PSI_SOCKET_RECV, (size_t)0, src_file, src_line);
+
+    /* Instrumented code */
+    result= recvfrom(mysql_socket.fd, buf, n, flags, addr, addr_len);
+
+    /* Instrumentation end */
+    if (locker != NULL)
+    {
+      size_t bytes_read;
+      bytes_read = (result > -1) ? result : 0;
+      PSI_CALL(end_socket_wait)(locker, bytes_read);
+    }
+
+    return result;
+  }
+#endif
+
+  /* Non instrumented code */
+  result= recvfrom(mysql_socket.fd, buf, n, flags, addr, addr_len);
+
+  return result;
+}
+
+/** mysql_socket_getsockopt */
+
+static inline int
+inline_mysql_socket_getsockopt
+(
+#ifdef HAVE_PSI_SOCKET_INTERFACE
+  const char *src_file, uint src_line,
+#endif
+ MYSQL_SOCKET mysql_socket, int level, int optname, SOCKBUF_T *optval, socklen_t *optlen)
+{
+  int result;
+
+#ifdef HAVE_PSI_SOCKET_INTERFACE
+  if (mysql_socket.m_psi != NULL)
+  {
+    /* Instrumentation start */
+    PSI_socket_locker *locker;
+    PSI_socket_locker_state state;
+    locker= PSI_CALL(start_socket_wait)(&state, mysql_socket.m_psi,
+                                        PSI_SOCKET_OPT, (size_t)0, src_file, src_line);
+
+    /* Instrumented code */
+    result= getsockopt(mysql_socket.fd, level, optname, optval, optlen);
+
+    /* Instrumentation end */
+    if (locker != NULL)
+      PSI_CALL(end_socket_wait)(locker, (size_t)0);
+
+    return result;
+  }
+#endif
+
+  /* Non instrumented code */
+  result= getsockopt(mysql_socket.fd, level, optname, optval, optlen);
+
+  return result;
+}
+
+/** mysql_socket_setsockopt */
+
+static inline int
+inline_mysql_socket_setsockopt
+(
+#ifdef HAVE_PSI_SOCKET_INTERFACE
+  const char *src_file, uint src_line,
+#endif
+ MYSQL_SOCKET mysql_socket, int level, int optname, const SOCKBUF_T *optval,
+ socklen_t optlen)
+{
+  int result;
+
+#ifdef HAVE_PSI_SOCKET_INTERFACE
+  if (mysql_socket.m_psi)
+  {
+    /* Instrumentation start */
+    PSI_socket_locker *locker;
+    PSI_socket_locker_state state;
+    locker= PSI_CALL(start_socket_wait)(&state, mysql_socket.m_psi,
+                                        PSI_SOCKET_OPT, (size_t)0, src_file, src_line);
+
+    /* Instrumented code */
+    result= setsockopt(mysql_socket.fd, level, optname, optval, optlen);
+
+    /* Instrumentation end */
+    if (locker != NULL)
+      PSI_CALL(end_socket_wait)(locker, (size_t)0);
+
+    return result;
+  }
+#endif
+
+  /* Non instrumented code */
+  result= setsockopt(mysql_socket.fd, level, optname, optval, optlen);
+
+  return result;
+}
+
+/** mysql_socket_listen */
+
+static inline int
+inline_mysql_socket_listen
+(
+#ifdef HAVE_PSI_SOCKET_INTERFACE
+  const char *src_file, uint src_line,
+#endif
+ MYSQL_SOCKET mysql_socket, int backlog)
+{
+  int result;
+
+#ifdef HAVE_PSI_SOCKET_INTERFACE
+  if (mysql_socket.m_psi != NULL)
+  {
+    /* Instrumentation start */
+    PSI_socket_locker *locker;
+    PSI_socket_locker_state state;
+    locker= PSI_CALL(start_socket_wait)(&state, mysql_socket.m_psi,
+                                        PSI_SOCKET_CONNECT, (size_t)0, src_file, src_line);
+
+    /* Instrumented code */
+    result= listen(mysql_socket.fd, backlog);
+
+    /* Instrumentation end */
+    if (locker != NULL)
+      PSI_CALL(end_socket_wait)(locker, (size_t)0);
+
+    return result;
+  }
+#endif
+
+  /* Non instrumented code */
+  result= listen(mysql_socket.fd, backlog);
+
+  return result;
+}
+
+/** mysql_socket_accept */
+
+static inline MYSQL_SOCKET
+inline_mysql_socket_accept
+(
+#ifdef HAVE_PSI_SOCKET_INTERFACE
+  const char *src_file, uint src_line, PSI_socket_key key,
+#endif
+  MYSQL_SOCKET socket_listen, struct sockaddr *addr, socklen_t *addr_len)
+{
+  MYSQL_SOCKET socket_accept= MYSQL_INVALID_SOCKET;
+  socklen_t addr_length= (addr_len != NULL) ? *addr_len : 0;
+
+#ifdef HAVE_PSI_SOCKET_INTERFACE
+  if (socket_listen.m_psi != NULL)
+  {
+    /* Instrumentation start */
+    PSI_socket_locker *locker;
+    PSI_socket_locker_state state;
+    locker= PSI_CALL(start_socket_wait)(&state, socket_listen.m_psi,
+                                        PSI_SOCKET_CONNECT, (size_t)0, src_file, src_line);
+
+    /* Instrumented code */
+    socket_accept.fd= accept(socket_listen.fd, addr, &addr_length);
+
+    /* Instrumentation end */
+    if (locker != NULL)
+      PSI_CALL(end_socket_wait)(locker, (size_t)0);
+  }
+  else
+#endif
+  {
+    /* Non instrumented code */
+    socket_accept.fd= accept(socket_listen.fd, addr, &addr_length);
+  }
+
+#ifdef HAVE_PSI_SOCKET_INTERFACE
+  /* Initialize the instrument with the new socket descriptor and address */
+  socket_accept.m_psi=
+            PSI_CALL(init_socket)(key, (const my_socket*)&socket_accept.fd);
+
+  /* FIXME: simplify this with just 1 call to init_socket(). */
+  if (socket_accept.m_psi != NULL)
+    PSI_CALL(set_socket_info)(socket_accept.m_psi, &socket_accept.fd, addr,
+                              addr_length);
+#endif
+
+  return socket_accept;
+}
+
+/** mysql_socket_close */
+
+static inline int
+inline_mysql_socket_close
+(
+#ifdef HAVE_PSI_SOCKET_INTERFACE
+  const char *src_file, uint src_line,
+#endif
+  MYSQL_SOCKET mysql_socket)
+{
+  int result;
+
+#ifdef HAVE_PSI_SOCKET_INTERFACE
+  if (mysql_socket.m_psi != NULL)
+  {
+    /* Instrumentation start */
+    PSI_socket_locker *locker;
+    PSI_socket_locker_state state;
+    locker= PSI_CALL(start_socket_wait)(&state, mysql_socket.m_psi,
+                                        PSI_SOCKET_CLOSE, (size_t)0, src_file, src_line);
+
+    /* Instrumented code */
+    result= closesocket(mysql_socket.fd);
+
+    /* Instrumentation end */
+    if (locker != NULL)
+      PSI_CALL(end_socket_wait)(locker, (size_t)0);
+    /* Remove the instrumentation for this socket. */
+    if (mysql_socket.m_psi != NULL)
+      PSI_CALL(destroy_socket)(mysql_socket.m_psi);
+
+    return result;
+  }
+#endif
+
+  /* Non instrumented code */
+  result= closesocket(mysql_socket.fd);
+
+  return result;
+}
+
+/** mysql_socket_shutdown */
+
+static inline int
+inline_mysql_socket_shutdown
+(
+#ifdef HAVE_PSI_SOCKET_INTERFACE
+  const char *src_file, uint src_line,
+#endif
+  MYSQL_SOCKET mysql_socket, int how)
+{
+  int result;
+
+  /* Instrumentation start */
+#ifdef HAVE_PSI_SOCKET_INTERFACE
+  if (mysql_socket.m_psi != NULL)
+  {
+    PSI_socket_locker *locker;
+    PSI_socket_locker_state state;
+    locker= PSI_CALL(start_socket_wait)(&state, mysql_socket.m_psi,
+                                        PSI_SOCKET_SHUTDOWN, (size_t)0, src_file, src_line);
+
+    /* Instrumented code */
+    result= shutdown(mysql_socket.fd, how);
+
+    /* Instrumentation end */
+    if (locker != NULL)
+      PSI_CALL(end_socket_wait)(locker, (size_t)0);
+
+    return result;
+  }
+#endif
+
+  /* Non instrumented code */
+  result= shutdown(mysql_socket.fd, how);
+
+  return result;
+}
+
+/** @} (end of group Socket_instrumentation) */
+
+#endif
+
diff --git a/include/mysql/psi/mysql_stage.h b/include/mysql/psi/mysql_stage.h
new file mode 100644
index 00000000000..dc44e9b0bed
--- /dev/null
+++ b/include/mysql/psi/mysql_stage.h
@@ -0,0 +1,72 @@
+/* Copyright (c) 2010, 2011, Oracle and/or its affiliates. All rights reserved.   
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; version 2 of the License.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301  USA */
+
+#ifndef MYSQL_STAGE_H
+#define MYSQL_STAGE_H
+
+/**
+  @file mysql/psi/mysql_stage.h
+  Instrumentation helpers for stages.
+*/
+
+#include "mysql/psi/psi.h"
+
+/**
+  @defgroup Stage_instrumentation Stage Instrumentation
+  @ingroup Instrumentation_interface
+  @{
+*/
+
+/**
+  @def mysql_stage_register(P1, P2, P3)
+  Stage registration.
+*/
+#ifdef HAVE_PSI_STAGE_INTERFACE
+#define mysql_stage_register(P1, P2, P3) \
+  inline_mysql_stage_register(P1, P2, P3)
+#else
+#define mysql_stage_register(P1, P2, P3) \
+  do {} while (0)
+#endif
+
+#ifdef HAVE_PSI_STAGE_INTERFACE
+  #define MYSQL_SET_STAGE(K, F, L) \
+    inline_mysql_set_stage(K, F, L)
+#else
+  #define MYSQL_SET_STAGE(K, F, L) \
+    do {} while (0)
+#endif
+
+#ifdef HAVE_PSI_STAGE_INTERFACE
+static inline void inline_mysql_stage_register(
+  const char *category, PSI_stage_info **info, int count)
+{
+  PSI_CALL(register_stage)(category, info, count);
+}
+#endif
+
+#ifdef HAVE_PSI_STAGE_INTERFACE
+static inline void
+inline_mysql_set_stage(PSI_stage_key key,
+                       const char *src_file, int src_line)
+{
+  PSI_CALL(start_stage)(key, src_file, src_line);
+}
+#endif
+
+/** @} (end of group Stage_instrumentation) */
+
+#endif
+
diff --git a/include/mysql/psi/mysql_statement.h b/include/mysql/psi/mysql_statement.h
new file mode 100644
index 00000000000..1b065065e57
--- /dev/null
+++ b/include/mysql/psi/mysql_statement.h
@@ -0,0 +1,229 @@
+/* Copyright (c) 2010, 2011, Oracle and/or its affiliates. All rights reserved.   
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; version 2 of the License.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301  USA */
+
+#ifndef MYSQL_STATEMENT_H
+#define MYSQL_STATEMENT_H
+
+/**
+  @file mysql/psi/mysql_statement.h
+  Instrumentation helpers for statements.
+*/
+
+#include "mysql/psi/psi.h"
+
+/**
+  @defgroup Statement_instrumentation Statement Instrumentation
+  @ingroup Instrumentation_interface
+  @{
+*/
+
+/**
+  @def mysql_statement_register(P1, P2, P3)
+  Statement registration.
+*/
+#ifdef HAVE_PSI_STATEMENT_INTERFACE
+#define mysql_statement_register(P1, P2, P3) \
+  inline_mysql_statement_register(P1, P2, P3)
+#else
+#define mysql_statement_register(P1, P2, P3) \
+  do {} while (0)
+#endif
+
+#ifdef HAVE_PSI_STATEMENT_INTERFACE
+#ifdef HAVE_PSI_STATEMENT_DIGEST_INTERFACE
+  #define MYSQL_DIGEST_START(LOCKER) \
+    inline_mysql_digest_start(LOCKER)
+#else
+  #define MYSQL_DIGEST_START(LOCKER) \
+    NULL
+#endif
+#else
+  #define MYSQL_DIGEST_START(LOCKER) \
+    NULL
+#endif
+
+#ifdef HAVE_PSI_STATEMENT_DIGEST_INTERFACE
+  #define MYSQL_ADD_TOKEN(LOCKER, T, Y) \
+    inline_mysql_add_token(LOCKER, T, Y)
+#else
+  #define MYSQL_ADD_TOKEN(LOCKER, T, Y) \
+    NULL
+#endif
+
+#ifdef HAVE_PSI_STATEMENT_INTERFACE
+  #define MYSQL_START_STATEMENT(STATE, K, DB, DB_LEN) \
+    inline_mysql_start_statement(STATE, K, DB, DB_LEN, __FILE__, __LINE__)
+#else
+  #define MYSQL_START_STATEMENT(STATE, K, DB, DB_LEN) \
+    NULL
+#endif
+
+#ifdef HAVE_PSI_STATEMENT_INTERFACE
+  #define MYSQL_REFINE_STATEMENT(LOCKER, K) \
+    inline_mysql_refine_statement(LOCKER, K)
+#else
+  #define MYSQL_REFINE_STATEMENT(LOCKER, K) \
+    NULL
+#endif
+
+#ifdef HAVE_PSI_STATEMENT_INTERFACE
+  #define MYSQL_SET_STATEMENT_TEXT(LOCKER, P1, P2) \
+    inline_mysql_set_statement_text(LOCKER, P1, P2)
+#else
+  #define MYSQL_SET_STATEMENT_TEXT(LOCKER, P1, P2) \
+    do {} while (0)
+#endif
+
+#ifdef HAVE_PSI_STATEMENT_INTERFACE
+  #define MYSQL_SET_STATEMENT_LOCK_TIME(LOCKER, P1) \
+    inline_mysql_set_statement_lock_time(LOCKER, P1)
+#else
+  #define MYSQL_SET_STATEMENT_LOCK_TIME(LOCKER, P1) \
+    do {} while (0)
+#endif
+
+#ifdef HAVE_PSI_STATEMENT_INTERFACE
+  #define MYSQL_SET_STATEMENT_ROWS_SENT(LOCKER, P1) \
+    inline_mysql_set_statement_rows_sent(LOCKER, P1)
+#else
+  #define MYSQL_SET_STATEMENT_ROWS_SENT(LOCKER, P1) \
+    do {} while (0)
+#endif
+
+#ifdef HAVE_PSI_STATEMENT_INTERFACE
+  #define MYSQL_SET_STATEMENT_ROWS_EXAMINED(LOCKER, P1) \
+    inline_mysql_set_statement_rows_examined(LOCKER, P1)
+#else
+  #define MYSQL_SET_STATEMENT_ROWS_EXAMINED(LOCKER, P1) \
+    do {} while (0)
+#endif
+
+#ifdef HAVE_PSI_STATEMENT_INTERFACE
+  #define MYSQL_END_STATEMENT(LOCKER, DA) \
+    inline_mysql_end_statement(LOCKER, DA)
+#else
+  #define MYSQL_END_STATEMENT(LOCKER, DA) \
+    do {} while (0)
+#endif
+
+#ifdef HAVE_PSI_STATEMENT_INTERFACE
+static inline void inline_mysql_statement_register(
+  const char *category, PSI_statement_info *info, int count)
+{
+  PSI_CALL(register_statement)(category, info, count);
+}
+
+#ifdef HAVE_PSI_STATEMENT_DIGEST_INTERFACE
+static inline struct PSI_digest_locker *
+inline_mysql_digest_start(PSI_statement_locker *locker)
+{
+  PSI_digest_locker* digest_locker= NULL;
+
+  if (likely(locker != NULL))
+    digest_locker= PSI_CALL(digest_start)(locker);
+  return digest_locker;
+}
+#endif
+
+#ifdef HAVE_PSI_STATEMENT_DIGEST_INTERFACE
+static inline struct PSI_digest_locker *
+inline_mysql_add_token(PSI_digest_locker *locker, uint token,
+                       void *yylval)
+{
+  if (likely(locker != NULL))
+    locker= PSI_CALL(digest_add_token)(locker, token,
+                                      (OPAQUE_LEX_YYSTYPE*)yylval);
+  return locker;
+}
+#endif
+
+static inline struct PSI_statement_locker *
+inline_mysql_start_statement(PSI_statement_locker_state *state,
+                             PSI_statement_key key,
+                             const char *db, uint db_len,
+                             const char *src_file, int src_line)
+{
+  PSI_statement_locker *locker;
+  locker= PSI_CALL(get_thread_statement_locker)(state, key);
+  if (likely(locker != NULL))
+    PSI_CALL(start_statement)(locker, db, db_len, src_file, src_line);
+  return locker;
+}
+
+static inline struct PSI_statement_locker *
+inline_mysql_refine_statement(PSI_statement_locker *locker,
+                              PSI_statement_key key)
+{
+  if (likely(locker != NULL))
+  {
+    locker= PSI_CALL(refine_statement)(locker, key);
+  }
+  return locker;
+}
+
+static inline void
+inline_mysql_set_statement_text(PSI_statement_locker *locker,
+                                const char *text, uint text_len)
+{
+  if (likely(locker != NULL))
+  {
+    PSI_CALL(set_statement_text)(locker, text, text_len);
+  }
+}
+
+static inline void
+inline_mysql_set_statement_lock_time(PSI_statement_locker *locker,
+                                     ulonglong count)
+{
+  if (likely(locker != NULL))
+  {
+    PSI_CALL(set_statement_lock_time)(locker, count);
+  }
+}
+
+static inline void
+inline_mysql_set_statement_rows_sent(PSI_statement_locker *locker,
+                                     ulonglong count)
+{
+  if (likely(locker != NULL))
+  {
+    PSI_CALL(set_statement_rows_sent)(locker, count);
+  }
+}
+
+static inline void
+inline_mysql_set_statement_rows_examined(PSI_statement_locker *locker,
+                                         ulonglong count)
+{
+  if (likely(locker != NULL))
+  {
+    PSI_CALL(set_statement_rows_examined)(locker, count);
+  }
+}
+
+static inline void
+inline_mysql_end_statement(struct PSI_statement_locker *locker,
+                           Diagnostics_area *stmt_da)
+{
+  PSI_CALL(end_stage)();
+  if (likely(locker != NULL))
+    PSI_CALL(end_statement)(locker, stmt_da);
+}
+#endif
+
+/** @} (end of group Statement_instrumentation) */
+
+#endif
+
diff --git a/include/mysql/psi/mysql_table.h b/include/mysql/psi/mysql_table.h
new file mode 100644
index 00000000000..1796943096e
--- /dev/null
+++ b/include/mysql/psi/mysql_table.h
@@ -0,0 +1,188 @@
+/* Copyright (c) 2010, 2012, Oracle and/or its affiliates. All rights reserved.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; version 2 of the License.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software Foundation,
+  51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA */
+
+#ifndef MYSQL_TABLE_H
+#define MYSQL_TABLE_H
+
+/**
+  @file mysql/psi/mysql_table.h
+  Instrumentation helpers for table io.
+*/
+
+#include "mysql/psi/psi.h"
+
+/**
+  @defgroup Table_instrumentation Table Instrumentation
+  @ingroup Instrumentation_interface
+  @{
+*/
+
+/**
+  @def MYSQL_TABLE_WAIT_VARIABLES
+  Instrumentation helper for table waits.
+  This instrumentation declares local variables.
+  Do not use a ';' after this macro
+  @param LOCKER the locker
+  @param STATE the locker state
+  @sa MYSQL_START_TABLE_IO_WAIT.
+  @sa MYSQL_END_TABLE_IO_WAIT.
+  @sa MYSQL_START_TABLE_LOCK_WAIT.
+  @sa MYSQL_END_TABLE_LOCK_WAIT.
+*/
+#ifdef HAVE_PSI_TABLE_INTERFACE
+  #define MYSQL_TABLE_WAIT_VARIABLES(LOCKER, STATE) \
+    struct PSI_table_locker* LOCKER; \
+    PSI_table_locker_state STATE;
+#else
+  #define MYSQL_TABLE_WAIT_VARIABLES(LOCKER, STATE)
+#endif
+
+/**
+  @def MYSQL_TABLE_IO_WAIT
+  Instrumentation helper for table io_waits.
+  This instrumentation marks the start of a wait event.
+  @param PSI the instrumented table
+  @param OP the table operation to be performed
+  @param INDEX the table index used if any, or MAY_KEY.
+  @param FLAGS per table operation flags.
+  @sa MYSQL_END_TABLE_WAIT.
+*/
+#ifdef HAVE_PSI_TABLE_INTERFACE
+  #define MYSQL_TABLE_IO_WAIT(PSI, OP, INDEX, FLAGS, PAYLOAD)          \
+    {                                                                  \
+      if (PSI != NULL)                                                 \
+      {                                                                \
+        PSI_table_locker *locker;                                      \
+        PSI_table_locker_state state;                                  \
+        locker= PSI_CALL(start_table_io_wait)(& state, PSI, OP, INDEX, \
+                                              __FILE__, __LINE__);     \
+        PAYLOAD                                                        \
+        if (locker != NULL)                                            \
+          PSI_CALL(end_table_io_wait)(locker);                         \
+      }                                                                \
+      else                                                             \
+      {                                                                \
+        PAYLOAD                                                        \
+      }                                                                \
+    }
+#else
+  #define MYSQL_TABLE_IO_WAIT(PSI, OP, INDEX, FLAGS, PAYLOAD) \
+    PAYLOAD
+#endif
+
+/**
+  @def MYSQL_TABLE_LOCK_WAIT
+  Instrumentation helper for table io_waits.
+  This instrumentation marks the start of a wait event.
+  @param PSI the instrumented table
+  @param OP the table operation to be performed
+  @param INDEX the table index used if any, or MAY_KEY.
+  @param FLAGS per table operation flags.
+  @sa MYSQL_END_TABLE_WAIT.
+*/
+#ifdef HAVE_PSI_TABLE_INTERFACE
+  #define MYSQL_TABLE_LOCK_WAIT(PSI, OP, FLAGS, PAYLOAD)                 \
+    {                                                                    \
+      if (PSI != NULL)                                                   \
+      {                                                                  \
+        PSI_table_locker *locker;                                        \
+        PSI_table_locker_state state;                                    \
+        locker= PSI_CALL(start_table_lock_wait)(& state, PSI, OP, FLAGS, \
+                                                __FILE__, __LINE__);     \
+        PAYLOAD                                                          \
+        if (locker != NULL)                                              \
+          PSI_CALL(end_table_lock_wait)(locker);                         \
+      }                                                                  \
+      else                                                               \
+      {                                                                  \
+        PAYLOAD                                                          \
+      }                                                                  \
+    }
+#else
+  #define MYSQL_TABLE_LOCK_WAIT(PSI, OP, FLAGS, PAYLOAD) \
+    PAYLOAD
+#endif
+
+/**
+  @def MYSQL_START_TABLE_LOCK_WAIT
+  Instrumentation helper for table lock waits.
+  This instrumentation marks the start of a wait event.
+  @param LOCKER the locker
+  @param STATE the locker state
+  @param PSI the instrumented table
+  @param OP the table operation to be performed
+  @param FLAGS per table operation flags.
+  @sa MYSQL_END_TABLE_LOCK_WAIT.
+*/
+#ifdef HAVE_PSI_TABLE_INTERFACE
+  #define MYSQL_START_TABLE_LOCK_WAIT(LOCKER, STATE, PSI, OP, FLAGS) \
+    LOCKER= inline_mysql_start_table_lock_wait(STATE, PSI, \
+                                               OP, FLAGS, __FILE__, __LINE__)
+#else
+  #define MYSQL_START_TABLE_LOCK_WAIT(LOCKER, STATE, PSI, OP, FLAGS) \
+    do {} while (0)
+#endif
+
+/**
+  @def MYSQL_END_TABLE_LOCK_WAIT
+  Instrumentation helper for table lock waits.
+  This instrumentation marks the end of a wait event.
+  @param LOCKER the locker
+  @sa MYSQL_START_TABLE_LOCK_WAIT.
+*/
+#ifdef HAVE_PSI_TABLE_INTERFACE
+  #define MYSQL_END_TABLE_LOCK_WAIT(LOCKER) \
+    inline_mysql_end_table_lock_wait(LOCKER)
+#else
+  #define MYSQL_END_TABLE_LOCK_WAIT(LOCKER) \
+    do {} while (0)
+#endif
+
+#ifdef HAVE_PSI_TABLE_INTERFACE
+/**
+  Instrumentation calls for MYSQL_START_TABLE_LOCK_WAIT.
+  @sa MYSQL_END_TABLE_LOCK_WAIT.
+*/
+static inline struct PSI_table_locker *
+inline_mysql_start_table_lock_wait(PSI_table_locker_state *state,
+                                   struct PSI_table *psi,
+                                   enum PSI_table_lock_operation op,
+                                   ulong flags, const char *src_file, int src_line)
+{
+  if (psi != NULL)
+  {
+    struct PSI_table_locker *locker;
+    locker= PSI_CALL(start_table_lock_wait)(state, psi, op, flags, src_file, src_line);
+    return locker;
+  }
+  return NULL;
+}
+
+/**
+  Instrumentation calls for MYSQL_END_TABLE_LOCK_WAIT.
+  @sa MYSQL_START_TABLE_LOCK_WAIT.
+*/
+static inline void
+inline_mysql_end_table_lock_wait(struct PSI_table_locker *locker)
+{
+  if (locker != NULL)
+    PSI_CALL(end_table_lock_wait)(locker);
+}
+#endif
+
+/** @} (end of group Table_instrumentation) */
+
+#endif
+
diff --git a/include/mysql/psi/mysql_thread.h b/include/mysql/psi/mysql_thread.h
index 18b4fde8c5c..78175196fa2 100644
--- a/include/mysql/psi/mysql_thread.h
+++ b/include/mysql/psi/mysql_thread.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2008, 2012, Oracle and/or its affiliates. All rights reserved.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
@@ -69,7 +69,13 @@
 struct st_mysql_mutex
 {
   /** The real mutex. */
+#ifdef SAFE_MUTEX
+  safe_mutex_t m_mutex;
+#elif defined(MY_PTHREAD_FASTMUTEX)
+  my_pthread_fastmutex_t m_mutex;
+#else
   pthread_mutex_t m_mutex;
+#endif
   /**
     The instrumentation hook.
     Note that this hook is not conditionally defined,
@@ -225,6 +231,13 @@ typedef struct st_mysql_cond mysql_cond_t;
   rw_pr_lock_assert_not_write_owner(&(M)->m_prlock)
 
 /**
+  @def mysql_mutex_register(P1, P2, P3)
+  Mutex registration.
+*/
+#define mysql_mutex_register(P1, P2, P3) \
+  inline_mysql_mutex_register(P1, P2, P3)
+
+/**
   @def mysql_mutex_init(K, M, A)
   Instrumented mutex_init.
   @c mysql_mutex_init is a replacement for @c pthread_mutex_init.
@@ -233,7 +246,7 @@ typedef struct st_mysql_cond mysql_cond_t;
   @param A Mutex attributes
 */
 
-#ifdef HAVE_PSI_INTERFACE
+#ifdef HAVE_PSI_MUTEX_INTERFACE
   #ifdef SAFE_MUTEX
     #define mysql_mutex_init(K, M, A) \
       inline_mysql_mutex_init(K, M, A, #M, __FILE__, __LINE__)
@@ -272,7 +285,7 @@ typedef struct st_mysql_cond mysql_cond_t;
   @param M The mutex to lock
 */
 
-#if defined(SAFE_MUTEX) || defined (HAVE_PSI_INTERFACE)
+#if defined(SAFE_MUTEX) || defined (HAVE_PSI_MUTEX_INTERFACE)
   #define mysql_mutex_lock(M) \
     inline_mysql_mutex_lock(M, __FILE__, __LINE__)
 #else
@@ -287,7 +300,7 @@ typedef struct st_mysql_cond mysql_cond_t;
   for @c pthread_mutex_trylock.
 */
 
-#if defined(SAFE_MUTEX) || defined (HAVE_PSI_INTERFACE)
+#if defined(SAFE_MUTEX) || defined (HAVE_PSI_MUTEX_INTERFACE)
   #define mysql_mutex_trylock(M) \
     inline_mysql_mutex_trylock(M, __FILE__, __LINE__)
 #else
@@ -309,6 +322,13 @@ typedef struct st_mysql_cond mysql_cond_t;
 #endif
 
 /**
+  @def mysql_rwlock_register(P1, P2, P3)
+  Rwlock registration.
+*/
+#define mysql_rwlock_register(P1, P2, P3) \
+  inline_mysql_rwlock_register(P1, P2, P3)
+
+/**
   @def mysql_rwlock_init(K, RW)
   Instrumented rwlock_init.
   @c mysql_rwlock_init is a replacement for @c pthread_rwlock_init.
@@ -316,7 +336,7 @@ typedef struct st_mysql_cond mysql_cond_t;
   @param K The PSI_rwlock_key for this instrumented rwlock
   @param RW The rwlock to initialize
 */
-#ifdef HAVE_PSI_INTERFACE
+#ifdef HAVE_PSI_RWLOCK_INTERFACE
   #define mysql_rwlock_init(K, RW) inline_mysql_rwlock_init(K, RW)
 #else
   #define mysql_rwlock_init(K, RW) inline_mysql_rwlock_init(RW)
@@ -329,7 +349,7 @@ typedef struct st_mysql_cond mysql_cond_t;
   @param K The PSI_rwlock_key for this instrumented prlock
   @param RW The prlock to initialize
 */
-#ifdef HAVE_PSI_INTERFACE
+#ifdef HAVE_PSI_RWLOCK_INTERFACE
   #define mysql_prlock_init(K, RW) inline_mysql_prlock_init(K, RW)
 #else
   #define mysql_prlock_init(K, RW) inline_mysql_prlock_init(RW)
@@ -357,7 +377,7 @@ typedef struct st_mysql_cond mysql_cond_t;
   @c mysql_rwlock_rdlock is a drop-in replacement
   for @c pthread_rwlock_rdlock.
 */
-#ifdef HAVE_PSI_INTERFACE
+#ifdef HAVE_PSI_RWLOCK_INTERFACE
   #define mysql_rwlock_rdlock(RW) \
     inline_mysql_rwlock_rdlock(RW, __FILE__, __LINE__)
 #else
@@ -371,7 +391,7 @@ typedef struct st_mysql_cond mysql_cond_t;
   @c mysql_prlock_rdlock is a drop-in replacement
   for @c rw_pr_rdlock.
 */
-#ifdef HAVE_PSI_INTERFACE
+#ifdef HAVE_PSI_RWLOCK_INTERFACE
   #define mysql_prlock_rdlock(RW) \
     inline_mysql_prlock_rdlock(RW, __FILE__, __LINE__)
 #else
@@ -385,7 +405,7 @@ typedef struct st_mysql_cond mysql_cond_t;
   @c mysql_rwlock_wrlock is a drop-in replacement
   for @c pthread_rwlock_wrlock.
 */
-#ifdef HAVE_PSI_INTERFACE
+#ifdef HAVE_PSI_RWLOCK_INTERFACE
   #define mysql_rwlock_wrlock(RW) \
     inline_mysql_rwlock_wrlock(RW, __FILE__, __LINE__)
 #else
@@ -399,7 +419,7 @@ typedef struct st_mysql_cond mysql_cond_t;
   @c mysql_prlock_wrlock is a drop-in replacement
   for @c rw_pr_wrlock.
 */
-#ifdef HAVE_PSI_INTERFACE
+#ifdef HAVE_PSI_RWLOCK_INTERFACE
   #define mysql_prlock_wrlock(RW) \
     inline_mysql_prlock_wrlock(RW, __FILE__, __LINE__)
 #else
@@ -413,7 +433,7 @@ typedef struct st_mysql_cond mysql_cond_t;
   @c mysql_rwlock_tryrdlock is a drop-in replacement
   for @c pthread_rwlock_tryrdlock.
 */
-#ifdef HAVE_PSI_INTERFACE
+#ifdef HAVE_PSI_RWLOCK_INTERFACE
   #define mysql_rwlock_tryrdlock(RW) \
     inline_mysql_rwlock_tryrdlock(RW, __FILE__, __LINE__)
 #else
@@ -427,7 +447,7 @@ typedef struct st_mysql_cond mysql_cond_t;
   @c mysql_rwlock_trywrlock is a drop-in replacement
   for @c pthread_rwlock_trywrlock.
 */
-#ifdef HAVE_PSI_INTERFACE
+#ifdef HAVE_PSI_RWLOCK_INTERFACE
   #define mysql_rwlock_trywrlock(RW) \
     inline_mysql_rwlock_trywrlock(RW, __FILE__, __LINE__)
 #else
@@ -452,6 +472,13 @@ typedef struct st_mysql_cond mysql_cond_t;
 #define mysql_prlock_unlock(RW) inline_mysql_prlock_unlock(RW)
 
 /**
+  @def mysql_cond_register(P1, P2, P3)
+  Cond registration.
+*/
+#define mysql_cond_register(P1, P2, P3) \
+  inline_mysql_cond_register(P1, P2, P3)
+
+/**
   @def mysql_cond_init(K, C, A)
   Instrumented cond_init.
   @c mysql_cond_init is a replacement for @c pthread_cond_init.
@@ -459,7 +486,7 @@ typedef struct st_mysql_cond mysql_cond_t;
   @param K The PSI_cond_key for this instrumented cond
   @param A Condition attributes
 */
-#ifdef HAVE_PSI_INTERFACE
+#ifdef HAVE_PSI_COND_INTERFACE
   #define mysql_cond_init(K, C, A) inline_mysql_cond_init(K, C, A)
 #else
   #define mysql_cond_init(K, C, A) inline_mysql_cond_init(C, A)
@@ -477,7 +504,7 @@ typedef struct st_mysql_cond mysql_cond_t;
   Instrumented cond_wait.
   @c mysql_cond_wait is a drop-in replacement for @c pthread_cond_wait.
 */
-#ifdef HAVE_PSI_INTERFACE
+#ifdef HAVE_PSI_COND_INTERFACE
   #define mysql_cond_wait(C, M) \
     inline_mysql_cond_wait(C, M, __FILE__, __LINE__)
 #else
@@ -491,7 +518,7 @@ typedef struct st_mysql_cond mysql_cond_t;
   @c mysql_cond_timedwait is a drop-in replacement
   for @c pthread_cond_timedwait.
 */
-#ifdef HAVE_PSI_INTERFACE
+#ifdef HAVE_PSI_COND_INTERFACE
   #define mysql_cond_timedwait(C, M, W) \
     inline_mysql_cond_timedwait(C, M, W, __FILE__, __LINE__)
 #else
@@ -514,6 +541,12 @@ typedef struct st_mysql_cond mysql_cond_t;
 */
 #define mysql_cond_broadcast(C) inline_mysql_cond_broadcast(C)
 
+/**
+  @def mysql_thread_register(P1, P2, P3)
+  Thread registration.
+*/
+#define mysql_thread_register(P1, P2, P3) \
+  inline_mysql_thread_register(P1, P2, P3)
 
 /**
   @def mysql_thread_create(K, P1, P2, P3, P4)
@@ -532,7 +565,7 @@ typedef struct st_mysql_cond mysql_cond_t;
   @param P3 pthread_create parameter 3
   @param P4 pthread_create parameter 4
 */
-#ifdef HAVE_PSI_INTERFACE
+#ifdef HAVE_PSI_THREAD_INTERFACE
   #define mysql_thread_create(K, P1, P2, P3, P4) \
     inline_mysql_thread_create(K, P1, P2, P3, P4)
 #else
@@ -545,14 +578,31 @@ typedef struct st_mysql_cond mysql_cond_t;
   Set the thread indentifier for the instrumentation.
   @param I The thread identifier
 */
-#ifdef HAVE_PSI_INTERFACE
+#ifdef HAVE_PSI_THREAD_INTERFACE
   #define mysql_thread_set_psi_id(I) inline_mysql_thread_set_psi_id(I)
 #else
   #define mysql_thread_set_psi_id(I) do {} while (0)
 #endif
 
+static inline void inline_mysql_mutex_register(
+#ifdef HAVE_PSI_MUTEX_INTERFACE
+  const char *category,
+  PSI_mutex_info *info,
+  int count
+#else
+  const char *category __attribute__ ((unused)),
+  void *info __attribute__ ((unused)),
+  int count __attribute__ ((unused))
+#endif
+)
+{
+#ifdef HAVE_PSI_MUTEX_INTERFACE
+  PSI_CALL(register_mutex)(category, info, count);
+#endif
+}
+
 static inline int inline_mysql_mutex_init(
-#ifdef HAVE_PSI_INTERFACE
+#ifdef HAVE_PSI_MUTEX_INTERFACE
   PSI_mutex_key key,
 #endif
   mysql_mutex_t *that,
@@ -562,14 +612,15 @@ static inline int inline_mysql_mutex_init(
 #endif
   )
 {
-#ifdef HAVE_PSI_INTERFACE
-  that->m_psi= PSI_server ? PSI_server->init_mutex(key, &that->m_mutex)
-                          : NULL;
+#ifdef HAVE_PSI_MUTEX_INTERFACE
+  that->m_psi= PSI_CALL(init_mutex)(key, &that->m_mutex);
 #else
   that->m_psi= NULL;
 #endif
 #ifdef SAFE_MUTEX
   return safe_mutex_init(&that->m_mutex, attr, src_name, src_file, src_line);
+#elif defined(MY_PTHREAD_FASTMUTEX)
+  return my_pthread_fastmutex_init(&that->m_mutex, attr);
 #else
   return pthread_mutex_init(&that->m_mutex, attr);
 #endif
@@ -582,15 +633,17 @@ static inline int inline_mysql_mutex_destroy(
 #endif
   )
 {
-#ifdef HAVE_PSI_INTERFACE
-  if (likely(PSI_server && that->m_psi))
+#ifdef HAVE_PSI_MUTEX_INTERFACE
+  if (that->m_psi != NULL)
   {
-    PSI_server->destroy_mutex(that->m_psi);
+    PSI_CALL(destroy_mutex)(that->m_psi);
     that->m_psi= NULL;
   }
 #endif
 #ifdef SAFE_MUTEX
   return safe_mutex_destroy(&that->m_mutex, src_file, src_line);
+#elif defined(MY_PTHREAD_FASTMUTEX)
+  return pthread_mutex_destroy(&that->m_mutex.mutex);
 #else
   return pthread_mutex_destroy(&that->m_mutex);
 #endif
@@ -598,61 +651,95 @@ static inline int inline_mysql_mutex_destroy(
 
 static inline int inline_mysql_mutex_lock(
   mysql_mutex_t *that
-#if defined(SAFE_MUTEX) || defined (HAVE_PSI_INTERFACE)
+#if defined(SAFE_MUTEX) || defined (HAVE_PSI_MUTEX_INTERFACE)
   , const char *src_file, uint src_line
 #endif
   )
 {
   int result;
-#ifdef HAVE_PSI_INTERFACE
-  struct PSI_mutex_locker *locker= NULL;
-  PSI_mutex_locker_state state;
-  if (likely(PSI_server && that->m_psi))
+
+#ifdef HAVE_PSI_MUTEX_INTERFACE
+  if (that->m_psi != NULL)
   {
-    locker= PSI_server->get_thread_mutex_locker(&state, that->m_psi, PSI_MUTEX_LOCK);
-    if (likely(locker != NULL))
-      PSI_server->start_mutex_wait(locker, src_file, src_line);
+    /* Instrumentation start */
+    PSI_mutex_locker *locker;
+    PSI_mutex_locker_state state;
+    locker= PSI_CALL(start_mutex_wait)(&state, that->m_psi,
+                                       PSI_MUTEX_LOCK, src_file, src_line);
+
+    /* Instrumented code */
+#ifdef SAFE_MUTEX
+    result= safe_mutex_lock(&that->m_mutex, FALSE, src_file, src_line);
+#elif defined(MY_PTHREAD_FASTMUTEX)
+    result= my_pthread_fastmutex_lock(&that->m_mutex);
+#else
+    result= pthread_mutex_lock(&that->m_mutex);
+#endif
+
+    /* Instrumentation end */
+    if (locker != NULL)
+      PSI_CALL(end_mutex_wait)(locker, result);
+
+    return result;
   }
 #endif
+
+  /* Non instrumented code */
 #ifdef SAFE_MUTEX
   result= safe_mutex_lock(&that->m_mutex, FALSE, src_file, src_line);
+#elif defined(MY_PTHREAD_FASTMUTEX)
+  result= my_pthread_fastmutex_lock(&that->m_mutex);
 #else
   result= pthread_mutex_lock(&that->m_mutex);
 #endif
-#ifdef HAVE_PSI_INTERFACE
-  if (likely(locker != NULL))
-    PSI_server->end_mutex_wait(locker, result);
-#endif
+
   return result;
 }
 
 static inline int inline_mysql_mutex_trylock(
   mysql_mutex_t *that
-#if defined(SAFE_MUTEX) || defined (HAVE_PSI_INTERFACE)
+#if defined(SAFE_MUTEX) || defined (HAVE_PSI_MUTEX_INTERFACE)
   , const char *src_file, uint src_line
 #endif
   )
 {
   int result;
-#ifdef HAVE_PSI_INTERFACE
-  struct PSI_mutex_locker *locker= NULL;
-  PSI_mutex_locker_state state;
-  if (likely(PSI_server && that->m_psi))
+
+#ifdef HAVE_PSI_MUTEX_INTERFACE
+  if (that->m_psi != NULL)
   {
-    locker= PSI_server->get_thread_mutex_locker(&state, that->m_psi, PSI_MUTEX_TRYLOCK);
-    if (likely(locker != NULL))
-      PSI_server->start_mutex_wait(locker, src_file, src_line);
+    /* Instrumentation start */
+    PSI_mutex_locker *locker;
+    PSI_mutex_locker_state state;
+    locker= PSI_CALL(start_mutex_wait)(&state, that->m_psi,
+                                       PSI_MUTEX_TRYLOCK, src_file, src_line);
+
+    /* Instrumented code */
+#ifdef SAFE_MUTEX
+    result= safe_mutex_lock(&that->m_mutex, TRUE, src_file, src_line);
+#elif defined(MY_PTHREAD_FASTMUTEX)
+    result= pthread_mutex_trylock(&that->m_mutex.mutex);
+#else
+    result= pthread_mutex_trylock(&that->m_mutex);
+#endif
+
+    /* Instrumentation end */
+    if (locker != NULL)
+      PSI_CALL(end_mutex_wait)(locker, result);
+
+    return result;
   }
 #endif
+
+  /* Non instrumented code */
 #ifdef SAFE_MUTEX
   result= safe_mutex_lock(&that->m_mutex, TRUE, src_file, src_line);
+#elif defined(MY_PTHREAD_FASTMUTEX)
+  result= pthread_mutex_trylock(&that->m_mutex.mutex);
 #else
   result= pthread_mutex_trylock(&that->m_mutex);
 #endif
-#ifdef HAVE_PSI_INTERFACE
-  if (likely(locker != NULL))
-    PSI_server->end_mutex_wait(locker, result);
-#endif
+
   return result;
 }
 
@@ -664,27 +751,48 @@ static inline int inline_mysql_mutex_unlock(
   )
 {
   int result;
-#ifdef HAVE_PSI_INTERFACE
-  if (likely(PSI_server && that->m_psi))
-    PSI_server->unlock_mutex(that->m_psi);
+
+#ifdef HAVE_PSI_MUTEX_INTERFACE
+  if (that->m_psi != NULL)
+    PSI_CALL(unlock_mutex)(that->m_psi);
 #endif
+
 #ifdef SAFE_MUTEX
   result= safe_mutex_unlock(&that->m_mutex, src_file, src_line);
+#elif defined(MY_PTHREAD_FASTMUTEX)
+  result= pthread_mutex_unlock(&that->m_mutex.mutex);
 #else
   result= pthread_mutex_unlock(&that->m_mutex);
 #endif
+
   return result;
 }
 
+static inline void inline_mysql_rwlock_register(
+#ifdef HAVE_PSI_RWLOCK_INTERFACE
+  const char *category,
+  PSI_rwlock_info *info,
+  int count
+#else
+  const char *category __attribute__ ((unused)),
+  void *info __attribute__ ((unused)),
+  int count __attribute__ ((unused))
+#endif
+)
+{
+#ifdef HAVE_PSI_RWLOCK_INTERFACE
+  PSI_CALL(register_rwlock)(category, info, count);
+#endif
+}
+
 static inline int inline_mysql_rwlock_init(
-#ifdef HAVE_PSI_INTERFACE
+#ifdef HAVE_PSI_RWLOCK_INTERFACE
   PSI_rwlock_key key,
 #endif
   mysql_rwlock_t *that)
 {
-#ifdef HAVE_PSI_INTERFACE
-  that->m_psi= (PSI_server ? PSI_server->init_rwlock(key, &that->m_rwlock)
-                           : NULL);
+#ifdef HAVE_PSI_RWLOCK_INTERFACE
+  that->m_psi= PSI_CALL(init_rwlock)(key, &that->m_rwlock);
 #else
   that->m_psi= NULL;
 #endif
@@ -696,14 +804,13 @@ static inline int inline_mysql_rwlock_init(
 
 #ifndef DISABLE_MYSQL_PRLOCK_H
 static inline int inline_mysql_prlock_init(
-#ifdef HAVE_PSI_INTERFACE
+#ifdef HAVE_PSI_RWLOCK_INTERFACE
   PSI_rwlock_key key,
 #endif
   mysql_prlock_t *that)
 {
-#ifdef HAVE_PSI_INTERFACE
-  that->m_psi= (PSI_server ? PSI_server->init_rwlock(key, &that->m_prlock)
-                           : NULL);
+#ifdef HAVE_PSI_RWLOCK_INTERFACE
+  that->m_psi= PSI_CALL(init_rwlock)(key, &that->m_prlock);
 #else
   that->m_psi= NULL;
 #endif
@@ -714,10 +821,10 @@ static inline int inline_mysql_prlock_init(
 static inline int inline_mysql_rwlock_destroy(
   mysql_rwlock_t *that)
 {
-#ifdef HAVE_PSI_INTERFACE
-  if (likely(PSI_server && that->m_psi))
+#ifdef HAVE_PSI_RWLOCK_INTERFACE
+  if (that->m_psi != NULL)
   {
-    PSI_server->destroy_rwlock(that->m_psi);
+    PSI_CALL(destroy_rwlock)(that->m_psi);
     that->m_psi= NULL;
   }
 #endif
@@ -728,10 +835,10 @@ static inline int inline_mysql_rwlock_destroy(
 static inline int inline_mysql_prlock_destroy(
   mysql_prlock_t *that)
 {
-#ifdef HAVE_PSI_INTERFACE
-  if (likely(PSI_server && that->m_psi))
+#ifdef HAVE_PSI_RWLOCK_INTERFACE
+  if (that->m_psi != NULL)
   {
-    PSI_server->destroy_rwlock(that->m_psi);
+    PSI_CALL(destroy_rwlock)(that->m_psi);
     that->m_psi= NULL;
   }
 #endif
@@ -741,167 +848,215 @@ static inline int inline_mysql_prlock_destroy(
 
 static inline int inline_mysql_rwlock_rdlock(
   mysql_rwlock_t *that
-#ifdef HAVE_PSI_INTERFACE
+#ifdef HAVE_PSI_RWLOCK_INTERFACE
   , const char *src_file, uint src_line
 #endif
   )
 {
   int result;
-#ifdef HAVE_PSI_INTERFACE
-  struct PSI_rwlock_locker *locker= NULL;
-  PSI_rwlock_locker_state state;
-  if (likely(PSI_server && that->m_psi))
+
+#ifdef HAVE_PSI_RWLOCK_INTERFACE
+  if (that->m_psi != NULL)
   {
-    locker= PSI_server->get_thread_rwlock_locker(&state, that->m_psi,
-                                                 PSI_RWLOCK_READLOCK);
-    if (likely(locker != NULL))
-      PSI_server->start_rwlock_rdwait(locker, src_file, src_line);
+    /* Instrumentation start */
+    PSI_rwlock_locker *locker;
+    PSI_rwlock_locker_state state;
+    locker= PSI_CALL(start_rwlock_rdwait)(&state, that->m_psi,
+                                          PSI_RWLOCK_READLOCK, src_file, src_line);
+
+    /* Instrumented code */
+    result= rw_rdlock(&that->m_rwlock);
+
+    /* Instrumentation end */
+    if (locker != NULL)
+      PSI_CALL(end_rwlock_rdwait)(locker, result);
+
+    return result;
   }
 #endif
+
+  /* Non instrumented code */
   result= rw_rdlock(&that->m_rwlock);
-#ifdef HAVE_PSI_INTERFACE
-  if (likely(locker != NULL))
-    PSI_server->end_rwlock_rdwait(locker, result);
-#endif
+
   return result;
 }
 
 #ifndef DISABLE_MYSQL_PRLOCK_H
 static inline int inline_mysql_prlock_rdlock(
   mysql_prlock_t *that
-#ifdef HAVE_PSI_INTERFACE
+#ifdef HAVE_PSI_RWLOCK_INTERFACE
   , const char *src_file, uint src_line
 #endif
   )
 {
   int result;
-#ifdef HAVE_PSI_INTERFACE
-  struct PSI_rwlock_locker *locker= NULL;
-  PSI_rwlock_locker_state state;
-  if (likely(PSI_server && that->m_psi))
+
+#ifdef HAVE_PSI_RWLOCK_INTERFACE
+  if (that->m_psi != NULL)
   {
-    locker= PSI_server->get_thread_rwlock_locker(&state, that->m_psi,
-                                                 PSI_RWLOCK_READLOCK);
-    if (likely(locker != NULL))
-      PSI_server->start_rwlock_rdwait(locker, src_file, src_line);
+    /* Instrumentation start */
+    PSI_rwlock_locker *locker;
+    PSI_rwlock_locker_state state;
+    locker= PSI_CALL(start_rwlock_rdwait)(&state, that->m_psi,
+                                          PSI_RWLOCK_READLOCK, src_file, src_line);
+
+    /* Instrumented code */
+    result= rw_pr_rdlock(&that->m_prlock);
+
+    /* Instrumentation end */
+    if (locker != NULL)
+      PSI_CALL(end_rwlock_rdwait)(locker, result);
+
+    return result;
   }
 #endif
+
+  /* Non instrumented code */
   result= rw_pr_rdlock(&that->m_prlock);
-#ifdef HAVE_PSI_INTERFACE
-  if (likely(locker != NULL))
-    PSI_server->end_rwlock_rdwait(locker, result);
-#endif
+
   return result;
 }
 #endif
 
 static inline int inline_mysql_rwlock_wrlock(
   mysql_rwlock_t *that
-#ifdef HAVE_PSI_INTERFACE
+#ifdef HAVE_PSI_RWLOCK_INTERFACE
   , const char *src_file, uint src_line
 #endif
   )
 {
   int result;
-#ifdef HAVE_PSI_INTERFACE
-  struct PSI_rwlock_locker *locker= NULL;
-  PSI_rwlock_locker_state state;
-  if (likely(PSI_server && that->m_psi))
+
+#ifdef HAVE_PSI_RWLOCK_INTERFACE
+  if (that->m_psi != NULL)
   {
-    locker= PSI_server->get_thread_rwlock_locker(&state, that->m_psi,
-                                                 PSI_RWLOCK_WRITELOCK);
-    if (likely(locker != NULL))
-      PSI_server->start_rwlock_wrwait(locker, src_file, src_line);
+    /* Instrumentation start */
+    PSI_rwlock_locker *locker;
+    PSI_rwlock_locker_state state;
+    locker= PSI_CALL(start_rwlock_wrwait)(&state, that->m_psi,
+                                          PSI_RWLOCK_WRITELOCK, src_file, src_line);
+
+    /* Instrumented code */
+    result= rw_wrlock(&that->m_rwlock);
+
+    /* Instrumentation end */
+    if (locker != NULL)
+      PSI_CALL(end_rwlock_wrwait)(locker, result);
+
+    return result;
   }
 #endif
+
+  /* Non instrumented code */
   result= rw_wrlock(&that->m_rwlock);
-#ifdef HAVE_PSI_INTERFACE
-  if (likely(locker != NULL))
-    PSI_server->end_rwlock_wrwait(locker, result);
-#endif
+
   return result;
 }
 
 #ifndef DISABLE_MYSQL_PRLOCK_H
 static inline int inline_mysql_prlock_wrlock(
   mysql_prlock_t *that
-#ifdef HAVE_PSI_INTERFACE
+#ifdef HAVE_PSI_RWLOCK_INTERFACE
   , const char *src_file, uint src_line
 #endif
   )
 {
   int result;
-#ifdef HAVE_PSI_INTERFACE
-  struct PSI_rwlock_locker *locker= NULL;
-  PSI_rwlock_locker_state state;
-  if (likely(PSI_server && that->m_psi))
+
+#ifdef HAVE_PSI_RWLOCK_INTERFACE
+  if (that->m_psi != NULL)
   {
-    locker= PSI_server->get_thread_rwlock_locker(&state, that->m_psi,
-                                                 PSI_RWLOCK_WRITELOCK);
-    if (likely(locker != NULL))
-      PSI_server->start_rwlock_wrwait(locker, src_file, src_line);
+    /* Instrumentation start */
+    PSI_rwlock_locker *locker;
+    PSI_rwlock_locker_state state;
+    locker= PSI_CALL(start_rwlock_wrwait)(&state, that->m_psi,
+                                          PSI_RWLOCK_WRITELOCK, src_file, src_line);
+
+    /* Instrumented code */
+    result= rw_pr_wrlock(&that->m_prlock);
+
+    /* Instrumentation end */
+    if (locker != NULL)
+      PSI_CALL(end_rwlock_wrwait)(locker, result);
+
+    return result;
   }
 #endif
+
+  /* Non instrumented code */
   result= rw_pr_wrlock(&that->m_prlock);
-#ifdef HAVE_PSI_INTERFACE
-  if (likely(locker != NULL))
-    PSI_server->end_rwlock_wrwait(locker, result);
-#endif
+
   return result;
 }
 #endif
 
 static inline int inline_mysql_rwlock_tryrdlock(
   mysql_rwlock_t *that
-#ifdef HAVE_PSI_INTERFACE
+#ifdef HAVE_PSI_RWLOCK_INTERFACE
   , const char *src_file, uint src_line
 #endif
   )
 {
   int result;
-#ifdef HAVE_PSI_INTERFACE
-  struct PSI_rwlock_locker *locker= NULL;
-  PSI_rwlock_locker_state state;
-  if (likely(PSI_server && that->m_psi))
+
+#ifdef HAVE_PSI_RWLOCK_INTERFACE
+  if (that->m_psi != NULL)
   {
-    locker= PSI_server->get_thread_rwlock_locker(&state, that->m_psi,
-                                                 PSI_RWLOCK_TRYREADLOCK);
-    if (likely(locker != NULL))
-      PSI_server->start_rwlock_rdwait(locker, src_file, src_line);
+    /* Instrumentation start */
+    PSI_rwlock_locker *locker;
+    PSI_rwlock_locker_state state;
+    locker= PSI_CALL(start_rwlock_rdwait)(&state, that->m_psi,
+                                          PSI_RWLOCK_TRYREADLOCK, src_file, src_line);
+
+    /* Instrumented code */
+    result= rw_tryrdlock(&that->m_rwlock);
+
+    /* Instrumentation end */
+    if (locker != NULL)
+      PSI_CALL(end_rwlock_rdwait)(locker, result);
+
+    return result;
   }
 #endif
+
+  /* Non instrumented code */
   result= rw_tryrdlock(&that->m_rwlock);
-#ifdef HAVE_PSI_INTERFACE
-  if (likely(locker != NULL))
-    PSI_server->end_rwlock_rdwait(locker, result);
-#endif
+
   return result;
 }
 
 static inline int inline_mysql_rwlock_trywrlock(
   mysql_rwlock_t *that
-#ifdef HAVE_PSI_INTERFACE
+#ifdef HAVE_PSI_RWLOCK_INTERFACE
   , const char *src_file, uint src_line
 #endif
   )
 {
   int result;
-#ifdef HAVE_PSI_INTERFACE
-  struct PSI_rwlock_locker *locker= NULL;
-  PSI_rwlock_locker_state state;
-  if (likely(PSI_server && that->m_psi))
+
+#ifdef HAVE_PSI_RWLOCK_INTERFACE
+  if (that->m_psi != NULL)
   {
-    locker= PSI_server->get_thread_rwlock_locker(&state, that->m_psi,
-                                                 PSI_RWLOCK_TRYWRITELOCK);
-    if (likely(locker != NULL))
-      PSI_server->start_rwlock_wrwait(locker, src_file, src_line);
+    /* Instrumentation start */
+    PSI_rwlock_locker *locker;
+    PSI_rwlock_locker_state state;
+    locker= PSI_CALL(start_rwlock_wrwait)(&state, that->m_psi,
+                                          PSI_RWLOCK_TRYWRITELOCK, src_file, src_line);
+
+    /* Instrumented code */
+    result= rw_trywrlock(&that->m_rwlock);
+
+    /* Instrumentation end */
+    if (locker != NULL)
+      PSI_CALL(end_rwlock_wrwait)(locker, result);
+
+    return result;
   }
 #endif
+
+  /* Non instrumented code */
   result= rw_trywrlock(&that->m_rwlock);
-#ifdef HAVE_PSI_INTERFACE
-  if (likely(locker != NULL))
-    PSI_server->end_rwlock_wrwait(locker, result);
-#endif
+
   return result;
 }
 
@@ -909,9 +1064,9 @@ static inline int inline_mysql_rwlock_unlock(
   mysql_rwlock_t *that)
 {
   int result;
-#ifdef HAVE_PSI_INTERFACE
-  if (likely(PSI_server && that->m_psi))
-    PSI_server->unlock_rwlock(that->m_psi);
+#ifdef HAVE_PSI_RWLOCK_INTERFACE
+  if (that->m_psi != NULL)
+    PSI_CALL(unlock_rwlock)(that->m_psi);
 #endif
   result= rw_unlock(&that->m_rwlock);
   return result;
@@ -922,25 +1077,41 @@ static inline int inline_mysql_prlock_unlock(
   mysql_prlock_t *that)
 {
   int result;
-#ifdef HAVE_PSI_INTERFACE
-  if (likely(PSI_server && that->m_psi))
-    PSI_server->unlock_rwlock(that->m_psi);
+#ifdef HAVE_PSI_RWLOCK_INTERFACE
+  if (that->m_psi != NULL)
+    PSI_CALL(unlock_rwlock)(that->m_psi);
 #endif
   result= rw_pr_unlock(&that->m_prlock);
   return result;
 }
 #endif
 
+static inline void inline_mysql_cond_register(
+#ifdef HAVE_PSI_COND_INTERFACE
+  const char *category,
+  PSI_cond_info *info,
+  int count
+#else
+  const char *category __attribute__ ((unused)),
+  void *info __attribute__ ((unused)),
+  int count __attribute__ ((unused))
+#endif
+)
+{
+#ifdef HAVE_PSI_COND_INTERFACE
+  PSI_CALL(register_cond)(category, info, count);
+#endif
+}
+
 static inline int inline_mysql_cond_init(
-#ifdef HAVE_PSI_INTERFACE
+#ifdef HAVE_PSI_COND_INTERFACE
   PSI_cond_key key,
 #endif
   mysql_cond_t *that,
   const pthread_condattr_t *attr)
 {
-#ifdef HAVE_PSI_INTERFACE
-  that->m_psi= (PSI_server ? PSI_server->init_cond(key, &that->m_cond)
-                           : NULL);
+#ifdef HAVE_PSI_COND_INTERFACE
+  that->m_psi= PSI_CALL(init_cond)(key, &that->m_cond);
 #else
   that->m_psi= NULL;
 #endif
@@ -950,10 +1121,10 @@ static inline int inline_mysql_cond_init(
 static inline int inline_mysql_cond_destroy(
   mysql_cond_t *that)
 {
-#ifdef HAVE_PSI_INTERFACE
-  if (likely(PSI_server && that->m_psi))
+#ifdef HAVE_PSI_COND_INTERFACE
+  if (that->m_psi != NULL)
   {
-    PSI_server->destroy_cond(that->m_psi);
+    PSI_CALL(destroy_cond)(that->m_psi);
     that->m_psi= NULL;
   }
 #endif
@@ -963,28 +1134,36 @@ static inline int inline_mysql_cond_destroy(
 static inline int inline_mysql_cond_wait(
   mysql_cond_t *that,
   mysql_mutex_t *mutex
-#ifdef HAVE_PSI_INTERFACE
+#ifdef HAVE_PSI_COND_INTERFACE
   , const char *src_file, uint src_line
 #endif
   )
 {
   int result;
-#ifdef HAVE_PSI_INTERFACE
-  struct PSI_cond_locker *locker= NULL;
-  PSI_cond_locker_state state;
-  if (likely(PSI_server && that->m_psi))
+
+#ifdef HAVE_PSI_COND_INTERFACE
+  if (that->m_psi != NULL)
   {
-    locker= PSI_server->get_thread_cond_locker(&state, that->m_psi, mutex->m_psi,
-                                               PSI_COND_WAIT);
-    if (likely(locker != NULL))
-      PSI_server->start_cond_wait(locker, src_file, src_line);
+    /* Instrumentation start */
+    PSI_cond_locker *locker;
+    PSI_cond_locker_state state;
+    locker= PSI_CALL(start_cond_wait)(&state, that->m_psi, mutex->m_psi,
+                                      PSI_COND_WAIT, src_file, src_line);
+
+    /* Instrumented code */
+    result= my_cond_wait(&that->m_cond, &mutex->m_mutex);
+
+    /* Instrumentation end */
+    if (locker != NULL)
+      PSI_CALL(end_cond_wait)(locker, result);
+
+    return result;
   }
 #endif
-  result= pthread_cond_wait(&that->m_cond, &mutex->m_mutex);
-#ifdef HAVE_PSI_INTERFACE
-  if (likely(locker != NULL))
-    PSI_server->end_cond_wait(locker, result);
-#endif
+
+  /* Non instrumented code */
+  result= my_cond_wait(&that->m_cond, &mutex->m_mutex);
+
   return result;
 }
 
@@ -992,28 +1171,36 @@ static inline int inline_mysql_cond_timedwait(
   mysql_cond_t *that,
   mysql_mutex_t *mutex,
   struct timespec *abstime
-#ifdef HAVE_PSI_INTERFACE
+#ifdef HAVE_PSI_COND_INTERFACE
   , const char *src_file, uint src_line
 #endif
   )
 {
   int result;
-#ifdef HAVE_PSI_INTERFACE
-  struct PSI_cond_locker *locker= NULL;
-  PSI_cond_locker_state state;
-  if (likely(PSI_server && that->m_psi))
+
+#ifdef HAVE_PSI_COND_INTERFACE
+  if (that->m_psi != NULL)
   {
-    locker= PSI_server->get_thread_cond_locker(&state, that->m_psi, mutex->m_psi,
-                                               PSI_COND_TIMEDWAIT);
-    if (likely(locker != NULL))
-      PSI_server->start_cond_wait(locker, src_file, src_line);
+    /* Instrumentation start */
+    PSI_cond_locker *locker;
+    PSI_cond_locker_state state;
+    locker= PSI_CALL(start_cond_wait)(&state, that->m_psi, mutex->m_psi,
+                                      PSI_COND_TIMEDWAIT, src_file, src_line);
+
+    /* Instrumented code */
+    result= my_cond_timedwait(&that->m_cond, &mutex->m_mutex, abstime);
+
+    /* Instrumentation end */
+    if (locker != NULL)
+      PSI_CALL(end_cond_wait)(locker, result);
+
+    return result;
   }
 #endif
-  result= pthread_cond_timedwait(&that->m_cond, &mutex->m_mutex, abstime);
-#ifdef HAVE_PSI_INTERFACE
-  if (likely(locker != NULL))
-    PSI_server->end_cond_wait(locker, result);
-#endif
+
+  /* Non instrumented code */
+  result= my_cond_timedwait(&that->m_cond, &mutex->m_mutex, abstime);
+
   return result;
 }
 
@@ -1021,9 +1208,9 @@ static inline int inline_mysql_cond_signal(
   mysql_cond_t *that)
 {
   int result;
-#ifdef HAVE_PSI_INTERFACE
-  if (likely(PSI_server && that->m_psi))
-    PSI_server->signal_cond(that->m_psi);
+#ifdef HAVE_PSI_COND_INTERFACE
+  if (that->m_psi != NULL)
+    PSI_CALL(signal_cond)(that->m_psi);
 #endif
   result= pthread_cond_signal(&that->m_cond);
   return result;
@@ -1033,36 +1220,46 @@ static inline int inline_mysql_cond_broadcast(
   mysql_cond_t *that)
 {
   int result;
-#ifdef HAVE_PSI_INTERFACE
-  if (likely(PSI_server && that->m_psi))
-    PSI_server->broadcast_cond(that->m_psi);
+#ifdef HAVE_PSI_COND_INTERFACE
+  if (that->m_psi != NULL)
+    PSI_CALL(broadcast_cond)(that->m_psi);
 #endif
   result= pthread_cond_broadcast(&that->m_cond);
   return result;
 }
 
-#ifdef HAVE_PSI_INTERFACE
+static inline void inline_mysql_thread_register(
+#ifdef HAVE_PSI_THREAD_INTERFACE
+  const char *category,
+  PSI_thread_info *info,
+  int count
+#else
+  const char *category __attribute__ ((unused)),
+  void *info __attribute__ ((unused)),
+  int count __attribute__ ((unused))
+#endif
+)
+{
+#ifdef HAVE_PSI_THREAD_INTERFACE
+  PSI_CALL(register_thread)(category, info, count);
+#endif
+}
+
+#ifdef HAVE_PSI_THREAD_INTERFACE
 static inline int inline_mysql_thread_create(
   PSI_thread_key key,
   pthread_t *thread, const pthread_attr_t *attr,
   void *(*start_routine)(void*), void *arg)
 {
   int result;
-  if (likely(PSI_server != NULL))
-    result= PSI_server->spawn_thread(key, thread, attr, start_routine, arg);
-  else
-    result= pthread_create(thread, attr, start_routine, arg);
+  result= PSI_CALL(spawn_thread)(key, thread, attr, start_routine, arg);
   return result;
 }
 
 static inline void inline_mysql_thread_set_psi_id(ulong id)
 {
-  if (likely(PSI_server != NULL))
-  {
-    struct PSI_thread *psi= PSI_server->get_thread();
-    if (likely(psi != NULL))
-      PSI_server->set_thread_id(psi, id);
-  }
+  struct PSI_thread *psi= PSI_CALL(get_thread)();
+  PSI_CALL(set_thread_id)(psi, id);
 }
 #endif
 
diff --git a/include/mysql/psi/psi.h b/include/mysql/psi/psi.h
index 562e4a80fd5..8d5e6db7307 100644
--- a/include/mysql/psi/psi.h
+++ b/include/mysql/psi/psi.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2008, 2012, Oracle and/or its affiliates. All rights reserved.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
@@ -16,7 +16,20 @@
 #ifndef MYSQL_PERFORMANCE_SCHEMA_INTERFACE_H
 #define MYSQL_PERFORMANCE_SCHEMA_INTERFACE_H
 
-#ifndef _global_h
+#ifdef EMBEDDED_LIBRARY
+#define DISABLE_PSI_MUTEX
+#define DISABLE_PSI_RWLOCK
+#define DISABLE_PSI_COND
+#define DISABLE_PSI_FILE
+#define DISABLE_PSI_TABLE
+#define DISABLE_PSI_SOCKET
+#define DISABLE_PSI_STAGE
+#define DISABLE_PSI_STATEMENT
+#define DISABLE_PSI_IDLE
+#define DISABLE_PSI_STATEMENT_DIGEST
+#endif /* EMBEDDED_LIBRARY */
+
+#ifndef MY_GLOBAL_INCLUDED
 /*
   Make sure a .c or .cc file contains an include to my_global.h first.
   When this include is missing, all the #ifdef HAVE_XXX have no effect,
@@ -29,6 +42,20 @@
 
 C_MODE_START
 
+struct TABLE_SHARE;
+/*
+  There are 3 known bison parsers in the server:
+  - (1) the SQL parser itself, sql/sql_yacc.yy
+  - (2) storage/innobase/fts/fts0pars.y
+  - (3) storage/innobase/pars/pars0grm.y
+  What is instrumented here are the tokens from the SQL query text (1),
+  to make digests.
+  Now, to avoid name pollution and conflicts with different YYSTYPE definitions,
+  an opaque structure is used here.
+  The real type to use when invoking the digest api is LEX_YYSTYPE.
+*/
+struct OPAQUE_LEX_YYSTYPE;
+
 /**
   @file mysql/psi/psi.h
   Performance schema instrumentation interface.
@@ -43,42 +70,84 @@ C_MODE_START
   This is an opaque structure.
 */
 struct PSI_mutex;
+typedef struct PSI_mutex PSI_mutex;
 
 /**
   Interface for an instrumented rwlock.
   This is an opaque structure.
 */
 struct PSI_rwlock;
+typedef struct PSI_rwlock PSI_rwlock;
 
 /**
   Interface for an instrumented condition.
   This is an opaque structure.
 */
 struct PSI_cond;
+typedef struct PSI_cond PSI_cond;
 
 /**
   Interface for an instrumented table share.
   This is an opaque structure.
 */
 struct PSI_table_share;
+typedef struct PSI_table_share PSI_table_share;
 
 /**
   Interface for an instrumented table handle.
   This is an opaque structure.
 */
 struct PSI_table;
+typedef struct PSI_table PSI_table;
 
 /**
   Interface for an instrumented thread.
   This is an opaque structure.
 */
 struct PSI_thread;
+typedef struct PSI_thread PSI_thread;
 
 /**
   Interface for an instrumented file handle.
   This is an opaque structure.
 */
 struct PSI_file;
+typedef struct PSI_file PSI_file;
+
+/**
+  Interface for an instrumented socket descriptor.
+  This is an opaque structure.
+*/
+struct PSI_socket;
+typedef struct PSI_socket PSI_socket;
+
+/**
+  Interface for an instrumented table operation.
+  This is an opaque structure.
+*/
+struct PSI_table_locker;
+typedef struct PSI_table_locker PSI_table_locker;
+
+/**
+  Interface for an instrumented statement.
+  This is an opaque structure.
+*/
+struct PSI_statement_locker;
+typedef struct PSI_statement_locker PSI_statement_locker;
+
+/**
+  Interface for an instrumented idle operation.
+  This is an opaque structure.
+*/
+struct PSI_idle_locker;
+typedef struct PSI_idle_locker PSI_idle_locker;
+
+/**
+  Interface for an instrumented statement digest operation.
+  This is an opaque structure.
+*/
+struct PSI_digest_locker;
+typedef struct PSI_digest_locker PSI_digest_locker;
 
 /** Entry point for the performance schema interface. */
 struct PSI_bootstrap
@@ -98,10 +167,134 @@ struct PSI_bootstrap
   */
   void* (*get_interface)(int version);
 };
+typedef struct PSI_bootstrap PSI_bootstrap;
 
 #ifdef HAVE_PSI_INTERFACE
 
 /**
+  @def DISABLE_PSI_MUTEX
+  Compiling option to disable the mutex instrumentation.
+  This option is mostly intended to be used during development,
+  when doing special builds with only a subset of the performance schema instrumentation,
+  for code analysis / profiling / performance tuning of a specific instrumentation alone.
+  For this reason, DISABLE_PSI_MUTEX is not advertised in the cmake general options.
+  To disable mutexes, add -DDISABLE_PSI_MUTEX to CFLAGS.
+  @sa DISABLE_PSI_RWLOCK
+  @sa DISABLE_PSI_COND
+  @sa DISABLE_PSI_FILE
+  @sa DISABLE_PSI_THREAD
+  @sa DISABLE_PSI_TABLE
+  @sa DISABLE_PSI_STAGE
+  @sa DISABLE_PSI_STATEMENT
+  @sa DISABLE_PSI_SOCKET
+  @sa DISABLE_PSI_IDLE
+*/
+
+#ifndef DISABLE_PSI_MUTEX
+#define HAVE_PSI_MUTEX_INTERFACE
+#endif
+
+/**
+  @def DISABLE_PSI_RWLOCK
+  Compiling option to disable the rwlock instrumentation.
+  @sa DISABLE_PSI_MUTEX
+*/
+
+#ifndef DISABLE_PSI_RWLOCK
+#define HAVE_PSI_RWLOCK_INTERFACE
+#endif
+
+/**
+  @def DISABLE_PSI_COND
+  Compiling option to disable the cond instrumentation.
+  @sa DISABLE_PSI_MUTEX
+*/
+
+#ifndef DISABLE_PSI_COND
+#define HAVE_PSI_COND_INTERFACE
+#endif
+
+/**
+  @def DISABLE_PSI_FILE
+  Compiling option to disable the file instrumentation.
+  @sa DISABLE_PSI_MUTEX
+*/
+
+#ifndef DISABLE_PSI_FILE
+#define HAVE_PSI_FILE_INTERFACE
+#endif
+
+/**
+  @def DISABLE_PSI_THREAD
+  Compiling option to disable the thread instrumentation.
+  @sa DISABLE_PSI_MUTEX
+*/
+#ifndef DISABLE_PSI_THREAD
+#define HAVE_PSI_THREAD_INTERFACE
+#endif
+
+/**
+  @def DISABLE_PSI_TABLE
+  Compiling option to disable the table instrumentation.
+  @sa DISABLE_PSI_MUTEX
+*/
+
+#ifndef DISABLE_PSI_TABLE
+#define HAVE_PSI_TABLE_INTERFACE
+#endif
+
+/**
+  @def DISABLE_PSI_STAGE
+  Compiling option to disable the stage instrumentation.
+  @sa DISABLE_PSI_MUTEX
+*/
+
+#ifndef DISABLE_PSI_STAGE
+#define HAVE_PSI_STAGE_INTERFACE
+#endif
+
+/**
+  @def DISABLE_PSI_STATEMENT
+  Compiling option to disable the statement instrumentation.
+  @sa DISABLE_PSI_MUTEX
+*/
+
+#ifndef DISABLE_PSI_STATEMENT
+#define HAVE_PSI_STATEMENT_INTERFACE
+#endif
+
+/**
+  @def DISABLE_PSI_STATEMENT_DIGEST
+  Compiling option to disable the statement digest instrumentation.
+*/
+
+#ifndef DISABLE_PSI_STATEMENT
+#ifndef DISABLE_PSI_STATEMENT_DIGEST
+#define HAVE_PSI_STATEMENT_DIGEST_INTERFACE
+#endif
+#endif
+
+/**
+  @def DISABLE_PSI_SOCKET
+  Compiling option to disable the statement instrumentation.
+  @sa DISABLE_PSI_MUTEX
+*/
+
+#ifndef DISABLE_PSI_SOCKET
+#define HAVE_PSI_SOCKET_INTERFACE
+#endif
+
+/**
+  @def DISABLE_PSI_IDLE
+  Compiling option to disable the idle instrumentation.
+  @sa DISABLE_PSI_MUTEX
+*/
+
+#ifndef DISABLE_PSI_IDLE
+#define HAVE_PSI_IDLE_INTERFACE
+#endif
+
+/**
   @def PSI_VERSION_1
   Performance Schema Interface number for version 1.
   This version is supported.
@@ -133,25 +326,35 @@ struct PSI_bootstrap
   This is an opaque structure.
 */
 struct PSI_mutex_locker;
+typedef struct PSI_mutex_locker PSI_mutex_locker;
 
 /**
   Interface for an instrumented rwlock operation.
   This is an opaque structure.
 */
-
 struct PSI_rwlock_locker;
+typedef struct PSI_rwlock_locker PSI_rwlock_locker;
+
 /**
   Interface for an instrumented condition operation.
   This is an opaque structure.
 */
-
 struct PSI_cond_locker;
+typedef struct PSI_cond_locker PSI_cond_locker;
 
 /**
   Interface for an instrumented file operation.
   This is an opaque structure.
 */
 struct PSI_file_locker;
+typedef struct PSI_file_locker PSI_file_locker;
+
+/**
+  Interface for an instrumented socket operation.
+  This is an opaque structure.
+*/
+struct PSI_socket_locker;
+typedef struct PSI_socket_locker PSI_socket_locker;
 
 /** Operation performed on an instrumented mutex. */
 enum PSI_mutex_operation
@@ -161,6 +364,7 @@ enum PSI_mutex_operation
   /** Lock attempt. */
   PSI_MUTEX_TRYLOCK= 1
 };
+typedef enum PSI_mutex_operation PSI_mutex_operation;
 
 /** Operation performed on an instrumented rwlock. */
 enum PSI_rwlock_operation
@@ -174,6 +378,7 @@ enum PSI_rwlock_operation
   /** Write lock attempt. */
   PSI_RWLOCK_TRYWRITELOCK= 3
 };
+typedef enum PSI_rwlock_operation PSI_rwlock_operation;
 
 /** Operation performed on an instrumented condition. */
 enum PSI_cond_operation
@@ -183,6 +388,7 @@ enum PSI_cond_operation
   /** Wait, with timeout. */
   PSI_COND_TIMEDWAIT= 1
 };
+typedef enum PSI_cond_operation PSI_cond_operation;
 
 /** Operation performed on an instrumented file. */
 enum PSI_file_operation
@@ -228,12 +434,77 @@ enum PSI_file_operation
   /** File sync, as in @c fsync() or @c my_sync(). */
   PSI_FILE_SYNC= 16
 };
+typedef enum PSI_file_operation PSI_file_operation;
 
-/**
-  Interface for an instrumented table operation.
-  This is an opaque structure.
-*/
-struct PSI_table_locker;
+/** IO operation performed on an instrumented table. */
+enum PSI_table_io_operation
+{
+  /** Row fetch. */
+  PSI_TABLE_FETCH_ROW= 0,
+  /** Row write. */
+  PSI_TABLE_WRITE_ROW= 1,
+  /** Row update. */
+  PSI_TABLE_UPDATE_ROW= 2,
+  /** Row delete. */
+  PSI_TABLE_DELETE_ROW= 3
+};
+typedef enum PSI_table_io_operation PSI_table_io_operation;
+
+/** Lock operation performed on an instrumented table. */
+enum PSI_table_lock_operation
+{
+  /** Table lock, in the server layer. */
+  PSI_TABLE_LOCK= 0,
+  /** Table lock, in the storage engine layer. */
+  PSI_TABLE_EXTERNAL_LOCK= 1
+};
+typedef enum PSI_table_lock_operation PSI_table_lock_operation;
+
+/** State of an instrumented socket. */
+enum PSI_socket_state
+{
+  /** Idle, waiting for the next command. */
+  PSI_SOCKET_STATE_IDLE= 1,
+  /** Active, executing a command. */
+  PSI_SOCKET_STATE_ACTIVE= 2
+};
+typedef enum PSI_socket_state PSI_socket_state;
+
+/** Operation performed on an instrumented socket. */
+enum PSI_socket_operation
+{
+  /** Socket creation, as in @c socket() or @c socketpair(). */
+  PSI_SOCKET_CREATE= 0,
+  /** Socket connection, as in @c connect(), @c listen() and @c accept(). */
+  PSI_SOCKET_CONNECT= 1,
+  /** Socket bind, as in @c bind(), @c getsockname() and @c getpeername(). */
+  PSI_SOCKET_BIND= 2,
+  /** Socket close, as in @c shutdown(). */
+  PSI_SOCKET_CLOSE= 3,
+  /** Socket send, @c send(). */
+  PSI_SOCKET_SEND= 4,
+  /** Socket receive, @c recv(). */
+  PSI_SOCKET_RECV= 5,
+  /** Socket send, @c sendto(). */
+  PSI_SOCKET_SENDTO= 6,
+  /** Socket receive, @c recvfrom). */
+  PSI_SOCKET_RECVFROM= 7,
+  /** Socket send, @c sendmsg(). */
+  PSI_SOCKET_SENDMSG= 8,
+  /** Socket receive, @c recvmsg(). */
+  PSI_SOCKET_RECVMSG= 9,
+  /** Socket seek, such as @c fseek() or @c seek(). */
+  PSI_SOCKET_SEEK= 10,
+  /** Socket options, as in @c getsockopt() and @c setsockopt(). */
+  PSI_SOCKET_OPT= 11,
+  /** Socket status, as in @c sockatmark() and @c isfdtype(). */
+  PSI_SOCKET_STAT= 12,
+  /** Socket shutdown, as in @c shutdown(). */
+  PSI_SOCKET_SHUTDOWN= 13,
+  /** Socket select, as in @c select() and @c poll(). */
+  PSI_SOCKET_SELECT= 14
+};
+typedef enum PSI_socket_operation PSI_socket_operation;
 
 /**
   Instrumented mutex key.
@@ -274,6 +545,27 @@ typedef unsigned int PSI_thread_key;
 typedef unsigned int PSI_file_key;
 
 /**
+  Instrumented stage key.
+  To instrument a stage, a stage key must be obtained using @c register_stage.
+  Using a zero key always disable the instrumentation.
+*/
+typedef unsigned int PSI_stage_key;
+
+/**
+  Instrumented statement key.
+  To instrument a statement, a statement key must be obtained using @c register_statement.
+  Using a zero key always disable the instrumentation.
+*/
+typedef unsigned int PSI_statement_key;
+
+/**
+  Instrumented socket key.
+  To instrument a socket, a socket key must be obtained using @c register_socket.
+  Using a zero key always disable the instrumentation.
+*/
+typedef unsigned int PSI_socket_key;
+
+/**
   @def USE_PSI_1
   Define USE_PSI_1 to use the interface version 1.
 */
@@ -300,6 +592,13 @@ typedef unsigned int PSI_file_key;
 */
 #define PSI_FLAG_GLOBAL (1 << 0)
 
+/**
+  Global flag.
+  This flag indicate that an instrumentation point is a general placeholder,
+  that can mutate into a more specific instrumentation point.
+*/
+#define PSI_FLAG_MUTABLE (1 << 1)
+
 #ifdef USE_PSI_1
 #define HAVE_PSI_1
 #endif
@@ -423,18 +722,95 @@ struct PSI_file_info_v1
 };
 
 /**
-  State data storage for @c get_thread_mutex_locker_v1_t.
+  Stage instrument information.
+  @since PSI_VERSION_1
+  This structure is used to register an instrumented stage.
+*/
+struct PSI_stage_info_v1
+{
+  /** The registered stage key. */
+  PSI_stage_key m_key;
+  /** The name of the stage instrument to register. */
+  const char *m_name;
+  /** The flags of the stage instrument to register. */
+  int m_flags;
+};
+
+/**
+  Statement instrument information.
+  @since PSI_VERSION_1
+  This structure is used to register an instrumented statement.
+*/
+struct PSI_statement_info_v1
+{
+  /** The registered statement key. */
+  PSI_statement_key m_key;
+  /** The name of the statement instrument to register. */
+  const char *m_name;
+  /** The flags of the statement instrument to register. */
+  int m_flags;
+};
+
+/**
+  Socket instrument information.
+  @since PSI_VERSION_1
+  This structure is used to register an instrumented socket.
+*/
+struct PSI_socket_info_v1
+{
+  /**
+    Pointer to the key assigned to the registered socket.
+  */
+  PSI_socket_key *m_key;
+  /**
+    The name of the socket instrument to register.
+  */
+  const char *m_name;
+  /**
+    The flags of the socket instrument to register.
+    @sa PSI_FLAG_GLOBAL
+  */
+  int m_flags;
+};
+
+/**
+  State data storage for @c start_idle_wait_v1_t.
+  This structure provide temporary storage to an idle locker.
+  The content of this structure is considered opaque,
+  the fields are only hints of what an implementation
+  of the psi interface can use.
+  This memory is provided by the instrumented code for performance reasons.
+  @sa start_idle_wait_v1_t.
+*/
+struct PSI_idle_locker_state_v1
+{
+  /** Internal state. */
+  uint m_flags;
+  /** Current thread. */
+  struct PSI_thread *m_thread;
+  /** Timer start. */
+  ulonglong m_timer_start;
+  /** Timer function. */
+  ulonglong (*m_timer)(void);
+  /** Internal data. */
+  void *m_wait;
+};
+
+/**
+  State data storage for @c start_mutex_wait_v1_t.
   This structure provide temporary storage to a mutex locker.
   The content of this structure is considered opaque,
   the fields are only hints of what an implementation
   of the psi interface can use.
   This memory is provided by the instrumented code for performance reasons.
-  @sa get_thread_mutex_locker_v1_t
+  @sa start_mutex_wait_v1_t
 */
 struct PSI_mutex_locker_state_v1
 {
   /** Internal state. */
   uint m_flags;
+  /** Current operation. */
+  enum PSI_mutex_operation m_operation;
   /** Current mutex. */
   struct PSI_mutex *m_mutex;
   /** Current thread. */
@@ -443,29 +819,26 @@ struct PSI_mutex_locker_state_v1
   ulonglong m_timer_start;
   /** Timer function. */
   ulonglong (*m_timer)(void);
-  /** Current operation. */
-  enum PSI_mutex_operation m_operation;
-  /** Source file. */
-  const char* m_src_file;
-  /** Source line number. */
-  int m_src_line;
   /** Internal data. */
   void *m_wait;
 };
 
 /**
-  State data storage for @c get_thread_rwlock_locker_v1_t.
+  State data storage for @c start_rwlock_rdwait_v1_t, @c start_rwlock_wrwait_v1_t.
   This structure provide temporary storage to a rwlock locker.
   The content of this structure is considered opaque,
   the fields are only hints of what an implementation
   of the psi interface can use.
   This memory is provided by the instrumented code for performance reasons.
-  @sa get_thread_rwlock_locker_v1_t
+  @sa start_rwlock_rdwait_v1_t
+  @sa start_rwlock_wrwait_v1_t
 */
 struct PSI_rwlock_locker_state_v1
 {
   /** Internal state. */
   uint m_flags;
+  /** Current operation. */
+  enum PSI_rwlock_operation m_operation;
   /** Current rwlock. */
   struct PSI_rwlock *m_rwlock;
   /** Current thread. */
@@ -474,29 +847,25 @@ struct PSI_rwlock_locker_state_v1
   ulonglong m_timer_start;
   /** Timer function. */
   ulonglong (*m_timer)(void);
-  /** Current operation. */
-  enum PSI_rwlock_operation m_operation;
-  /** Source file. */
-  const char* m_src_file;
-  /** Source line number. */
-  int m_src_line;
   /** Internal data. */
   void *m_wait;
 };
 
 /**
-  State data storage for @c get_thread_cond_locker_v1_t.
+  State data storage for @c start_cond_wait_v1_t.
   This structure provide temporary storage to a condition locker.
   The content of this structure is considered opaque,
   the fields are only hints of what an implementation
   of the psi interface can use.
   This memory is provided by the instrumented code for performance reasons.
-  @sa get_thread_cond_locker_v1_t
+  @sa start_cond_wait_v1_t
 */
 struct PSI_cond_locker_state_v1
 {
   /** Internal state. */
   uint m_flags;
+  /** Current operation. */
+  enum PSI_cond_operation m_operation;
   /** Current condition. */
   struct PSI_cond *m_cond;
   /** Current mutex. */
@@ -507,12 +876,6 @@ struct PSI_cond_locker_state_v1
   ulonglong m_timer_start;
   /** Timer function. */
   ulonglong (*m_timer)(void);
-  /** Current operation. */
-  enum PSI_cond_operation m_operation;
-  /** Source file. */
-  const char* m_src_file;
-  /** Source line number. */
-  int m_src_line;
   /** Internal data. */
   void *m_wait;
 };
@@ -532,6 +895,8 @@ struct PSI_file_locker_state_v1
 {
   /** Internal state. */
   uint m_flags;
+  /** Current operation. */
+  enum PSI_file_operation m_operation;
   /** Current file. */
   struct PSI_file *m_file;
   /** Current thread. */
@@ -542,33 +907,88 @@ struct PSI_file_locker_state_v1
   ulonglong m_timer_start;
   /** Timer function. */
   ulonglong (*m_timer)(void);
-  /** Current operation. */
-  enum PSI_file_operation m_operation;
-  /** Source file. */
-  const char* m_src_file;
-  /** Source line number. */
-  int m_src_line;
   /** Internal data. */
   void *m_wait;
 };
 
 /**
-  State data storage for @c get_thread_table_locker_v1_t.
+  State data storage for @c start_table_io_wait_v1_t,
+  @c start_table_lock_wait_v1_t.
   This structure provide temporary storage to a table locker.
   The content of this structure is considered opaque,
   the fields are only hints of what an implementation
   of the psi interface can use.
   This memory is provided by the instrumented code for performance reasons.
-  @sa get_thread_table_locker_v1_t
+  @sa start_table_io_wait_v1_t
+  @sa start_table_lock_wait_v1_t
 */
 struct PSI_table_locker_state_v1
 {
   /** Internal state. */
   uint m_flags;
+  /** Current io operation. */
+  enum PSI_table_io_operation m_io_operation;
   /** Current table handle. */
   struct PSI_table *m_table;
   /** Current table share. */
   struct PSI_table_share *m_table_share;
+  /** Current thread. */
+  struct PSI_thread *m_thread;
+  /** Timer start. */
+  ulonglong m_timer_start;
+  /** Timer function. */
+  ulonglong (*m_timer)(void);
+  /** Internal data. */
+  void *m_wait;
+  /**
+    Implementation specific.
+    For table io, the table io index.
+    For table lock, the lock type.
+  */
+  uint m_index;
+};
+
+#define PSI_MAX_DIGEST_STORAGE_SIZE 1024
+
+/**
+  Structure to store token count/array for a statement
+  on which digest is to be calculated.
+*/
+struct PSI_digest_storage
+{
+  my_bool m_full;
+  int m_byte_count;
+  unsigned char m_token_array[PSI_MAX_DIGEST_STORAGE_SIZE];
+};
+typedef struct PSI_digest_storage PSI_digest_storage;
+
+struct PSI_digest_locker_state
+{
+  int m_last_id_index;
+  PSI_digest_storage m_digest_storage;
+};
+typedef struct PSI_digest_locker_state PSI_digest_locker_state;
+
+/**
+  State data storage for @c get_thread_statement_locker_v1_t,
+  @c get_thread_statement_locker_v1_t.
+  This structure provide temporary storage to a statement locker.
+  The content of this structure is considered opaque,
+  the fields are only hints of what an implementation
+  of the psi interface can use.
+  This memory is provided by the instrumented code for performance reasons.
+  @sa get_thread_statement_locker_v1_t
+*/
+struct PSI_statement_locker_state_v1
+{
+  /** Discarded flag. */
+  my_bool m_discarded;
+  /** Metric, no index used flag. */
+  uchar m_no_index_used;
+  /** Metric, no good index used flag. */
+  uchar m_no_good_index_used;
+  /** Internal state. */
+  uint m_flags;
   /** Instrumentation class. */
   void *m_class;
   /** Current thread. */
@@ -577,12 +997,65 @@ struct PSI_table_locker_state_v1
   ulonglong m_timer_start;
   /** Timer function. */
   ulonglong (*m_timer)(void);
-  /* Current operation (waiting for WL#4895). */
-  /* enum PSI_table_operation m_operation; */
-  /** Current table io index. */
-  uint m_index;
-  /** Current table lock index. */
-  uint m_lock_index;
+  /** Internal data. */
+  void *m_statement;
+  /** Locked time. */
+  ulonglong m_lock_time;
+  /** Rows sent. */
+  ulonglong m_rows_sent;
+  /** Rows examined. */
+  ulonglong m_rows_examined;
+  /** Metric, temporary tables created on disk. */
+  ulong m_created_tmp_disk_tables;
+  /** Metric, temporary tables created. */
+  ulong m_created_tmp_tables;
+  /** Metric, number of select full join. */
+  ulong m_select_full_join;
+  /** Metric, number of select full range join. */
+  ulong m_select_full_range_join;
+  /** Metric, number of select range. */
+  ulong m_select_range;
+  /** Metric, number of select range check. */
+  ulong m_select_range_check;
+  /** Metric, number of select scan. */
+  ulong m_select_scan;
+  /** Metric, number of sort merge passes. */
+  ulong m_sort_merge_passes;
+  /** Metric, number of sort merge. */
+  ulong m_sort_range;
+  /** Metric, number of sort rows. */
+  ulong m_sort_rows;
+  /** Metric, number of sort scans. */
+  ulong m_sort_scan;
+  /** Statement digest. */
+  PSI_digest_locker_state m_digest_state;
+};
+
+/**
+  State data storage for @c start_socket_wait_v1_t.
+  This structure provide temporary storage to a socket locker.
+  The content of this structure is considered opaque,
+  the fields are only hints of what an implementation
+  of the psi interface can use.
+  This memory is provided by the instrumented code for performance reasons.
+  @sa start_socket_wait_v1_t
+*/
+struct PSI_socket_locker_state_v1
+{
+  /** Internal state. */
+  uint m_flags;
+  /** Current socket. */
+  struct PSI_socket *m_socket;
+  /** Current thread. */
+  struct PSI_thread *m_thread;
+  /** Operation number of bytes. */
+  size_t m_number_of_bytes;
+  /** Timer start. */
+  ulonglong m_timer_start;
+  /** Timer function. */
+  ulonglong (*m_timer)(void);
+  /** Current operation. */
+  enum PSI_socket_operation m_operation;
   /** Source file. */
   const char* m_src_file;
   /** Source line number. */
@@ -639,6 +1112,33 @@ typedef void (*register_file_v1_t)
   (const char *category, struct PSI_file_info_v1 *info, int count);
 
 /**
+  Stage registration API.
+  @param category a category name
+  @param info an array of stage info to register
+  @param count the size of the info array
+*/
+typedef void (*register_stage_v1_t)
+  (const char *category, struct PSI_stage_info_v1 **info, int count);
+
+/**
+  Statement registration API.
+  @param category a category name
+  @param info an array of stage info to register
+  @param count the size of the info array
+*/
+typedef void (*register_statement_v1_t)
+  (const char *category, struct PSI_statement_info_v1 *info, int count);
+
+/**
+  Socket registration API.
+  @param category a category name (typically a plugin name)
+  @param info an array of socket info to register
+  @param count the size of the info array
+*/
+typedef void (*register_socket_v1_t)
+  (const char *category, struct PSI_socket_info_v1 *info, int count);
+
+/**
   Mutex instrumentation initialisation API.
   @param key the registered mutex key
   @param identity the address of the mutex itself
@@ -684,17 +1184,28 @@ typedef struct PSI_cond* (*init_cond_v1_t)
 typedef void (*destroy_cond_v1_t)(struct PSI_cond *cond);
 
 /**
-  Acquire a table info by name.
-  @param schema_name name of the table schema
-  @param schema_name_length length of schema_name
-  @param table_name name of the table
-  @param table_name_length length of table_name
-  @param identity table identity pointer, typically the table share
-  @return a table info, or NULL if the table is not instrumented
+  Socket instrumentation initialisation API.
+  @param key the registered mutex key
+  @param socket descriptor
+  @return an instrumented socket
+*/
+typedef struct PSI_socket* (*init_socket_v1_t)
+  (PSI_socket_key key, const my_socket *fd);
+
+/**
+  socket instrumentation destruction API.
+  @param socket the socket to destroy
+*/
+typedef void (*destroy_socket_v1_t)(struct PSI_socket *socket);
+
+/**
+  Acquire a table share instrumentation.
+  @param temporary True for temporary tables
+  @param share The SQL layer table share
+  @return a table share instrumentation, or NULL
 */
 typedef struct PSI_table_share* (*get_table_share_v1_t)
-  (const char *schema_name, int schema_name_length, const char *table_name,
-   int table_name_length, const void *identity);
+  (my_bool temporary, struct TABLE_SHARE *share);
 
 /**
   Release a table share.
@@ -703,6 +1214,18 @@ typedef struct PSI_table_share* (*get_table_share_v1_t)
 typedef void (*release_table_share_v1_t)(struct PSI_table_share *share);
 
 /**
+  Drop a table share.
+  @param temporary True for temporary tables
+  @param schema_name the table schema name
+  @param schema_name_length the table schema name length
+  @param table_name the table name
+  @param table_name_length the table name length
+*/
+typedef void (*drop_table_share_v1_t)
+  (my_bool temporary, const char *schema_name, int schema_name_length,
+   const char *table_name, int table_name_length);
+
+/**
   Open an instrumentation table handle.
   @param share the table to open
   @param identity table handle identity
@@ -712,6 +1235,23 @@ typedef struct PSI_table* (*open_table_v1_t)
   (struct PSI_table_share *share, const void *identity);
 
 /**
+  Unbind a table handle from the current thread.
+  This operation happens when an opened table is added to the open table cache.
+  @param table the table to unbind
+*/
+typedef void (*unbind_table_v1_t)
+  (struct PSI_table *table);
+
+/**
+  Rebind a table handle to the current thread.
+  This operation happens when a table from the open table cache
+  is reused for a thread.
+  @param table the table to unbind
+*/
+typedef PSI_table* (*rebind_table_v1_t)
+  (PSI_table_share *share, const void *identity, PSI_table *table);
+
+/**
   Close an instrumentation table handle.
   Note that the table handle is invalid after this call.
   @param table the table handle to close
@@ -770,65 +1310,70 @@ typedef void (*set_thread_id_v1_t)(struct PSI_thread *thread,
 typedef struct PSI_thread* (*get_thread_v1_t)(void);
 
 /**
-  Attach a thread instrumentation to the running thread.
-  In case of thread pools, this method should be called when
-  a worker thread picks a work item and runs it.
-  Also, this method should be called if the instrumented code does not
-  keep the pointer returned by @c new_thread() and relies on @c get_thread()
-  instead.
-  @param thread the thread instrumentation
+  Assign a user name to the instrumented thread.
+  @param user the user name
+  @param user_len the user name length
 */
-typedef void (*set_thread_v1_t)(struct PSI_thread *thread);
+typedef void (*set_thread_user_v1_t)(const char *user, int user_len);
 
-/** Delete the current thread instrumentation. */
-typedef void (*delete_current_thread_v1_t)(void);
+/**
+  Assign a user name and host name to the instrumented thread.
+  @param user the user name
+  @param user_len the user name length
+  @param host the host name
+  @param host_len the host name length
+*/
+typedef void (*set_thread_user_host_v1_t)(const char *user, int user_len,
+                                          const char *host, int host_len);
 
-/** Delete a thread instrumentation. */
-typedef void (*delete_thread_v1_t)(struct PSI_thread *thread);
+/**
+  Assign a current database to the instrumented thread.
+  @param db the database name
+  @param db_len the database name length
+*/
+typedef void (*set_thread_db_v1_t)(const char* db, int db_len);
 
 /**
-  Get a mutex instrumentation locker.
-  @param state data storage for the locker
-  @param mutex the instrumented mutex to lock
-  @return a mutex locker, or NULL
+  Assign a current command to the instrumented thread.
+  @param command the current command
 */
-typedef struct PSI_mutex_locker* (*get_thread_mutex_locker_v1_t)
-  (struct PSI_mutex_locker_state_v1 *state,
-   struct PSI_mutex *mutex,
-   enum PSI_mutex_operation op);
+typedef void (*set_thread_command_v1_t)(int command);
 
 /**
-  Get a rwlock instrumentation locker.
-  @param state data storage for the locker
-  @param rwlock the instrumented rwlock to lock
-  @return a rwlock locker, or NULL
+  Assign a start time to the instrumented thread.
+  @param start_time the thread start time
 */
-typedef struct PSI_rwlock_locker* (*get_thread_rwlock_locker_v1_t)
-  (struct PSI_rwlock_locker_state_v1 *state,
-   struct PSI_rwlock *rwlock,
-   enum PSI_rwlock_operation op);
+typedef void (*set_thread_start_time_v1_t)(time_t start_time);
 
 /**
-  Get a cond instrumentation locker.
-  @param state data storage for the locker
-  @param cond the instrumented condition to wait on
-  @param mutex the instrumented mutex associated with the condition
-  @return a condition locker, or NULL
+  Assign a state to the instrumented thread.
+  @param state the thread state
 */
-typedef struct PSI_cond_locker* (*get_thread_cond_locker_v1_t)
-  (struct PSI_cond_locker_state_v1 *state,
-   struct PSI_cond *cond, struct PSI_mutex *mutex,
-   enum PSI_cond_operation op);
+typedef void (*set_thread_state_v1_t)(const char* state);
 
 /**
-  Get a table instrumentation locker.
-  @param state data storage for the locker
-  @param table the instrumented table to lock
-  @return a table locker, or NULL
+  Assign a process info to the instrumented thread.
+  @param info the process into string
+  @param info_len the process into string length
 */
-typedef struct PSI_table_locker* (*get_thread_table_locker_v1_t)
-  (struct PSI_table_locker_state_v1 *state,
-   struct PSI_table *table);
+typedef void (*set_thread_info_v1_t)(const char* info, int info_len);
+
+/**
+  Attach a thread instrumentation to the running thread.
+  In case of thread pools, this method should be called when
+  a worker thread picks a work item and runs it.
+  Also, this method should be called if the instrumented code does not
+  keep the pointer returned by @c new_thread() and relies on @c get_thread()
+  instead.
+  @param thread the thread instrumentation
+*/
+typedef void (*set_thread_v1_t)(struct PSI_thread *thread);
+
+/** Delete the current thread instrumentation. */
+typedef void (*delete_current_thread_v1_t)(void);
+
+/** Delete a thread instrumentation. */
+typedef void (*delete_thread_v1_t)(struct PSI_thread *thread);
 
 /**
   Get a file instrumentation locker, for opening or creating a file.
@@ -894,12 +1439,26 @@ typedef void (*signal_cond_v1_t)
 typedef void (*broadcast_cond_v1_t)
   (struct PSI_cond *cond);
 
+typedef struct PSI_idle_locker* (*start_idle_wait_v1_t)
+  (struct PSI_idle_locker_state_v1 *state, const char *src_file, uint src_line);
+
+typedef void (*end_idle_wait_v1_t)
+  (struct PSI_idle_locker *locker);
+
 /**
   Record a mutex instrumentation wait start event.
-  @param locker a thread locker for the running thread
+  @param state data storage for the locker
+  @param mutex the instrumented mutex to lock
+  @param op the operation to perform
+  @param file the source file name
+  @param line the source line number
+  @return a mutex locker, or NULL
 */
-typedef void (*start_mutex_wait_v1_t)
-  (struct PSI_mutex_locker *locker, const char *src_file, uint src_line);
+typedef struct PSI_mutex_locker* (*start_mutex_wait_v1_t)
+  (struct PSI_mutex_locker_state_v1 *state,
+   struct PSI_mutex *mutex,
+   enum PSI_mutex_operation op,
+   const char *src_file, uint src_line);
 
 /**
   Record a mutex instrumentation wait end event.
@@ -914,8 +1473,11 @@ typedef void (*end_mutex_wait_v1_t)
   @param locker a thread locker for the running thread
   @param must must block: 1 for lock, 0 for trylock
 */
-typedef void (*start_rwlock_rdwait_v1_t)
-  (struct PSI_rwlock_locker *locker, const char *src_file, uint src_line);
+typedef struct PSI_rwlock_locker* (*start_rwlock_rdwait_v1_t)
+  (struct PSI_rwlock_locker_state_v1 *state,
+   struct PSI_rwlock *rwlock,
+   enum PSI_rwlock_operation op,
+   const char *src_file, uint src_line);
 
 /**
   Record a rwlock instrumentation read wait end event.
@@ -930,8 +1492,11 @@ typedef void (*end_rwlock_rdwait_v1_t)
   @param locker a thread locker for the running thread
   @param must must block: 1 for lock, 0 for trylock
 */
-typedef void (*start_rwlock_wrwait_v1_t)
-  (struct PSI_rwlock_locker *locker, const char *src_file, uint src_line);
+typedef struct PSI_rwlock_locker* (*start_rwlock_wrwait_v1_t)
+  (struct PSI_rwlock_locker_state_v1 *state,
+   struct PSI_rwlock *rwlock,
+   enum PSI_rwlock_operation op,
+   const char *src_file, uint src_line);
 
 /**
   Record a rwlock instrumentation write wait end event.
@@ -946,8 +1511,12 @@ typedef void (*end_rwlock_wrwait_v1_t)
   @param locker a thread locker for the running thread
   @param must must block: 1 for wait, 0 for timedwait
 */
-typedef void (*start_cond_wait_v1_t)
-  (struct PSI_cond_locker *locker, const char *src_file, uint src_line);
+typedef struct PSI_cond_locker* (*start_cond_wait_v1_t)
+  (struct PSI_cond_locker_state_v1 *state,
+   struct PSI_cond *cond,
+   struct PSI_mutex *mutex,
+   enum PSI_cond_operation op,
+   const char *src_file, uint src_line);
 
 /**
   Record a condition instrumentation wait end event.
@@ -958,19 +1527,42 @@ typedef void (*end_cond_wait_v1_t)
   (struct PSI_cond_locker *locker, int rc);
 
 /**
-  Record a table instrumentation wait start event.
+  Record a table instrumentation io wait start event.
   @param locker a table locker for the running thread
   @param file the source file name
   @param line the source line number
 */
-typedef void (*start_table_wait_v1_t)
-  (struct PSI_table_locker *locker, const char *src_file, uint src_line);
+typedef struct PSI_table_locker* (*start_table_io_wait_v1_t)
+  (struct PSI_table_locker_state_v1 *state,
+   struct PSI_table *table,
+   enum PSI_table_io_operation op,
+   uint index,
+   const char *src_file, uint src_line);
+
+/**
+  Record a table instrumentation io wait end event.
+  @param locker a table locker for the running thread
+*/
+typedef void (*end_table_io_wait_v1_t)(struct PSI_table_locker *locker);
+
+/**
+  Record a table instrumentation lock wait start event.
+  @param locker a table locker for the running thread
+  @param file the source file name
+  @param line the source line number
+*/
+typedef struct PSI_table_locker* (*start_table_lock_wait_v1_t)
+  (struct PSI_table_locker_state_v1 *state,
+   struct PSI_table *table,
+   enum PSI_table_lock_operation op,
+   ulong flags,
+   const char *src_file, uint src_line);
 
 /**
-  Record a table instrumentation wait end event.
+  Record a table instrumentation lock wait end event.
   @param locker a table locker for the running thread
 */
-typedef void (*end_table_wait_v1_t)(struct PSI_table_locker *locker);
+typedef void (*end_table_lock_wait_v1_t)(struct PSI_table_locker *locker);
 
 /**
   Start a file instrumentation open operation.
@@ -1025,6 +1617,260 @@ typedef void (*end_file_wait_v1_t)
   (struct PSI_file_locker *locker, size_t count);
 
 /**
+  Start a new stage, and implicitly end the previous stage.
+  @param key the key of the new stage
+  @param src_file the source file name
+  @param src_line the source line number
+*/
+typedef void (*start_stage_v1_t)
+  (PSI_stage_key key, const char *src_file, int src_line);
+
+/** End the current stage. */
+typedef void (*end_stage_v1_t) (void);
+
+/**
+  Get a statement instrumentation locker.
+  @param state data storage for the locker
+  @param key the statement instrumentation key
+  @return a statement locker, or NULL
+*/
+typedef struct PSI_statement_locker* (*get_thread_statement_locker_v1_t)
+  (struct PSI_statement_locker_state_v1 *state,
+   PSI_statement_key key);
+
+/**
+  Refine a statement locker to a more specific key.
+  Note that only events declared mutable can be refined.
+  @param the statement locker for the current event
+  @param key the new key for the event
+  @sa PSI_FLAG_MUTABLE
+*/
+typedef struct PSI_statement_locker* (*refine_statement_v1_t)
+  (struct PSI_statement_locker *locker,
+   PSI_statement_key key);
+
+/**
+  Start a new statement event.
+  @param locker the statement locker for this event
+  @param db the active database name for this statement
+  @param db_length the active database name length for this statement
+  @param src_file source file name
+  @param src_line source line number
+*/
+typedef void (*start_statement_v1_t)
+  (struct PSI_statement_locker *locker,
+   const char *db, uint db_length,
+   const char *src_file, uint src_line);
+
+/**
+  Set the statement text for a statement event.
+  @param locker the current statement locker
+  @param text the statement text
+  @param text_len the statement text length
+*/
+typedef void (*set_statement_text_v1_t)
+  (struct PSI_statement_locker *locker,
+   const char *text, uint text_len);
+
+/**
+  Set a statement event lock time.
+  @param locker the statement locker
+  @param lock_time the locked time, in microseconds
+*/
+typedef void (*set_statement_lock_time_t)
+  (struct PSI_statement_locker *locker, ulonglong lock_time);
+
+/**
+  Set a statement event rows sent metric.
+  @param locker the statement locker
+  @param count the number of rows sent
+*/
+typedef void (*set_statement_rows_sent_t)
+  (struct PSI_statement_locker *locker, ulonglong count);
+
+/**
+  Set a statement event rows examined metric.
+  @param locker the statement locker
+  @param count the number of rows examined
+*/
+typedef void (*set_statement_rows_examined_t)
+  (struct PSI_statement_locker *locker, ulonglong count);
+
+/**
+  Increment a statement event "created tmp disk tables" metric.
+  @param locker the statement locker
+  @param count the metric increment value
+*/
+typedef void (*inc_statement_created_tmp_disk_tables_t)
+  (struct PSI_statement_locker *locker, ulong count);
+
+/**
+  Increment a statement event "created tmp tables" metric.
+  @param locker the statement locker
+  @param count the metric increment value
+*/
+typedef void (*inc_statement_created_tmp_tables_t)
+  (struct PSI_statement_locker *locker, ulong count);
+
+/**
+  Increment a statement event "select full join" metric.
+  @param locker the statement locker
+  @param count the metric increment value
+*/
+typedef void (*inc_statement_select_full_join_t)
+  (struct PSI_statement_locker *locker, ulong count);
+
+/**
+  Increment a statement event "select full range join" metric.
+  @param locker the statement locker
+  @param count the metric increment value
+*/
+typedef void (*inc_statement_select_full_range_join_t)
+  (struct PSI_statement_locker *locker, ulong count);
+
+/**
+  Increment a statement event "select range join" metric.
+  @param locker the statement locker
+  @param count the metric increment value
+*/
+typedef void (*inc_statement_select_range_t)
+  (struct PSI_statement_locker *locker, ulong count);
+
+/**
+  Increment a statement event "select range check" metric.
+  @param locker the statement locker
+  @param count the metric increment value
+*/
+typedef void (*inc_statement_select_range_check_t)
+  (struct PSI_statement_locker *locker, ulong count);
+
+/**
+  Increment a statement event "select scan" metric.
+  @param locker the statement locker
+  @param count the metric increment value
+*/
+typedef void (*inc_statement_select_scan_t)
+  (struct PSI_statement_locker *locker, ulong count);
+
+/**
+  Increment a statement event "sort merge passes" metric.
+  @param locker the statement locker
+  @param count the metric increment value
+*/
+typedef void (*inc_statement_sort_merge_passes_t)
+  (struct PSI_statement_locker *locker, ulong count);
+
+/**
+  Increment a statement event "sort range" metric.
+  @param locker the statement locker
+  @param count the metric increment value
+*/
+typedef void (*inc_statement_sort_range_t)
+  (struct PSI_statement_locker *locker, ulong count);
+
+/**
+  Increment a statement event "sort rows" metric.
+  @param locker the statement locker
+  @param count the metric increment value
+*/
+typedef void (*inc_statement_sort_rows_t)
+  (struct PSI_statement_locker *locker, ulong count);
+
+/**
+  Increment a statement event "sort scan" metric.
+  @param locker the statement locker
+  @param count the metric increment value
+*/
+typedef void (*inc_statement_sort_scan_t)
+  (struct PSI_statement_locker *locker, ulong count);
+
+/**
+  Set a statement event "no index used" metric.
+  @param locker the statement locker
+  @param count the metric value
+*/
+typedef void (*set_statement_no_index_used_t)
+  (struct PSI_statement_locker *locker);
+
+/**
+  Set a statement event "no good index used" metric.
+  @param locker the statement locker
+  @param count the metric value
+*/
+typedef void (*set_statement_no_good_index_used_t)
+  (struct PSI_statement_locker *locker);
+
+/**
+  End a statement event.
+  @param locker the statement locker
+  @param stmt_da the statement diagnostics area.
+  @sa Diagnostics_area
+*/
+typedef void (*end_statement_v1_t)
+  (struct PSI_statement_locker *locker, void *stmt_da);
+
+/**
+  Record a socket instrumentation start event.
+  @param locker a socket locker for the running thread
+  @param op socket operation to be performed
+  @param count the number of bytes requested, or 0 if not applicable
+  @param src_file the source file name
+  @param src_line the source line number
+*/
+typedef struct PSI_socket_locker* (*start_socket_wait_v1_t)
+  (struct PSI_socket_locker_state_v1 *state,
+   struct PSI_socket *socket,
+   enum PSI_socket_operation op,
+   size_t count,
+   const char *src_file, uint src_line);
+
+/**
+  Record a socket instrumentation end event.
+  Note that for socket close operations, the instrumented socket handle
+  associated with the socket (which was provided to obtain a locker)
+  is invalid after this call.
+  @param locker a socket locker for the running thread
+  @param count the number of bytes actually used in the operation,
+  or 0 if not applicable, or -1 if the operation failed
+  @sa get_thread_socket_locker
+*/
+typedef void (*end_socket_wait_v1_t)
+  (struct PSI_socket_locker *locker, size_t count);
+
+/**
+  Set the socket state for an instrumented socket.
+    @param socket the instrumented socket
+    @param state socket state
+  */
+typedef void (*set_socket_state_v1_t)(struct PSI_socket *socket,
+                                      enum PSI_socket_state state);
+
+/**
+  Set the socket info for an instrumented socket.
+  @param socket the instrumented socket
+  @param fd the socket descriptor
+  @param addr the socket ip address
+  @param addr_len length of socket ip address
+  @param thread_id associated thread id
+*/
+typedef void (*set_socket_info_v1_t)(struct PSI_socket *socket,
+                                     const my_socket *fd,
+                                     const struct sockaddr *addr,
+                                     socklen_t addr_len);
+
+/**
+  Bind a socket to the thread that owns it.
+  @param socket instrumented socket
+*/
+typedef void (*set_socket_thread_owner_v1_t)(struct PSI_socket *socket);
+
+typedef struct PSI_digest_locker * (*digest_start_v1_t)
+  (struct PSI_statement_locker *locker);
+
+typedef struct PSI_digest_locker* (*digest_add_token_v1_t)
+  (struct PSI_digest_locker *locker, uint token, struct OPAQUE_LEX_YYSTYPE *yylval);
+
+/**
   Performance Schema Interface, version 1.
   @since PSI_VERSION_1
 */
@@ -1040,6 +1886,12 @@ struct PSI_v1
   register_thread_v1_t register_thread;
   /** @sa register_file_v1_t. */
   register_file_v1_t register_file;
+  /** @sa register_stage_v1_t. */
+  register_stage_v1_t register_stage;
+  /** @sa register_statement_v1_t. */
+  register_statement_v1_t register_statement;
+  /** @sa register_socket_v1_t. */
+  register_socket_v1_t register_socket;
   /** @sa init_mutex_v1_t. */
   init_mutex_v1_t init_mutex;
   /** @sa destroy_mutex_v1_t. */
@@ -1052,12 +1904,22 @@ struct PSI_v1
   init_cond_v1_t init_cond;
   /** @sa destroy_cond_v1_t. */
   destroy_cond_v1_t destroy_cond;
+  /** @sa init_socket_v1_t. */
+  init_socket_v1_t init_socket;
+  /** @sa destroy_socket_v1_t. */
+  destroy_socket_v1_t destroy_socket;
   /** @sa get_table_share_v1_t. */
   get_table_share_v1_t get_table_share;
   /** @sa release_table_share_v1_t. */
   release_table_share_v1_t release_table_share;
+  /** @sa drop_table_share_v1_t. */
+  drop_table_share_v1_t drop_table_share;
   /** @sa open_table_v1_t. */
   open_table_v1_t open_table;
+  /** @sa unbind_table_v1_t. */
+  unbind_table_v1_t unbind_table;
+  /** @sa rebind_table_v1_t. */
+  rebind_table_v1_t rebind_table;
   /** @sa close_table_v1_t. */
   close_table_v1_t close_table;
   /** @sa create_file_v1_t. */
@@ -1070,20 +1932,26 @@ struct PSI_v1
   set_thread_id_v1_t set_thread_id;
   /** @sa get_thread_v1_t. */
   get_thread_v1_t get_thread;
+  /** @sa set_thread_user_v1_t. */
+  set_thread_user_v1_t set_thread_user;
+  /** @sa set_thread_user_host_v1_t. */
+  set_thread_user_host_v1_t set_thread_user_host;
+  /** @sa set_thread_db_v1_t. */
+  set_thread_db_v1_t set_thread_db;
+  /** @sa set_thread_command_v1_t. */
+  set_thread_command_v1_t set_thread_command;
+  /** @sa set_thread_start_time_v1_t. */
+  set_thread_start_time_v1_t set_thread_start_time;
+  /** @sa set_thread_state_v1_t. */
+  set_thread_state_v1_t set_thread_state;
+  /** @sa set_thread_info_v1_t. */
+  set_thread_info_v1_t set_thread_info;
   /** @sa set_thread_v1_t. */
   set_thread_v1_t set_thread;
   /** @sa delete_current_thread_v1_t. */
   delete_current_thread_v1_t delete_current_thread;
   /** @sa delete_thread_v1_t. */
   delete_thread_v1_t delete_thread;
-  /** @sa get_thread_mutex_locker_v1_t. */
-  get_thread_mutex_locker_v1_t get_thread_mutex_locker;
-  /** @sa get_thread_rwlock_locker_v1_t. */
-  get_thread_rwlock_locker_v1_t get_thread_rwlock_locker;
-  /** @sa get_thread_cond_locker_v1_t. */
-  get_thread_cond_locker_v1_t get_thread_cond_locker;
-  /** @sa get_thread_table_locker_v1_t. */
-  get_thread_table_locker_v1_t get_thread_table_locker;
   /** @sa get_thread_file_name_locker_v1_t. */
   get_thread_file_name_locker_v1_t get_thread_file_name_locker;
   /** @sa get_thread_file_stream_locker_v1_t. */
@@ -1098,6 +1966,10 @@ struct PSI_v1
   signal_cond_v1_t signal_cond;
   /** @sa broadcast_cond_v1_t. */
   broadcast_cond_v1_t broadcast_cond;
+  /** @sa start_idle_wait_v1_t. */
+  start_idle_wait_v1_t start_idle_wait;
+  /** @sa end_idle_wait_v1_t. */
+  end_idle_wait_v1_t end_idle_wait;
   /** @sa start_mutex_wait_v1_t. */
   start_mutex_wait_v1_t start_mutex_wait;
   /** @sa end_mutex_wait_v1_t. */
@@ -1114,10 +1986,14 @@ struct PSI_v1
   start_cond_wait_v1_t start_cond_wait;
   /** @sa end_cond_wait_v1_t. */
   end_cond_wait_v1_t end_cond_wait;
-  /** @sa start_table_wait_v1_t. */
-  start_table_wait_v1_t start_table_wait;
-  /** @sa end_table_wait_v1_t. */
-  end_table_wait_v1_t end_table_wait;
+  /** @sa start_table_io_wait_v1_t. */
+  start_table_io_wait_v1_t start_table_io_wait;
+  /** @sa end_table_io_wait_v1_t. */
+  end_table_io_wait_v1_t end_table_io_wait;
+  /** @sa start_table_lock_wait_v1_t. */
+  start_table_lock_wait_v1_t start_table_lock_wait;
+  /** @sa end_table_lock_wait_v1_t. */
+  end_table_lock_wait_v1_t end_table_lock_wait;
   /** @sa start_file_open_wait_v1_t. */
   start_file_open_wait_v1_t start_file_open_wait;
   /** @sa end_file_open_wait_v1_t. */
@@ -1129,6 +2005,66 @@ struct PSI_v1
   start_file_wait_v1_t start_file_wait;
   /** @sa end_file_wait_v1_t. */
   end_file_wait_v1_t end_file_wait;
+  /** @sa start_stage_v1_t. */
+  start_stage_v1_t start_stage;
+  /** @sa end_stage_v1_t. */
+  end_stage_v1_t end_stage;
+  /** @sa get_thread_statement_locker_v1_t. */
+  get_thread_statement_locker_v1_t get_thread_statement_locker;
+  /** @sa refine_statement_v1_t. */
+  refine_statement_v1_t refine_statement;
+  /** @sa start_statement_v1_t. */
+  start_statement_v1_t start_statement;
+  /** @sa set_statement_text_v1_t. */
+  set_statement_text_v1_t set_statement_text;
+  /** @sa set_statement_lock_time_t. */
+  set_statement_lock_time_t set_statement_lock_time;
+  /** @sa set_statement_rows_sent_t. */
+  set_statement_rows_sent_t set_statement_rows_sent;
+  /** @sa set_statement_rows_examined_t. */
+  set_statement_rows_examined_t set_statement_rows_examined;
+  /** @sa inc_statement_created_tmp_disk_tables. */
+  inc_statement_created_tmp_disk_tables_t inc_statement_created_tmp_disk_tables;
+  /** @sa inc_statement_created_tmp_tables. */
+  inc_statement_created_tmp_tables_t inc_statement_created_tmp_tables;
+  /** @sa inc_statement_select_full_join. */
+  inc_statement_select_full_join_t inc_statement_select_full_join;
+  /** @sa inc_statement_select_full_range_join. */
+  inc_statement_select_full_range_join_t inc_statement_select_full_range_join;
+  /** @sa inc_statement_select_range. */
+  inc_statement_select_range_t inc_statement_select_range;
+  /** @sa inc_statement_select_range_check. */
+  inc_statement_select_range_check_t inc_statement_select_range_check;
+  /** @sa inc_statement_select_scan. */
+  inc_statement_select_scan_t inc_statement_select_scan;
+  /** @sa inc_statement_sort_merge_passes. */
+  inc_statement_sort_merge_passes_t inc_statement_sort_merge_passes;
+  /** @sa inc_statement_sort_range. */
+  inc_statement_sort_range_t inc_statement_sort_range;
+  /** @sa inc_statement_sort_rows. */
+  inc_statement_sort_rows_t inc_statement_sort_rows;
+  /** @sa inc_statement_sort_scan. */
+  inc_statement_sort_scan_t inc_statement_sort_scan;
+  /** @sa set_statement_no_index_used. */
+  set_statement_no_index_used_t set_statement_no_index_used;
+  /** @sa set_statement_no_good_index_used. */
+  set_statement_no_good_index_used_t set_statement_no_good_index_used;
+  /** @sa end_statement_v1_t. */
+  end_statement_v1_t end_statement;
+  /** @sa start_socket_wait_v1_t. */
+  start_socket_wait_v1_t start_socket_wait;
+  /** @sa end_socket_wait_v1_t. */
+  end_socket_wait_v1_t end_socket_wait;
+  /** @sa set_socket_state_v1_t. */
+  set_socket_state_v1_t set_socket_state;
+  /** @sa set_socket_info_v1_t. */
+  set_socket_info_v1_t set_socket_info;
+  /** @sa set_socket_thread_owner_v1_t. */
+  set_socket_thread_owner_v1_t set_socket_thread_owner;
+  /** @sa digest_start_v1_t. */
+  digest_start_v1_t digest_start;
+  /** @sa digest_add_token_v1_t. */
+  digest_add_token_v1_t digest_add_token;
 };
 
 /** @} (end of group Group_PSI_v1) */
@@ -1194,36 +2130,76 @@ struct PSI_file_info_v2
   int placeholder;
 };
 
+/** Placeholder */
+struct PSI_stage_info_v2
+{
+  /** Placeholder */
+  int placeholder;
+};
+
+/** Placeholder */
+struct PSI_statement_info_v2
+{
+  /** Placeholder */
+  int placeholder;
+};
+
+/** Placeholder */
+struct PSI_idle_locker_state_v2
+{
+  /** Placeholder */
+  int placeholder;
+};
+
+/** Placeholder */
 struct PSI_mutex_locker_state_v2
 {
   /** Placeholder */
   int placeholder;
 };
 
+/** Placeholder */
 struct PSI_rwlock_locker_state_v2
 {
   /** Placeholder */
   int placeholder;
 };
 
+/** Placeholder */
 struct PSI_cond_locker_state_v2
 {
   /** Placeholder */
   int placeholder;
 };
 
+/** Placeholder */
 struct PSI_file_locker_state_v2
 {
   /** Placeholder */
   int placeholder;
 };
 
+/** Placeholder */
 struct PSI_table_locker_state_v2
 {
   /** Placeholder */
   int placeholder;
 };
 
+/** Placeholder */
+struct PSI_statement_locker_state_v2
+{
+  /** Placeholder */
+  int placeholder;
+};
+
+/** Placeholder */
+struct PSI_socket_locker_state_v2
+{
+  /** Placeholder */
+  int placeholder;
+};
+
 /** @} (end of group Group_PSI_v2) */
 
 #endif /* HAVE_PSI_2 */
@@ -1267,11 +2243,17 @@ typedef struct PSI_rwlock_info_v1 PSI_rwlock_info;
 typedef struct PSI_cond_info_v1 PSI_cond_info;
 typedef struct PSI_thread_info_v1 PSI_thread_info;
 typedef struct PSI_file_info_v1 PSI_file_info;
+typedef struct PSI_stage_info_v1 PSI_stage_info;
+typedef struct PSI_statement_info_v1 PSI_statement_info;
+typedef struct PSI_socket_info_v1 PSI_socket_info;
+typedef struct PSI_idle_locker_state_v1 PSI_idle_locker_state;
 typedef struct PSI_mutex_locker_state_v1 PSI_mutex_locker_state;
 typedef struct PSI_rwlock_locker_state_v1 PSI_rwlock_locker_state;
 typedef struct PSI_cond_locker_state_v1 PSI_cond_locker_state;
 typedef struct PSI_file_locker_state_v1 PSI_file_locker_state;
 typedef struct PSI_table_locker_state_v1 PSI_table_locker_state;
+typedef struct PSI_statement_locker_state_v1 PSI_statement_locker_state;
+typedef struct PSI_socket_locker_state_v1 PSI_socket_locker_state;
 #endif
 
 #ifdef USE_PSI_2
@@ -1281,11 +2263,17 @@ typedef struct PSI_rwlock_info_v2 PSI_rwlock_info;
 typedef struct PSI_cond_info_v2 PSI_cond_info;
 typedef struct PSI_thread_info_v2 PSI_thread_info;
 typedef struct PSI_file_info_v2 PSI_file_info;
+typedef struct PSI_stage_info_v2 PSI_stage_info;
+typedef struct PSI_statement_info_v2 PSI_statement_info;
+typedef struct PSI_socket_info_v2 PSI_socket_info;
+typedef struct PSI_idle_locker_state_v2 PSI_idle_locker_state;
 typedef struct PSI_mutex_locker_state_v2 PSI_mutex_locker_state;
 typedef struct PSI_rwlock_locker_state_v2 PSI_rwlock_locker_state;
 typedef struct PSI_cond_locker_state_v2 PSI_cond_locker_state;
 typedef struct PSI_file_locker_state_v2 PSI_file_locker_state;
 typedef struct PSI_table_locker_state_v2 PSI_table_locker_state;
+typedef struct PSI_statement_locker_state_v2 PSI_statement_locker_state;
+typedef struct PSI_socket_locker_state_v2 PSI_socket_locker_state;
 #endif
 
 #else /* HAVE_PSI_INTERFACE */
@@ -1301,10 +2289,37 @@ struct PSI_none
 };
 typedef struct PSI_none PSI;
 
+/**
+  Stage instrument information.
+  @since PSI_VERSION_1
+  This structure is used to register an instrumented stage.
+*/
+struct PSI_stage_info_none
+{
+  /** Unused stage key. */
+  unsigned int m_key;
+  /** The name of the stage instrument. */
+  const char *m_name;
+  /** Unused stage flags. */
+  int m_flags;
+};
+
+/**
+  The stage instrumentation has to co exist with the legacy
+  THD::set_proc_info instrumentation.
+  To avoid duplication of the instrumentation in the server,
+  the common PSI_stage_info structure is used,
+  so we export it here, even when not building
+  with HAVE_PSI_INTERFACE.
+*/
+typedef struct PSI_stage_info_none PSI_stage_info;
+
 #endif /* HAVE_PSI_INTERFACE */
 
 extern MYSQL_PLUGIN_IMPORT PSI *PSI_server;
 
+#define PSI_CALL(M) PSI_server->M
+
 /** @} */
 
 C_MODE_END
diff --git a/include/mysql/psi/psi_abi_v0.h b/include/mysql/psi/psi_abi_v0.h
new file mode 100644
index 00000000000..7444cb0b276
--- /dev/null
+++ b/include/mysql/psi/psi_abi_v0.h
@@ -0,0 +1,24 @@
+/* Copyright (c) 2011, Oracle and/or its affiliates. All rights reserved.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; version 2 of the License.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software Foundation,
+  51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA */
+
+/**
+  @file mysql/psi/psi_abi_v0.h
+  ABI check for mysql/psi/psi.h, when compiling without instrumentation.
+  This file is only used to automate detection of changes between versions.
+  Do not include this file, include mysql/psi/psi.h instead.
+*/
+#define _global_h
+#include "mysql/psi/psi.h"
+
diff --git a/include/mysql/psi/psi_abi_v0.h.pp b/include/mysql/psi/psi_abi_v0.h.pp
new file mode 100644
index 00000000000..b46b38ed144
--- /dev/null
+++ b/include/mysql/psi/psi_abi_v0.h.pp
@@ -0,0 +1,47 @@
+#include "mysql/psi/psi.h"
+C_MODE_START
+struct TABLE_SHARE;
+struct OPAQUE_LEX_YYSTYPE;
+struct PSI_mutex;
+typedef struct PSI_mutex PSI_mutex;
+struct PSI_rwlock;
+typedef struct PSI_rwlock PSI_rwlock;
+struct PSI_cond;
+typedef struct PSI_cond PSI_cond;
+struct PSI_table_share;
+typedef struct PSI_table_share PSI_table_share;
+struct PSI_table;
+typedef struct PSI_table PSI_table;
+struct PSI_thread;
+typedef struct PSI_thread PSI_thread;
+struct PSI_file;
+typedef struct PSI_file PSI_file;
+struct PSI_socket;
+typedef struct PSI_socket PSI_socket;
+struct PSI_table_locker;
+typedef struct PSI_table_locker PSI_table_locker;
+struct PSI_statement_locker;
+typedef struct PSI_statement_locker PSI_statement_locker;
+struct PSI_idle_locker;
+typedef struct PSI_idle_locker PSI_idle_locker;
+struct PSI_digest_locker;
+typedef struct PSI_digest_locker PSI_digest_locker;
+struct PSI_bootstrap
+{
+  void* (*get_interface)(int version);
+};
+typedef struct PSI_bootstrap PSI_bootstrap;
+struct PSI_none
+{
+  int opaque;
+};
+typedef struct PSI_none PSI;
+struct PSI_stage_info_none
+{
+  unsigned int m_key;
+  const char *m_name;
+  int m_flags;
+};
+typedef struct PSI_stage_info_none PSI_stage_info;
+extern MYSQL_PLUGIN_IMPORT PSI *PSI_server;
+C_MODE_END
diff --git a/include/mysql/psi/psi_abi_v1.h b/include/mysql/psi/psi_abi_v1.h
index 0f62291696f..54c49f0c518 100644
--- a/include/mysql/psi/psi_abi_v1.h
+++ b/include/mysql/psi/psi_abi_v1.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2008, 2011, Oracle and/or its affiliates. All rights reserved.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
@@ -21,6 +21,6 @@
 */
 #define USE_PSI_1
 #define HAVE_PSI_INTERFACE
-#define _global_h
+#define MY_GLOBAL_INCLUDED
 #include "mysql/psi/psi.h"
 
diff --git a/include/mysql/psi/psi_abi_v1.h.pp b/include/mysql/psi/psi_abi_v1.h.pp
index adb3010469b..b0559213998 100644
--- a/include/mysql/psi/psi_abi_v1.h.pp
+++ b/include/mysql/psi/psi_abi_v1.h.pp
@@ -1,25 +1,52 @@
 #include "mysql/psi/psi.h"
 C_MODE_START
+struct TABLE_SHARE;
+struct OPAQUE_LEX_YYSTYPE;
 struct PSI_mutex;
+typedef struct PSI_mutex PSI_mutex;
 struct PSI_rwlock;
+typedef struct PSI_rwlock PSI_rwlock;
 struct PSI_cond;
+typedef struct PSI_cond PSI_cond;
 struct PSI_table_share;
+typedef struct PSI_table_share PSI_table_share;
 struct PSI_table;
+typedef struct PSI_table PSI_table;
 struct PSI_thread;
+typedef struct PSI_thread PSI_thread;
 struct PSI_file;
+typedef struct PSI_file PSI_file;
+struct PSI_socket;
+typedef struct PSI_socket PSI_socket;
+struct PSI_table_locker;
+typedef struct PSI_table_locker PSI_table_locker;
+struct PSI_statement_locker;
+typedef struct PSI_statement_locker PSI_statement_locker;
+struct PSI_idle_locker;
+typedef struct PSI_idle_locker PSI_idle_locker;
+struct PSI_digest_locker;
+typedef struct PSI_digest_locker PSI_digest_locker;
 struct PSI_bootstrap
 {
   void* (*get_interface)(int version);
 };
+typedef struct PSI_bootstrap PSI_bootstrap;
 struct PSI_mutex_locker;
+typedef struct PSI_mutex_locker PSI_mutex_locker;
 struct PSI_rwlock_locker;
+typedef struct PSI_rwlock_locker PSI_rwlock_locker;
 struct PSI_cond_locker;
+typedef struct PSI_cond_locker PSI_cond_locker;
 struct PSI_file_locker;
+typedef struct PSI_file_locker PSI_file_locker;
+struct PSI_socket_locker;
+typedef struct PSI_socket_locker PSI_socket_locker;
 enum PSI_mutex_operation
 {
   PSI_MUTEX_LOCK= 0,
   PSI_MUTEX_TRYLOCK= 1
 };
+typedef enum PSI_mutex_operation PSI_mutex_operation;
 enum PSI_rwlock_operation
 {
   PSI_RWLOCK_READLOCK= 0,
@@ -27,11 +54,13 @@ enum PSI_rwlock_operation
   PSI_RWLOCK_TRYREADLOCK= 2,
   PSI_RWLOCK_TRYWRITELOCK= 3
 };
+typedef enum PSI_rwlock_operation PSI_rwlock_operation;
 enum PSI_cond_operation
 {
   PSI_COND_WAIT= 0,
   PSI_COND_TIMEDWAIT= 1
 };
+typedef enum PSI_cond_operation PSI_cond_operation;
 enum PSI_file_operation
 {
   PSI_FILE_CREATE= 0,
@@ -52,12 +81,54 @@ enum PSI_file_operation
   PSI_FILE_RENAME= 15,
   PSI_FILE_SYNC= 16
 };
-struct PSI_table_locker;
+typedef enum PSI_file_operation PSI_file_operation;
+enum PSI_table_io_operation
+{
+  PSI_TABLE_FETCH_ROW= 0,
+  PSI_TABLE_WRITE_ROW= 1,
+  PSI_TABLE_UPDATE_ROW= 2,
+  PSI_TABLE_DELETE_ROW= 3
+};
+typedef enum PSI_table_io_operation PSI_table_io_operation;
+enum PSI_table_lock_operation
+{
+  PSI_TABLE_LOCK= 0,
+  PSI_TABLE_EXTERNAL_LOCK= 1
+};
+typedef enum PSI_table_lock_operation PSI_table_lock_operation;
+enum PSI_socket_state
+{
+  PSI_SOCKET_STATE_IDLE= 1,
+  PSI_SOCKET_STATE_ACTIVE= 2
+};
+typedef enum PSI_socket_state PSI_socket_state;
+enum PSI_socket_operation
+{
+  PSI_SOCKET_CREATE= 0,
+  PSI_SOCKET_CONNECT= 1,
+  PSI_SOCKET_BIND= 2,
+  PSI_SOCKET_CLOSE= 3,
+  PSI_SOCKET_SEND= 4,
+  PSI_SOCKET_RECV= 5,
+  PSI_SOCKET_SENDTO= 6,
+  PSI_SOCKET_RECVFROM= 7,
+  PSI_SOCKET_SENDMSG= 8,
+  PSI_SOCKET_RECVMSG= 9,
+  PSI_SOCKET_SEEK= 10,
+  PSI_SOCKET_OPT= 11,
+  PSI_SOCKET_STAT= 12,
+  PSI_SOCKET_SHUTDOWN= 13,
+  PSI_SOCKET_SELECT= 14
+};
+typedef enum PSI_socket_operation PSI_socket_operation;
 typedef unsigned int PSI_mutex_key;
 typedef unsigned int PSI_rwlock_key;
 typedef unsigned int PSI_cond_key;
 typedef unsigned int PSI_thread_key;
 typedef unsigned int PSI_file_key;
+typedef unsigned int PSI_stage_key;
+typedef unsigned int PSI_statement_key;
+typedef unsigned int PSI_socket_key;
 struct PSI_mutex_info_v1
 {
   PSI_mutex_key *m_key;
@@ -88,67 +159,135 @@ struct PSI_file_info_v1
   const char *m_name;
   int m_flags;
 };
+struct PSI_stage_info_v1
+{
+  PSI_stage_key m_key;
+  const char *m_name;
+  int m_flags;
+};
+struct PSI_statement_info_v1
+{
+  PSI_statement_key m_key;
+  const char *m_name;
+  int m_flags;
+};
+struct PSI_socket_info_v1
+{
+  PSI_socket_key *m_key;
+  const char *m_name;
+  int m_flags;
+};
+struct PSI_idle_locker_state_v1
+{
+  uint m_flags;
+  struct PSI_thread *m_thread;
+  ulonglong m_timer_start;
+  ulonglong (*m_timer)(void);
+  void *m_wait;
+};
 struct PSI_mutex_locker_state_v1
 {
   uint m_flags;
+  enum PSI_mutex_operation m_operation;
   struct PSI_mutex *m_mutex;
   struct PSI_thread *m_thread;
   ulonglong m_timer_start;
   ulonglong (*m_timer)(void);
-  enum PSI_mutex_operation m_operation;
-  const char* m_src_file;
-  int m_src_line;
   void *m_wait;
 };
 struct PSI_rwlock_locker_state_v1
 {
   uint m_flags;
+  enum PSI_rwlock_operation m_operation;
   struct PSI_rwlock *m_rwlock;
   struct PSI_thread *m_thread;
   ulonglong m_timer_start;
   ulonglong (*m_timer)(void);
-  enum PSI_rwlock_operation m_operation;
-  const char* m_src_file;
-  int m_src_line;
   void *m_wait;
 };
 struct PSI_cond_locker_state_v1
 {
   uint m_flags;
+  enum PSI_cond_operation m_operation;
   struct PSI_cond *m_cond;
   struct PSI_mutex *m_mutex;
   struct PSI_thread *m_thread;
   ulonglong m_timer_start;
   ulonglong (*m_timer)(void);
-  enum PSI_cond_operation m_operation;
-  const char* m_src_file;
-  int m_src_line;
   void *m_wait;
 };
 struct PSI_file_locker_state_v1
 {
   uint m_flags;
+  enum PSI_file_operation m_operation;
   struct PSI_file *m_file;
   struct PSI_thread *m_thread;
   size_t m_number_of_bytes;
   ulonglong m_timer_start;
   ulonglong (*m_timer)(void);
-  enum PSI_file_operation m_operation;
-  const char* m_src_file;
-  int m_src_line;
   void *m_wait;
 };
 struct PSI_table_locker_state_v1
 {
   uint m_flags;
+  enum PSI_table_io_operation m_io_operation;
   struct PSI_table *m_table;
   struct PSI_table_share *m_table_share;
-  void *m_class;
   struct PSI_thread *m_thread;
   ulonglong m_timer_start;
   ulonglong (*m_timer)(void);
+  void *m_wait;
   uint m_index;
-  uint m_lock_index;
+};
+struct PSI_digest_storage
+{
+  my_bool m_full;
+  int m_byte_count;
+  unsigned char m_token_array[1024];
+};
+typedef struct PSI_digest_storage PSI_digest_storage;
+struct PSI_digest_locker_state
+{
+  int m_last_id_index;
+  PSI_digest_storage m_digest_storage;
+};
+typedef struct PSI_digest_locker_state PSI_digest_locker_state;
+struct PSI_statement_locker_state_v1
+{
+  my_bool m_discarded;
+  uchar m_no_index_used;
+  uchar m_no_good_index_used;
+  uint m_flags;
+  void *m_class;
+  struct PSI_thread *m_thread;
+  ulonglong m_timer_start;
+  ulonglong (*m_timer)(void);
+  void *m_statement;
+  ulonglong m_lock_time;
+  ulonglong m_rows_sent;
+  ulonglong m_rows_examined;
+  ulong m_created_tmp_disk_tables;
+  ulong m_created_tmp_tables;
+  ulong m_select_full_join;
+  ulong m_select_full_range_join;
+  ulong m_select_range;
+  ulong m_select_range_check;
+  ulong m_select_scan;
+  ulong m_sort_merge_passes;
+  ulong m_sort_range;
+  ulong m_sort_rows;
+  ulong m_sort_scan;
+  PSI_digest_locker_state m_digest_state;
+};
+struct PSI_socket_locker_state_v1
+{
+  uint m_flags;
+  struct PSI_socket *m_socket;
+  struct PSI_thread *m_thread;
+  size_t m_number_of_bytes;
+  ulonglong m_timer_start;
+  ulonglong (*m_timer)(void);
+  enum PSI_socket_operation m_operation;
   const char* m_src_file;
   int m_src_line;
   void *m_wait;
@@ -163,6 +302,12 @@ typedef void (*register_thread_v1_t)
   (const char *category, struct PSI_thread_info_v1 *info, int count);
 typedef void (*register_file_v1_t)
   (const char *category, struct PSI_file_info_v1 *info, int count);
+typedef void (*register_stage_v1_t)
+  (const char *category, struct PSI_stage_info_v1 **info, int count);
+typedef void (*register_statement_v1_t)
+  (const char *category, struct PSI_statement_info_v1 *info, int count);
+typedef void (*register_socket_v1_t)
+  (const char *category, struct PSI_socket_info_v1 *info, int count);
 typedef struct PSI_mutex* (*init_mutex_v1_t)
   (PSI_mutex_key key, const void *identity);
 typedef void (*destroy_mutex_v1_t)(struct PSI_mutex *mutex);
@@ -172,12 +317,21 @@ typedef void (*destroy_rwlock_v1_t)(struct PSI_rwlock *rwlock);
 typedef struct PSI_cond* (*init_cond_v1_t)
   (PSI_cond_key key, const void *identity);
 typedef void (*destroy_cond_v1_t)(struct PSI_cond *cond);
+typedef struct PSI_socket* (*init_socket_v1_t)
+  (PSI_socket_key key, const my_socket *fd);
+typedef void (*destroy_socket_v1_t)(struct PSI_socket *socket);
 typedef struct PSI_table_share* (*get_table_share_v1_t)
-  (const char *schema_name, int schema_name_length, const char *table_name,
-   int table_name_length, const void *identity);
+  (my_bool temporary, struct TABLE_SHARE *share);
 typedef void (*release_table_share_v1_t)(struct PSI_table_share *share);
+typedef void (*drop_table_share_v1_t)
+  (my_bool temporary, const char *schema_name, int schema_name_length,
+   const char *table_name, int table_name_length);
 typedef struct PSI_table* (*open_table_v1_t)
   (struct PSI_table_share *share, const void *identity);
+typedef void (*unbind_table_v1_t)
+  (struct PSI_table *table);
+typedef PSI_table* (*rebind_table_v1_t)
+  (PSI_table_share *share, const void *identity, PSI_table *table);
 typedef void (*close_table_v1_t)(struct PSI_table *table);
 typedef void (*create_file_v1_t)(PSI_file_key key, const char *name,
                                  File file);
@@ -190,24 +344,17 @@ typedef struct PSI_thread* (*new_thread_v1_t)
 typedef void (*set_thread_id_v1_t)(struct PSI_thread *thread,
                                    unsigned long id);
 typedef struct PSI_thread* (*get_thread_v1_t)(void);
+typedef void (*set_thread_user_v1_t)(const char *user, int user_len);
+typedef void (*set_thread_user_host_v1_t)(const char *user, int user_len,
+                                          const char *host, int host_len);
+typedef void (*set_thread_db_v1_t)(const char* db, int db_len);
+typedef void (*set_thread_command_v1_t)(int command);
+typedef void (*set_thread_start_time_v1_t)(time_t start_time);
+typedef void (*set_thread_state_v1_t)(const char* state);
+typedef void (*set_thread_info_v1_t)(const char* info, int info_len);
 typedef void (*set_thread_v1_t)(struct PSI_thread *thread);
 typedef void (*delete_current_thread_v1_t)(void);
 typedef void (*delete_thread_v1_t)(struct PSI_thread *thread);
-typedef struct PSI_mutex_locker* (*get_thread_mutex_locker_v1_t)
-  (struct PSI_mutex_locker_state_v1 *state,
-   struct PSI_mutex *mutex,
-   enum PSI_mutex_operation op);
-typedef struct PSI_rwlock_locker* (*get_thread_rwlock_locker_v1_t)
-  (struct PSI_rwlock_locker_state_v1 *state,
-   struct PSI_rwlock *rwlock,
-   enum PSI_rwlock_operation op);
-typedef struct PSI_cond_locker* (*get_thread_cond_locker_v1_t)
-  (struct PSI_cond_locker_state_v1 *state,
-   struct PSI_cond *cond, struct PSI_mutex *mutex,
-   enum PSI_cond_operation op);
-typedef struct PSI_table_locker* (*get_thread_table_locker_v1_t)
-  (struct PSI_table_locker_state_v1 *state,
-   struct PSI_table *table);
 typedef struct PSI_file_locker* (*get_thread_file_name_locker_v1_t)
   (struct PSI_file_locker_state_v1 *state,
    PSI_file_key key, enum PSI_file_operation op, const char *name,
@@ -226,25 +373,53 @@ typedef void (*signal_cond_v1_t)
   (struct PSI_cond *cond);
 typedef void (*broadcast_cond_v1_t)
   (struct PSI_cond *cond);
-typedef void (*start_mutex_wait_v1_t)
-  (struct PSI_mutex_locker *locker, const char *src_file, uint src_line);
+typedef struct PSI_idle_locker* (*start_idle_wait_v1_t)
+  (struct PSI_idle_locker_state_v1 *state, const char *src_file, uint src_line);
+typedef void (*end_idle_wait_v1_t)
+  (struct PSI_idle_locker *locker);
+typedef struct PSI_mutex_locker* (*start_mutex_wait_v1_t)
+  (struct PSI_mutex_locker_state_v1 *state,
+   struct PSI_mutex *mutex,
+   enum PSI_mutex_operation op,
+   const char *src_file, uint src_line);
 typedef void (*end_mutex_wait_v1_t)
   (struct PSI_mutex_locker *locker, int rc);
-typedef void (*start_rwlock_rdwait_v1_t)
-  (struct PSI_rwlock_locker *locker, const char *src_file, uint src_line);
+typedef struct PSI_rwlock_locker* (*start_rwlock_rdwait_v1_t)
+  (struct PSI_rwlock_locker_state_v1 *state,
+   struct PSI_rwlock *rwlock,
+   enum PSI_rwlock_operation op,
+   const char *src_file, uint src_line);
 typedef void (*end_rwlock_rdwait_v1_t)
   (struct PSI_rwlock_locker *locker, int rc);
-typedef void (*start_rwlock_wrwait_v1_t)
-  (struct PSI_rwlock_locker *locker, const char *src_file, uint src_line);
+typedef struct PSI_rwlock_locker* (*start_rwlock_wrwait_v1_t)
+  (struct PSI_rwlock_locker_state_v1 *state,
+   struct PSI_rwlock *rwlock,
+   enum PSI_rwlock_operation op,
+   const char *src_file, uint src_line);
 typedef void (*end_rwlock_wrwait_v1_t)
   (struct PSI_rwlock_locker *locker, int rc);
-typedef void (*start_cond_wait_v1_t)
-  (struct PSI_cond_locker *locker, const char *src_file, uint src_line);
+typedef struct PSI_cond_locker* (*start_cond_wait_v1_t)
+  (struct PSI_cond_locker_state_v1 *state,
+   struct PSI_cond *cond,
+   struct PSI_mutex *mutex,
+   enum PSI_cond_operation op,
+   const char *src_file, uint src_line);
 typedef void (*end_cond_wait_v1_t)
   (struct PSI_cond_locker *locker, int rc);
-typedef void (*start_table_wait_v1_t)
-  (struct PSI_table_locker *locker, const char *src_file, uint src_line);
-typedef void (*end_table_wait_v1_t)(struct PSI_table_locker *locker);
+typedef struct PSI_table_locker* (*start_table_io_wait_v1_t)
+  (struct PSI_table_locker_state_v1 *state,
+   struct PSI_table *table,
+   enum PSI_table_io_operation op,
+   uint index,
+   const char *src_file, uint src_line);
+typedef void (*end_table_io_wait_v1_t)(struct PSI_table_locker *locker);
+typedef struct PSI_table_locker* (*start_table_lock_wait_v1_t)
+  (struct PSI_table_locker_state_v1 *state,
+   struct PSI_table *table,
+   enum PSI_table_lock_operation op,
+   ulong flags,
+   const char *src_file, uint src_line);
+typedef void (*end_table_lock_wait_v1_t)(struct PSI_table_locker *locker);
 typedef struct PSI_file* (*start_file_open_wait_v1_t)
   (struct PSI_file_locker *locker, const char *src_file, uint src_line);
 typedef void (*end_file_open_wait_v1_t)(struct PSI_file_locker *locker);
@@ -255,6 +430,75 @@ typedef void (*start_file_wait_v1_t)
    const char *src_file, uint src_line);
 typedef void (*end_file_wait_v1_t)
   (struct PSI_file_locker *locker, size_t count);
+typedef void (*start_stage_v1_t)
+  (PSI_stage_key key, const char *src_file, int src_line);
+typedef void (*end_stage_v1_t) (void);
+typedef struct PSI_statement_locker* (*get_thread_statement_locker_v1_t)
+  (struct PSI_statement_locker_state_v1 *state,
+   PSI_statement_key key);
+typedef struct PSI_statement_locker* (*refine_statement_v1_t)
+  (struct PSI_statement_locker *locker,
+   PSI_statement_key key);
+typedef void (*start_statement_v1_t)
+  (struct PSI_statement_locker *locker,
+   const char *db, uint db_length,
+   const char *src_file, uint src_line);
+typedef void (*set_statement_text_v1_t)
+  (struct PSI_statement_locker *locker,
+   const char *text, uint text_len);
+typedef void (*set_statement_lock_time_t)
+  (struct PSI_statement_locker *locker, ulonglong lock_time);
+typedef void (*set_statement_rows_sent_t)
+  (struct PSI_statement_locker *locker, ulonglong count);
+typedef void (*set_statement_rows_examined_t)
+  (struct PSI_statement_locker *locker, ulonglong count);
+typedef void (*inc_statement_created_tmp_disk_tables_t)
+  (struct PSI_statement_locker *locker, ulong count);
+typedef void (*inc_statement_created_tmp_tables_t)
+  (struct PSI_statement_locker *locker, ulong count);
+typedef void (*inc_statement_select_full_join_t)
+  (struct PSI_statement_locker *locker, ulong count);
+typedef void (*inc_statement_select_full_range_join_t)
+  (struct PSI_statement_locker *locker, ulong count);
+typedef void (*inc_statement_select_range_t)
+  (struct PSI_statement_locker *locker, ulong count);
+typedef void (*inc_statement_select_range_check_t)
+  (struct PSI_statement_locker *locker, ulong count);
+typedef void (*inc_statement_select_scan_t)
+  (struct PSI_statement_locker *locker, ulong count);
+typedef void (*inc_statement_sort_merge_passes_t)
+  (struct PSI_statement_locker *locker, ulong count);
+typedef void (*inc_statement_sort_range_t)
+  (struct PSI_statement_locker *locker, ulong count);
+typedef void (*inc_statement_sort_rows_t)
+  (struct PSI_statement_locker *locker, ulong count);
+typedef void (*inc_statement_sort_scan_t)
+  (struct PSI_statement_locker *locker, ulong count);
+typedef void (*set_statement_no_index_used_t)
+  (struct PSI_statement_locker *locker);
+typedef void (*set_statement_no_good_index_used_t)
+  (struct PSI_statement_locker *locker);
+typedef void (*end_statement_v1_t)
+  (struct PSI_statement_locker *locker, void *stmt_da);
+typedef struct PSI_socket_locker* (*start_socket_wait_v1_t)
+  (struct PSI_socket_locker_state_v1 *state,
+   struct PSI_socket *socket,
+   enum PSI_socket_operation op,
+   size_t count,
+   const char *src_file, uint src_line);
+typedef void (*end_socket_wait_v1_t)
+  (struct PSI_socket_locker *locker, size_t count);
+typedef void (*set_socket_state_v1_t)(struct PSI_socket *socket,
+                                      enum PSI_socket_state state);
+typedef void (*set_socket_info_v1_t)(struct PSI_socket *socket,
+                                     const my_socket *fd,
+                                     const struct sockaddr *addr,
+                                     socklen_t addr_len);
+typedef void (*set_socket_thread_owner_v1_t)(struct PSI_socket *socket);
+typedef struct PSI_digest_locker * (*digest_start_v1_t)
+  (struct PSI_statement_locker *locker);
+typedef struct PSI_digest_locker* (*digest_add_token_v1_t)
+  (struct PSI_digest_locker *locker, uint token, struct OPAQUE_LEX_YYSTYPE *yylval);
 struct PSI_v1
 {
   register_mutex_v1_t register_mutex;
@@ -262,28 +506,39 @@ struct PSI_v1
   register_cond_v1_t register_cond;
   register_thread_v1_t register_thread;
   register_file_v1_t register_file;
+  register_stage_v1_t register_stage;
+  register_statement_v1_t register_statement;
+  register_socket_v1_t register_socket;
   init_mutex_v1_t init_mutex;
   destroy_mutex_v1_t destroy_mutex;
   init_rwlock_v1_t init_rwlock;
   destroy_rwlock_v1_t destroy_rwlock;
   init_cond_v1_t init_cond;
   destroy_cond_v1_t destroy_cond;
+  init_socket_v1_t init_socket;
+  destroy_socket_v1_t destroy_socket;
   get_table_share_v1_t get_table_share;
   release_table_share_v1_t release_table_share;
+  drop_table_share_v1_t drop_table_share;
   open_table_v1_t open_table;
+  unbind_table_v1_t unbind_table;
+  rebind_table_v1_t rebind_table;
   close_table_v1_t close_table;
   create_file_v1_t create_file;
   spawn_thread_v1_t spawn_thread;
   new_thread_v1_t new_thread;
   set_thread_id_v1_t set_thread_id;
   get_thread_v1_t get_thread;
+  set_thread_user_v1_t set_thread_user;
+  set_thread_user_host_v1_t set_thread_user_host;
+  set_thread_db_v1_t set_thread_db;
+  set_thread_command_v1_t set_thread_command;
+  set_thread_start_time_v1_t set_thread_start_time;
+  set_thread_state_v1_t set_thread_state;
+  set_thread_info_v1_t set_thread_info;
   set_thread_v1_t set_thread;
   delete_current_thread_v1_t delete_current_thread;
   delete_thread_v1_t delete_thread;
-  get_thread_mutex_locker_v1_t get_thread_mutex_locker;
-  get_thread_rwlock_locker_v1_t get_thread_rwlock_locker;
-  get_thread_cond_locker_v1_t get_thread_cond_locker;
-  get_thread_table_locker_v1_t get_thread_table_locker;
   get_thread_file_name_locker_v1_t get_thread_file_name_locker;
   get_thread_file_stream_locker_v1_t get_thread_file_stream_locker;
   get_thread_file_descriptor_locker_v1_t get_thread_file_descriptor_locker;
@@ -291,6 +546,8 @@ struct PSI_v1
   unlock_rwlock_v1_t unlock_rwlock;
   signal_cond_v1_t signal_cond;
   broadcast_cond_v1_t broadcast_cond;
+  start_idle_wait_v1_t start_idle_wait;
+  end_idle_wait_v1_t end_idle_wait;
   start_mutex_wait_v1_t start_mutex_wait;
   end_mutex_wait_v1_t end_mutex_wait;
   start_rwlock_rdwait_v1_t start_rwlock_rdwait;
@@ -299,14 +556,46 @@ struct PSI_v1
   end_rwlock_wrwait_v1_t end_rwlock_wrwait;
   start_cond_wait_v1_t start_cond_wait;
   end_cond_wait_v1_t end_cond_wait;
-  start_table_wait_v1_t start_table_wait;
-  end_table_wait_v1_t end_table_wait;
+  start_table_io_wait_v1_t start_table_io_wait;
+  end_table_io_wait_v1_t end_table_io_wait;
+  start_table_lock_wait_v1_t start_table_lock_wait;
+  end_table_lock_wait_v1_t end_table_lock_wait;
   start_file_open_wait_v1_t start_file_open_wait;
   end_file_open_wait_v1_t end_file_open_wait;
   end_file_open_wait_and_bind_to_descriptor_v1_t
     end_file_open_wait_and_bind_to_descriptor;
   start_file_wait_v1_t start_file_wait;
   end_file_wait_v1_t end_file_wait;
+  start_stage_v1_t start_stage;
+  end_stage_v1_t end_stage;
+  get_thread_statement_locker_v1_t get_thread_statement_locker;
+  refine_statement_v1_t refine_statement;
+  start_statement_v1_t start_statement;
+  set_statement_text_v1_t set_statement_text;
+  set_statement_lock_time_t set_statement_lock_time;
+  set_statement_rows_sent_t set_statement_rows_sent;
+  set_statement_rows_examined_t set_statement_rows_examined;
+  inc_statement_created_tmp_disk_tables_t inc_statement_created_tmp_disk_tables;
+  inc_statement_created_tmp_tables_t inc_statement_created_tmp_tables;
+  inc_statement_select_full_join_t inc_statement_select_full_join;
+  inc_statement_select_full_range_join_t inc_statement_select_full_range_join;
+  inc_statement_select_range_t inc_statement_select_range;
+  inc_statement_select_range_check_t inc_statement_select_range_check;
+  inc_statement_select_scan_t inc_statement_select_scan;
+  inc_statement_sort_merge_passes_t inc_statement_sort_merge_passes;
+  inc_statement_sort_range_t inc_statement_sort_range;
+  inc_statement_sort_rows_t inc_statement_sort_rows;
+  inc_statement_sort_scan_t inc_statement_sort_scan;
+  set_statement_no_index_used_t set_statement_no_index_used;
+  set_statement_no_good_index_used_t set_statement_no_good_index_used;
+  end_statement_v1_t end_statement;
+  start_socket_wait_v1_t start_socket_wait;
+  end_socket_wait_v1_t end_socket_wait;
+  set_socket_state_v1_t set_socket_state;
+  set_socket_info_v1_t set_socket_info;
+  set_socket_thread_owner_v1_t set_socket_thread_owner;
+  digest_start_v1_t digest_start;
+  digest_add_token_v1_t digest_add_token;
 };
 typedef struct PSI_v1 PSI;
 typedef struct PSI_mutex_info_v1 PSI_mutex_info;
@@ -314,10 +603,16 @@ typedef struct PSI_rwlock_info_v1 PSI_rwlock_info;
 typedef struct PSI_cond_info_v1 PSI_cond_info;
 typedef struct PSI_thread_info_v1 PSI_thread_info;
 typedef struct PSI_file_info_v1 PSI_file_info;
+typedef struct PSI_stage_info_v1 PSI_stage_info;
+typedef struct PSI_statement_info_v1 PSI_statement_info;
+typedef struct PSI_socket_info_v1 PSI_socket_info;
+typedef struct PSI_idle_locker_state_v1 PSI_idle_locker_state;
 typedef struct PSI_mutex_locker_state_v1 PSI_mutex_locker_state;
 typedef struct PSI_rwlock_locker_state_v1 PSI_rwlock_locker_state;
 typedef struct PSI_cond_locker_state_v1 PSI_cond_locker_state;
 typedef struct PSI_file_locker_state_v1 PSI_file_locker_state;
 typedef struct PSI_table_locker_state_v1 PSI_table_locker_state;
+typedef struct PSI_statement_locker_state_v1 PSI_statement_locker_state;
+typedef struct PSI_socket_locker_state_v1 PSI_socket_locker_state;
 extern MYSQL_PLUGIN_IMPORT PSI *PSI_server;
 C_MODE_END
diff --git a/include/mysql/psi/psi_abi_v2.h.pp b/include/mysql/psi/psi_abi_v2.h.pp
index 63f8c52c50a..458013b43e4 100644
--- a/include/mysql/psi/psi_abi_v2.h.pp
+++ b/include/mysql/psi/psi_abi_v2.h.pp
@@ -1,25 +1,52 @@
 #include "mysql/psi/psi.h"
 C_MODE_START
+struct TABLE_SHARE;
+struct OPAQUE_LEX_YYSTYPE;
 struct PSI_mutex;
+typedef struct PSI_mutex PSI_mutex;
 struct PSI_rwlock;
+typedef struct PSI_rwlock PSI_rwlock;
 struct PSI_cond;
+typedef struct PSI_cond PSI_cond;
 struct PSI_table_share;
+typedef struct PSI_table_share PSI_table_share;
 struct PSI_table;
+typedef struct PSI_table PSI_table;
 struct PSI_thread;
+typedef struct PSI_thread PSI_thread;
 struct PSI_file;
+typedef struct PSI_file PSI_file;
+struct PSI_socket;
+typedef struct PSI_socket PSI_socket;
+struct PSI_table_locker;
+typedef struct PSI_table_locker PSI_table_locker;
+struct PSI_statement_locker;
+typedef struct PSI_statement_locker PSI_statement_locker;
+struct PSI_idle_locker;
+typedef struct PSI_idle_locker PSI_idle_locker;
+struct PSI_digest_locker;
+typedef struct PSI_digest_locker PSI_digest_locker;
 struct PSI_bootstrap
 {
   void* (*get_interface)(int version);
 };
+typedef struct PSI_bootstrap PSI_bootstrap;
 struct PSI_mutex_locker;
+typedef struct PSI_mutex_locker PSI_mutex_locker;
 struct PSI_rwlock_locker;
+typedef struct PSI_rwlock_locker PSI_rwlock_locker;
 struct PSI_cond_locker;
+typedef struct PSI_cond_locker PSI_cond_locker;
 struct PSI_file_locker;
+typedef struct PSI_file_locker PSI_file_locker;
+struct PSI_socket_locker;
+typedef struct PSI_socket_locker PSI_socket_locker;
 enum PSI_mutex_operation
 {
   PSI_MUTEX_LOCK= 0,
   PSI_MUTEX_TRYLOCK= 1
 };
+typedef enum PSI_mutex_operation PSI_mutex_operation;
 enum PSI_rwlock_operation
 {
   PSI_RWLOCK_READLOCK= 0,
@@ -27,11 +54,13 @@ enum PSI_rwlock_operation
   PSI_RWLOCK_TRYREADLOCK= 2,
   PSI_RWLOCK_TRYWRITELOCK= 3
 };
+typedef enum PSI_rwlock_operation PSI_rwlock_operation;
 enum PSI_cond_operation
 {
   PSI_COND_WAIT= 0,
   PSI_COND_TIMEDWAIT= 1
 };
+typedef enum PSI_cond_operation PSI_cond_operation;
 enum PSI_file_operation
 {
   PSI_FILE_CREATE= 0,
@@ -52,12 +81,54 @@ enum PSI_file_operation
   PSI_FILE_RENAME= 15,
   PSI_FILE_SYNC= 16
 };
-struct PSI_table_locker;
+typedef enum PSI_file_operation PSI_file_operation;
+enum PSI_table_io_operation
+{
+  PSI_TABLE_FETCH_ROW= 0,
+  PSI_TABLE_WRITE_ROW= 1,
+  PSI_TABLE_UPDATE_ROW= 2,
+  PSI_TABLE_DELETE_ROW= 3
+};
+typedef enum PSI_table_io_operation PSI_table_io_operation;
+enum PSI_table_lock_operation
+{
+  PSI_TABLE_LOCK= 0,
+  PSI_TABLE_EXTERNAL_LOCK= 1
+};
+typedef enum PSI_table_lock_operation PSI_table_lock_operation;
+enum PSI_socket_state
+{
+  PSI_SOCKET_STATE_IDLE= 1,
+  PSI_SOCKET_STATE_ACTIVE= 2
+};
+typedef enum PSI_socket_state PSI_socket_state;
+enum PSI_socket_operation
+{
+  PSI_SOCKET_CREATE= 0,
+  PSI_SOCKET_CONNECT= 1,
+  PSI_SOCKET_BIND= 2,
+  PSI_SOCKET_CLOSE= 3,
+  PSI_SOCKET_SEND= 4,
+  PSI_SOCKET_RECV= 5,
+  PSI_SOCKET_SENDTO= 6,
+  PSI_SOCKET_RECVFROM= 7,
+  PSI_SOCKET_SENDMSG= 8,
+  PSI_SOCKET_RECVMSG= 9,
+  PSI_SOCKET_SEEK= 10,
+  PSI_SOCKET_OPT= 11,
+  PSI_SOCKET_STAT= 12,
+  PSI_SOCKET_SHUTDOWN= 13,
+  PSI_SOCKET_SELECT= 14
+};
+typedef enum PSI_socket_operation PSI_socket_operation;
 typedef unsigned int PSI_mutex_key;
 typedef unsigned int PSI_rwlock_key;
 typedef unsigned int PSI_cond_key;
 typedef unsigned int PSI_thread_key;
 typedef unsigned int PSI_file_key;
+typedef unsigned int PSI_stage_key;
+typedef unsigned int PSI_statement_key;
+typedef unsigned int PSI_socket_key;
 struct PSI_v2
 {
   int placeholder;
@@ -82,6 +153,18 @@ struct PSI_file_info_v2
 {
   int placeholder;
 };
+struct PSI_stage_info_v2
+{
+  int placeholder;
+};
+struct PSI_statement_info_v2
+{
+  int placeholder;
+};
+struct PSI_idle_locker_state_v2
+{
+  int placeholder;
+};
 struct PSI_mutex_locker_state_v2
 {
   int placeholder;
@@ -102,16 +185,30 @@ struct PSI_table_locker_state_v2
 {
   int placeholder;
 };
+struct PSI_statement_locker_state_v2
+{
+  int placeholder;
+};
+struct PSI_socket_locker_state_v2
+{
+  int placeholder;
+};
 typedef struct PSI_v2 PSI;
 typedef struct PSI_mutex_info_v2 PSI_mutex_info;
 typedef struct PSI_rwlock_info_v2 PSI_rwlock_info;
 typedef struct PSI_cond_info_v2 PSI_cond_info;
 typedef struct PSI_thread_info_v2 PSI_thread_info;
 typedef struct PSI_file_info_v2 PSI_file_info;
+typedef struct PSI_stage_info_v2 PSI_stage_info;
+typedef struct PSI_statement_info_v2 PSI_statement_info;
+typedef struct PSI_socket_info_v2 PSI_socket_info;
+typedef struct PSI_idle_locker_state_v2 PSI_idle_locker_state;
 typedef struct PSI_mutex_locker_state_v2 PSI_mutex_locker_state;
 typedef struct PSI_rwlock_locker_state_v2 PSI_rwlock_locker_state;
 typedef struct PSI_cond_locker_state_v2 PSI_cond_locker_state;
 typedef struct PSI_file_locker_state_v2 PSI_file_locker_state;
 typedef struct PSI_table_locker_state_v2 PSI_table_locker_state;
+typedef struct PSI_statement_locker_state_v2 PSI_statement_locker_state;
+typedef struct PSI_socket_locker_state_v2 PSI_socket_locker_state;
 extern MYSQL_PLUGIN_IMPORT PSI *PSI_server;
 C_MODE_END
diff --git a/include/mysql/service_debug_sync.h b/include/mysql/service_debug_sync.h
index f7bab99ac97..55615b8f198 100644
--- a/include/mysql/service_debug_sync.h
+++ b/include/mysql/service_debug_sync.h
@@ -337,10 +337,16 @@ extern void (*debug_sync_C_callback_ptr)(MYSQL_THD, const char *, size_t);
 #define DEBUG_SYNC(thd, name)                           \
   do {                                                  \
     if (debug_sync_service)                             \
-      debug_sync_service(thd, name, sizeof(name)-1);    \
+      debug_sync_service(thd, STRING_WITH_LEN(name));   \
   } while(0)
+#define DEBUG_SYNC_C_IF_THD(thd, name)                  \
+  do {                                                  \
+    if (debug_sync_service && thd)                      \
+      (*debug_sync_service)(thd, STRING_WITH_LEN(name)); } \
+  while(0)
 #else
-#define DEBUG_SYNC(thd,name)            do { } while(0)
+#define DEBUG_SYNC(thd,name)           do { } while(0)
+#define DEBUG_SYNC_C_IF_THD(thd, name) do { } while(0)
 #endif
 
 /* compatibility macro */
diff --git a/libmysqld/CMakeLists.txt b/libmysqld/CMakeLists.txt
index c40beb5f9a1..ea27a3569e2 100644
--- a/libmysqld/CMakeLists.txt
+++ b/libmysqld/CMakeLists.txt
@@ -69,7 +69,8 @@ SET(SQL_EMBEDDED_SOURCES emb_qcache.cc libmysqld.c lib_sql.cc
            ../sql/sql_lex.cc ../sql/keycaches.cc
            ../sql/sql_list.cc ../sql/sql_load.cc ../sql/sql_locale.cc 
            ../sql/sql_binlog.cc ../sql/sql_manager.cc
-           ../sql/sql_parse.cc ../sql/sql_partition.cc ../sql/sql_plugin.cc 
+           ../sql/sql_parse.cc ../sql/sql_bootstrap.cc
+           ../sql/sql_partition.cc ../sql/sql_plugin.cc 
            ../sql/debug_sync.cc ../sql/opt_table_elimination.cc
            ../sql/sql_prepare.cc ../sql/sql_rename.cc ../sql/sql_repl.cc 
            ../sql/sql_select.cc ../sql/sql_servers.cc
diff --git a/mysql-test/mysql-test-run.pl b/mysql-test/mysql-test-run.pl
index 35d428a9681..cee25edb44a 100755
--- a/mysql-test/mysql-test-run.pl
+++ b/mysql-test/mysql-test-run.pl
@@ -3516,7 +3516,7 @@ sub mysql_install_db {
   {
     my $sql_dir= dirname($path_sql);
     # Use the mysql database for system tables
-    mtr_tofile($bootstrap_sql_file, "use mysql\n");
+    mtr_tofile($bootstrap_sql_file, "use mysql;\n");
 
     # Add the offical mysql system tables
     # for a production system
diff --git a/mysys/CMakeLists.txt b/mysys/CMakeLists.txt
index 832cd01e263..05606942d8e 100644
--- a/mysys/CMakeLists.txt
+++ b/mysys/CMakeLists.txt
@@ -16,7 +16,9 @@
 INCLUDE_DIRECTORIES(${ZLIB_INCLUDE_DIR} ${CMAKE_SOURCE_DIR}/include ${CMAKE_SOURCE_DIR}/mysys)
 
 SET(MYSYS_SOURCES  array.c charset-def.c charset.c checksum.c default.c
-				errors.c hash.c list.c md5.c mf_cache.c mf_dirname.c mf_fn_ext.c
+				errors.c hash.c list.c
+                                md5.c md5_compute.cc
+                                mf_cache.c mf_dirname.c mf_fn_ext.c
 				mf_format.c mf_getdate.c mf_iocache.c mf_iocache2.c mf_keycache.c 
 				mf_keycaches.c mf_loadpath.c mf_pack.c mf_path.c mf_qsort.c mf_qsort2.c
 				mf_radix.c mf_same.c mf_sort.c mf_soundex.c mf_arr_appstr.c mf_tempdir.c
@@ -37,7 +39,7 @@ SET(MYSYS_SOURCES  array.c charset-def.c charset.c checksum.c default.c
                 safemalloc.c my_new.cc
 				my_atomic.c my_getncpus.c my_safehash.c my_chmod.c my_rnd.c
                                 my_uuid.c wqueue.c waiting_threads.c ma_dyncol.c
-				my_rdtsc.c my_context.c)
+				my_rdtsc.c my_context.c psi_noop.c)
 
 IF (WIN32)
  SET (MYSYS_SOURCES ${MYSYS_SOURCES} my_winthread.c my_wincond.c my_winerr.c my_winfile.c my_windac.c my_conio.c)
diff --git a/mysys/array.c b/mysys/array.c
index c969da83586..94f1cb8de34 100644
--- a/mysys/array.c
+++ b/mysys/array.c
@@ -87,9 +87,9 @@ my_bool init_dynamic_array(DYNAMIC_ARRAY *array, uint element_size,
     FALSE	Ok
 */
 
-my_bool insert_dynamic(DYNAMIC_ARRAY *array, const uchar* element)
+my_bool insert_dynamic(DYNAMIC_ARRAY *array, const void * element)
 {
-  uchar* buffer;
+  void *buffer;
   if (array->elements == array->max_element)
   {						/* Call only when nessesary */
     if (!(buffer=alloc_dynamic(array)))
@@ -122,7 +122,7 @@ my_bool insert_dynamic(DYNAMIC_ARRAY *array, const uchar* element)
     0		Error
 */
 
-uchar *alloc_dynamic(DYNAMIC_ARRAY *array)
+void *alloc_dynamic(DYNAMIC_ARRAY *array)
 {
   DBUG_ENTER("alloc_dynamic");
   if (array->elements == array->max_element)
@@ -167,7 +167,7 @@ uchar *alloc_dynamic(DYNAMIC_ARRAY *array)
     0		Array is empty
 */
 
-uchar *pop_dynamic(DYNAMIC_ARRAY *array)
+void *pop_dynamic(DYNAMIC_ARRAY *array)
 {
   if (array->elements)
     return array->buffer+(--array->elements * array->size_of_element);
@@ -192,7 +192,7 @@ uchar *pop_dynamic(DYNAMIC_ARRAY *array)
     FALSE	Ok
 */
 
-my_bool set_dynamic(DYNAMIC_ARRAY *array, uchar* element, uint idx)
+my_bool set_dynamic(DYNAMIC_ARRAY *array, const void *element, uint idx)
 {
   if (idx >= array->elements)
   {
@@ -268,7 +268,7 @@ my_bool allocate_dynamic(DYNAMIC_ARRAY *array, uint max_elements)
       idx	Index of element wanted.
 */
 
-void get_dynamic(DYNAMIC_ARRAY *array, uchar* element, uint idx)
+void get_dynamic(DYNAMIC_ARRAY *array, void *element, uint idx)
 {
   if (idx >= array->elements)
   {
@@ -363,13 +363,13 @@ void freeze_size(DYNAMIC_ARRAY *array)
 
 */
 
-int get_index_dynamic(DYNAMIC_ARRAY *array, uchar* element)
+int get_index_dynamic(DYNAMIC_ARRAY *array, void* element)
 {
   size_t ret;
-  if (array->buffer > element)
+  if (array->buffer > (uchar*) element)
     return -1;
 
-  ret= (element - array->buffer) /  array->size_of_element;
+  ret= ((uchar*) element - array->buffer) /  array->size_of_element;
   if (ret > array->elements)
     return -1;
 
diff --git a/mysys/md5.c b/mysys/md5.c
index 22a5e409a09..b3b0a470369 100644
--- a/mysys/md5.c
+++ b/mysys/md5.c
@@ -38,13 +38,15 @@
    copyright in any changes I have made; this code remains in the
    public domain.  */
 
+/*
+  Skip entirely if built with OpenSSL/YaSSL support.
+*/
+#if !defined(HAVE_OPENSSL) && !defined(HAVE_YASSL)
+
 #include <my_global.h>
 #include <m_string.h>
 #include "my_md5.h"
 
-#include <string.h>	/* for memcpy() and memset() */
-
-
 static void 
 my_MD5Transform (cvs_uint32 buf[4], const unsigned char in[64]);
 
@@ -323,3 +325,5 @@ main (int argc, char **argv)
   return 0;
 }
 #endif /* TEST */
+
+#endif /* !defined(HAVE_OPENSSL) && !defined(HAVE_YASSL) */
diff --git a/mysys/md5_compute.cc b/mysys/md5_compute.cc
new file mode 100644
index 00000000000..7b1591d91b7
--- /dev/null
+++ b/mysys/md5_compute.cc
@@ -0,0 +1,68 @@
+/* Copyright (c) 2012, Oracle and/or its affiliates. All rights reserved.
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software Foundation,
+   51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA */
+
+
+/**
+  @file
+
+  @brief
+  Wrapper functions for OpenSSL, YaSSL and MySQL's MD5
+  implementations. Also provides a Compatibility layer
+  to make available YaSSL's MD5 implementation.
+*/
+
+#include <my_global.h>
+#include <my_md5.h>
+
+#ifdef HAVE_YASSL
+
+#include "md5.hpp"
+
+/**
+  Compute MD5 message digest.
+
+  @param digest [out]  Computed MD5 digest
+  @param buf    [in]   Message to be computed
+  @param len    [in]   Length of the message
+
+  @return              void
+*/
+void my_md5_hash(char *digest, const char *buf, int len)
+{
+  TaoCrypt::MD5 hasher;
+  hasher.Update((TaoCrypt::byte *) buf, len);
+  hasher.Final((TaoCrypt::byte *) digest);
+}
+#endif /* HAVE_YASSL */
+
+
+/**
+    Wrapper function to compute MD5 message digest.
+
+    @param digest [out]  Computed MD5 digest
+    @param buf    [in]   Message to be computed
+    @param len    [in]   Length of the message
+
+    @return              void
+*/
+void compute_md5_hash(char *digest, const char *buf, int len)
+{
+#ifdef HAVE_YASSL
+  my_md5_hash(digest, buf, len);
+#else
+  MY_MD5_HASH((unsigned char *) digest, (unsigned char const *) buf, len);
+#endif /* HAVE_YASSL */
+}
+
diff --git a/mysys/my_init.c b/mysys/my_init.c
index 193c8281577..324f0f86f4d 100644
--- a/mysys/my_init.c
+++ b/mysys/my_init.c
@@ -21,6 +21,7 @@
 #include <m_string.h>
 #include <m_ctype.h>
 #include <signal.h>
+#include <mysql/psi/mysql_stage.h>
 #ifdef __WIN__
 #ifdef _MSC_VER
 #include <locale.h>
@@ -442,6 +443,9 @@ static my_bool win32_init_tcp_ip()
 
 #ifdef HAVE_PSI_INTERFACE
 
+PSI_stage_info stage_waiting_for_table_level_lock=
+{0, "Waiting for table level lock", 0};
+
 #if !defined(HAVE_PREAD) && !defined(_WIN32)
 PSI_mutex_key key_my_file_info_mutex;
 #endif /* !defined(HAVE_PREAD) && !defined(_WIN32) */
@@ -531,30 +535,34 @@ static PSI_file_info all_mysys_files[]=
   { &key_file_cnf, "cnf", 0}
 };
 
+PSI_stage_info *all_mysys_stages[]=
+{
+  & stage_waiting_for_table_level_lock
+};
+
 void my_init_mysys_psi_keys()
 {
   const char* category= "mysys";
   int count;
 
-  if (PSI_server == NULL)
-    return;
-
   count= sizeof(all_mysys_mutexes)/sizeof(all_mysys_mutexes[0]);
-  PSI_server->register_mutex(category, all_mysys_mutexes, count);
+  mysql_mutex_register(category, all_mysys_mutexes, count);
 
   count= sizeof(all_mysys_conds)/sizeof(all_mysys_conds[0]);
-  PSI_server->register_cond(category, all_mysys_conds, count);
+  mysql_cond_register(category, all_mysys_conds, count);
 
   count= sizeof(all_mysys_rwlocks)/sizeof(all_mysys_rwlocks[0]);
-  PSI_server->register_rwlock(category, all_mysys_rwlocks, count);
+  mysql_rwlock_register(category, all_mysys_rwlocks, count);
 
 #ifdef USE_ALARM_THREAD
   count= sizeof(all_mysys_threads)/sizeof(all_mysys_threads[0]);
-  PSI_server->register_thread(category, all_mysys_threads, count);
+  mysql_thread_register(category, all_mysys_threads, count);
 #endif /* USE_ALARM_THREAD */
 
   count= sizeof(all_mysys_files)/sizeof(all_mysys_files[0]);
-  PSI_server->register_file(category, all_mysys_files, count);
+  mysql_file_register(category, all_mysys_files, count);
+
+  count= array_elements(all_mysys_stages);
+  mysql_stage_register(category, all_mysys_stages, count);
 }
 #endif /* HAVE_PSI_INTERFACE */
-
diff --git a/mysys/my_static.c b/mysys/my_static.c
index bc2d8beac83..f9d019c8dc1 100644
--- a/mysys/my_static.c
+++ b/mysys/my_static.c
@@ -98,31 +98,3 @@ my_bool my_disable_sync=0;
 my_bool my_disable_async_io=0;
 my_bool my_disable_flush_key_blocks=0;
 my_bool my_disable_symlinks=0;
-
-/*
-  Note that PSI_hook and PSI_server are unconditionally
-  (no ifdef HAVE_PSI_INTERFACE) defined.
-  This is to ensure binary compatibility between the server and plugins,
-  in the case when:
-  - the server is not compiled with HAVE_PSI_INTERFACE
-  - a plugin is compiled with HAVE_PSI_INTERFACE
-  See the doxygen documentation for the performance schema.
-*/
-
-/**
-  Hook for the instrumentation interface.
-  Code implementing the instrumentation interface should register here.
-*/
-struct PSI_bootstrap *PSI_hook= NULL;
-
-/**
-  Instance of the instrumentation interface for the MySQL server.
-  @todo This is currently a global variable, which is handy when
-  compiling instrumented code that is bundled with the server.
-  When dynamic plugin are truly supported, this variable will need
-  to be replaced by a macro, so that each XYZ plugin can have it's own
-  xyz_psi_server variable, obtained from PSI_bootstrap::get_interface()
-  with the version used at compile time for plugin XYZ.
-*/
-PSI *PSI_server= NULL;
-
diff --git a/mysys/psi_noop.c b/mysys/psi_noop.c
new file mode 100644
index 00000000000..6d26f5a2178
--- /dev/null
+++ b/mysys/psi_noop.c
@@ -0,0 +1,746 @@
+/* Copyright (c) 2011, Oracle and/or its affiliates. All rights reserved.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; version 2 of the License.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software Foundation,
+  51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA */
+
+/*
+  Always provide the noop performance interface, for plugins.
+*/
+
+#define USE_PSI_V1
+#define HAVE_PSI_INTERFACE
+
+#include "my_global.h"
+#include "my_pthread.h"
+#include "my_sys.h"
+#include "mysql/psi/psi.h"
+
+C_MODE_START
+
+#define NNN __attribute__((unused))
+
+static void register_mutex_noop(const char *category NNN,
+                                PSI_mutex_info *info NNN,
+                                int count NNN)
+{
+  return;
+}
+
+static void register_rwlock_noop(const char *category NNN,
+                                 PSI_rwlock_info *info NNN,
+                                 int count NNN)
+{
+  return;
+}
+
+static void register_cond_noop(const char *category NNN,
+                               PSI_cond_info *info NNN,
+                               int count NNN)
+{
+  return;
+}
+
+static void register_thread_noop(const char *category NNN,
+                                 PSI_thread_info *info NNN,
+                                 int count NNN)
+{
+  return;
+}
+
+static void register_file_noop(const char *category NNN,
+                               PSI_file_info *info NNN,
+                               int count NNN)
+{
+  return;
+}
+
+static void register_stage_noop(const char *category NNN,
+                                PSI_stage_info **info_array NNN,
+                                int count NNN)
+{
+  return;
+}
+
+static void register_statement_noop(const char *category NNN,
+                                    PSI_statement_info *info NNN,
+                                    int count NNN)
+{
+  return;
+}
+
+static void register_socket_noop(const char *category NNN,
+                                 PSI_socket_info *info NNN,
+                                 int count NNN)
+{
+  return;
+}
+
+static PSI_mutex*
+init_mutex_noop(PSI_mutex_key key NNN, const void *identity NNN)
+{
+  return NULL;
+}
+
+static void destroy_mutex_noop(PSI_mutex* mutex NNN)
+{
+  return;
+}
+
+static PSI_rwlock*
+init_rwlock_noop(PSI_rwlock_key key NNN, const void *identity NNN)
+{
+  return NULL;
+}
+
+static void destroy_rwlock_noop(PSI_rwlock* rwlock NNN)
+{
+  return;
+}
+
+static PSI_cond*
+init_cond_noop(PSI_cond_key key NNN, const void *identity NNN)
+{
+  return NULL;
+}
+
+static void destroy_cond_noop(PSI_cond* cond NNN)
+{
+  return;
+}
+
+static PSI_socket*
+init_socket_noop(PSI_socket_key key NNN, const my_socket *fd NNN)
+{
+  return NULL;
+}
+
+static void destroy_socket_noop(PSI_socket* socket NNN)
+{
+  return;
+}
+
+static PSI_table_share*
+get_table_share_noop(my_bool temporary NNN, struct TABLE_SHARE *share NNN)
+{
+  return NULL;
+}
+
+static void release_table_share_noop(PSI_table_share* share NNN)
+{
+  return;
+}
+
+static void
+drop_table_share_noop(my_bool temporary NNN, const char *schema_name NNN,
+                      int schema_name_length NNN, const char *table_name NNN,
+                      int table_name_length NNN)
+{
+  return;
+}
+
+static PSI_table*
+open_table_noop(PSI_table_share *share NNN, const void *identity NNN)
+{
+  return NULL;
+}
+
+static void unbind_table_noop(PSI_table *table NNN)
+{
+  return;
+}
+
+static PSI_table*
+rebind_table_noop(PSI_table_share *share NNN,
+                  const void *identity NNN,
+                  PSI_table *table NNN)
+{
+  return NULL;
+}
+
+static void close_table_noop(PSI_table *table NNN)
+{
+  return;
+}
+
+static void create_file_noop(PSI_file_key key NNN,
+                             const char *name NNN, File file NNN)
+{
+  return;
+}
+
+static int spawn_thread_noop(PSI_thread_key key NNN,
+                             pthread_t *thread NNN,
+                             const pthread_attr_t *attr NNN,
+                             void *(*start_routine)(void*) NNN, void *arg NNN)
+{
+  return pthread_create(thread, attr, start_routine, arg);
+}
+
+static PSI_thread*
+new_thread_noop(PSI_thread_key key NNN,
+                const void *identity NNN, ulong thread_id NNN)
+{
+  return NULL;
+}
+
+static void set_thread_id_noop(PSI_thread *thread NNN, unsigned long id NNN)
+{
+  return;
+}
+
+static PSI_thread*
+get_thread_noop(void NNN)
+{
+  return NULL;
+}
+
+static void set_thread_user_noop(const char *user NNN, int user_len NNN)
+{
+  return;
+}
+
+static void set_thread_user_host_noop(const char *user NNN, int user_len NNN,
+                                      const char *host NNN, int host_len NNN)
+{
+  return;
+}
+
+static void set_thread_db_noop(const char* db NNN, int db_len NNN)
+{
+  return;
+}
+
+static void set_thread_command_noop(int command NNN)
+{
+  return;
+}
+
+static void set_thread_start_time_noop(time_t start_time NNN)
+{
+  return;
+}
+
+static void set_thread_state_noop(const char* state NNN)
+{
+  return;
+}
+
+static void set_thread_info_noop(const char* info NNN, int info_len NNN)
+{
+  return;
+}
+
+static void set_thread_noop(PSI_thread* thread NNN)
+{
+  return;
+}
+
+static void delete_current_thread_noop(void)
+{
+  return;
+}
+
+static void delete_thread_noop(PSI_thread *thread NNN)
+{
+  return;
+}
+
+static PSI_file_locker*
+get_thread_file_name_locker_noop(PSI_file_locker_state *state NNN,
+                                 PSI_file_key key NNN,
+                                 enum PSI_file_operation op NNN,
+                                 const char *name NNN, const void *identity NNN)
+{
+  return NULL;
+}
+
+static PSI_file_locker*
+get_thread_file_stream_locker_noop(PSI_file_locker_state *state NNN,
+                                   PSI_file *file NNN,
+                                   enum PSI_file_operation op NNN)
+{
+  return NULL;
+}
+
+
+static PSI_file_locker*
+get_thread_file_descriptor_locker_noop(PSI_file_locker_state *state NNN,
+                                       File file NNN,
+                                       enum PSI_file_operation op NNN)
+{
+  return NULL;
+}
+
+static void unlock_mutex_noop(PSI_mutex *mutex NNN)
+{
+  return;
+}
+
+static void unlock_rwlock_noop(PSI_rwlock *rwlock NNN)
+{
+  return;
+}
+
+static void signal_cond_noop(PSI_cond* cond NNN)
+{
+  return;
+}
+
+static void broadcast_cond_noop(PSI_cond* cond NNN)
+{
+  return;
+}
+
+static PSI_idle_locker*
+start_idle_wait_noop(PSI_idle_locker_state* state NNN,
+                     const char *src_file NNN, uint src_line NNN)
+{
+  return NULL;
+}
+
+static void end_idle_wait_noop(PSI_idle_locker* locker NNN)
+{
+  return;
+}
+
+static PSI_mutex_locker*
+start_mutex_wait_noop(PSI_mutex_locker_state *state NNN,
+                      PSI_mutex *mutex NNN,
+                      PSI_mutex_operation op NNN,
+                      const char *src_file NNN, uint src_line NNN)
+{
+  return NULL;
+}
+
+static void end_mutex_wait_noop(PSI_mutex_locker* locker NNN, int rc NNN)
+{
+  return;
+}
+
+
+static PSI_rwlock_locker*
+start_rwlock_rdwait_noop(struct PSI_rwlock_locker_state_v1 *state NNN,
+                         struct PSI_rwlock *rwlock NNN,
+                         enum PSI_rwlock_operation op NNN,
+                         const char *src_file NNN, uint src_line NNN)
+{
+  return NULL;
+}
+
+static void end_rwlock_rdwait_noop(PSI_rwlock_locker* locker NNN, int rc NNN)
+{
+  return;
+}
+
+static struct PSI_rwlock_locker*
+start_rwlock_wrwait_noop(struct PSI_rwlock_locker_state_v1 *state NNN,
+                         struct PSI_rwlock *rwlock NNN,
+                         enum PSI_rwlock_operation op NNN,
+                         const char *src_file NNN, uint src_line NNN)
+{
+  return NULL;
+}
+
+static void end_rwlock_wrwait_noop(PSI_rwlock_locker* locker NNN, int rc NNN)
+{
+  return;
+}
+
+static struct PSI_cond_locker*
+start_cond_wait_noop(struct PSI_cond_locker_state_v1 *state NNN,
+                     struct PSI_cond *cond NNN,
+                     struct PSI_mutex *mutex NNN,
+                     enum PSI_cond_operation op NNN,
+                     const char *src_file NNN, uint src_line NNN)
+{
+  return NULL;
+}
+
+static void end_cond_wait_noop(PSI_cond_locker* locker NNN, int rc NNN)
+{
+  return;
+}
+
+static struct PSI_table_locker*
+start_table_io_wait_noop(struct PSI_table_locker_state_v1 *state NNN,
+                         struct PSI_table *table NNN,
+                         enum PSI_table_io_operation op NNN,
+                         uint index NNN,
+                         const char *src_file NNN, uint src_line NNN)
+{
+  return NULL;
+}
+
+static void end_table_io_wait_noop(PSI_table_locker* locker NNN)
+{
+  return;
+}
+
+static struct PSI_table_locker*
+start_table_lock_wait_noop(struct PSI_table_locker_state_v1 *state NNN,
+                           struct PSI_table *table NNN,
+                           enum PSI_table_lock_operation op NNN,
+                           ulong flags NNN,
+                           const char *src_file NNN, uint src_line NNN)
+{
+  return NULL;
+}
+
+static void end_table_lock_wait_noop(PSI_table_locker* locker NNN)
+{
+  return;
+}
+
+static PSI_file* start_file_open_wait_noop(PSI_file_locker *locker NNN,
+                                           const char *src_file NNN,
+                                           uint src_line NNN)
+{
+  return NULL;
+}
+
+static void end_file_open_wait_noop(PSI_file_locker *locker NNN)
+{
+  return;
+}
+
+static void end_file_open_wait_and_bind_to_descriptor_noop
+  (PSI_file_locker *locker NNN, File file NNN)
+{
+  return;
+}
+
+static void start_file_wait_noop(PSI_file_locker *locker NNN,
+                                 size_t count NNN,
+                                 const char *src_file NNN,
+                                 uint src_line NNN)
+{
+  return;
+}
+
+static void end_file_wait_noop(PSI_file_locker *locker NNN,
+                               size_t count NNN)
+{
+  return;
+}
+
+static void start_stage_noop(PSI_stage_key key NNN,
+                             const char *src_file NNN, int src_line NNN)
+{
+  return;
+}
+
+static void end_stage_noop(void)
+{
+  return;
+}
+
+static PSI_statement_locker*
+get_thread_statement_locker_noop(PSI_statement_locker_state *state NNN,
+                                 PSI_statement_key key NNN)
+{
+  return NULL;
+}
+
+static PSI_statement_locker*
+refine_statement_noop(PSI_statement_locker *locker NNN,
+                      PSI_statement_key key NNN)
+{
+  return NULL;
+}
+
+static void start_statement_noop(PSI_statement_locker *locker NNN,
+                                 const char *db NNN, uint db_len NNN,
+                                 const char *src_file NNN, uint src_line NNN)
+{
+  return;
+}
+
+static void set_statement_text_noop(PSI_statement_locker *locker NNN,
+                                    const char *text NNN, uint text_len NNN)
+{
+  return;
+}
+
+static void set_statement_lock_time_noop(PSI_statement_locker *locker NNN,
+                                         ulonglong count NNN)
+{
+  return;
+}
+
+static void set_statement_rows_sent_noop(PSI_statement_locker *locker NNN,
+                                         ulonglong count NNN)
+{
+  return;
+}
+
+static void set_statement_rows_examined_noop(PSI_statement_locker *locker NNN,
+                                             ulonglong count NNN)
+{
+  return;
+}
+
+static void inc_statement_created_tmp_disk_tables_noop(PSI_statement_locker *locker NNN,
+                                                       ulong count NNN)
+{
+  return;
+}
+
+static void inc_statement_created_tmp_tables_noop(PSI_statement_locker *locker NNN,
+                                                  ulong count NNN)
+{
+  return;
+}
+
+static void inc_statement_select_full_join_noop(PSI_statement_locker *locker NNN,
+                                                ulong count NNN)
+{
+  return;
+}
+
+static void inc_statement_select_full_range_join_noop(PSI_statement_locker *locker NNN,
+                                                      ulong count NNN)
+{
+  return;
+}
+
+static void inc_statement_select_range_noop(PSI_statement_locker *locker NNN,
+                                            ulong count NNN)
+{
+  return;
+}
+
+static void inc_statement_select_range_check_noop(PSI_statement_locker *locker NNN,
+                                                  ulong count NNN)
+{
+  return;
+}
+
+static void inc_statement_select_scan_noop(PSI_statement_locker *locker NNN,
+                                           ulong count NNN)
+{
+  return;
+}
+
+static void inc_statement_sort_merge_passes_noop(PSI_statement_locker *locker NNN,
+                                                 ulong count NNN)
+{
+  return;
+}
+
+static void inc_statement_sort_range_noop(PSI_statement_locker *locker NNN,
+                                          ulong count NNN)
+{
+  return;
+}
+
+static void inc_statement_sort_rows_noop(PSI_statement_locker *locker NNN,
+                                         ulong count NNN)
+{
+  return;
+}
+
+static void inc_statement_sort_scan_noop(PSI_statement_locker *locker NNN,
+                                         ulong count NNN)
+{
+  return;
+}
+
+static void set_statement_no_index_used_noop(PSI_statement_locker *locker NNN)
+{
+  return;
+}
+
+static void set_statement_no_good_index_used_noop(PSI_statement_locker *locker NNN)
+{
+  return;
+}
+
+static void end_statement_noop(PSI_statement_locker *locker NNN,
+                               void *stmt_da NNN)
+{
+  return;
+}
+
+static PSI_socket_locker*
+start_socket_wait_noop(PSI_socket_locker_state *state NNN,
+                       PSI_socket *socket NNN,
+                       PSI_socket_operation op NNN,
+                       size_t count NNN,
+                       const char *src_file NNN,
+                       uint src_line NNN)
+{
+  return NULL;
+}
+
+static void end_socket_wait_noop(PSI_socket_locker *locker NNN,
+                                 size_t count NNN)
+{
+  return;
+}
+
+static void set_socket_state_noop(PSI_socket *socket NNN,
+                                  enum PSI_socket_state state NNN)
+{
+  return;
+}
+
+static void set_socket_info_noop(PSI_socket *socket NNN,
+                                 const my_socket *fd NNN,
+                                 const struct sockaddr *addr NNN,
+                                 socklen_t addr_len NNN)
+{
+  return;
+}
+
+static void set_socket_thread_owner_noop(PSI_socket *socket NNN)
+{
+  return;
+}
+
+static struct PSI_digest_locker*
+digest_start_noop(PSI_statement_locker *locker NNN)
+{
+  return NULL;
+}
+
+static PSI_digest_locker*
+digest_add_token_noop(PSI_digest_locker *locker NNN,
+                      uint token NNN,
+                      struct OPAQUE_LEX_YYSTYPE *yylval NNN)
+{
+  return NULL;
+}
+
+static PSI PSI_noop=
+{
+  register_mutex_noop,
+  register_rwlock_noop,
+  register_cond_noop,
+  register_thread_noop,
+  register_file_noop,
+  register_stage_noop,
+  register_statement_noop,
+  register_socket_noop,
+  init_mutex_noop,
+  destroy_mutex_noop,
+  init_rwlock_noop,
+  destroy_rwlock_noop,
+  init_cond_noop,
+  destroy_cond_noop,
+  init_socket_noop,
+  destroy_socket_noop,
+  get_table_share_noop,
+  release_table_share_noop,
+  drop_table_share_noop,
+  open_table_noop,
+  unbind_table_noop,
+  rebind_table_noop,
+  close_table_noop,
+  create_file_noop,
+  spawn_thread_noop,
+  new_thread_noop,
+  set_thread_id_noop,
+  get_thread_noop,
+  set_thread_user_noop,
+  set_thread_user_host_noop,
+  set_thread_db_noop,
+  set_thread_command_noop,
+  set_thread_start_time_noop,
+  set_thread_state_noop,
+  set_thread_info_noop,
+  set_thread_noop,
+  delete_current_thread_noop,
+  delete_thread_noop,
+  get_thread_file_name_locker_noop,
+  get_thread_file_stream_locker_noop,
+  get_thread_file_descriptor_locker_noop,
+  unlock_mutex_noop,
+  unlock_rwlock_noop,
+  signal_cond_noop,
+  broadcast_cond_noop,
+  start_idle_wait_noop,
+  end_idle_wait_noop,
+  start_mutex_wait_noop,
+  end_mutex_wait_noop,
+  start_rwlock_rdwait_noop,
+  end_rwlock_rdwait_noop,
+  start_rwlock_wrwait_noop,
+  end_rwlock_wrwait_noop,
+  start_cond_wait_noop,
+  end_cond_wait_noop,
+  start_table_io_wait_noop,
+  end_table_io_wait_noop,
+  start_table_lock_wait_noop,
+  end_table_lock_wait_noop,
+  start_file_open_wait_noop,
+  end_file_open_wait_noop,
+  end_file_open_wait_and_bind_to_descriptor_noop,
+  start_file_wait_noop,
+  end_file_wait_noop,
+  start_stage_noop,
+  end_stage_noop,
+  get_thread_statement_locker_noop,
+  refine_statement_noop,
+  start_statement_noop,
+  set_statement_text_noop,
+  set_statement_lock_time_noop,
+  set_statement_rows_sent_noop,
+  set_statement_rows_examined_noop,
+  inc_statement_created_tmp_disk_tables_noop,
+  inc_statement_created_tmp_tables_noop,
+  inc_statement_select_full_join_noop,
+  inc_statement_select_full_range_join_noop,
+  inc_statement_select_range_noop,
+  inc_statement_select_range_check_noop,
+  inc_statement_select_scan_noop,
+  inc_statement_sort_merge_passes_noop,
+  inc_statement_sort_range_noop,
+  inc_statement_sort_rows_noop,
+  inc_statement_sort_scan_noop,
+  set_statement_no_index_used_noop,
+  set_statement_no_good_index_used_noop,
+  end_statement_noop,
+  start_socket_wait_noop,
+  end_socket_wait_noop,
+  set_socket_state_noop,
+  set_socket_info_noop,
+  set_socket_thread_owner_noop,
+  digest_start_noop,
+  digest_add_token_noop
+};
+
+/**
+  Hook for the instrumentation interface.
+  Code implementing the instrumentation interface should register here.
+*/
+struct PSI_bootstrap *PSI_hook= NULL;
+
+/**
+  Instance of the instrumentation interface for the MySQL server.
+  @todo This is currently a global variable, which is handy when
+  compiling instrumented code that is bundled with the server.
+  When dynamic plugin are truly supported, this variable will need
+  to be replaced by a macro, so that each XYZ plugin can have it's own
+  xyz_psi_server variable, obtained from PSI_bootstrap::get_interface()
+  with the version used at compile time for plugin XYZ.
+*/
+
+PSI *PSI_server= & PSI_noop;
+
+void set_psi_server(PSI *psi)
+{
+  PSI_server= psi;
+}
+
+C_MODE_END
+
diff --git a/scripts/comp_sql.c b/scripts/comp_sql.c
index e067d0757bf..f1b327b24ac 100644
--- a/scripts/comp_sql.c
+++ b/scripts/comp_sql.c
@@ -1,4 +1,5 @@
-/* Copyright (C) 2004 MySQL AB
+/* Copyright (c) 2004, 2010, Oracle and/or its affiliates. 
+   Copyright (c) 2012 Monty Program Ab
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -11,10 +12,10 @@
 
    You should have received a copy of the GNU General Public License
    along with this program; if not, write to the Free Software
-   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+   51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA */
 
 /*
-  Written by Magnus Svensson
+  Originally written by Magnus Svensson
 */
 
 /*
@@ -25,10 +26,18 @@
 #include <stdarg.h>
 #include <stdlib.h>
 #include <stdio.h>
-#include <sys/stat.h>
 
-/* Compiler-dependent constant for maximum string constant */
-#define MAX_STRING_CONSTANT_LENGTH 65535
+#include "../sql/sql_bootstrap.h"
+
+/*
+  This is an internal tool used during the build process only,
+  - do not make a library just for this,
+    which would make the Makefiles and the server link
+    more complex than necessary,
+  - do not duplicate the code either.
+ so just add the sql_bootstrap.cc code as is.
+*/
+#include "../sql/sql_bootstrap.cc"
 
 FILE *in, *out;
 
@@ -58,15 +67,60 @@ static void die(const char *fmt, ...)
   exit(1);
 }
 
+char *fgets_fn(char *buffer, size_t size, fgets_input_t input)
+{
+  return fgets(buffer, size, (FILE*) input);
+}
+
+static void print_query(FILE *out, const char *query)
+{
+  const char *ptr= query;
+  int column= 0;
+
+  fprintf(out, "\"");
+  while (*ptr)
+  {
+    if (column >= 120)
+    {
+      /* Wrap to the next line, tabulated. */
+      fprintf(out, "\"\n  \"");
+      column= 2;
+    }
+    switch(*ptr)
+    {
+    case '\n':
+      /*
+        Preserve the \n character in the query text,
+        and wrap to the next line, tabulated.
+      */
+      fprintf(out, "\\n\"\n  \"");
+      column= 2;
+      break;
+    case '\r':
+      /* Skipped */
+      break;
+    case '\"':
+      fprintf(out, "\\\"");
+      column++;
+      break;
+    default:
+      putc(*ptr, out);
+      column++;
+      break;
+    }
+    ptr++;
+  }
+  fprintf(out, "\\n\",\n");
+}
 
 int main(int argc, char *argv[])
 {
-  char buff[512];
-  struct stat st;
+  char query[MAX_BOOTSTRAP_QUERY_SIZE];
   char* struct_name= argv[1];
   char* infile_name= argv[2];
   char* outfile_name= argv[3];
-
+  int rc;
+  int query_length;
 
   if (argc != 4)
     die("Usage: comp_sql <struct_name> <sql_filename> <c_filename>");
@@ -74,86 +128,31 @@ int main(int argc, char *argv[])
   /* Open input and output file */
   if (!(in= fopen(infile_name, "r")))
     die("Failed to open SQL file '%s'", infile_name);
-
-  
   if (!(out= fopen(outfile_name, "w")))
     die("Failed to open output file '%s'", outfile_name);
-  fprintf(out, "const char %s[]={\n",struct_name);
-
-  /* 
-    Some compilers have limitations how long a string constant can be.
-    We'll output very long strings as hexadecimal arrays, and short ones
-    as strings (prettier)
-  */
-  stat(infile_name, &st);
-  if (st.st_size > MAX_STRING_CONSTANT_LENGTH)
-  {
-    int cnt=0;
-    int c;
-    for(cnt=0;;cnt++)
-    {
-      c= fgetc(in);
-      if (c== -1)
-        break;
 
-      if(cnt != 0)
-        fputc(',', out);
+  fprintf(out, "/*\n");
+  fprintf(out, "  Do not edit this file, it is automatically generated from:\n");
+  fprintf(out, "  <%s>\n", infile_name);
+  fprintf(out, "*/\n");
+  fprintf(out, "const char* %s[]={\n", struct_name);
 
-      /* Put line break after each 16 hex characters */
-      if(cnt && (cnt%16 == 0))
-        fputc('\n', out);
-
-      fprintf(out,"0x%02x",c);
-    }
-    fprintf(out,",0x00");
-  }
-  else
+  for ( ; ; )
   {
-    fprintf(out,"\"");
-    while (fgets(buff, sizeof(buff), in))
-    {
-      char *curr= buff;
-      while (*curr)
-      {
-        if (*curr == '\n')
-        {
-          /*
-            Reached end of line, add escaped newline, escaped
-            backslash and a newline to outfile
-          */
-          fprintf(out, "\\n \"\n\"");
-          curr++;
-        }
-        else if (*curr == '\r')
-        {
-          curr++; /* Skip */
-        }
-        else
-        {
-          if (*curr == '"')
-          {
-            /* Needs escape */
-            fputc('\\', out);
-          }
-
-          fputc(*curr, out);
-          curr++;
-        }
-      }
-      if (*(curr-1) != '\n')
-      {
-        /*
-          Some compilers have a max string length,
-          insert a newline at every 512th char in long
-          strings
-        */
-        fprintf(out, "\"\n\"");
-      }
-    }
-    fprintf(out, "\\\n\"");
+    rc= read_bootstrap_query(query, &query_length,
+                             (fgets_input_t) in, fgets_fn);
+
+    if (rc == READ_BOOTSTRAP_ERROR)
+      die("Failed to read the bootstrap input file.\n");
+    
+    if (rc == READ_BOOTSTRAP_EOF)
+      break;
+
+    print_query(out, query);
   }
-  
-  fprintf(out, "};\n");
+
+  fprintf(out, "NULL\n};\n");
+
   fclose(in);
   fclose(out);
 
diff --git a/scripts/mysql_system_tables.sql b/scripts/mysql_system_tables.sql
index 2e97d2b8272..a01ce84dfa5 100644
--- a/scripts/mysql_system_tables.sql
+++ b/scripts/mysql_system_tables.sql
@@ -1,6 +1,6 @@
+-- Copyright (c) 2007, 2011, Oracle and/or its affiliates.
 -- Copyright (c) 2007, 2008 MySQL AB, 2009 Sun Microsystems, Inc.
--- Copyright (c) 2008-2011 Monty Program Ab
--- Use is subject to license terms.
+-- Copyright (c) 2008-2012 Monty Program Ab
 -- 
 -- This program is free software; you can redistribute it and/or modify
 -- it under the terms of the GNU General Public License as published by
@@ -113,9 +113,7 @@ CREATE TABLE IF NOT EXISTS ndb_binlog_index (Position BIGINT UNSIGNED NOT NULL,
 
 set @have_old_pfs= (select count(*) from information_schema.schemata where schema_name='performance_schema');
 
-SET @l1="SET @broken_tables = (select count(*) from information_schema.tables";
-SET @l2=" where engine != \'PERFORMANCE_SCHEMA\' and table_schema=\'performance_schema\')";
-SET @cmd=concat(@l1,@l2);
+SET @cmd="SET @broken_tables = (select count(*) from information_schema.tables  where engine != 'PERFORMANCE_SCHEMA' and table_schema='performance_schema')";
 
 -- Work around for bug#49542
 SET @str = IF(@have_old_pfs = 1, @cmd, 'SET @broken_tables = 0');
@@ -123,9 +121,7 @@ PREPARE stmt FROM @str;
 EXECUTE stmt;
 DROP PREPARE stmt;
 
-SET @l1="SET @broken_views = (select count(*) from information_schema.views";
-SET @l2=" where table_schema='performance_schema')";
-SET @cmd=concat(@l1,@l2);
+SET @cmd="SET @broken_views = (select count(*) from information_schema.views where table_schema='performance_schema')";
 
 -- Work around for bug#49542
 SET @str = IF(@have_old_pfs = 1, @cmd, 'SET @broken_views = 0');
@@ -162,7 +158,7 @@ DROP PREPARE stmt;
 
 --
 -- From this point, only create the performance schema tables
--- if the server is build with performance schema
+-- if the server is built with performance schema
 --
 
 set @have_pfs= (select count(engine) from information_schema.engines where engine='PERFORMANCE_SCHEMA' and support != 'NO');
@@ -171,12 +167,10 @@ set @have_pfs= (select count(engine) from information_schema.engines where engin
 -- TABLE COND_INSTANCES
 --
 
-SET @l1="CREATE TABLE performance_schema.cond_instances(";
-SET @l2="NAME VARCHAR(128) not null,";
-SET @l3="OBJECT_INSTANCE_BEGIN BIGINT not null";
-SET @l4=")ENGINE=PERFORMANCE_SCHEMA;";
-
-SET @cmd=concat(@l1,@l2,@l3,@l4);
+SET @cmd="CREATE TABLE performance_schema.cond_instances("
+  "NAME VARCHAR(128) not null,"
+  "OBJECT_INSTANCE_BEGIN BIGINT unsigned not null"
+  ")ENGINE=PERFORMANCE_SCHEMA;";
 
 SET @str = IF(@have_pfs = 1, @cmd, 'SET @dummy = 0');
 PREPARE stmt FROM @str;
@@ -187,26 +181,27 @@ DROP PREPARE stmt;
 -- TABLE EVENTS_WAITS_CURRENT
 --
 
-SET @l1="CREATE TABLE performance_schema.events_waits_current(";
-SET @l2="THREAD_ID INTEGER not null,";
-SET @l3="EVENT_ID BIGINT unsigned not null,";
-SET @l4="EVENT_NAME VARCHAR(128) not null,";
-SET @l5="SOURCE VARCHAR(64),";
-SET @l6="TIMER_START BIGINT unsigned,";
-SET @l7="TIMER_END BIGINT unsigned,";
-SET @l8="TIMER_WAIT BIGINT unsigned,";
-SET @l9="SPINS INTEGER unsigned,";
-SET @l10="OBJECT_SCHEMA VARCHAR(64),";
-SET @l11="OBJECT_NAME VARCHAR(512),";
-SET @l12="OBJECT_TYPE VARCHAR(64),";
-SET @l13="OBJECT_INSTANCE_BEGIN BIGINT not null,";
-SET @l14="NESTING_EVENT_ID BIGINT unsigned,";
-SET @l15="OPERATION VARCHAR(16) not null,";
-SET @l16="NUMBER_OF_BYTES BIGINT unsigned,";
-SET @l17="FLAGS INTEGER unsigned";
-SET @l18=")ENGINE=PERFORMANCE_SCHEMA;";
-
-SET @cmd=concat(@l1,@l2,@l3,@l4,@l5,@l6,@l7,@l8,@l9,@l10,@l11,@l12,@l13,@l14,@l15,@l16,@l17,@l18);
+SET @cmd="CREATE TABLE performance_schema.events_waits_current("
+  "THREAD_ID INTEGER not null,"
+  "EVENT_ID BIGINT unsigned not null,"
+  "END_EVENT_ID BIGINT unsigned,"
+  "EVENT_NAME VARCHAR(128) not null,"
+  "SOURCE VARCHAR(64),"
+  "TIMER_START BIGINT unsigned,"
+  "TIMER_END BIGINT unsigned,"
+  "TIMER_WAIT BIGINT unsigned,"
+  "SPINS INTEGER unsigned,"
+  "OBJECT_SCHEMA VARCHAR(64),"
+  "OBJECT_NAME VARCHAR(512),"
+  "INDEX_NAME VARCHAR(64),"
+  "OBJECT_TYPE VARCHAR(64),"
+  "OBJECT_INSTANCE_BEGIN BIGINT unsigned not null,"
+  "NESTING_EVENT_ID BIGINT unsigned,"
+  "NESTING_EVENT_TYPE ENUM('STATEMENT', 'STAGE', 'WAIT'),"
+  "OPERATION VARCHAR(32) not null,"
+  "NUMBER_OF_BYTES BIGINT,"
+  "FLAGS INTEGER unsigned"
+  ")ENGINE=PERFORMANCE_SCHEMA;";
 
 SET @str = IF(@have_pfs = 1, @cmd, 'SET @dummy = 0');
 PREPARE stmt FROM @str;
@@ -217,10 +212,27 @@ DROP PREPARE stmt;
 -- TABLE EVENTS_WAITS_HISTORY
 --
 
-SET @l1="CREATE TABLE performance_schema.events_waits_history(";
--- lines 2 to 18 are unchanged from EVENTS_WAITS_CURRENT
-
-SET @cmd=concat(@l1,@l2,@l3,@l4,@l5,@l6,@l7,@l8,@l9,@l10,@l11,@l12,@l13,@l14,@l15,@l16,@l17,@l18);
+SET @cmd="CREATE TABLE performance_schema.events_waits_history("
+  "THREAD_ID INTEGER not null,"
+  "EVENT_ID BIGINT unsigned not null,"
+  "END_EVENT_ID BIGINT unsigned,"
+  "EVENT_NAME VARCHAR(128) not null,"
+  "SOURCE VARCHAR(64),"
+  "TIMER_START BIGINT unsigned,"
+  "TIMER_END BIGINT unsigned,"
+  "TIMER_WAIT BIGINT unsigned,"
+  "SPINS INTEGER unsigned,"
+  "OBJECT_SCHEMA VARCHAR(64),"
+  "OBJECT_NAME VARCHAR(512),"
+  "INDEX_NAME VARCHAR(64),"
+  "OBJECT_TYPE VARCHAR(64),"
+  "OBJECT_INSTANCE_BEGIN BIGINT unsigned not null,"
+  "NESTING_EVENT_ID BIGINT unsigned,"
+  "NESTING_EVENT_TYPE ENUM('STATEMENT', 'STAGE', 'WAIT'),"
+  "OPERATION VARCHAR(32) not null,"
+  "NUMBER_OF_BYTES BIGINT,"
+  "FLAGS INTEGER unsigned"
+  ")ENGINE=PERFORMANCE_SCHEMA;";
 
 SET @str = IF(@have_pfs = 1, @cmd, 'SET @dummy = 0');
 PREPARE stmt FROM @str;
@@ -231,10 +243,27 @@ DROP PREPARE stmt;
 -- TABLE EVENTS_WAITS_HISTORY_LONG
 --
 
-SET @l1="CREATE TABLE performance_schema.events_waits_history_long(";
--- lines 2 to 18 are unchanged from EVENTS_WAITS_CURRENT
-
-SET @cmd=concat(@l1,@l2,@l3,@l4,@l5,@l6,@l7,@l8,@l9,@l10,@l11,@l12,@l13,@l14,@l15,@l16,@l17,@l18);
+SET @cmd="CREATE TABLE performance_schema.events_waits_history_long("
+  "THREAD_ID INTEGER not null,"
+  "EVENT_ID BIGINT unsigned not null,"
+  "END_EVENT_ID BIGINT unsigned,"
+  "EVENT_NAME VARCHAR(128) not null,"
+  "SOURCE VARCHAR(64),"
+  "TIMER_START BIGINT unsigned,"
+  "TIMER_END BIGINT unsigned,"
+  "TIMER_WAIT BIGINT unsigned,"
+  "SPINS INTEGER unsigned,"
+  "OBJECT_SCHEMA VARCHAR(64),"
+  "OBJECT_NAME VARCHAR(512),"
+  "INDEX_NAME VARCHAR(64),"
+  "OBJECT_TYPE VARCHAR(64),"
+  "OBJECT_INSTANCE_BEGIN BIGINT unsigned not null,"
+  "NESTING_EVENT_ID BIGINT unsigned,"
+  "NESTING_EVENT_TYPE ENUM('STATEMENT', 'STAGE', 'WAIT'),"
+  "OPERATION VARCHAR(32) not null,"
+  "NUMBER_OF_BYTES BIGINT,"
+  "FLAGS INTEGER unsigned"
+  ")ENGINE=PERFORMANCE_SCHEMA;";
 
 SET @str = IF(@have_pfs = 1, @cmd, 'SET @dummy = 0');
 PREPARE stmt FROM @str;
@@ -245,17 +274,34 @@ DROP PREPARE stmt;
 -- TABLE EVENTS_WAITS_SUMMARY_BY_INSTANCE
 --
 
-SET @l1="CREATE TABLE performance_schema.events_waits_summary_by_instance(";
-SET @l2="EVENT_NAME VARCHAR(128) not null,";
-SET @l3="OBJECT_INSTANCE_BEGIN BIGINT not null,";
-SET @l4="COUNT_STAR BIGINT unsigned not null,";
-SET @l5="SUM_TIMER_WAIT BIGINT unsigned not null,";
-SET @l6="MIN_TIMER_WAIT BIGINT unsigned not null,";
-SET @l7="AVG_TIMER_WAIT BIGINT unsigned not null,";
-SET @l8="MAX_TIMER_WAIT BIGINT unsigned not null";
-SET @l9=")ENGINE=PERFORMANCE_SCHEMA;";
+SET @cmd="CREATE TABLE performance_schema.events_waits_summary_by_instance("
+  "EVENT_NAME VARCHAR(128) not null,"
+  "OBJECT_INSTANCE_BEGIN BIGINT unsigned not null,"
+  "COUNT_STAR BIGINT unsigned not null,"
+  "SUM_TIMER_WAIT BIGINT unsigned not null,"
+  "MIN_TIMER_WAIT BIGINT unsigned not null,"
+  "AVG_TIMER_WAIT BIGINT unsigned not null,"
+  "MAX_TIMER_WAIT BIGINT unsigned not null"
+  ")ENGINE=PERFORMANCE_SCHEMA;";
 
-SET @cmd=concat(@l1,@l2,@l3,@l4,@l5,@l6,@l7,@l8,@l9);
+SET @str = IF(@have_pfs = 1, @cmd, 'SET @dummy = 0');
+PREPARE stmt FROM @str;
+EXECUTE stmt;
+DROP PREPARE stmt;
+
+--
+-- TABLE EVENTS_WAITS_SUMMARY_BY_HOST_BY_EVENT_NAME
+--
+
+SET @cmd="CREATE TABLE performance_schema.events_waits_summary_by_host_by_event_name("
+  "HOST CHAR(60) collate utf8_bin default null,"
+  "EVENT_NAME VARCHAR(128) not null,"
+  "COUNT_STAR BIGINT unsigned not null,"
+  "SUM_TIMER_WAIT BIGINT unsigned not null,"
+  "MIN_TIMER_WAIT BIGINT unsigned not null,"
+  "AVG_TIMER_WAIT BIGINT unsigned not null,"
+  "MAX_TIMER_WAIT BIGINT unsigned not null"
+  ")ENGINE=PERFORMANCE_SCHEMA;";
 
 SET @str = IF(@have_pfs = 1, @cmd, 'SET @dummy = 0');
 PREPARE stmt FROM @str;
@@ -263,20 +309,57 @@ EXECUTE stmt;
 DROP PREPARE stmt;
 
 --
--- TABLE EVENTS_WAITS_SUMMARY_BY_THREAD_BY_EVENT_NAME
+-- TABLE EVENTS_WAITS_SUMMARY_BY_USER_BY_EVENT_NAME
+--
+
+SET @cmd="CREATE TABLE performance_schema.events_waits_summary_by_user_by_event_name("
+  "USER CHAR(16) collate utf8_bin default null,"
+  "EVENT_NAME VARCHAR(128) not null,"
+  "COUNT_STAR BIGINT unsigned not null,"
+  "SUM_TIMER_WAIT BIGINT unsigned not null,"
+  "MIN_TIMER_WAIT BIGINT unsigned not null,"
+  "AVG_TIMER_WAIT BIGINT unsigned not null,"
+  "MAX_TIMER_WAIT BIGINT unsigned not null"
+  ")ENGINE=PERFORMANCE_SCHEMA;";
+
+SET @str = IF(@have_pfs = 1, @cmd, 'SET @dummy = 0');
+PREPARE stmt FROM @str;
+EXECUTE stmt;
+DROP PREPARE stmt;
+
+--
+-- TABLE EVENTS_WAITS_SUMMARY_BY_ACCOUNT_BY_EVENT_NAME
 --
 
-SET @l1="CREATE TABLE performance_schema.events_waits_summary_by_thread_by_event_name(";
-SET @l2="THREAD_ID INTEGER not null,";
-SET @l3="EVENT_NAME VARCHAR(128) not null,";
-SET @l4="COUNT_STAR BIGINT unsigned not null,";
-SET @l5="SUM_TIMER_WAIT BIGINT unsigned not null,";
-SET @l6="MIN_TIMER_WAIT BIGINT unsigned not null,";
-SET @l7="AVG_TIMER_WAIT BIGINT unsigned not null,";
-SET @l8="MAX_TIMER_WAIT BIGINT unsigned not null";
-SET @l9=")ENGINE=PERFORMANCE_SCHEMA;";
+SET @cmd="CREATE TABLE performance_schema.events_waits_summary_by_account_by_event_name("
+  "USER CHAR(16) collate utf8_bin default null,"
+  "HOST CHAR(60) collate utf8_bin default null,"
+  "EVENT_NAME VARCHAR(128) not null,"
+  "COUNT_STAR BIGINT unsigned not null,"
+  "SUM_TIMER_WAIT BIGINT unsigned not null,"
+  "MIN_TIMER_WAIT BIGINT unsigned not null,"
+  "AVG_TIMER_WAIT BIGINT unsigned not null,"
+  "MAX_TIMER_WAIT BIGINT unsigned not null"
+  ")ENGINE=PERFORMANCE_SCHEMA;";
 
-SET @cmd=concat(@l1,@l2,@l3,@l4,@l5,@l6,@l7,@l8,@l9);
+SET @str = IF(@have_pfs = 1, @cmd, 'SET @dummy = 0');
+PREPARE stmt FROM @str;
+EXECUTE stmt;
+DROP PREPARE stmt;
+
+--
+-- TABLE EVENTS_WAITS_SUMMARY_BY_THREAD_BY_EVENT_NAME
+--
+
+SET @cmd="CREATE TABLE performance_schema.events_waits_summary_by_thread_by_event_name("
+  "THREAD_ID INTEGER not null,"
+  "EVENT_NAME VARCHAR(128) not null,"
+  "COUNT_STAR BIGINT unsigned not null,"
+  "SUM_TIMER_WAIT BIGINT unsigned not null,"
+  "MIN_TIMER_WAIT BIGINT unsigned not null,"
+  "AVG_TIMER_WAIT BIGINT unsigned not null,"
+  "MAX_TIMER_WAIT BIGINT unsigned not null"
+  ")ENGINE=PERFORMANCE_SCHEMA;";
 
 SET @str = IF(@have_pfs = 1, @cmd, 'SET @dummy = 0');
 PREPARE stmt FROM @str;
@@ -287,16 +370,14 @@ DROP PREPARE stmt;
 -- TABLE EVENTS_WAITS_SUMMARY_GLOBAL_BY_EVENT_NAME
 --
 
-SET @l1="CREATE TABLE performance_schema.events_waits_summary_global_by_event_name(";
-SET @l2="EVENT_NAME VARCHAR(128) not null,";
-SET @l3="COUNT_STAR BIGINT unsigned not null,";
-SET @l4="SUM_TIMER_WAIT BIGINT unsigned not null,";
-SET @l5="MIN_TIMER_WAIT BIGINT unsigned not null,";
-SET @l6="AVG_TIMER_WAIT BIGINT unsigned not null,";
-SET @l7="MAX_TIMER_WAIT BIGINT unsigned not null";
-SET @l8=")ENGINE=PERFORMANCE_SCHEMA;";
-
-SET @cmd=concat(@l1,@l2,@l3,@l4,@l5,@l6,@l7,@l8);
+SET @cmd="CREATE TABLE performance_schema.events_waits_summary_global_by_event_name("
+  "EVENT_NAME VARCHAR(128) not null,"
+  "COUNT_STAR BIGINT unsigned not null,"
+  "SUM_TIMER_WAIT BIGINT unsigned not null,"
+  "MIN_TIMER_WAIT BIGINT unsigned not null,"
+  "AVG_TIMER_WAIT BIGINT unsigned not null,"
+  "MAX_TIMER_WAIT BIGINT unsigned not null"
+  ")ENGINE=PERFORMANCE_SCHEMA;";
 
 SET @str = IF(@have_pfs = 1, @cmd, 'SET @dummy = 0');
 PREPARE stmt FROM @str;
@@ -307,13 +388,11 @@ DROP PREPARE stmt;
 -- TABLE FILE_INSTANCES
 --
 
-SET @l1="CREATE TABLE performance_schema.file_instances(";
-SET @l2="FILE_NAME VARCHAR(512) not null,";
-SET @l3="EVENT_NAME VARCHAR(128) not null,";
-SET @l4="OPEN_COUNT INTEGER unsigned not null";
-SET @l5=")ENGINE=PERFORMANCE_SCHEMA;";
-
-SET @cmd=concat(@l1,@l2,@l3,@l4,@l5);
+SET @cmd="CREATE TABLE performance_schema.file_instances("
+  "FILE_NAME VARCHAR(512) not null,"
+  "EVENT_NAME VARCHAR(128) not null,"
+  "OPEN_COUNT INTEGER unsigned not null"
+  ")ENGINE=PERFORMANCE_SCHEMA;";
 
 SET @str = IF(@have_pfs = 1, @cmd, 'SET @dummy = 0');
 PREPARE stmt FROM @str;
@@ -324,15 +403,31 @@ DROP PREPARE stmt;
 -- TABLE FILE_SUMMARY_BY_EVENT_NAME
 --
 
-SET @l1="CREATE TABLE performance_schema.file_summary_by_event_name(";
-SET @l2="EVENT_NAME VARCHAR(128) not null,";
-SET @l3="COUNT_READ BIGINT unsigned not null,";
-SET @l4="COUNT_WRITE BIGINT unsigned not null,";
-SET @l5="SUM_NUMBER_OF_BYTES_READ BIGINT unsigned not null,";
-SET @l6="SUM_NUMBER_OF_BYTES_WRITE BIGINT unsigned not null";
-SET @l7=")ENGINE=PERFORMANCE_SCHEMA;";
-
-SET @cmd=concat(@l1,@l2,@l3,@l4,@l5,@l6,@l7);
+SET @cmd="CREATE TABLE performance_schema.file_summary_by_event_name("
+  "EVENT_NAME VARCHAR(128) not null,"
+  "COUNT_STAR BIGINT unsigned not null,"
+  "SUM_TIMER_WAIT BIGINT unsigned not null,"
+  "MIN_TIMER_WAIT BIGINT unsigned not null,"
+  "AVG_TIMER_WAIT BIGINT unsigned not null,"
+  "MAX_TIMER_WAIT BIGINT unsigned not null,"
+  "COUNT_READ BIGINT unsigned not null,"
+  "SUM_TIMER_READ BIGINT unsigned not null,"
+  "MIN_TIMER_READ BIGINT unsigned not null,"
+  "AVG_TIMER_READ BIGINT unsigned not null,"
+  "MAX_TIMER_READ BIGINT unsigned not null,"
+  "SUM_NUMBER_OF_BYTES_READ BIGINT not null,"
+  "COUNT_WRITE BIGINT unsigned not null,"
+  "SUM_TIMER_WRITE BIGINT unsigned not null,"
+  "MIN_TIMER_WRITE BIGINT unsigned not null,"
+  "AVG_TIMER_WRITE BIGINT unsigned not null,"
+  "MAX_TIMER_WRITE BIGINT unsigned not null,"
+  "SUM_NUMBER_OF_BYTES_WRITE BIGINT not null,"
+  "COUNT_MISC BIGINT unsigned not null,"
+  "SUM_TIMER_MISC BIGINT unsigned not null,"
+  "MIN_TIMER_MISC BIGINT unsigned not null,"
+  "AVG_TIMER_MISC BIGINT unsigned not null,"
+  "MAX_TIMER_MISC BIGINT unsigned not null"
+  ")ENGINE=PERFORMANCE_SCHEMA;";
 
 SET @str = IF(@have_pfs = 1, @cmd, 'SET @dummy = 0');
 PREPARE stmt FROM @str;
@@ -343,16 +438,53 @@ DROP PREPARE stmt;
 -- TABLE FILE_SUMMARY_BY_INSTANCE
 --
 
-SET @l1="CREATE TABLE performance_schema.file_summary_by_instance(";
-SET @l2="FILE_NAME VARCHAR(512) not null,";
-SET @l3="EVENT_NAME VARCHAR(128) not null,";
-SET @l4="COUNT_READ BIGINT unsigned not null,";
-SET @l5="COUNT_WRITE BIGINT unsigned not null,";
-SET @l6="SUM_NUMBER_OF_BYTES_READ BIGINT unsigned not null,";
-SET @l7="SUM_NUMBER_OF_BYTES_WRITE BIGINT unsigned not null";
-SET @l8=")ENGINE=PERFORMANCE_SCHEMA;";
+SET @cmd="CREATE TABLE performance_schema.file_summary_by_instance("
+  "FILE_NAME VARCHAR(512) not null,"
+  "EVENT_NAME VARCHAR(128) not null,"
+  "OBJECT_INSTANCE_BEGIN BIGINT unsigned not null,"
+  "COUNT_STAR BIGINT unsigned not null,"
+  "SUM_TIMER_WAIT BIGINT unsigned not null,"
+  "MIN_TIMER_WAIT BIGINT unsigned not null,"
+  "AVG_TIMER_WAIT BIGINT unsigned not null,"
+  "MAX_TIMER_WAIT BIGINT unsigned not null,"
+  "COUNT_READ BIGINT unsigned not null,"
+  "SUM_TIMER_READ BIGINT unsigned not null,"
+  "MIN_TIMER_READ BIGINT unsigned not null,"
+  "AVG_TIMER_READ BIGINT unsigned not null,"
+  "MAX_TIMER_READ BIGINT unsigned not null,"
+  "SUM_NUMBER_OF_BYTES_READ BIGINT not null,"
+  "COUNT_WRITE BIGINT unsigned not null,"
+  "SUM_TIMER_WRITE BIGINT unsigned not null,"
+  "MIN_TIMER_WRITE BIGINT unsigned not null,"
+  "AVG_TIMER_WRITE BIGINT unsigned not null,"
+  "MAX_TIMER_WRITE BIGINT unsigned not null,"
+  "SUM_NUMBER_OF_BYTES_WRITE BIGINT not null,"
+  "COUNT_MISC BIGINT unsigned not null,"
+  "SUM_TIMER_MISC BIGINT unsigned not null,"
+  "MIN_TIMER_MISC BIGINT unsigned not null,"
+  "AVG_TIMER_MISC BIGINT unsigned not null,"
+  "MAX_TIMER_MISC BIGINT unsigned not null"
+  ")ENGINE=PERFORMANCE_SCHEMA;";
 
-SET @cmd=concat(@l1,@l2,@l3,@l4,@l5,@l6,@l7,@l8);
+SET @str = IF(@have_pfs = 1, @cmd, 'SET @dummy = 0');
+PREPARE stmt FROM @str;
+EXECUTE stmt;
+DROP PREPARE stmt;
+
+
+--
+-- TABLE SOCKET_INSTANCES
+--
+
+SET @cmd="CREATE TABLE performance_schema.socket_instances("
+  "EVENT_NAME VARCHAR(128) not null,"
+  "OBJECT_INSTANCE_BEGIN BIGINT unsigned not null,"
+  "THREAD_ID INTEGER,"
+  "SOCKET_ID INTEGER not null,"
+  "IP VARCHAR(64) not null,"
+  "PORT INTEGER not null,"
+  "STATE ENUM('IDLE','ACTIVE') not null"
+  ")ENGINE=PERFORMANCE_SCHEMA;";
 
 SET @str = IF(@have_pfs = 1, @cmd, 'SET @dummy = 0');
 PREPARE stmt FROM @str;
@@ -360,16 +492,146 @@ EXECUTE stmt;
 DROP PREPARE stmt;
 
 --
+-- TABLE SOCKET_SUMMARY_BY_INSTANCE
+--
+
+SET @cmd="CREATE TABLE performance_schema.socket_summary_by_instance("
+  "EVENT_NAME VARCHAR(128) not null,"
+  "OBJECT_INSTANCE_BEGIN BIGINT unsigned not null,"
+  "COUNT_STAR BIGINT unsigned not null,"
+  "SUM_TIMER_WAIT BIGINT unsigned not null,"
+  "MIN_TIMER_WAIT BIGINT unsigned not null,"
+  "AVG_TIMER_WAIT BIGINT unsigned not null,"
+  "MAX_TIMER_WAIT BIGINT unsigned not null,"
+  "COUNT_READ BIGINT unsigned not null,"
+  "SUM_TIMER_READ BIGINT unsigned not null,"
+  "MIN_TIMER_READ BIGINT unsigned not null,"
+  "AVG_TIMER_READ BIGINT unsigned not null,"
+  "MAX_TIMER_READ BIGINT unsigned not null,"
+  "SUM_NUMBER_OF_BYTES_READ BIGINT unsigned not null,"
+  "COUNT_WRITE BIGINT unsigned not null,"
+  "SUM_TIMER_WRITE BIGINT unsigned not null,"
+  "MIN_TIMER_WRITE BIGINT unsigned not null,"
+  "AVG_TIMER_WRITE BIGINT unsigned not null,"
+  "MAX_TIMER_WRITE BIGINT unsigned not null,"
+  "SUM_NUMBER_OF_BYTES_WRITE BIGINT unsigned not null,"
+  "COUNT_MISC BIGINT unsigned not null,"
+  "SUM_TIMER_MISC BIGINT unsigned not null,"
+  "MIN_TIMER_MISC BIGINT unsigned not null,"
+  "AVG_TIMER_MISC BIGINT unsigned not null,"
+  "MAX_TIMER_MISC BIGINT unsigned not null"
+  ")ENGINE=PERFORMANCE_SCHEMA;";
+
+SET @str = IF(@have_pfs = 1, @cmd, 'SET @dummy = 0');
+PREPARE stmt FROM @str;
+EXECUTE stmt;
+DROP PREPARE stmt;
+
+--
+-- TABLE SOCKET_SUMMARY_BY_INSTANCE
+--
+
+SET @cmd="CREATE TABLE performance_schema.socket_summary_by_event_name("
+  "EVENT_NAME VARCHAR(128) not null,"
+  "COUNT_STAR BIGINT unsigned not null,"
+  "SUM_TIMER_WAIT BIGINT unsigned not null,"
+  "MIN_TIMER_WAIT BIGINT unsigned not null,"
+  "AVG_TIMER_WAIT BIGINT unsigned not null,"
+  "MAX_TIMER_WAIT BIGINT unsigned not null,"
+  "COUNT_READ BIGINT unsigned not null,"
+  "SUM_TIMER_READ BIGINT unsigned not null,"
+  "MIN_TIMER_READ BIGINT unsigned not null,"
+  "AVG_TIMER_READ BIGINT unsigned not null,"
+  "MAX_TIMER_READ BIGINT unsigned not null,"
+  "SUM_NUMBER_OF_BYTES_READ BIGINT unsigned not null,"
+  "COUNT_WRITE BIGINT unsigned not null,"
+  "SUM_TIMER_WRITE BIGINT unsigned not null,"
+  "MIN_TIMER_WRITE BIGINT unsigned not null,"
+  "AVG_TIMER_WRITE BIGINT unsigned not null,"
+  "MAX_TIMER_WRITE BIGINT unsigned not null,"
+  "SUM_NUMBER_OF_BYTES_WRITE BIGINT unsigned not null,"
+  "COUNT_MISC BIGINT unsigned not null,"
+  "SUM_TIMER_MISC BIGINT unsigned not null,"
+  "MIN_TIMER_MISC BIGINT unsigned not null,"
+  "AVG_TIMER_MISC BIGINT unsigned not null,"
+  "MAX_TIMER_MISC BIGINT unsigned not null"
+  ")ENGINE=PERFORMANCE_SCHEMA;";
+
+SET @str = IF(@have_pfs = 1, @cmd, 'SET @dummy = 0');
+PREPARE stmt FROM @str;
+EXECUTE stmt;
+DROP PREPARE stmt;
+
+--
+-- TABLE HOST_CACHE
+--
+
+SET @cmd="CREATE TABLE performance_schema.host_cache("
+  "IP VARCHAR(64) not null,"
+  "HOST VARCHAR(255) collate utf8_bin,"
+  "HOST_VALIDATED ENUM ('YES', 'NO') not null,"
+  "SUM_CONNECT_ERRORS BIGINT not null,"
+  "COUNT_HOST_BLOCKED_ERRORS BIGINT not null,"
+  "COUNT_NAMEINFO_TRANSIENT_ERRORS BIGINT not null,"
+  "COUNT_NAMEINFO_PERMANENT_ERRORS BIGINT not null,"
+  "COUNT_FORMAT_ERRORS BIGINT not null,"
+  "COUNT_ADDRINFO_TRANSIENT_ERRORS BIGINT not null,"
+  "COUNT_ADDRINFO_PERMANENT_ERRORS BIGINT not null,"
+  "COUNT_FCRDNS_ERRORS BIGINT not null,"
+  "COUNT_HOST_ACL_ERRORS BIGINT not null,"
+  "COUNT_NO_AUTH_PLUGIN_ERRORS BIGINT not null,"
+  "COUNT_AUTH_PLUGIN_ERRORS BIGINT not null,"
+  "COUNT_HANDSHAKE_ERRORS BIGINT not null,"
+  "COUNT_PROXY_USER_ERRORS BIGINT not null,"
+  "COUNT_PROXY_USER_ACL_ERRORS BIGINT not null,"
+  "COUNT_AUTHENTICATION_ERRORS BIGINT not null,"
+  "COUNT_SSL_ERRORS BIGINT not null,"
+  "COUNT_MAX_USER_CONNECTIONS_ERRORS BIGINT not null,"
+  "COUNT_MAX_USER_CONNECTIONS_PER_HOUR_ERRORS BIGINT not null,"
+  "COUNT_DEFAULT_DATABASE_ERRORS BIGINT not null,"
+  "COUNT_INIT_CONNECT_ERRORS BIGINT not null,"
+  "COUNT_LOCAL_ERRORS BIGINT not null,"
+  "COUNT_UNKNOWN_ERRORS BIGINT not null,"
+  "FIRST_SEEN TIMESTAMP(0) default 0,"
+  "LAST_SEEN TIMESTAMP(0) default 0,"
+  "FIRST_ERROR_SEEN TIMESTAMP(0) null default 0,"
+  "LAST_ERROR_SEEN TIMESTAMP(0) null default 0"
+  ")ENGINE=PERFORMANCE_SCHEMA;";
+
+SET @str = IF(@have_pfs = 1, @cmd, 'SET @dummy = 0');
+#PREPARE stmt FROM @str;
+#EXECUTE stmt;
+#DROP PREPARE stmt;
+
+--
 -- TABLE MUTEX_INSTANCES
 --
 
-SET @l1="CREATE TABLE performance_schema.mutex_instances(";
-SET @l2="NAME VARCHAR(128) not null,";
-SET @l3="OBJECT_INSTANCE_BEGIN BIGINT not null,";
-SET @l4="LOCKED_BY_THREAD_ID INTEGER";
-SET @l5=")ENGINE=PERFORMANCE_SCHEMA;";
+SET @cmd="CREATE TABLE performance_schema.mutex_instances("
+  "NAME VARCHAR(128) not null,"
+  "OBJECT_INSTANCE_BEGIN BIGINT unsigned not null,"
+  "LOCKED_BY_THREAD_ID INTEGER"
+  ")ENGINE=PERFORMANCE_SCHEMA;";
+
+SET @str = IF(@have_pfs = 1, @cmd, 'SET @dummy = 0');
+PREPARE stmt FROM @str;
+EXECUTE stmt;
+DROP PREPARE stmt;
+
+--
+-- TABLE OBJECTS_SUMMARY_GLOBAL_BY_TYPE
+--
 
-SET @cmd=concat(@l1,@l2,@l3,@l4,@l5);
+SET @cmd="CREATE TABLE performance_schema.objects_summary_global_by_type("
+  "OBJECT_TYPE VARCHAR(64),"
+  "OBJECT_SCHEMA VARCHAR(64),"
+  "OBJECT_NAME VARCHAR(64),"
+  "COUNT_STAR BIGINT unsigned not null,"
+  "SUM_TIMER_WAIT BIGINT unsigned not null,"
+  "MIN_TIMER_WAIT BIGINT unsigned not null,"
+  "AVG_TIMER_WAIT BIGINT unsigned not null,"
+  "MAX_TIMER_WAIT BIGINT unsigned not null"
+  ")ENGINE=PERFORMANCE_SCHEMA;";
 
 SET @str = IF(@have_pfs = 1, @cmd, 'SET @dummy = 0');
 PREPARE stmt FROM @str;
@@ -380,14 +642,12 @@ DROP PREPARE stmt;
 -- TABLE PERFORMANCE_TIMERS
 --
 
-SET @l1="CREATE TABLE performance_schema.performance_timers(";
-SET @l2="TIMER_NAME ENUM ('CYCLE', 'NANOSECOND', 'MICROSECOND', 'MILLISECOND', 'TICK') not null,";
-SET @l3="TIMER_FREQUENCY BIGINT,";
-SET @l4="TIMER_RESOLUTION BIGINT,";
-SET @l5="TIMER_OVERHEAD BIGINT";
-SET @l6=") ENGINE=PERFORMANCE_SCHEMA;";
-
-SET @cmd=concat(@l1,@l2,@l3,@l4,@l5,@l6);
+SET @cmd="CREATE TABLE performance_schema.performance_timers("
+  "TIMER_NAME ENUM ('CYCLE', 'NANOSECOND', 'MICROSECOND', 'MILLISECOND', 'TICK') not null,"
+  "TIMER_FREQUENCY BIGINT,"
+  "TIMER_RESOLUTION BIGINT,"
+  "TIMER_OVERHEAD BIGINT"
+  ") ENGINE=PERFORMANCE_SCHEMA;";
 
 SET @str = IF(@have_pfs = 1, @cmd, 'SET @dummy = 0');
 PREPARE stmt FROM @str;
@@ -398,14 +658,27 @@ DROP PREPARE stmt;
 -- TABLE RWLOCK_INSTANCES
 --
 
-SET @l1="CREATE TABLE performance_schema.rwlock_instances(";
-SET @l2="NAME VARCHAR(128) not null,";
-SET @l3="OBJECT_INSTANCE_BEGIN BIGINT not null,";
-SET @l4="WRITE_LOCKED_BY_THREAD_ID INTEGER,";
-SET @l5="READ_LOCKED_BY_COUNT INTEGER unsigned not null";
-SET @l6=")ENGINE=PERFORMANCE_SCHEMA;";
+SET @cmd="CREATE TABLE performance_schema.rwlock_instances("
+  "NAME VARCHAR(128) not null,"
+  "OBJECT_INSTANCE_BEGIN BIGINT unsigned not null,"
+  "WRITE_LOCKED_BY_THREAD_ID INTEGER,"
+  "READ_LOCKED_BY_COUNT INTEGER unsigned not null"
+  ")ENGINE=PERFORMANCE_SCHEMA;";
+
+SET @str = IF(@have_pfs = 1, @cmd, 'SET @dummy = 0');
+PREPARE stmt FROM @str;
+EXECUTE stmt;
+DROP PREPARE stmt;
+
+--
+-- TABLE SETUP_ACTORS
+--
 
-SET @cmd=concat(@l1,@l2,@l3,@l4,@l5,@l6);
+SET @cmd="CREATE TABLE performance_schema.setup_actors("
+  "HOST CHAR(60) collate utf8_bin default '%' not null,"
+  "USER CHAR(16) collate utf8_bin default '%' not null,"
+  "ROLE CHAR(16) collate utf8_bin default '%' not null"
+  ")ENGINE=PERFORMANCE_SCHEMA;";
 
 SET @str = IF(@have_pfs = 1, @cmd, 'SET @dummy = 0');
 PREPARE stmt FROM @str;
@@ -416,12 +689,10 @@ DROP PREPARE stmt;
 -- TABLE SETUP_CONSUMERS
 --
 
-SET @l1="CREATE TABLE performance_schema.setup_consumers(";
-SET @l2="NAME VARCHAR(64) not null,";
-SET @l3="ENABLED ENUM ('YES', 'NO') not null";
-SET @l4=")ENGINE=PERFORMANCE_SCHEMA;";
-
-SET @cmd=concat(@l1,@l2,@l3,@l4);
+SET @cmd="CREATE TABLE performance_schema.setup_consumers("
+  "NAME VARCHAR(64) not null,"
+  "ENABLED ENUM ('YES', 'NO') not null"
+  ")ENGINE=PERFORMANCE_SCHEMA;";
 
 SET @str = IF(@have_pfs = 1, @cmd, 'SET @dummy = 0');
 PREPARE stmt FROM @str;
@@ -432,13 +703,28 @@ DROP PREPARE stmt;
 -- TABLE SETUP_INSTRUMENTS
 --
 
-SET @l1="CREATE TABLE performance_schema.setup_instruments(";
-SET @l2="NAME VARCHAR(128) not null,";
-SET @l3="ENABLED ENUM ('YES', 'NO') not null,";
-SET @l4="TIMED ENUM ('YES', 'NO') not null";
-SET @l5=")ENGINE=PERFORMANCE_SCHEMA;";
+SET @cmd="CREATE TABLE performance_schema.setup_instruments("
+  "NAME VARCHAR(128) not null,"
+  "ENABLED ENUM ('YES', 'NO') not null,"
+  "TIMED ENUM ('YES', 'NO') not null"
+  ")ENGINE=PERFORMANCE_SCHEMA;";
 
-SET @cmd=concat(@l1,@l2,@l3,@l4,@l5);
+SET @str = IF(@have_pfs = 1, @cmd, 'SET @dummy = 0');
+PREPARE stmt FROM @str;
+EXECUTE stmt;
+DROP PREPARE stmt;
+
+--
+-- TABLE SETUP_OBJECTS
+--
+
+SET @cmd="CREATE TABLE performance_schema.setup_objects("
+  "OBJECT_TYPE ENUM ('TABLE') not null default 'TABLE',"
+  "OBJECT_SCHEMA VARCHAR(64) default '%',"
+  "OBJECT_NAME VARCHAR(64) not null default '%',"
+  "ENABLED ENUM ('YES', 'NO') not null default 'YES',"
+  "TIMED ENUM ('YES', 'NO') not null default 'YES'"
+  ")ENGINE=PERFORMANCE_SCHEMA;";
 
 SET @str = IF(@have_pfs = 1, @cmd, 'SET @dummy = 0');
 PREPARE stmt FROM @str;
@@ -449,12 +735,196 @@ DROP PREPARE stmt;
 -- TABLE SETUP_TIMERS
 --
 
-SET @l1="CREATE TABLE performance_schema.setup_timers(";
-SET @l2="NAME VARCHAR(64) not null,";
-SET @l3="TIMER_NAME ENUM ('CYCLE', 'NANOSECOND', 'MICROSECOND', 'MILLISECOND', 'TICK') not null";
-SET @l4=")ENGINE=PERFORMANCE_SCHEMA;";
+SET @cmd="CREATE TABLE performance_schema.setup_timers("
+  "NAME VARCHAR(64) not null,"
+  "TIMER_NAME ENUM ('CYCLE', 'NANOSECOND', 'MICROSECOND', 'MILLISECOND', 'TICK') not null"
+  ")ENGINE=PERFORMANCE_SCHEMA;";
+
+SET @str = IF(@have_pfs = 1, @cmd, 'SET @dummy = 0');
+PREPARE stmt FROM @str;
+EXECUTE stmt;
+DROP PREPARE stmt;
+
+--
+-- TABLE TABLE_IO_WAITS_SUMMARY_BY_INDEX_USAGE
+--
+
+SET @cmd="CREATE TABLE performance_schema.table_io_waits_summary_by_index_usage("
+  "OBJECT_TYPE VARCHAR(64),"
+  "OBJECT_SCHEMA VARCHAR(64),"
+  "OBJECT_NAME VARCHAR(64),"
+  "INDEX_NAME VARCHAR(64),"
+  "COUNT_STAR BIGINT unsigned not null,"
+  "SUM_TIMER_WAIT BIGINT unsigned not null,"
+  "MIN_TIMER_WAIT BIGINT unsigned not null,"
+  "AVG_TIMER_WAIT BIGINT unsigned not null,"
+  "MAX_TIMER_WAIT BIGINT unsigned not null,"
+  "COUNT_READ BIGINT unsigned not null,"
+  "SUM_TIMER_READ BIGINT unsigned not null,"
+  "MIN_TIMER_READ BIGINT unsigned not null,"
+  "AVG_TIMER_READ BIGINT unsigned not null,"
+  "MAX_TIMER_READ BIGINT unsigned not null,"
+  "COUNT_WRITE BIGINT unsigned not null,"
+  "SUM_TIMER_WRITE BIGINT unsigned not null,"
+  "MIN_TIMER_WRITE BIGINT unsigned not null,"
+  "AVG_TIMER_WRITE BIGINT unsigned not null,"
+  "MAX_TIMER_WRITE BIGINT unsigned not null,"
+  "COUNT_FETCH BIGINT unsigned not null,"
+  "SUM_TIMER_FETCH BIGINT unsigned not null,"
+  "MIN_TIMER_FETCH BIGINT unsigned not null,"
+  "AVG_TIMER_FETCH BIGINT unsigned not null,"
+  "MAX_TIMER_FETCH BIGINT unsigned not null,"
+  "COUNT_INSERT BIGINT unsigned not null,"
+  "SUM_TIMER_INSERT BIGINT unsigned not null,"
+  "MIN_TIMER_INSERT BIGINT unsigned not null,"
+  "AVG_TIMER_INSERT BIGINT unsigned not null,"
+  "MAX_TIMER_INSERT BIGINT unsigned not null,"
+  "COUNT_UPDATE BIGINT unsigned not null,"
+  "SUM_TIMER_UPDATE BIGINT unsigned not null,"
+  "MIN_TIMER_UPDATE BIGINT unsigned not null,"
+  "AVG_TIMER_UPDATE BIGINT unsigned not null,"
+  "MAX_TIMER_UPDATE BIGINT unsigned not null,"
+  "COUNT_DELETE BIGINT unsigned not null,"
+  "SUM_TIMER_DELETE BIGINT unsigned not null,"
+  "MIN_TIMER_DELETE BIGINT unsigned not null,"
+  "AVG_TIMER_DELETE BIGINT unsigned not null,"
+  "MAX_TIMER_DELETE BIGINT unsigned not null"
+  ")ENGINE=PERFORMANCE_SCHEMA;";
 
-SET @cmd=concat(@l1,@l2,@l3,@l4);
+SET @str = IF(@have_pfs = 1, @cmd, 'SET @dummy = 0');
+PREPARE stmt FROM @str;
+EXECUTE stmt;
+DROP PREPARE stmt;
+
+--
+-- TABLE TABLE_IO_WAITS_SUMMARY_BY_TABLE
+--
+
+SET @cmd="CREATE TABLE performance_schema.table_io_waits_summary_by_table("
+  "OBJECT_TYPE VARCHAR(64),"
+  "OBJECT_SCHEMA VARCHAR(64),"
+  "OBJECT_NAME VARCHAR(64),"
+  "COUNT_STAR BIGINT unsigned not null,"
+  "SUM_TIMER_WAIT BIGINT unsigned not null,"
+  "MIN_TIMER_WAIT BIGINT unsigned not null,"
+  "AVG_TIMER_WAIT BIGINT unsigned not null,"
+  "MAX_TIMER_WAIT BIGINT unsigned not null,"
+  "COUNT_READ BIGINT unsigned not null,"
+  "SUM_TIMER_READ BIGINT unsigned not null,"
+  "MIN_TIMER_READ BIGINT unsigned not null,"
+  "AVG_TIMER_READ BIGINT unsigned not null,"
+  "MAX_TIMER_READ BIGINT unsigned not null,"
+  "COUNT_WRITE BIGINT unsigned not null,"
+  "SUM_TIMER_WRITE BIGINT unsigned not null,"
+  "MIN_TIMER_WRITE BIGINT unsigned not null,"
+  "AVG_TIMER_WRITE BIGINT unsigned not null,"
+  "MAX_TIMER_WRITE BIGINT unsigned not null,"
+  "COUNT_FETCH BIGINT unsigned not null,"
+  "SUM_TIMER_FETCH BIGINT unsigned not null,"
+  "MIN_TIMER_FETCH BIGINT unsigned not null,"
+  "AVG_TIMER_FETCH BIGINT unsigned not null,"
+  "MAX_TIMER_FETCH BIGINT unsigned not null,"
+  "COUNT_INSERT BIGINT unsigned not null,"
+  "SUM_TIMER_INSERT BIGINT unsigned not null,"
+  "MIN_TIMER_INSERT BIGINT unsigned not null,"
+  "AVG_TIMER_INSERT BIGINT unsigned not null,"
+  "MAX_TIMER_INSERT BIGINT unsigned not null,"
+  "COUNT_UPDATE BIGINT unsigned not null,"
+  "SUM_TIMER_UPDATE BIGINT unsigned not null,"
+  "MIN_TIMER_UPDATE BIGINT unsigned not null,"
+  "AVG_TIMER_UPDATE BIGINT unsigned not null,"
+  "MAX_TIMER_UPDATE BIGINT unsigned not null,"
+  "COUNT_DELETE BIGINT unsigned not null,"
+  "SUM_TIMER_DELETE BIGINT unsigned not null,"
+  "MIN_TIMER_DELETE BIGINT unsigned not null,"
+  "AVG_TIMER_DELETE BIGINT unsigned not null,"
+  "MAX_TIMER_DELETE BIGINT unsigned not null"
+  ")ENGINE=PERFORMANCE_SCHEMA;";
+
+SET @str = IF(@have_pfs = 1, @cmd, 'SET @dummy = 0');
+PREPARE stmt FROM @str;
+EXECUTE stmt;
+DROP PREPARE stmt;
+
+--
+-- TABLE TABLE_LOCK_WAITS_SUMMARY_BY_TABLE
+--
+
+SET @cmd="CREATE TABLE performance_schema.table_lock_waits_summary_by_table("
+  "OBJECT_TYPE VARCHAR(64),"
+  "OBJECT_SCHEMA VARCHAR(64),"
+  "OBJECT_NAME VARCHAR(64),"
+  "COUNT_STAR BIGINT unsigned not null,"
+  "SUM_TIMER_WAIT BIGINT unsigned not null,"
+  "MIN_TIMER_WAIT BIGINT unsigned not null,"
+  "AVG_TIMER_WAIT BIGINT unsigned not null,"
+  "MAX_TIMER_WAIT BIGINT unsigned not null,"
+  "COUNT_READ BIGINT unsigned not null,"
+  "SUM_TIMER_READ BIGINT unsigned not null,"
+  "MIN_TIMER_READ BIGINT unsigned not null,"
+  "AVG_TIMER_READ BIGINT unsigned not null,"
+  "MAX_TIMER_READ BIGINT unsigned not null,"
+  "COUNT_WRITE BIGINT unsigned not null,"
+  "SUM_TIMER_WRITE BIGINT unsigned not null,"
+  "MIN_TIMER_WRITE BIGINT unsigned not null,"
+  "AVG_TIMER_WRITE BIGINT unsigned not null,"
+  "MAX_TIMER_WRITE BIGINT unsigned not null,"
+  "COUNT_READ_NORMAL BIGINT unsigned not null,"
+  "SUM_TIMER_READ_NORMAL BIGINT unsigned not null,"
+  "MIN_TIMER_READ_NORMAL BIGINT unsigned not null,"
+  "AVG_TIMER_READ_NORMAL BIGINT unsigned not null,"
+  "MAX_TIMER_READ_NORMAL BIGINT unsigned not null,"
+  "COUNT_READ_WITH_SHARED_LOCKS BIGINT unsigned not null,"
+  "SUM_TIMER_READ_WITH_SHARED_LOCKS BIGINT unsigned not null,"
+  "MIN_TIMER_READ_WITH_SHARED_LOCKS BIGINT unsigned not null,"
+  "AVG_TIMER_READ_WITH_SHARED_LOCKS BIGINT unsigned not null,"
+  "MAX_TIMER_READ_WITH_SHARED_LOCKS BIGINT unsigned not null,"
+  "COUNT_READ_HIGH_PRIORITY BIGINT unsigned not null,"
+  "SUM_TIMER_READ_HIGH_PRIORITY BIGINT unsigned not null,"
+  "MIN_TIMER_READ_HIGH_PRIORITY BIGINT unsigned not null,"
+  "AVG_TIMER_READ_HIGH_PRIORITY BIGINT unsigned not null,"
+  "MAX_TIMER_READ_HIGH_PRIORITY BIGINT unsigned not null,"
+  "COUNT_READ_NO_INSERT BIGINT unsigned not null,"
+  "SUM_TIMER_READ_NO_INSERT BIGINT unsigned not null,"
+  "MIN_TIMER_READ_NO_INSERT BIGINT unsigned not null,"
+  "AVG_TIMER_READ_NO_INSERT BIGINT unsigned not null,"
+  "MAX_TIMER_READ_NO_INSERT BIGINT unsigned not null,"
+  "COUNT_READ_EXTERNAL BIGINT unsigned not null,"
+  "SUM_TIMER_READ_EXTERNAL BIGINT unsigned not null,"
+  "MIN_TIMER_READ_EXTERNAL BIGINT unsigned not null,"
+  "AVG_TIMER_READ_EXTERNAL BIGINT unsigned not null,"
+  "MAX_TIMER_READ_EXTERNAL BIGINT unsigned not null,"
+  "COUNT_WRITE_ALLOW_WRITE BIGINT unsigned not null,"
+  "SUM_TIMER_WRITE_ALLOW_WRITE BIGINT unsigned not null,"
+  "MIN_TIMER_WRITE_ALLOW_WRITE BIGINT unsigned not null,"
+  "AVG_TIMER_WRITE_ALLOW_WRITE BIGINT unsigned not null,"
+  "MAX_TIMER_WRITE_ALLOW_WRITE BIGINT unsigned not null,"
+  "COUNT_WRITE_CONCURRENT_INSERT BIGINT unsigned not null,"
+  "SUM_TIMER_WRITE_CONCURRENT_INSERT BIGINT unsigned not null,"
+  "MIN_TIMER_WRITE_CONCURRENT_INSERT BIGINT unsigned not null,"
+  "AVG_TIMER_WRITE_CONCURRENT_INSERT BIGINT unsigned not null,"
+  "MAX_TIMER_WRITE_CONCURRENT_INSERT BIGINT unsigned not null,"
+  "COUNT_WRITE_DELAYED BIGINT unsigned not null,"
+  "SUM_TIMER_WRITE_DELAYED BIGINT unsigned not null,"
+  "MIN_TIMER_WRITE_DELAYED BIGINT unsigned not null,"
+  "AVG_TIMER_WRITE_DELAYED BIGINT unsigned not null,"
+  "MAX_TIMER_WRITE_DELAYED BIGINT unsigned not null,"
+  "COUNT_WRITE_LOW_PRIORITY BIGINT unsigned not null,"
+  "SUM_TIMER_WRITE_LOW_PRIORITY BIGINT unsigned not null,"
+  "MIN_TIMER_WRITE_LOW_PRIORITY BIGINT unsigned not null,"
+  "AVG_TIMER_WRITE_LOW_PRIORITY BIGINT unsigned not null,"
+  "MAX_TIMER_WRITE_LOW_PRIORITY BIGINT unsigned not null,"
+  "COUNT_WRITE_NORMAL BIGINT unsigned not null,"
+  "SUM_TIMER_WRITE_NORMAL BIGINT unsigned not null,"
+  "MIN_TIMER_WRITE_NORMAL BIGINT unsigned not null,"
+  "AVG_TIMER_WRITE_NORMAL BIGINT unsigned not null,"
+  "MAX_TIMER_WRITE_NORMAL BIGINT unsigned not null,"
+  "COUNT_WRITE_EXTERNAL BIGINT unsigned not null,"
+  "SUM_TIMER_WRITE_EXTERNAL BIGINT unsigned not null,"
+  "MIN_TIMER_WRITE_EXTERNAL BIGINT unsigned not null,"
+  "AVG_TIMER_WRITE_EXTERNAL BIGINT unsigned not null,"
+  "MAX_TIMER_WRITE_EXTERNAL BIGINT unsigned not null"
+  ")ENGINE=PERFORMANCE_SCHEMA;";
 
 SET @str = IF(@have_pfs = 1, @cmd, 'SET @dummy = 0');
 PREPARE stmt FROM @str;
@@ -465,13 +935,616 @@ DROP PREPARE stmt;
 -- TABLE THREADS
 --
 
-SET @l1="CREATE TABLE performance_schema.threads(";
-SET @l2="THREAD_ID INTEGER not null,";
-SET @l3="PROCESSLIST_ID INTEGER,";
-SET @l4="NAME VARCHAR(128) not null";
-SET @l5=")ENGINE=PERFORMANCE_SCHEMA;";
+SET @cmd="CREATE TABLE performance_schema.threads("
+  "THREAD_ID INTEGER not null,"
+  "NAME VARCHAR(128) not null,"
+  "TYPE VARCHAR(10) not null,"
+  "PROCESSLIST_ID INTEGER,"
+  "PROCESSLIST_USER VARCHAR(16),"
+  "PROCESSLIST_HOST VARCHAR(60),"
+  "PROCESSLIST_DB VARCHAR(64),"
+  "PROCESSLIST_COMMAND VARCHAR(16),"
+  "PROCESSLIST_TIME BIGINT,"
+  "PROCESSLIST_STATE VARCHAR(64),"
+  "PROCESSLIST_INFO LONGTEXT,"
+  "PARENT_THREAD_ID INTEGER,"
+  "ROLE VARCHAR(64),"
+  "INSTRUMENTED ENUM ('YES', 'NO') not null"
+  ")ENGINE=PERFORMANCE_SCHEMA;";
+
+SET @str = IF(@have_pfs = 1, @cmd, 'SET @dummy = 0');
+PREPARE stmt FROM @str;
+EXECUTE stmt;
+DROP PREPARE stmt;
+
+--
+-- TABLE EVENTS_STAGES_CURRENT
+--
+
+SET @cmd="CREATE TABLE performance_schema.events_stages_current("
+  "THREAD_ID INTEGER not null,"
+  "EVENT_ID BIGINT unsigned not null,"
+  "END_EVENT_ID BIGINT unsigned,"
+  "EVENT_NAME VARCHAR(128) not null,"
+  "SOURCE VARCHAR(64),"
+  "TIMER_START BIGINT unsigned,"
+  "TIMER_END BIGINT unsigned,"
+  "TIMER_WAIT BIGINT unsigned,"
+  "NESTING_EVENT_ID BIGINT unsigned,"
+  "NESTING_EVENT_TYPE ENUM('STATEMENT', 'STAGE', 'WAIT')"
+  ")ENGINE=PERFORMANCE_SCHEMA;";
+
+SET @str = IF(@have_pfs = 1, @cmd, 'SET @dummy = 0');
+PREPARE stmt FROM @str;
+EXECUTE stmt;
+DROP PREPARE stmt;
+
+--
+-- TABLE EVENTS_STAGES_HISTORY
+--
+
+SET @cmd="CREATE TABLE performance_schema.events_stages_history("
+  "THREAD_ID INTEGER not null,"
+  "EVENT_ID BIGINT unsigned not null,"
+  "END_EVENT_ID BIGINT unsigned,"
+  "EVENT_NAME VARCHAR(128) not null,"
+  "SOURCE VARCHAR(64),"
+  "TIMER_START BIGINT unsigned,"
+  "TIMER_END BIGINT unsigned,"
+  "TIMER_WAIT BIGINT unsigned,"
+  "NESTING_EVENT_ID BIGINT unsigned,"
+  "NESTING_EVENT_TYPE ENUM('STATEMENT', 'STAGE', 'WAIT')"
+  ")ENGINE=PERFORMANCE_SCHEMA;";
+
+SET @str = IF(@have_pfs = 1, @cmd, 'SET @dummy = 0');
+PREPARE stmt FROM @str;
+EXECUTE stmt;
+DROP PREPARE stmt;
+
+--
+-- TABLE EVENTS_STAGES_HISTORY_LONG
+--
+
+SET @cmd="CREATE TABLE performance_schema.events_stages_history_long("
+  "THREAD_ID INTEGER not null,"
+  "EVENT_ID BIGINT unsigned not null,"
+  "END_EVENT_ID BIGINT unsigned,"
+  "EVENT_NAME VARCHAR(128) not null,"
+  "SOURCE VARCHAR(64),"
+  "TIMER_START BIGINT unsigned,"
+  "TIMER_END BIGINT unsigned,"
+  "TIMER_WAIT BIGINT unsigned,"
+  "NESTING_EVENT_ID BIGINT unsigned,"
+  "NESTING_EVENT_TYPE ENUM('STATEMENT', 'STAGE', 'WAIT')"
+  ")ENGINE=PERFORMANCE_SCHEMA;";
+
+SET @str = IF(@have_pfs = 1, @cmd, 'SET @dummy = 0');
+PREPARE stmt FROM @str;
+EXECUTE stmt;
+DROP PREPARE stmt;
+
+--
+-- TABLE EVENTS_STAGES_SUMMARY_BY_THREAD_BY_EVENT_NAME
+--
+
+SET @cmd="CREATE TABLE performance_schema.events_stages_summary_by_thread_by_event_name("
+  "THREAD_ID INTEGER not null,"
+  "EVENT_NAME VARCHAR(128) not null,"
+  "COUNT_STAR BIGINT unsigned not null,"
+  "SUM_TIMER_WAIT BIGINT unsigned not null,"
+  "MIN_TIMER_WAIT BIGINT unsigned not null,"
+  "AVG_TIMER_WAIT BIGINT unsigned not null,"
+  "MAX_TIMER_WAIT BIGINT unsigned not null"
+  ")ENGINE=PERFORMANCE_SCHEMA;";
+
+SET @str = IF(@have_pfs = 1, @cmd, 'SET @dummy = 0');
+PREPARE stmt FROM @str;
+EXECUTE stmt;
+DROP PREPARE stmt;
+
+--
+-- TABLE EVENTS_STAGES_SUMMARY_BY_HOST_BY_EVENT_NAME
+--
+
+SET @cmd="CREATE TABLE performance_schema.events_stages_summary_by_host_by_event_name("
+  "HOST CHAR(60) collate utf8_bin default null,"
+  "EVENT_NAME VARCHAR(128) not null,"
+  "COUNT_STAR BIGINT unsigned not null,"
+  "SUM_TIMER_WAIT BIGINT unsigned not null,"
+  "MIN_TIMER_WAIT BIGINT unsigned not null,"
+  "AVG_TIMER_WAIT BIGINT unsigned not null,"
+  "MAX_TIMER_WAIT BIGINT unsigned not null"
+  ")ENGINE=PERFORMANCE_SCHEMA;";
+
+SET @str = IF(@have_pfs = 1, @cmd, 'SET @dummy = 0');
+PREPARE stmt FROM @str;
+EXECUTE stmt;
+DROP PREPARE stmt;
+
+--
+-- TABLE EVENTS_STAGES_SUMMARY_BY_USER_BY_EVENT_NAME
+--
+
+SET @cmd="CREATE TABLE performance_schema.events_stages_summary_by_user_by_event_name("
+  "USER CHAR(16) collate utf8_bin default null,"
+  "EVENT_NAME VARCHAR(128) not null,"
+  "COUNT_STAR BIGINT unsigned not null,"
+  "SUM_TIMER_WAIT BIGINT unsigned not null,"
+  "MIN_TIMER_WAIT BIGINT unsigned not null,"
+  "AVG_TIMER_WAIT BIGINT unsigned not null,"
+  "MAX_TIMER_WAIT BIGINT unsigned not null"
+  ")ENGINE=PERFORMANCE_SCHEMA;";
+
+SET @str = IF(@have_pfs = 1, @cmd, 'SET @dummy = 0');
+PREPARE stmt FROM @str;
+EXECUTE stmt;
+DROP PREPARE stmt;
+
+--
+-- TABLE EVENTS_STAGES_SUMMARY_BY_ACCOUNT_BY_EVENT_NAME
+--
+
+SET @cmd="CREATE TABLE performance_schema.events_stages_summary_by_account_by_event_name("
+  "USER CHAR(16) collate utf8_bin default null,"
+  "HOST CHAR(60) collate utf8_bin default null,"
+  "EVENT_NAME VARCHAR(128) not null,"
+  "COUNT_STAR BIGINT unsigned not null,"
+  "SUM_TIMER_WAIT BIGINT unsigned not null,"
+  "MIN_TIMER_WAIT BIGINT unsigned not null,"
+  "AVG_TIMER_WAIT BIGINT unsigned not null,"
+  "MAX_TIMER_WAIT BIGINT unsigned not null"
+  ")ENGINE=PERFORMANCE_SCHEMA;";
+
+SET @str = IF(@have_pfs = 1, @cmd, 'SET @dummy = 0');
+PREPARE stmt FROM @str;
+EXECUTE stmt;
+DROP PREPARE stmt;
+
+--
+-- TABLE EVENTS_STAGES_SUMMARY_GLOBAL_BY_EVENT_NAME
+--
+
+SET @cmd="CREATE TABLE performance_schema.events_stages_summary_global_by_event_name("
+  "EVENT_NAME VARCHAR(128) not null,"
+  "COUNT_STAR BIGINT unsigned not null,"
+  "SUM_TIMER_WAIT BIGINT unsigned not null,"
+  "MIN_TIMER_WAIT BIGINT unsigned not null,"
+  "AVG_TIMER_WAIT BIGINT unsigned not null,"
+  "MAX_TIMER_WAIT BIGINT unsigned not null"
+  ")ENGINE=PERFORMANCE_SCHEMA;";
+
+SET @str = IF(@have_pfs = 1, @cmd, 'SET @dummy = 0');
+PREPARE stmt FROM @str;
+EXECUTE stmt;
+DROP PREPARE stmt;
+
+--
+-- TABLE EVENTS_STATEMENTS_CURRENT
+--
+
+SET @cmd="CREATE TABLE performance_schema.events_statements_current("
+  "THREAD_ID INTEGER not null,"
+  "EVENT_ID BIGINT unsigned not null,"
+  "END_EVENT_ID BIGINT unsigned,"
+  "EVENT_NAME VARCHAR(128) not null,"
+  "SOURCE VARCHAR(64),"
+  "TIMER_START BIGINT unsigned,"
+  "TIMER_END BIGINT unsigned,"
+  "TIMER_WAIT BIGINT unsigned,"
+  "LOCK_TIME bigint unsigned not null,"
+  "SQL_TEXT LONGTEXT,"
+  "DIGEST VARCHAR(32),"
+  "DIGEST_TEXT LONGTEXT,"
+  "CURRENT_SCHEMA VARCHAR(64),"
+  "OBJECT_TYPE VARCHAR(64),"
+  "OBJECT_SCHEMA VARCHAR(64),"
+  "OBJECT_NAME VARCHAR(64),"
+  "OBJECT_INSTANCE_BEGIN BIGINT unsigned,"
+  "MYSQL_ERRNO INTEGER,"
+  "RETURNED_SQLSTATE VARCHAR(5),"
+  "MESSAGE_TEXT VARCHAR(128),"
+  "ERRORS BIGINT unsigned not null,"
+  "WARNINGS BIGINT unsigned not null,"
+  "ROWS_AFFECTED BIGINT unsigned not null,"
+  "ROWS_SENT BIGINT unsigned not null,"
+  "ROWS_EXAMINED BIGINT unsigned not null,"
+  "CREATED_TMP_DISK_TABLES BIGINT unsigned not null,"
+  "CREATED_TMP_TABLES BIGINT unsigned not null,"
+  "SELECT_FULL_JOIN BIGINT unsigned not null,"
+  "SELECT_FULL_RANGE_JOIN BIGINT unsigned not null,"
+  "SELECT_RANGE BIGINT unsigned not null,"
+  "SELECT_RANGE_CHECK BIGINT unsigned not null,"
+  "SELECT_SCAN BIGINT unsigned not null,"
+  "SORT_MERGE_PASSES BIGINT unsigned not null,"
+  "SORT_RANGE BIGINT unsigned not null,"
+  "SORT_ROWS BIGINT unsigned not null,"
+  "SORT_SCAN BIGINT unsigned not null,"
+  "NO_INDEX_USED BIGINT unsigned not null,"
+  "NO_GOOD_INDEX_USED BIGINT unsigned not null,"
+  "NESTING_EVENT_ID BIGINT unsigned,"
+  "NESTING_EVENT_TYPE ENUM('STATEMENT', 'STAGE', 'WAIT')"
+  ")ENGINE=PERFORMANCE_SCHEMA;";
+
+SET @str = IF(@have_pfs = 1, @cmd, 'SET @dummy = 0');
+PREPARE stmt FROM @str;
+EXECUTE stmt;
+DROP PREPARE stmt;
+
+--
+-- TABLE EVENTS_STATEMENTS_HISTORY
+--
+
+SET @cmd="CREATE TABLE performance_schema.events_statements_history("
+  "THREAD_ID INTEGER not null,"
+  "EVENT_ID BIGINT unsigned not null,"
+  "END_EVENT_ID BIGINT unsigned,"
+  "EVENT_NAME VARCHAR(128) not null,"
+  "SOURCE VARCHAR(64),"
+  "TIMER_START BIGINT unsigned,"
+  "TIMER_END BIGINT unsigned,"
+  "TIMER_WAIT BIGINT unsigned,"
+  "LOCK_TIME bigint unsigned not null,"
+  "SQL_TEXT LONGTEXT,"
+  "DIGEST VARCHAR(32),"
+  "DIGEST_TEXT LONGTEXT,"
+  "CURRENT_SCHEMA VARCHAR(64),"
+  "OBJECT_TYPE VARCHAR(64),"
+  "OBJECT_SCHEMA VARCHAR(64),"
+  "OBJECT_NAME VARCHAR(64),"
+  "OBJECT_INSTANCE_BEGIN BIGINT unsigned,"
+  "MYSQL_ERRNO INTEGER,"
+  "RETURNED_SQLSTATE VARCHAR(5),"
+  "MESSAGE_TEXT VARCHAR(128),"
+  "ERRORS BIGINT unsigned not null,"
+  "WARNINGS BIGINT unsigned not null,"
+  "ROWS_AFFECTED BIGINT unsigned not null,"
+  "ROWS_SENT BIGINT unsigned not null,"
+  "ROWS_EXAMINED BIGINT unsigned not null,"
+  "CREATED_TMP_DISK_TABLES BIGINT unsigned not null,"
+  "CREATED_TMP_TABLES BIGINT unsigned not null,"
+  "SELECT_FULL_JOIN BIGINT unsigned not null,"
+  "SELECT_FULL_RANGE_JOIN BIGINT unsigned not null,"
+  "SELECT_RANGE BIGINT unsigned not null,"
+  "SELECT_RANGE_CHECK BIGINT unsigned not null,"
+  "SELECT_SCAN BIGINT unsigned not null,"
+  "SORT_MERGE_PASSES BIGINT unsigned not null,"
+  "SORT_RANGE BIGINT unsigned not null,"
+  "SORT_ROWS BIGINT unsigned not null,"
+  "SORT_SCAN BIGINT unsigned not null,"
+  "NO_INDEX_USED BIGINT unsigned not null,"
+  "NO_GOOD_INDEX_USED BIGINT unsigned not null,"
+  "NESTING_EVENT_ID BIGINT unsigned,"
+  "NESTING_EVENT_TYPE ENUM('STATEMENT', 'STAGE', 'WAIT')"
+  ")ENGINE=PERFORMANCE_SCHEMA;";
+
+SET @str = IF(@have_pfs = 1, @cmd, 'SET @dummy = 0');
+PREPARE stmt FROM @str;
+EXECUTE stmt;
+DROP PREPARE stmt;
+
+--
+-- TABLE EVENTS_STATEMENTS_HISTORY_LONG
+--
+
+SET @cmd="CREATE TABLE performance_schema.events_statements_history_long("
+  "THREAD_ID INTEGER not null,"
+  "EVENT_ID BIGINT unsigned not null,"
+  "END_EVENT_ID BIGINT unsigned,"
+  "EVENT_NAME VARCHAR(128) not null,"
+  "SOURCE VARCHAR(64),"
+  "TIMER_START BIGINT unsigned,"
+  "TIMER_END BIGINT unsigned,"
+  "TIMER_WAIT BIGINT unsigned,"
+  "LOCK_TIME bigint unsigned not null,"
+  "SQL_TEXT LONGTEXT,"
+  "DIGEST VARCHAR(32),"
+  "DIGEST_TEXT LONGTEXT,"
+  "CURRENT_SCHEMA VARCHAR(64),"
+  "OBJECT_TYPE VARCHAR(64),"
+  "OBJECT_SCHEMA VARCHAR(64),"
+  "OBJECT_NAME VARCHAR(64),"
+  "OBJECT_INSTANCE_BEGIN BIGINT unsigned,"
+  "MYSQL_ERRNO INTEGER,"
+  "RETURNED_SQLSTATE VARCHAR(5),"
+  "MESSAGE_TEXT VARCHAR(128),"
+  "ERRORS BIGINT unsigned not null,"
+  "WARNINGS BIGINT unsigned not null,"
+  "ROWS_AFFECTED BIGINT unsigned not null,"
+  "ROWS_SENT BIGINT unsigned not null,"
+  "ROWS_EXAMINED BIGINT unsigned not null,"
+  "CREATED_TMP_DISK_TABLES BIGINT unsigned not null,"
+  "CREATED_TMP_TABLES BIGINT unsigned not null,"
+  "SELECT_FULL_JOIN BIGINT unsigned not null,"
+  "SELECT_FULL_RANGE_JOIN BIGINT unsigned not null,"
+  "SELECT_RANGE BIGINT unsigned not null,"
+  "SELECT_RANGE_CHECK BIGINT unsigned not null,"
+  "SELECT_SCAN BIGINT unsigned not null,"
+  "SORT_MERGE_PASSES BIGINT unsigned not null,"
+  "SORT_RANGE BIGINT unsigned not null,"
+  "SORT_ROWS BIGINT unsigned not null,"
+  "SORT_SCAN BIGINT unsigned not null,"
+  "NO_INDEX_USED BIGINT unsigned not null,"
+  "NO_GOOD_INDEX_USED BIGINT unsigned not null,"
+  "NESTING_EVENT_ID BIGINT unsigned,"
+  "NESTING_EVENT_TYPE ENUM('STATEMENT', 'STAGE', 'WAIT')"
+  ")ENGINE=PERFORMANCE_SCHEMA;";
+
+SET @str = IF(@have_pfs = 1, @cmd, 'SET @dummy = 0');
+PREPARE stmt FROM @str;
+EXECUTE stmt;
+DROP PREPARE stmt;
+
+--
+-- TABLE EVENTS_STATEMENTS_SUMMARY_BY_THREAD_BY_EVENT_NAME
+--
+
+SET @cmd="CREATE TABLE performance_schema.events_statements_summary_by_thread_by_event_name("
+  "THREAD_ID INTEGER not null,"
+  "EVENT_NAME VARCHAR(128) not null,"
+  "COUNT_STAR BIGINT unsigned not null,"
+  "SUM_TIMER_WAIT BIGINT unsigned not null,"
+  "MIN_TIMER_WAIT BIGINT unsigned not null,"
+  "AVG_TIMER_WAIT BIGINT unsigned not null,"
+  "MAX_TIMER_WAIT BIGINT unsigned not null,"
+  "SUM_LOCK_TIME BIGINT unsigned not null,"
+  "SUM_ERRORS BIGINT unsigned not null,"
+  "SUM_WARNINGS BIGINT unsigned not null,"
+  "SUM_ROWS_AFFECTED BIGINT unsigned not null,"
+  "SUM_ROWS_SENT BIGINT unsigned not null,"
+  "SUM_ROWS_EXAMINED BIGINT unsigned not null,"
+  "SUM_CREATED_TMP_DISK_TABLES BIGINT unsigned not null,"
+  "SUM_CREATED_TMP_TABLES BIGINT unsigned not null,"
+  "SUM_SELECT_FULL_JOIN BIGINT unsigned not null,"
+  "SUM_SELECT_FULL_RANGE_JOIN BIGINT unsigned not null,"
+  "SUM_SELECT_RANGE BIGINT unsigned not null,"
+  "SUM_SELECT_RANGE_CHECK BIGINT unsigned not null,"
+  "SUM_SELECT_SCAN BIGINT unsigned not null,"
+  "SUM_SORT_MERGE_PASSES BIGINT unsigned not null,"
+  "SUM_SORT_RANGE BIGINT unsigned not null,"
+  "SUM_SORT_ROWS BIGINT unsigned not null,"
+  "SUM_SORT_SCAN BIGINT unsigned not null,"
+  "SUM_NO_INDEX_USED BIGINT unsigned not null,"
+  "SUM_NO_GOOD_INDEX_USED BIGINT unsigned not null"
+  ")ENGINE=PERFORMANCE_SCHEMA;";
+
+SET @str = IF(@have_pfs = 1, @cmd, 'SET @dummy = 0');
+PREPARE stmt FROM @str;
+EXECUTE stmt;
+DROP PREPARE stmt;
+
+--
+-- TABLE EVENTS_STATEMENTS_SUMMARY_BY_HOST_BY_EVENT_NAME
+--
+
+SET @cmd="CREATE TABLE performance_schema.events_statements_summary_by_host_by_event_name("
+  "HOST CHAR(60) collate utf8_bin default null,"
+  "EVENT_NAME VARCHAR(128) not null,"
+  "COUNT_STAR BIGINT unsigned not null,"
+  "SUM_TIMER_WAIT BIGINT unsigned not null,"
+  "MIN_TIMER_WAIT BIGINT unsigned not null,"
+  "AVG_TIMER_WAIT BIGINT unsigned not null,"
+  "MAX_TIMER_WAIT BIGINT unsigned not null,"
+  "SUM_LOCK_TIME BIGINT unsigned not null,"
+  "SUM_ERRORS BIGINT unsigned not null,"
+  "SUM_WARNINGS BIGINT unsigned not null,"
+  "SUM_ROWS_AFFECTED BIGINT unsigned not null,"
+  "SUM_ROWS_SENT BIGINT unsigned not null,"
+  "SUM_ROWS_EXAMINED BIGINT unsigned not null,"
+  "SUM_CREATED_TMP_DISK_TABLES BIGINT unsigned not null,"
+  "SUM_CREATED_TMP_TABLES BIGINT unsigned not null,"
+  "SUM_SELECT_FULL_JOIN BIGINT unsigned not null,"
+  "SUM_SELECT_FULL_RANGE_JOIN BIGINT unsigned not null,"
+  "SUM_SELECT_RANGE BIGINT unsigned not null,"
+  "SUM_SELECT_RANGE_CHECK BIGINT unsigned not null,"
+  "SUM_SELECT_SCAN BIGINT unsigned not null,"
+  "SUM_SORT_MERGE_PASSES BIGINT unsigned not null,"
+  "SUM_SORT_RANGE BIGINT unsigned not null,"
+  "SUM_SORT_ROWS BIGINT unsigned not null,"
+  "SUM_SORT_SCAN BIGINT unsigned not null,"
+  "SUM_NO_INDEX_USED BIGINT unsigned not null,"
+  "SUM_NO_GOOD_INDEX_USED BIGINT unsigned not null"
+  ")ENGINE=PERFORMANCE_SCHEMA;";
+
+SET @str = IF(@have_pfs = 1, @cmd, 'SET @dummy = 0');
+PREPARE stmt FROM @str;
+EXECUTE stmt;
+DROP PREPARE stmt;
+
+--
+-- TABLE EVENTS_STATEMENTS_SUMMARY_BY_USER_BY_EVENT_NAME
+--
+
+SET @cmd="CREATE TABLE performance_schema.events_statements_summary_by_user_by_event_name("
+  "USER CHAR(16) collate utf8_bin default null,"
+  "EVENT_NAME VARCHAR(128) not null,"
+  "COUNT_STAR BIGINT unsigned not null,"
+  "SUM_TIMER_WAIT BIGINT unsigned not null,"
+  "MIN_TIMER_WAIT BIGINT unsigned not null,"
+  "AVG_TIMER_WAIT BIGINT unsigned not null,"
+  "MAX_TIMER_WAIT BIGINT unsigned not null,"
+  "SUM_LOCK_TIME BIGINT unsigned not null,"
+  "SUM_ERRORS BIGINT unsigned not null,"
+  "SUM_WARNINGS BIGINT unsigned not null,"
+  "SUM_ROWS_AFFECTED BIGINT unsigned not null,"
+  "SUM_ROWS_SENT BIGINT unsigned not null,"
+  "SUM_ROWS_EXAMINED BIGINT unsigned not null,"
+  "SUM_CREATED_TMP_DISK_TABLES BIGINT unsigned not null,"
+  "SUM_CREATED_TMP_TABLES BIGINT unsigned not null,"
+  "SUM_SELECT_FULL_JOIN BIGINT unsigned not null,"
+  "SUM_SELECT_FULL_RANGE_JOIN BIGINT unsigned not null,"
+  "SUM_SELECT_RANGE BIGINT unsigned not null,"
+  "SUM_SELECT_RANGE_CHECK BIGINT unsigned not null,"
+  "SUM_SELECT_SCAN BIGINT unsigned not null,"
+  "SUM_SORT_MERGE_PASSES BIGINT unsigned not null,"
+  "SUM_SORT_RANGE BIGINT unsigned not null,"
+  "SUM_SORT_ROWS BIGINT unsigned not null,"
+  "SUM_SORT_SCAN BIGINT unsigned not null,"
+  "SUM_NO_INDEX_USED BIGINT unsigned not null,"
+  "SUM_NO_GOOD_INDEX_USED BIGINT unsigned not null"
+  ")ENGINE=PERFORMANCE_SCHEMA;";
+
+SET @str = IF(@have_pfs = 1, @cmd, 'SET @dummy = 0');
+PREPARE stmt FROM @str;
+EXECUTE stmt;
+DROP PREPARE stmt;
+
+--
+-- TABLE EVENTS_STATEMENTS_SUMMARY_BY_ACCOUNT_BY_EVENT_NAME
+--
+
+SET @cmd="CREATE TABLE performance_schema.events_statements_summary_by_account_by_event_name("
+  "USER CHAR(16) collate utf8_bin default null,"
+  "HOST CHAR(60) collate utf8_bin default null,"
+  "EVENT_NAME VARCHAR(128) not null,"
+  "COUNT_STAR BIGINT unsigned not null,"
+  "SUM_TIMER_WAIT BIGINT unsigned not null,"
+  "MIN_TIMER_WAIT BIGINT unsigned not null,"
+  "AVG_TIMER_WAIT BIGINT unsigned not null,"
+  "MAX_TIMER_WAIT BIGINT unsigned not null,"
+  "SUM_LOCK_TIME BIGINT unsigned not null,"
+  "SUM_ERRORS BIGINT unsigned not null,"
+  "SUM_WARNINGS BIGINT unsigned not null,"
+  "SUM_ROWS_AFFECTED BIGINT unsigned not null,"
+  "SUM_ROWS_SENT BIGINT unsigned not null,"
+  "SUM_ROWS_EXAMINED BIGINT unsigned not null,"
+  "SUM_CREATED_TMP_DISK_TABLES BIGINT unsigned not null,"
+  "SUM_CREATED_TMP_TABLES BIGINT unsigned not null,"
+  "SUM_SELECT_FULL_JOIN BIGINT unsigned not null,"
+  "SUM_SELECT_FULL_RANGE_JOIN BIGINT unsigned not null,"
+  "SUM_SELECT_RANGE BIGINT unsigned not null,"
+  "SUM_SELECT_RANGE_CHECK BIGINT unsigned not null,"
+  "SUM_SELECT_SCAN BIGINT unsigned not null,"
+  "SUM_SORT_MERGE_PASSES BIGINT unsigned not null,"
+  "SUM_SORT_RANGE BIGINT unsigned not null,"
+  "SUM_SORT_ROWS BIGINT unsigned not null,"
+  "SUM_SORT_SCAN BIGINT unsigned not null,"
+  "SUM_NO_INDEX_USED BIGINT unsigned not null,"
+  "SUM_NO_GOOD_INDEX_USED BIGINT unsigned not null"
+  ")ENGINE=PERFORMANCE_SCHEMA;";
+
+SET @str = IF(@have_pfs = 1, @cmd, 'SET @dummy = 0');
+PREPARE stmt FROM @str;
+EXECUTE stmt;
+DROP PREPARE stmt;
+
+--
+-- TABLE EVENTS_STATEMENTS_SUMMARY_GLOBAL_BY_EVENT_NAME
+--
+
+SET @cmd="CREATE TABLE performance_schema.events_statements_summary_global_by_event_name("
+  "EVENT_NAME VARCHAR(128) not null,"
+  "COUNT_STAR BIGINT unsigned not null,"
+  "SUM_TIMER_WAIT BIGINT unsigned not null,"
+  "MIN_TIMER_WAIT BIGINT unsigned not null,"
+  "AVG_TIMER_WAIT BIGINT unsigned not null,"
+  "MAX_TIMER_WAIT BIGINT unsigned not null,"
+  "SUM_LOCK_TIME BIGINT unsigned not null,"
+  "SUM_ERRORS BIGINT unsigned not null,"
+  "SUM_WARNINGS BIGINT unsigned not null,"
+  "SUM_ROWS_AFFECTED BIGINT unsigned not null,"
+  "SUM_ROWS_SENT BIGINT unsigned not null,"
+  "SUM_ROWS_EXAMINED BIGINT unsigned not null,"
+  "SUM_CREATED_TMP_DISK_TABLES BIGINT unsigned not null,"
+  "SUM_CREATED_TMP_TABLES BIGINT unsigned not null,"
+  "SUM_SELECT_FULL_JOIN BIGINT unsigned not null,"
+  "SUM_SELECT_FULL_RANGE_JOIN BIGINT unsigned not null,"
+  "SUM_SELECT_RANGE BIGINT unsigned not null,"
+  "SUM_SELECT_RANGE_CHECK BIGINT unsigned not null,"
+  "SUM_SELECT_SCAN BIGINT unsigned not null,"
+  "SUM_SORT_MERGE_PASSES BIGINT unsigned not null,"
+  "SUM_SORT_RANGE BIGINT unsigned not null,"
+  "SUM_SORT_ROWS BIGINT unsigned not null,"
+  "SUM_SORT_SCAN BIGINT unsigned not null,"
+  "SUM_NO_INDEX_USED BIGINT unsigned not null,"
+  "SUM_NO_GOOD_INDEX_USED BIGINT unsigned not null"
+  ")ENGINE=PERFORMANCE_SCHEMA;";
+
+SET @str = IF(@have_pfs = 1, @cmd, 'SET @dummy = 0');
+PREPARE stmt FROM @str;
+EXECUTE stmt;
+DROP PREPARE stmt;
+
+--
+-- TABLE HOSTS
+--
+
+SET @cmd="CREATE TABLE performance_schema.hosts("
+  "HOST CHAR(60) collate utf8_bin default null,"
+  "CURRENT_CONNECTIONS bigint not null,"
+  "TOTAL_CONNECTIONS bigint not null"
+  ")ENGINE=PERFORMANCE_SCHEMA;";
+
+SET @str = IF(@have_pfs = 1, @cmd, 'SET @dummy = 0');
+PREPARE stmt FROM @str;
+EXECUTE stmt;
+DROP PREPARE stmt;
+
+--
+-- TABLE USERS
+--
+
+SET @cmd="CREATE TABLE performance_schema.users("
+  "USER CHAR(16) collate utf8_bin default null,"
+  "CURRENT_CONNECTIONS bigint not null,"
+  "TOTAL_CONNECTIONS bigint not null"
+  ")ENGINE=PERFORMANCE_SCHEMA;";
+
+SET @str = IF(@have_pfs = 1, @cmd, 'SET @dummy = 0');
+PREPARE stmt FROM @str;
+EXECUTE stmt;
+DROP PREPARE stmt;
+
+--
+-- TABLE ACCOUNTS
+--
+
+SET @cmd="CREATE TABLE performance_schema.accounts("
+  "USER CHAR(16) collate utf8_bin default null,"
+  "HOST CHAR(60) collate utf8_bin default null,"
+  "CURRENT_CONNECTIONS bigint not null,"
+  "TOTAL_CONNECTIONS bigint not null"
+  ")ENGINE=PERFORMANCE_SCHEMA;";
+
+SET @str = IF(@have_pfs = 1, @cmd, 'SET @dummy = 0');
+PREPARE stmt FROM @str;
+EXECUTE stmt;
+DROP PREPARE stmt;
+
+--
+-- TABLE EVENTS_STATEMENTS_SUMMARY_BY_DIGEST
+--
+
+SET @cmd="CREATE TABLE performance_schema.events_statements_summary_by_digest("
+  "DIGEST VARCHAR(32),"
+  "DIGEST_TEXT LONGTEXT,"
+  "COUNT_STAR BIGINT unsigned not null,"
+  "SUM_TIMER_WAIT BIGINT unsigned not null,"
+  "MIN_TIMER_WAIT BIGINT unsigned not null,"
+  "AVG_TIMER_WAIT BIGINT unsigned not null,"
+  "MAX_TIMER_WAIT BIGINT unsigned not null,"
+  "SUM_LOCK_TIME BIGINT unsigned not null,"
+  "SUM_ERRORS BIGINT unsigned not null,"
+  "SUM_WARNINGS BIGINT unsigned not null,"
+  "SUM_ROWS_AFFECTED BIGINT unsigned not null,"
+  "SUM_ROWS_SENT BIGINT unsigned not null,"
+  "SUM_ROWS_EXAMINED BIGINT unsigned not null,"
+  "SUM_CREATED_TMP_DISK_TABLES BIGINT unsigned not null,"
+  "SUM_CREATED_TMP_TABLES BIGINT unsigned not null,"
+  "SUM_SELECT_FULL_JOIN BIGINT unsigned not null,"
+  "SUM_SELECT_FULL_RANGE_JOIN BIGINT unsigned not null,"
+  "SUM_SELECT_RANGE BIGINT unsigned not null,"
+  "SUM_SELECT_RANGE_CHECK BIGINT unsigned not null,"
+  "SUM_SELECT_SCAN BIGINT unsigned not null,"
+  "SUM_SORT_MERGE_PASSES BIGINT unsigned not null,"
+  "SUM_SORT_RANGE BIGINT unsigned not null,"
+  "SUM_SORT_ROWS BIGINT unsigned not null,"
+  "SUM_SORT_SCAN BIGINT unsigned not null,"
+  "SUM_NO_INDEX_USED BIGINT unsigned not null,"
+  "SUM_NO_GOOD_INDEX_USED BIGINT unsigned not null,"
+  "FIRST_SEEN TIMESTAMP(0) default 0,"
+  "LAST_SEEN TIMESTAMP(0) default 0"
+  ")ENGINE=PERFORMANCE_SCHEMA;";
 
-SET @cmd=concat(@l1,@l2,@l3,@l4,@l5);
 
 SET @str = IF(@have_pfs = 1, @cmd, 'SET @dummy = 0');
 PREPARE stmt FROM @str;
diff --git a/sql/CMakeLists.txt b/sql/CMakeLists.txt
index ecf91fcf043..805f7b34e04 100644
--- a/sql/CMakeLists.txt
+++ b/sql/CMakeLists.txt
@@ -59,7 +59,8 @@ SET (SQL_SOURCE
                sql_cache.cc sql_class.cc sql_client.cc sql_crypt.cc sql_crypt.h 
                sql_cursor.cc sql_db.cc sql_delete.cc sql_derived.cc sql_do.cc 
                sql_error.cc sql_handler.cc sql_help.cc sql_insert.cc sql_lex.cc 
-               sql_list.cc sql_load.cc sql_manager.cc sql_parse.cc
+               sql_list.cc sql_load.cc sql_manager.cc
+               sql_parse.cc sql_bootstrap.cc sql_bootstrap.h
                sql_partition.cc sql_plugin.cc sql_prepare.cc sql_rename.cc 
                debug_sync.cc debug_sync.h
                sql_repl.cc sql_select.cc sql_show.cc sql_state.c sql_string.cc
diff --git a/sql/handler.h b/sql/handler.h
index ee1731af563..981bf9aec6a 100644
--- a/sql/handler.h
+++ b/sql/handler.h
@@ -1069,7 +1069,22 @@ inline LEX_STRING *hton_name(const handlerton *hton)
 #define HTON_NOT_USER_SELECTABLE     (1 << 5)
 #define HTON_TEMPORARY_NOT_SUPPORTED (1 << 6) //Having temporary tables not supported
 #define HTON_SUPPORT_LOG_TABLES      (1 << 7) //Engine supports log tables
-#define HTON_NO_PARTITION            (1 << 8) //You can not partition these tables
+#define HTON_NO_PARTITION            (1 << 8) //Not partition of these tables
+
+/*
+  This flag should be set when deciding that the engine does not allow
+  row based binary logging (RBL) optimizations.
+
+  Currently, setting this flag, means that table's read/write_set will
+  be left untouched when logging changes to tables in this engine. In
+  practice this means that the server will not mess around with
+  table->write_set and/or table->read_set when using RBL and deciding
+  whether to log full or minimal rows.
+
+  It's valuable for instance for virtual tables, eg: Performance
+  Schema which have no meaning for replication.
+*/
+#define HTON_NO_BINLOG_ROW_OPT       (1 << 9)
 
 class Ha_trx_info;
 
@@ -1446,21 +1461,24 @@ typedef struct st_range_seq_if
 
 typedef bool (*SKIP_INDEX_TUPLE_FUNC) (range_seq_t seq, range_id_t range_info);
 
-class COST_VECT
+class Cost_estimate
 { 
 public:
   double io_count;     /* number of I/O                 */
   double avg_io_cost;  /* cost of an average I/O oper.  */
   double cpu_cost;     /* cost of operations in CPU     */
-  double mem_cost;     /* cost of used memory           */ 
   double import_cost;  /* cost of remote operations     */
+  double mem_cost;     /* cost of used memory           */ 
   
   enum { IO_COEFF=1 };
   enum { CPU_COEFF=1 };
   enum { MEM_COEFF=1 };
   enum { IMPORT_COEFF=1 };
 
-  COST_VECT() {}                              // keep gcc happy
+  Cost_estimate()
+  {
+    reset();
+  }
 
   double total_cost() 
   {
@@ -1468,7 +1486,17 @@ public:
            MEM_COEFF*mem_cost + IMPORT_COEFF*import_cost;
   }
 
-  void zero()
+  /**
+    Whether or not all costs in the object are zero
+    
+    @return true if all costs are zero, false otherwise
+  */
+  bool is_zero() const
+  { 
+    return !(io_count || cpu_cost || import_cost || mem_cost);
+  }
+
+  void reset()
   {
     avg_io_cost= 1.0;
     io_count= cpu_cost= mem_cost= import_cost= 0.0;
@@ -1482,13 +1510,14 @@ public:
     /* Don't multiply mem_cost */
   }
 
-  void add(const COST_VECT* cost)
+  void add(const Cost_estimate* cost)
   {
     double io_count_sum= io_count + cost->io_count;
     add_io(cost->io_count, cost->avg_io_cost);
     io_count= io_count_sum;
     cpu_cost += cost->cpu_cost;
   }
+
   void add_io(double add_io_cnt, double add_avg_cost)
   {
     /* In edge cases add_io_cnt may be zero */
@@ -1501,20 +1530,28 @@ public:
     }
   }
 
+  /// Add to CPU cost
+  void add_cpu(double add_cpu_cost) { cpu_cost+= add_cpu_cost; }
+
+  /// Add to import cost
+  void add_import(double add_import_cost) { import_cost+= add_import_cost; }
+
+  /// Add to memory cost
+  void add_mem(double add_mem_cost) { mem_cost+= add_mem_cost; }
+
   /*
     To be used when we go from old single value-based cost calculations to
-    the new COST_VECT-based.
+    the new Cost_estimate-based.
   */
   void convert_from_cost(double cost)
   {
-    zero();
-    avg_io_cost= 1.0;
+    reset();
     io_count= cost;
   }
 };
 
 void get_sweep_read_cost(TABLE *table, ha_rows nrows, bool interrupted, 
-                         COST_VECT *cost);
+                         Cost_estimate *cost);
 
 /*
   Indicates that all scanned ranges will be singlepoint (aka equality) ranges.
@@ -2156,10 +2193,11 @@ public:
   virtual ha_rows multi_range_read_info_const(uint keyno, RANGE_SEQ_IF *seq,
                                               void *seq_init_param, 
                                               uint n_ranges, uint *bufsz,
-                                              uint *mrr_mode, COST_VECT *cost);
+                                              uint *mrr_mode,
+                                              Cost_estimate *cost);
   virtual ha_rows multi_range_read_info(uint keyno, uint n_ranges, uint keys,
                                         uint key_parts, uint *bufsz, 
-                                        uint *mrr_mode, COST_VECT *cost);
+                                        uint *mrr_mode, Cost_estimate *cost);
   virtual int multi_range_read_init(RANGE_SEQ_IF *seq, void *seq_init_param,
                                     uint n_ranges, uint mrr_mode, 
                                     HANDLER_BUFFER *buf);
diff --git a/sql/hash_filo.h b/sql/hash_filo.h
index dab54928a55..b6068348d1d 100644
--- a/sql/hash_filo.h
+++ b/sql/hash_filo.h
@@ -32,9 +32,15 @@
 
 class hash_filo_element
 {
+private:
   hash_filo_element *next_used,*prev_used;
  public:
   hash_filo_element() {}
+  hash_filo_element *next()
+  { return next_used; }
+  hash_filo_element *prev()
+  { return prev_used; }
+
   friend class hash_filo;
 };
 
diff --git a/sql/item_strfunc.cc b/sql/item_strfunc.cc
index c5d1edbe475..48d0b32d51c 100644
--- a/sql/item_strfunc.cc
+++ b/sql/item_strfunc.cc
@@ -50,7 +50,7 @@
 #include "password.h"           // my_make_scrambled_password,
                                 // my_make_scrambled_password_323
 #include <m_ctype.h>
-#include "my_md5.h"
+#include <my_md5.h>
 #include "sha1.h"
 #include "my_aes.h"
 #include <zlib.h>
@@ -180,7 +180,8 @@ String *Item_func_md5::val_str_ascii(String *str)
     uchar digest[16];
 
     null_value=0;
-    MY_MD5_HASH(digest,(uchar *) sptr->ptr(), sptr->length());
+    compute_md5_hash((char *) digest, (const char *) sptr->ptr(),
+                     sptr->length());
     if (str->alloc(32))				// Ensure that memory is free
     {
       null_value=1;
diff --git a/sql/multi_range_read.cc b/sql/multi_range_read.cc
index 5b040c1fce3..ac44ba7288b 100644
--- a/sql/multi_range_read.cc
+++ b/sql/multi_range_read.cc
@@ -56,7 +56,7 @@
 ha_rows 
 handler::multi_range_read_info_const(uint keyno, RANGE_SEQ_IF *seq,
                                      void *seq_init_param, uint n_ranges_arg,
-                                     uint *bufsz, uint *flags, COST_VECT *cost)
+                                     uint *bufsz, uint *flags, Cost_estimate *cost)
 {
   KEY_MULTI_RANGE range;
   range_seq_t seq_it;
@@ -106,7 +106,7 @@ handler::multi_range_read_info_const(uint keyno, RANGE_SEQ_IF *seq,
   {
     /* The following calculation is the same as in multi_range_read_info(): */
     *flags |= HA_MRR_USE_DEFAULT_IMPL;
-    cost->zero();
+    cost->reset();
     cost->avg_io_cost= 1; /* assume random seeks */
     if ((*flags & HA_MRR_INDEX_ONLY) && total_rows > 2)
       cost->io_count= keyread_time(keyno, n_ranges, (uint)total_rows);
@@ -154,7 +154,7 @@ handler::multi_range_read_info_const(uint keyno, RANGE_SEQ_IF *seq,
 
 ha_rows handler::multi_range_read_info(uint keyno, uint n_ranges, uint n_rows,
                                        uint key_parts, uint *bufsz, 
-                                       uint *flags, COST_VECT *cost)
+                                       uint *flags, Cost_estimate *cost)
 {
   /* 
     Currently we expect this function to be called only in preparation of scan
@@ -165,7 +165,7 @@ ha_rows handler::multi_range_read_info(uint keyno, uint n_ranges, uint n_rows,
   *bufsz= 0; /* Default implementation doesn't need a buffer */
   *flags |= HA_MRR_USE_DEFAULT_IMPL;
 
-  cost->zero();
+  cost->reset();
   cost->avg_io_cost= 1; /* assume random seeks */
 
   /* Produce the same cost as non-MRR code does */
@@ -1402,7 +1402,7 @@ int DsMrr_impl::dsmrr_next(range_id_t *range_info)
 */
 ha_rows DsMrr_impl::dsmrr_info(uint keyno, uint n_ranges, uint rows, 
                                uint key_parts,
-                               uint *bufsz, uint *flags, COST_VECT *cost)
+                               uint *bufsz, uint *flags, Cost_estimate *cost)
 {  
   ha_rows __attribute__((unused)) res;
   uint def_flags= *flags;
@@ -1437,7 +1437,7 @@ ha_rows DsMrr_impl::dsmrr_info(uint keyno, uint n_ranges, uint rows,
 
 ha_rows DsMrr_impl::dsmrr_info_const(uint keyno, RANGE_SEQ_IF *seq,
                                  void *seq_init_param, uint n_ranges, 
-                                 uint *bufsz, uint *flags, COST_VECT *cost)
+                                 uint *bufsz, uint *flags, Cost_estimate *cost)
 {
   ha_rows rows;
   uint def_flags= *flags;
@@ -1551,9 +1551,9 @@ bool DsMrr_impl::check_cpk_scan(THD *thd, uint keyno, uint mrr_flags)
 
 
 bool DsMrr_impl::choose_mrr_impl(uint keyno, ha_rows rows, uint *flags,
-                                 uint *bufsz, COST_VECT *cost)
+                                 uint *bufsz, Cost_estimate *cost)
 {
-  COST_VECT dsmrr_cost;
+  Cost_estimate dsmrr_cost;
   bool res;
   THD *thd= current_thd;
 
@@ -1655,7 +1655,7 @@ int DsMrr_impl::dsmrr_explain_info(uint mrr_mode, char *str, size_t size)
 }
 
 
-static void get_sort_and_sweep_cost(TABLE *table, ha_rows nrows, COST_VECT *cost);
+static void get_sort_and_sweep_cost(TABLE *table, ha_rows nrows, Cost_estimate *cost);
 
 
 /**
@@ -1673,7 +1673,7 @@ static void get_sort_and_sweep_cost(TABLE *table, ha_rows nrows, COST_VECT *cost
 */
 
 bool DsMrr_impl::get_disk_sweep_mrr_cost(uint keynr, ha_rows rows, uint flags,
-                                         uint *buffer_size, COST_VECT *cost)
+                                         uint *buffer_size, Cost_estimate *cost)
 {
   ulong max_buff_entries, elem_size;
   ha_rows rows_in_full_step;
@@ -1707,13 +1707,13 @@ bool DsMrr_impl::get_disk_sweep_mrr_cost(uint keynr, ha_rows rows, uint flags,
   }
   else
   {
-    cost->zero();
+    cost->reset();
     *buffer_size= max(*buffer_size, 
                       (size_t)(1.2*rows_in_last_step) * elem_size + 
                       primary_file->ref_length + table->key_info[keynr].key_length);
   }
   
-  COST_VECT last_step_cost;
+  Cost_estimate last_step_cost;
   get_sort_and_sweep_cost(table, rows_in_last_step, &last_step_cost);
   cost->add(&last_step_cost);
  
@@ -1742,7 +1742,7 @@ bool DsMrr_impl::get_disk_sweep_mrr_cost(uint keynr, ha_rows rows, uint flags,
 */
 
 static 
-void get_sort_and_sweep_cost(TABLE *table, ha_rows nrows, COST_VECT *cost)
+void get_sort_and_sweep_cost(TABLE *table, ha_rows nrows, Cost_estimate *cost)
 {
   if (nrows)
   {
@@ -1754,7 +1754,7 @@ void get_sort_and_sweep_cost(TABLE *table, ha_rows nrows, COST_VECT *cost)
     cost->cpu_cost += cmp_op * log2(cmp_op);
   }
   else
-    cost->zero();
+    cost->reset();
 }
 
 
@@ -1802,11 +1802,11 @@ void get_sort_and_sweep_cost(TABLE *table, ha_rows nrows, COST_VECT *cost)
 */
 
 void get_sweep_read_cost(TABLE *table, ha_rows nrows, bool interrupted, 
-                         COST_VECT *cost)
+                         Cost_estimate *cost)
 {
   DBUG_ENTER("get_sweep_read_cost");
 
-  cost->zero();
+  cost->reset();
   if (table->file->primary_key_is_clustered())
   {
     cost->io_count= table->file->read_time(table->s->primary_key,
diff --git a/sql/multi_range_read.h b/sql/multi_range_read.h
index dcba92aab16..387ae9791bc 100644
--- a/sql/multi_range_read.h
+++ b/sql/multi_range_read.h
@@ -562,11 +562,11 @@ public:
   int dsmrr_next(range_id_t *range_info);
 
   ha_rows dsmrr_info(uint keyno, uint n_ranges, uint keys, uint key_parts, 
-                     uint *bufsz, uint *flags, COST_VECT *cost);
+                     uint *bufsz, uint *flags, Cost_estimate *cost);
 
   ha_rows dsmrr_info_const(uint keyno, RANGE_SEQ_IF *seq, 
                             void *seq_init_param, uint n_ranges, uint *bufsz,
-                            uint *flags, COST_VECT *cost);
+                            uint *flags, Cost_estimate *cost);
 
   int dsmrr_explain_info(uint mrr_mode, char *str, size_t size);
 private:
@@ -624,9 +624,9 @@ private:
   Forward_lifo_buffer rowid_buffer;
   
   bool choose_mrr_impl(uint keyno, ha_rows rows, uint *flags, uint *bufsz, 
-                       COST_VECT *cost);
+                       Cost_estimate *cost);
   bool get_disk_sweep_mrr_cost(uint keynr, ha_rows rows, uint flags, 
-                               uint *buffer_size, COST_VECT *cost);
+                               uint *buffer_size, Cost_estimate *cost);
   bool check_cpk_scan(THD *thd, uint keyno, uint mrr_flags);
 
   bool setup_buffer_sharing(uint key_size_in_keybuf, key_part_map key_tuple_map);
diff --git a/sql/mysqld.h b/sql/mysqld.h
index f32b92633b7..6b073eac59b 100644
--- a/sql/mysqld.h
+++ b/sql/mysqld.h
@@ -207,6 +207,7 @@ extern int bootstrap_error;
 extern I_List<THD> threads;
 extern char err_shared_dir[];
 extern TYPELIB thread_handling_typelib;
+extern ulong log_warnings;
 
 /*
   THR_MALLOC is a key which will be used to set/get MEM_ROOT** for a thread,
diff --git a/sql/opt_range.cc b/sql/opt_range.cc
index 0390ac1101e..a66a6755757 100644
--- a/sql/opt_range.cc
+++ b/sql/opt_range.cc
@@ -890,7 +890,7 @@ static bool is_key_scan_ror(PARAM *param, uint keynr, uint8 nparts);
 static ha_rows check_quick_select(PARAM *param, uint idx, bool index_only,
                                   SEL_ARG *tree, bool update_tbl_stats, 
                                   uint *mrr_flags, uint *bufsize,
-                                  COST_VECT *cost);
+                                  Cost_estimate *cost);
 
 QUICK_RANGE_SELECT *get_quick_select(PARAM *param,uint index,
                                      SEL_ARG *key_tree, uint mrr_flags, 
@@ -6691,7 +6691,7 @@ static TRP_RANGE *get_key_scans_params(PARAM *param, SEL_TREE *tree,
     if (*key)
     {
       ha_rows found_records;
-      COST_VECT cost;
+      Cost_estimate cost;
       double found_read_time;
       uint mrr_flags, buf_size;
       INDEX_SCAN_INFO *index_scan;
@@ -9941,7 +9941,7 @@ void SEL_ARG::test_use_count(SEL_ARG *root)
 static
 ha_rows check_quick_select(PARAM *param, uint idx, bool index_only,
                            SEL_ARG *tree, bool update_tbl_stats, 
-                           uint *mrr_flags, uint *bufsize, COST_VECT *cost)
+                           uint *mrr_flags, uint *bufsize, Cost_estimate *cost)
 {
   SEL_ARG_RANGE_SEQ seq;
   RANGE_SEQ_IF seq_if = {NULL, sel_arg_range_seq_init, sel_arg_range_seq_next, 0, 0};
@@ -10430,7 +10430,7 @@ QUICK_RANGE_SELECT *get_quick_select_for_ref(THD *thd, TABLE *table,
   QUICK_RANGE *range;
   uint part;
   bool create_err= FALSE;
-  COST_VECT cost;
+  Cost_estimate cost;
 
   old_root= thd->mem_root;
   /* The following call may change thd->mem_root */
@@ -12103,7 +12103,7 @@ get_best_group_min_max(PARAM *param, SEL_TREE *tree, double read_time)
       cur_index_tree= get_index_range_tree(cur_index, tree, param,
                                            &cur_param_idx);
       /* Check if this range tree can be used for prefix retrieval. */
-      COST_VECT dummy_cost;
+      Cost_estimate dummy_cost;
       uint mrr_flags= HA_MRR_USE_DEFAULT_IMPL;
       uint mrr_bufsize=0;
       cur_quick_prefix_records= check_quick_select(param, cur_param_idx,
diff --git a/sql/opt_subselect.cc b/sql/opt_subselect.cc
index 8d1cbeba5f4..9b09bb57358 100644
--- a/sql/opt_subselect.cc
+++ b/sql/opt_subselect.cc
@@ -2198,7 +2198,7 @@ bool optimize_semijoin_nests(JOIN *join, table_map all_table_map)
           Set the cost to do a full scan of the temptable (will need this to 
           consider doing sjm-scan):
         */ 
-        sjm->scan_cost.zero();
+        sjm->scan_cost.reset();
         sjm->scan_cost.add_io(sjm->rows, lookup_cost);
 
         sjm->lookup_cost.convert_from_cost(lookup_cost);
@@ -2633,12 +2633,12 @@ bool Sj_materialization_picker::check_qep(JOIN *join,
     else
     {
       /* This is SJ-Materialization with lookups */
-      COST_VECT prefix_cost; 
+      Cost_estimate prefix_cost; 
       signed int first_tab= (int)idx - mat_info->tables;
       double prefix_rec_count;
       if (first_tab < (int)join->const_tables)
       {
-        prefix_cost.zero();
+        prefix_cost.reset();
         prefix_rec_count= 1.0;
       }
       else
diff --git a/sql/share/errmsg-utf8.txt b/sql/share/errmsg-utf8.txt
index a762a06cfc1..907161224d5 100644
--- a/sql/share/errmsg-utf8.txt
+++ b/sql/share/errmsg-utf8.txt
@@ -6508,6 +6508,225 @@ ER_BINLOG_UNSAFE_AUTOINC_NOT_FIRST
 #  End of 5.5 error messages.
 #
 
+ER_COL_COUNT_DOESNT_MATCH_CORRUPTED_V2
+  eng "Column count of %s.%s is wrong. Expected %d, found %d. The table is probably corrupted"
+  ger "Spaltenanzahl von %s.%s falsch. %d erwartet, aber %d gefunden. Tabelle ist wahrscheinlich beschädigt"
+
+ER_CANNOT_LOAD_FROM_TABLE_V2
+  eng "Cannot load from %s.%s. The table is probably corrupted"
+  ger "Kann %s.%s nicht einlesen. Tabelle ist wahrscheinlich beschädigt"
+
+ER_MASTER_DELAY_VALUE_OUT_OF_RANGE
+  eng "The requested value %u for the master delay exceeds the maximum %u"
+ER_ONLY_FD_AND_RBR_EVENTS_ALLOWED_IN_BINLOG_STATEMENT
+  eng "Only Format_description_log_event and row events are allowed in BINLOG statements (but %s was provided)"
+
+ER_PARTITION_EXCHANGE_DIFFERENT_OPTION
+  eng "Non matching attribute '%-.64s' between partition and table"
+  swe "Attributet '%-.64s' är olika mellan partition och tabell"
+ER_PARTITION_EXCHANGE_PART_TABLE
+  eng "Table to exchange with partition is partitioned: '%-.64s'"
+  swe "Tabellen att byta ut mot partition är partitionerad: '%-.64s'"
+ER_PARTITION_EXCHANGE_TEMP_TABLE
+  eng "Table to exchange with partition is temporary: '%-.64s'"
+  swe "Tabellen att byta ut mot partition är temporär: '%-.64s'"
+ER_PARTITION_INSTEAD_OF_SUBPARTITION
+  eng "Subpartitioned table, use subpartition instead of partition"
+  swe "Subpartitionerad tabell, använd subpartition istället för partition"
+ER_UNKNOWN_PARTITION
+  eng "Unknown partition '%-.64s' in table '%-.64s'"
+  swe "Okänd partition '%-.64s' i tabell '%-.64s'"
+ER_TABLES_DIFFERENT_METADATA
+  eng "Tables have different definitions"
+  swe "Tabellerna har olika definitioner"
+ER_ROW_DOES_NOT_MATCH_PARTITION
+  eng "Found a row that does not match the partition"
+  swe "Hittade en rad som inte passar i partitionen"
+ER_BINLOG_CACHE_SIZE_GREATER_THAN_MAX
+  eng "Option binlog_cache_size (%lu) is greater than max_binlog_cache_size (%lu); setting binlog_cache_size equal to max_binlog_cache_size."
+ER_WARN_INDEX_NOT_APPLICABLE
+  eng "Cannot use %-.64s access on index '%-.64s' due to type or collation conversion on field '%-.64s'"
+
+ER_PARTITION_EXCHANGE_FOREIGN_KEY
+  eng "Table to exchange with partition has foreign key references: '%-.64s'"
+  swe "Tabellen att byta ut mot partition har foreign key referenser: '%-.64s'"
+ER_NO_SUCH_KEY_VALUE
+  eng "Key value '%-.192s' was not found in table '%-.192s.%-.192s'"
+ER_RPL_INFO_DATA_TOO_LONG
+  eng "Data for column '%s' too long"
+ER_NETWORK_READ_EVENT_CHECKSUM_FAILURE
+  eng "Replication event checksum verification failed while reading from network."
+ER_BINLOG_READ_EVENT_CHECKSUM_FAILURE
+  eng "Replication event checksum verification failed while reading from a log file."
+
+ER_BINLOG_STMT_CACHE_SIZE_GREATER_THAN_MAX
+  eng "Option binlog_stmt_cache_size (%lu) is greater than max_binlog_stmt_cache_size (%lu); setting binlog_stmt_cache_size equal to max_binlog_stmt_cache_size."
+ER_CANT_UPDATE_TABLE_IN_CREATE_TABLE_SELECT
+  eng "Can't update table '%-.192s' while '%-.192s' is being created."
+
+ER_PARTITION_CLAUSE_ON_NONPARTITIONED
+  eng "PARTITION () clause on non partitioned table"
+  swe "PARTITION () klausul för en icke partitionerad tabell"
+ER_ROW_DOES_NOT_MATCH_GIVEN_PARTITION_SET
+  eng "Found a row not matching the given partition set"
+  swe "Hittade en rad som inte passar i någon given partition"
+ER_NO_SUCH_PARTITION 
+  cze "partion '%-.64s' neexistuje"
+  dan "partition '%-.64s' eksisterer ikke"
+  nla "partition '%-.64s' bestaat niet"
+  eng "partition '%-.64s' doesn't exist"
+  est "partition '%-.64s' ei eksisteeri"
+  fre "La partition '%-.64s' n'existe pas"
+  ger "Die partition '%-.64s' existiert nicht"
+  hun "A '%-.64s' partition nem letezik"
+  ita "La tabella particione '%-.64s' non esiste"
+  nor "Partition '%-.64s' doesn't exist"
+  norwegian-ny "Partition '%-.64s' doesn't exist"
+  pol "Partition '%-.64s' doesn't exist"
+  por "Particion '%-.64s' n�o existe"
+  rum "Partition '%-.64s' nu exista"
+  serbian "Partition '%-.64s' ne postoji"
+  slo "Partition '%-.64s' doesn't exist"
+  spa "Particion '%-.64s' no existe"
+  swe "Det finns ingen partition som heter '%-.64s'"
+
+ER_CHANGE_RPL_INFO_REPOSITORY_FAILURE
+  eng "Failure while changing the type of replication repository: %s."
+
+ER_WARNING_NOT_COMPLETE_ROLLBACK_WITH_CREATED_TEMP_TABLE
+  eng "The creation of some temporary tables could not be rolled back."
+ER_WARNING_NOT_COMPLETE_ROLLBACK_WITH_DROPPED_TEMP_TABLE
+  eng "Some temporary tables were dropped, but these operations could not be rolled back."
+
+ER_MTS_FEATURE_IS_NOT_SUPPORTED
+  eng "%s is not supported in multi-threaded slave mode. %s"
+ER_MTS_UPDATED_DBS_GREATER_MAX
+  eng "The number of modified databases exceeds the maximum %d; the database names will not be included in the replication event metadata."
+ER_MTS_CANT_PARALLEL
+  eng "Cannot execute the current event group in the parallel mode. Encountered event %s, relay-log name %s, position %s which prevents execution of this event group in parallel mode. Reason: %s."
+ER_MTS_INCONSISTENT_DATA
+  eng "%s"
+
+ER_FULLTEXT_NOT_SUPPORTED_WITH_PARTITIONING
+  eng "FULLTEXT index is not supported for partitioned tables."
+  swe "FULLTEXT index stöds ej för partitionerade tabeller."
+
+ER_DA_INVALID_CONDITION_NUMBER 35000
+  eng "Invalid condition number"
+  por "Número de condição inválido"
+
+ER_INSECURE_PLAIN_TEXT
+  eng "Sending passwords in plain text without SSL/TLS is extremely insecure."
+
+ER_INSECURE_CHANGE_MASTER
+  eng "Storing MySQL user name or password information in the master.info repository is not secure and is therefore not recommended. Please see the MySQL Manual for more about this issue and possible alternatives."
+
+ER_FOREIGN_DUPLICATE_KEY_WITH_CHILD_INFO 23000 S1009
+        eng "Foreign key constraint for table '%.192s', record '%-.192s' would lead to a duplicate entry in table '%.192s', key '%.192s'"
+        ger "Fremdschlüssel-Beschränkung für Tabelle '%.192s', Datensatz '%-.192s' würde zu einem doppelten Eintrag in Tabelle '%.192s', Schlüssel '%.192s' führen"
+        swe "FOREIGN KEY constraint för tabell '%.192s', posten '%-.192s' kan inte uppdatera barntabell '%.192s' på grund av nyckel '%.192s'"
+
+ER_FOREIGN_DUPLICATE_KEY_WITHOUT_CHILD_INFO 23000 S1009
+        eng "Foreign key constraint for table '%.192s', record '%-.192s' would lead to a duplicate entry in a child table"
+        ger "Fremdschlüssel-Beschränkung für Tabelle '%.192s', Datensatz '%-.192s' würde zu einem doppelten Eintrag in einer Kind-Tabelle führen"
+        swe "FOREIGN KEY constraint för tabell '%.192s', posten '%-.192s' kan inte uppdatera en barntabell på grund av UNIQUE-test"
+ER_SQLTHREAD_WITH_SECURE_SLAVE
+  eng "Setting authentication options is not possible when only the Slave SQL Thread is being started."
+
+ER_TABLE_HAS_NO_FT
+  eng "The table does not have FULLTEXT index to support this query"
+
+ER_INNODB_FT_LIMIT
+  eng "InnoDB presently supports one FULLTEXT index per table"
+
+ER_INNODB_NO_FT_TEMP_TABLE
+  eng "Cannot create FULLTEXT index on temporary InnoDB table"
+
+ER_VARIABLE_NOT_SETTABLE_IN_SF_OR_TRIGGER
+  eng "The system variable %.200s cannot be set in stored functions or triggers."
+
+ER_VARIABLE_NOT_SETTABLE_IN_TRANSACTION
+  eng "The system variable %.200s cannot be set when there is an ongoing transaction."
+
+ER_GTID_NEXT_IS_NOT_IN_GTID_NEXT_LIST
+  eng "The system variable @@SESSION.GTID_NEXT has the value %.200s, which is not listed in @@SESSION.GTID_NEXT_LIST."
+
+ER_CANT_CHANGE_GTID_NEXT_IN_TRANSACTION_WHEN_GTID_NEXT_LIST_IS_NULL
+  eng "When @@SESSION.GTID_NEXT_LIST == NULL, the system variable @@SESSION.GTID_NEXT cannot change inside a transaction."
+
+ER_SET_STATEMENT_CANNOT_INVOKE_FUNCTION
+  eng "The statement 'SET %.200s' cannot invoke a stored function."
+
+ER_GTID_NEXT_CANT_BE_AUTOMATIC_IF_GTID_NEXT_LIST_IS_NON_NULL
+  eng "The system variable @@SESSION.GTID_NEXT cannot be 'AUTOMATIC' when @@SESSION.GTID_NEXT_LIST is non-NULL."
+
+ER_SKIPPING_LOGGED_TRANSACTION
+  eng "Skipping transaction %.200s because it has already been executed and logged."
+
+ER_MALFORMED_GTID_SET_SPECIFICATION
+  eng "Malformed GTID set specification '%.200s'."
+
+ER_MALFORMED_GTID_SET_ENCODING
+  eng "Malformed GTID set encoding."
+
+ER_MALFORMED_GTID_SPECIFICATION
+  eng "Malformed GTID specification '%.200s'."
+
+ER_GNO_EXHAUSTED
+  eng "Impossible to generate Global Transaction Identifier: the integer component reached the maximal value. Restart the server with a new server_uuid."
+
+ER_BAD_SLAVE_AUTO_POSITION
+  eng "Parameters MASTER_LOG_FILE, MASTER_LOG_POS, RELAY_LOG_FILE and RELAY_LOG_POS cannot be set when MASTER_AUTO_POSITION is active."
+
+ER_AUTO_POSITION_REQUIRES_GTID_MODE_ON
+  eng "CHANGE MASTER TO AUTO_POSITION = 1 can only be executed when GTID_MODE = ON."
+
+ER_CANT_DO_IMPLICIT_COMMIT_IN_TRX_WHEN_GTID_NEXT_IS_SET
+  eng "Cannot execute statements with implicit commit inside a transaction when GTID_NEXT != AUTOMATIC or GTID_NEXT_LIST != NULL."
+
+ER_GTID_MODE_2_OR_3_REQUIRES_DISABLE_GTID_UNSAFE_STATEMENTS_ON
+  eng "GTID_MODE = ON or GTID_MODE = UPGRADE_STEP_2 requires DISABLE_GTID_UNSAFE_STATEMENTS = 1."
+
+ER_GTID_MODE_REQUIRES_BINLOG
+  eng "GTID_MODE = ON or UPGRADE_STEP_1 or UPGRADE_STEP_2 requires --log-bin and --log-slave-updates."
+
+ER_CANT_SET_GTID_NEXT_TO_GTID_WHEN_GTID_MODE_IS_OFF
+  eng "GTID_NEXT cannot be set to UUID:NUMBER when GTID_MODE = OFF."
+
+ER_CANT_SET_GTID_NEXT_TO_ANONYMOUS_WHEN_GTID_MODE_IS_ON
+  eng "GTID_NEXT cannot be set to ANONYMOUS when GTID_MODE = ON."
+
+ER_CANT_SET_GTID_NEXT_LIST_TO_NON_NULL_WHEN_GTID_MODE_IS_OFF
+  eng "GTID_NEXT_LIST cannot be set to a non-NULL value when GTID_MODE = OFF."
+
+ER_FOUND_GTID_EVENT_WHEN_GTID_MODE_IS_OFF
+  eng "Found a Gtid_log_event or Previous_gtids_log_event when GTID_MODE = OFF."
+
+ER_GTID_UNSAFE_NON_TRANSACTIONAL_TABLE
+  eng "Updates to non-transactional tables are forbidden when DISABLE_GTID_UNSAFE_STATEMENTS = 1."
+
+ER_GTID_UNSAFE_CREATE_SELECT
+  eng "CREATE TABLE ... SELECT is forbidden when DISABLE_GTID_UNSAFE_STATEMENTS = 1."
+
+ER_GTID_UNSAFE_CREATE_DROP_TEMPORARY_TABLE_IN_TRANSACTION
+  eng "When DISABLE_GTID_UNSAFE_STATEMENTS = 1, the statements CREATE TEMPORARY TABLE and DROP TEMPORARY TABLE can be executed in a non-transactional context only, and require that AUTOCOMMIT = 1."
+
+ER_GTID_MODE_CAN_ONLY_CHANGE_ONE_STEP_AT_A_TIME
+  eng "The value of GTID_MODE can only change one step at a time: OFF <-> UPGRADE_STEP_1 <-> UPGRADE_STEP_2 <-> ON. Also note that this value must be stepped up or down simultaneously on all servers; see the Manual for instructions." 
+
+ER_MASTER_HAS_PURGED_REQUIRED_GTIDS
+  eng "The slave is connecting using CHANGE MASTER TO MASTER_AUTO_POSITION = 1, but the master has purged binary logs containing GTIDs that the slave requires."
+
+ER_CANT_SET_GTID_NEXT_WHEN_OWNING_GTID
+  eng "GTID_NEXT cannot be changed by a client that owns a GTID. The client owns %s. Ownership is released on COMMIT or ROLLBACK."
+
+ER_UNKNOWN_EXPLAIN_FORMAT
+  eng "Unknown EXPLAIN format name: '%s'"
+  rus "Неизвестное имя формата команды EXPLAIN: '%s'"
+
+ER_CANT_EXECUTE_IN_READ_ONLY_TRANSACTION 25006
+  eng "Cannot execute statement in a READ ONLY transaction."
+
 #
 # MariaDB error messages section starts here
 #
@@ -6547,10 +6766,10 @@ ER_UNKNOWN_OPTION
   eng "Unknown option '%-.64s'"
 ER_BAD_OPTION_VALUE
   eng "Incorrect value '%-.64s' for option '%-.64s'"
-ER_NETWORK_READ_EVENT_CHECKSUM_FAILURE
-  eng "Replication event checksum verification failed while reading from network."
-ER_BINLOG_READ_EVENT_CHECKSUM_FAILURE
-  eng "Replication event checksum verification failed while reading from a log file."
+ER_NOT_USED_ERROR_MESSAGE
+  eng ""
+ER_NOT_USED_ERROR_MESSAGE2
+  eng ""
 ER_CANT_DO_ONLINE
         eng "Can't execute the given '%s' command as online"
 ER_DATA_OVERFLOW 22003
diff --git a/sql/sql_bootstrap.cc b/sql/sql_bootstrap.cc
new file mode 100644
index 00000000000..c5e88739df7
--- /dev/null
+++ b/sql/sql_bootstrap.cc
@@ -0,0 +1,99 @@
+/* Copyright (c) 2010, Oracle and/or its affiliates.
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software Foundation,
+   51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA */
+
+
+#include <ctype.h>
+#include <string.h>
+#include "sql_bootstrap.h"
+
+int read_bootstrap_query(char *query, int *query_length,
+                         fgets_input_t input, fgets_fn_t fgets_fn)
+{
+  char line_buffer[MAX_BOOTSTRAP_LINE_SIZE];
+  const char *line;
+  int len;
+  int query_len= 0;
+
+  for ( ; ; )
+  {
+    line= (*fgets_fn)(line_buffer, sizeof(line_buffer), input);
+
+    if (line == NULL)
+      return (query_len ? READ_BOOTSTRAP_ERROR : READ_BOOTSTRAP_EOF);
+
+    len= strlen(line);
+
+    /*
+      Remove trailing whitespace characters.
+      This assumes:
+      - no multibyte encoded character can be found at the very end of a line,
+      - whitespace characters from the "C" locale only.
+     which is sufficient for the kind of queries found
+     in the bootstrap scripts.
+    */
+    while (len && (isspace(line[len - 1])))
+      len--;
+    /*
+      Cleanly end the string, so we don't have to test len > x
+      all the time before reading line[x], in the code below.
+    */
+    line_buffer[len]= '\0';
+
+    /* Skip blank lines */
+    if (len == 0)
+      continue;
+
+    /* Skip # comments */
+    if (line[0] == '#')
+      continue;
+    
+    /* Skip -- comments */
+    if ((line[0] == '-') && (line[1] == '-'))
+      continue;
+
+    /* Skip delimiter, ignored. */
+    if (strncmp(line, "delimiter", 9) == 0)
+      continue;
+
+    /* Append the current line to a multi line query. */
+
+    if (query_len + len + 1 >= MAX_BOOTSTRAP_QUERY_SIZE)
+      return READ_BOOTSTRAP_ERROR;
+
+    if (query_len != 0)
+    {
+      /*
+        Append a \n to the current line, if any,
+        to preserve the intended presentation.
+       */
+      query[query_len]= '\n';
+      query_len++;
+    }
+    memcpy(query + query_len, line, len);
+    query_len+= len;
+
+    if (line[len - 1] == ';')
+    {
+      /*
+        The last line is terminated by ';'.
+        Return the query found.
+      */
+      query[query_len]= '\0';
+      *query_length= query_len;
+      return 0;
+    }
+  }
+}
+
diff --git a/sql/sql_bootstrap.h b/sql/sql_bootstrap.h
new file mode 100644
index 00000000000..f80daf4a2f4
--- /dev/null
+++ b/sql/sql_bootstrap.h
@@ -0,0 +1,44 @@
+/* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software Foundation,
+   51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA */
+
+
+#ifndef SQL_BOOTSTRAP_H
+#define SQL_BOOTSTRAP_H
+
+/**
+  The maximum size of a bootstrap query.
+  Increase this size if parsing a longer query during bootstrap is necessary.
+  The longest query in use depends on the documentation content,
+  see the file fill_help_tables.sql
+*/
+#define MAX_BOOTSTRAP_QUERY_SIZE 20000
+/**
+  The maximum size of a bootstrap query, expressed in a single line.
+  Do not increase this size, use the multiline syntax instead.
+*/
+#define MAX_BOOTSTRAP_LINE_SIZE 20000
+
+#define READ_BOOTSTRAP_EOF 1
+#define READ_BOOTSTRAP_ERROR 2
+
+typedef void *fgets_input_t;
+typedef char * (*fgets_fn_t)(char *, size_t, fgets_input_t);
+
+int read_bootstrap_query(char *query, int *query_length,
+                         fgets_input_t input, fgets_fn_t fgets_fn);
+
+#endif
+
+
diff --git a/sql/sql_class.h b/sql/sql_class.h
index 5b53f806ddb..de5c0583213 100644
--- a/sql/sql_class.h
+++ b/sql/sql_class.h
@@ -3695,13 +3695,13 @@ public:
   /* 
     Cost to materialize - execute the sub-join and write rows into temp.table
   */
-  COST_VECT materialization_cost;
+  Cost_estimate materialization_cost;
 
   /* Cost to make one lookup in the temptable */
-  COST_VECT lookup_cost;
+  Cost_estimate lookup_cost;
   
   /* Cost of scanning the materialized table */
-  COST_VECT scan_cost;
+  Cost_estimate scan_cost;
 
   /* --- Execution structures ---------- */
   
diff --git a/sql/sql_error.h b/sql/sql_error.h
index 00ade934226..79633ae5df8 100644
--- a/sql/sql_error.h
+++ b/sql/sql_error.h
@@ -20,6 +20,8 @@
 #include "m_string.h" /* LEX_STRING */
 #include "sql_string.h"                        /* String */
 #include "mysql_com.h" /* MYSQL_ERRMSG_SIZE */
+#include "my_time.h"   /* MYSQL_TIME */
+#include "decimal.h"
 
 class THD;
 
@@ -319,6 +321,14 @@ private:
   MEM_ROOT *m_mem_root;
 };
 
+class Sql_condition : public MYSQL_ERROR
+{
+  /*
+    Wrapper class to allow one to use Sql_condition in handlers instead of
+    MYSQL_ERROR
+   */
+};
+
 ///////////////////////////////////////////////////////////////////////////
 
 /**
diff --git a/sql/sql_parse.cc b/sql/sql_parse.cc
index e77ae68a5de..17a6cf0f379 100644
--- a/sql/sql_parse.cc
+++ b/sql/sql_parse.cc
@@ -95,6 +95,7 @@
 #include "probes_mysql.h"
 #include "set_var.h"
 #include "log_slow.h"
+#include "sql_bootstrap.h"
 
 #define FLAGSTR(V,F) ((V)&(F)?#F" ":"")
 
@@ -481,11 +482,16 @@ void execute_init_command(THD *thd, LEX_STRING *init_command,
 }
 
 
+static char *fgets_fn(char *buffer, size_t size, fgets_input_t input)
+{
+  MYSQL_FILE *in= static_cast<MYSQL_FILE*> (input);
+  return mysql_file_fgets(buffer, size, in);
+}
+
+
 static void handle_bootstrap_impl(THD *thd)
 {
   MYSQL_FILE *file= bootstrap_file;
-  char *buff, *res;
-
   DBUG_ENTER("handle_bootstrap");
 
 #ifndef EMBEDDED_LIBRARY
@@ -503,50 +509,30 @@ static void handle_bootstrap_impl(THD *thd)
   */
   thd->client_capabilities|= CLIENT_MULTI_RESULTS;
 
-  buff= (char*) thd->net.buff;
   thd->init_for_queries();
-  while (mysql_file_fgets(buff, thd->net.max_packet, file))
+
+  for ( ; ; )
   {
+    char buffer[MAX_BOOTSTRAP_QUERY_SIZE];
+    int rc, length;
     char *query;
-    /* strlen() can't be deleted because mysql_file_fgets() doesn't return length */
-    ulong length= (ulong) strlen(buff);
-    while (buff[length-1] != '\n' && !mysql_file_feof(file))
+
+    rc= read_bootstrap_query(buffer, &length, file, fgets_fn);
+
+    if (rc == READ_BOOTSTRAP_ERROR)
     {
-      /*
-        We got only a part of the current string. Will try to increase
-        net buffer then read the rest of the current string.
-      */
-      /* purecov: begin tested */
-      if (net_realloc(&(thd->net), 2 * thd->net.max_packet))
-      {
-        thd->protocol->end_statement();
-        bootstrap_error= 1;
-        break;
-      }
-      buff= (char*) thd->net.buff;
-      res= mysql_file_fgets(buff + length, thd->net.max_packet - length, file);
-      if (!res && !mysql_file_feof(file))
-      {
-        thd->protocol->end_statement();
-        bootstrap_error= 1;
-        break;
-      }
-      length+= (ulong) strlen(buff + length);
-      /* purecov: end */
+      thd->raise_error(ER_SYNTAX_ERROR);
+      thd->protocol->end_statement();
+      bootstrap_error= 1;
+      break;
     }
-    if (bootstrap_error)
-      break;                                    /* purecov: inspected */
 
-    while (length && (my_isspace(thd->charset(), buff[length-1]) ||
-                      buff[length-1] == ';'))
-      length--;
-    buff[length]=0;
+    if (rc == READ_BOOTSTRAP_EOF)
+      break;
 
-    /* Skip lines starting with delimiter */
-    if (strncmp(buff, STRING_WITH_LEN("delimiter")) == 0)
-      continue;
+    DBUG_ASSERT(rc == 0);
 
-    query= (char *) thd->memdup_w_gap(buff, length + 1,
+    query= (char *) thd->memdup_w_gap(buffer, length + 1,
                                       thd->db_length + 1 +
                                       QUERY_CACHE_DB_LENGTH_SIZE +
                                       QUERY_CACHE_FLAGS_SIZE);
diff --git a/sql/sql_select.cc b/sql/sql_select.cc
index 0c1fb07d761..7032fdae939 100644
--- a/sql/sql_select.cc
+++ b/sql/sql_select.cc
@@ -9588,7 +9588,7 @@ uint check_join_cache_usage(JOIN_TAB *tab,
                             uint table_index,
                             JOIN_TAB *prev_tab)
 {
-  COST_VECT cost;
+  Cost_estimate cost;
   uint flags= 0;
   ha_rows rows= 0;
   uint bufsz= 4096;
diff --git a/sql/sql_select.h b/sql/sql_select.h
index 0ed976ac36a..289914df5c5 100644
--- a/sql/sql_select.h
+++ b/sql/sql_select.h
@@ -763,7 +763,7 @@ typedef struct st_position :public Sql_alloc
   double read_time;
 
   /* Cumulative cost and record count for the join prefix */
-  COST_VECT prefix_cost;
+  Cost_estimate prefix_cost;
   double    prefix_record_count;
 
   /*
diff --git a/sql/table.cc b/sql/table.cc
index 2a11098caed..cf9b3906cbd 100644
--- a/sql/table.cc
+++ b/sql/table.cc
@@ -4018,7 +4018,8 @@ void TABLE::reset_item_list(List<Item> *item_list) const
 void  TABLE_LIST::calc_md5(char *buffer)
 {
   uchar digest[16];
-  MY_MD5_HASH(digest, (uchar *) select_stmt.str, select_stmt.length);
+  compute_md5_hash((char*) digest, select_stmt.str,
+                   select_stmt.length);
   sprintf((char *) buffer,
 	    "%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x",
 	    digest[0], digest[1], digest[2], digest[3],
diff --git a/storage/innobase/CMakeLists.txt b/storage/innobase/CMakeLists.txt
index e4455630bc8..318b45e43ae 100644
--- a/storage/innobase/CMakeLists.txt
+++ b/storage/innobase/CMakeLists.txt
@@ -1,14 +1,14 @@
 # Copyright (c) 2006, 2011, Oracle and/or its affiliates. All rights reserved.
-# 
+#
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
 # the Free Software Foundation; version 2 of the License.
-# 
+#
 # This program is distributed in the hope that it will be useful,
 # but WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 # GNU General Public License for more details.
-# 
+#
 # You should have received a copy of the GNU General Public License
 # along with this program; if not, write to the Free Software
 # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
@@ -30,19 +30,29 @@ IF(UNIX)
       LINK_LIBRARIES(aio)
     ENDIF()
   ELSEIF(CMAKE_SYSTEM_NAME MATCHES "HP*")
-    ADD_DEFINITIONS("-DUNIV_HPUX -DUNIV_MUST_NOT_INLINE")
+    ADD_DEFINITIONS("-DUNIV_HPUX")
   ELSEIF(CMAKE_SYSTEM_NAME STREQUAL "AIX")
-    ADD_DEFINITIONS("-DUNIV_AIX -DUNIX_MUST_NOT_INLINE")
+    ADD_DEFINITIONS("-DUNIV_AIX")
   ELSEIF(CMAKE_SYSTEM_NAME STREQUAL "SunOS")
     ADD_DEFINITIONS("-DUNIV_SOLARIS")
-  ELSE()
-   ADD_DEFINITIONS("-DUNIV_MUST_NOT_INLINE")
   ENDIF()
 ENDIF()
 
-# Enable InnoDB's UNIV_DEBUG for debug builds
-SET(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -DUNIV_DEBUG")
-SET(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -DUNIV_DEBUG")
+IF(CMAKE_CXX_COMPILER_ID MATCHES "GNU")
+# After: WL#5825 Using C++ Standard Library with MySQL code
+#       we no longer use -fno-exceptions
+#	SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-exceptions")
+ENDIF()
+
+# Enable InnoDB's UNIV_DEBUG and UNIV_SYNC_DEBUG in debug builds
+SET(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -DUNIV_DEBUG -DUNIV_SYNC_DEBUG")
+
+# Add -Wconversion if compiling with GCC
+## As of Mar 15 2011 this flag causes 3573+ warnings. If you are reading this
+## please fix them and enable the following code:
+#IF(CMAKE_CXX_COMPILER_ID MATCHES "GNU")
+#SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wconversion")
+#ENDIF()
 
 IF(NOT MSVC)
 # either define HAVE_IB_GCC_ATOMIC_BUILTINS or not
@@ -119,6 +129,8 @@ ENDIF()
 
 ENDIF(NOT MSVC)
 
+SET(LINKER_SCRIPT)
+
 # Solaris atomics
 IF(CMAKE_SYSTEM_NAME STREQUAL "SunOS")
   CHECK_FUNCTION_EXISTS(atomic_cas_ulong  HAVE_ATOMIC_CAS_ULONG)
@@ -133,11 +145,15 @@ IF(CMAKE_SYSTEM_NAME STREQUAL "SunOS")
      HAVE_ATOMIC_SWAP_UCHAR)
     SET(HAVE_IB_SOLARIS_ATOMICS 1)
   ENDIF()
-  
+
   IF(HAVE_IB_SOLARIS_ATOMICS)
     ADD_DEFINITIONS(-DHAVE_IB_SOLARIS_ATOMICS=1)
   ENDIF()
 
+  IF(CMAKE_COMPILER_IS_GNUCC AND NOT HAVE_VISIBILITY_HIDDEN)
+    SET(LINKER_SCRIPT "-Wl,-M${CMAKE_CURRENT_SOURCE_DIR}/plugin_exports")
+  ENDIF()
+
   IF(NOT CMAKE_CROSSCOMPILING)
   # either define HAVE_IB_ATOMIC_PTHREAD_T_SOLARIS or not
   CHECK_C_SOURCE_COMPILES(
@@ -154,15 +170,15 @@ IF(CMAKE_SYSTEM_NAME STREQUAL "SunOS")
         memset(&x3, 0x0, sizeof(x3));
 
         if (sizeof(pthread_t) == 4) {
-        
+
           atomic_cas_32(&x1, x2, x3);
-        
+
         } else if (sizeof(pthread_t) == 8) {
-        
+
           atomic_cas_64(&x1, x2, x3);
-        
+
         } else {
-        
+
           return(1);
         }
 
@@ -198,81 +214,148 @@ INCLUDE_DIRECTORIES(${CMAKE_SOURCE_DIR}/storage/innobase/include
 		    ${CMAKE_SOURCE_DIR}/storage/innobase/handler)
 
 # Sun Studio bug with -xO2
-IF(CMAKE_C_COMPILER_ID MATCHES "SunPro" 
-	AND CMAKE_C_FLAGS_RELEASE MATCHES "O2" 
+IF(CMAKE_CXX_COMPILER_ID MATCHES "SunPro"
+	AND CMAKE_CXX_FLAGS_RELEASE MATCHES "O2"
 	AND NOT CMAKE_BUILD_TYPE STREQUAL "Debug")
 	# Sun Studio 12 crashes with -xO2 flag, but not with higher optimization
 	# -xO3
-	SET_SOURCE_FILES_PROPERTIES(${CMAKE_CURRENT_SOURCE_DIR}/rem/rem0rec.c 
+	SET_SOURCE_FILES_PROPERTIES(${CMAKE_CURRENT_SOURCE_DIR}/rem/rem0rec.cc
     PROPERTIES COMPILE_FLAGS -xO3)
 ENDIF()
 
 # Removing compiler optimizations for innodb/mem/* files on 64-bit Windows
 # due to 64-bit compiler error, See MySQL Bug #19424, #36366, #34297
 IF (MSVC AND CMAKE_SIZEOF_VOID_P EQUAL 8)
-	SET_SOURCE_FILES_PROPERTIES(mem/mem0mem.c mem/mem0pool.c
+	SET_SOURCE_FILES_PROPERTIES(mem/mem0mem.cc mem/mem0pool.cc
 				    PROPERTIES COMPILE_FLAGS -Od)
 ENDIF()
 
 IF(MSVC)
   # Avoid "unreferenced label" warning in generated file
   GET_FILENAME_COMPONENT(_SRC_DIR ${CMAKE_CURRENT_LIST_FILE} PATH)
-  SET_SOURCE_FILES_PROPERTIES(${_SRC_DIR}/pars/pars0grm.c
+  SET_SOURCE_FILES_PROPERTIES(${_SRC_DIR}/pars/pars0grm.cc
           PROPERTIES COMPILE_FLAGS "/wd4102")
-  SET_SOURCE_FILES_PROPERTIES(${_SRC_DIR}/pars/lexyy.c
+  SET_SOURCE_FILES_PROPERTIES(${_SRC_DIR}/pars/lexyy.cc
           PROPERTIES COMPILE_FLAGS "/wd4003")
 ENDIF()
-
-SET(INNOBASE_SOURCES	btr/btr0btr.c btr/btr0cur.c btr/btr0pcur.c btr/btr0sea.c
-			buf/buf0buddy.c buf/buf0buf.c buf/buf0flu.c buf/buf0lru.c buf/buf0rea.c
-			data/data0data.c data/data0type.c
-			dict/dict0boot.c dict/dict0crea.c dict/dict0dict.c dict/dict0load.c dict/dict0mem.c
-			dyn/dyn0dyn.c
-			eval/eval0eval.c eval/eval0proc.c
-			fil/fil0fil.c
-			fsp/fsp0fsp.c
-			fut/fut0fut.c fut/fut0lst.c
-			ha/ha0ha.c ha/hash0hash.c ha/ha0storage.c
-			ibuf/ibuf0ibuf.c
-			pars/lexyy.c pars/pars0grm.c pars/pars0opt.c pars/pars0pars.c pars/pars0sym.c
-			lock/lock0lock.c lock/lock0iter.c
-			log/log0log.c log/log0recv.c
-			mach/mach0data.c
-			mem/mem0mem.c mem/mem0pool.c
-			mtr/mtr0log.c mtr/mtr0mtr.c
-			os/os0file.c os/os0proc.c os/os0sync.c os/os0thread.c
-			page/page0cur.c page/page0page.c page/page0zip.c
-			que/que0que.c
-			handler/ha_innodb.cc handler/handler0alter.cc handler/i_s.cc
-			read/read0read.c
-			rem/rem0cmp.c rem/rem0rec.c
-			row/row0ext.c row/row0ins.c row/row0merge.c row/row0mysql.c row/row0purge.c row/row0row.c
-			row/row0sel.c row/row0uins.c row/row0umod.c row/row0undo.c row/row0upd.c row/row0vers.c
-			srv/srv0srv.c srv/srv0start.c
-			sync/sync0arr.c sync/sync0rw.c sync/sync0sync.c
-			trx/trx0i_s.c trx/trx0purge.c trx/trx0rec.c trx/trx0roll.c trx/trx0rseg.c
-			trx/trx0sys.c trx/trx0trx.c trx/trx0undo.c
-			usr/usr0sess.c
-			ut/ut0byte.c ut/ut0dbg.c ut/ut0list.c ut/ut0mem.c ut/ut0rbt.c ut/ut0rnd.c
-			ut/ut0ut.c ut/ut0vec.c ut/ut0wqueue.c ut/ut0bh.c)
+ 
+SET(INNOBASE_SOURCES
+	btr/btr0btr.cc
+	btr/btr0cur.cc
+	btr/btr0pcur.cc
+	btr/btr0sea.cc
+	buf/buf0buddy.cc
+	buf/buf0buf.cc
+	buf/buf0dblwr.cc
+	buf/buf0checksum.cc
+	buf/buf0dump.cc
+	buf/buf0flu.cc
+	buf/buf0lru.cc
+	buf/buf0rea.cc
+	data/data0data.cc
+	data/data0type.cc
+	dict/dict0boot.cc
+	dict/dict0crea.cc
+	dict/dict0dict.cc
+	dict/dict0load.cc
+	dict/dict0mem.cc
+	dict/dict0stats.cc
+	dyn/dyn0dyn.cc
+	eval/eval0eval.cc
+	eval/eval0proc.cc
+	fil/fil0fil.cc
+	fsp/fsp0fsp.cc
+	fut/fut0fut.cc
+	fut/fut0lst.cc
+	ha/ha0ha.cc
+	ha/ha0storage.cc
+	ha/hash0hash.cc
+	fts/fts0fts.cc
+	fts/fts0ast.cc
+	fts/fts0blex.cc
+	fts/fts0config.cc
+	fts/fts0opt.cc
+	fts/fts0pars.cc
+	fts/fts0que.cc
+	fts/fts0sql.cc
+	fts/fts0tlex.cc
+	handler/ha_innodb.cc
+	handler/handler0alter.cc
+	handler/i_s.cc
+	ibuf/ibuf0ibuf.cc
+	lock/lock0iter.cc
+	lock/lock0lock.cc
+	lock/lock0wait.cc
+	log/log0log.cc
+	log/log0recv.cc
+	mach/mach0data.cc
+	mem/mem0mem.cc
+	mem/mem0pool.cc
+	mtr/mtr0log.cc
+	mtr/mtr0mtr.cc
+	os/os0file.cc
+	os/os0proc.cc
+	os/os0sync.cc
+	os/os0thread.cc
+	page/page0cur.cc
+	page/page0page.cc
+	page/page0zip.cc
+	pars/lexyy.cc
+	pars/pars0grm.cc
+	pars/pars0opt.cc
+	pars/pars0pars.cc
+	pars/pars0sym.cc
+	que/que0que.cc
+	read/read0read.cc
+	rem/rem0cmp.cc
+	rem/rem0rec.cc
+	row/row0ext.cc
+	row/row0ftsort.cc
+	row/row0ins.cc
+	row/row0merge.cc
+	row/row0mysql.cc
+	row/row0purge.cc
+	row/row0row.cc
+	row/row0sel.cc
+	row/row0uins.cc
+	row/row0umod.cc
+	row/row0undo.cc
+	row/row0upd.cc
+	row/row0vers.cc
+	srv/srv0conc.cc
+	srv/srv0mon.cc
+	srv/srv0srv.cc
+	srv/srv0start.cc
+	sync/sync0arr.cc
+	sync/sync0rw.cc
+	sync/sync0sync.cc
+	trx/trx0i_s.cc
+	trx/trx0purge.cc
+	trx/trx0rec.cc
+	trx/trx0roll.cc
+	trx/trx0rseg.cc
+	trx/trx0sys.cc
+	trx/trx0trx.cc
+	trx/trx0undo.cc
+	usr/usr0sess.cc
+	ut/ut0bh.cc
+	ut/ut0byte.cc
+	ut/ut0crc32.cc
+	ut/ut0dbg.cc
+	ut/ut0list.cc
+	ut/ut0mem.cc
+	ut/ut0rbt.cc
+	ut/ut0rnd.cc
+	ut/ut0ut.cc
+	ut/ut0vec.cc
+	ut/ut0wqueue.cc)
 
 IF(WITH_INNODB)
   # Legacy option
   SET(WITH_INNOBASE_STORAGE_ENGINE TRUE)
 ENDIF()
 
-
-# On solaris, reduce symbol visibility, so loader does not mix
-# the same symbols from builtin innodb and from shared one.
-# Only required for old GCC (3.4.3) that does not support hidden visibility
-IF(CMAKE_SYSTEM_NAME MATCHES "SunOS" AND CMAKE_COMPILER_IS_GNUCC 
-  AND NOT HAVE_VISIBILITY_HIDDEN)
-  SET(LINKER_SCRIPT "-Wl,-M${CMAKE_CURRENT_SOURCE_DIR}/plugin_exports")
-ELSE()
-  SET(LINKER_SCRIPT)
-ENDIF()
-
-MYSQL_ADD_PLUGIN(innobase ${INNOBASE_SOURCES} STORAGE_ENGINE 
-  MODULE_ONLY
+MYSQL_ADD_PLUGIN(innobase ${INNOBASE_SOURCES} STORAGE_ENGINE
+  DEFAULT
   MODULE_OUTPUT_NAME ha_innodb
-  LINK_LIBRARIES ${ZLIB_LIBRARY} ${LINKER_SCRIPT})
+  LINK_LIBRARIES ${ZLIB_LIBRARY})
diff --git a/storage/innobase/btr/btr0btr.c b/storage/innobase/btr/btr0btr.cc
index fc3cdaf3cf1..60babcb3b74 100644
--- a/storage/innobase/btr/btr0btr.c
+++ b/storage/innobase/btr/btr0btr.cc
@@ -17,7 +17,7 @@ this program; if not, write to the Free Software Foundation, Inc.,
 *****************************************************************************/
 
 /**************************************************//**
-@file btr/btr0btr.c
+@file btr/btr0btr.cc
 The B-tree
 
 Created 6/2/1994 Heikki Tuuri
@@ -41,6 +41,7 @@ Created 6/2/1994 Heikki Tuuri
 #include "lock0lock.h"
 #include "ibuf0ibuf.h"
 #include "trx0trx.h"
+#include "srv0mon.h"
 
 /**************************************************************//**
 Report that an index page is corrupted. */
@@ -132,8 +133,8 @@ btr_blob_dbg_cmp(
 	const void*	a,	/*!< in: first btr_blob_dbg_t to compare */
 	const void*	b)	/*!< in: second btr_blob_dbg_t to compare */
 {
-	const btr_blob_dbg_t*	aa	= a;
-	const btr_blob_dbg_t*	bb	= b;
+	const btr_blob_dbg_t*	aa = static_cast<const btr_blob_dbg_t*>(a);
+	const btr_blob_dbg_t*	bb = static_cast<const btr_blob_dbg_t*>(b);
 
 	ut_ad(aa != NULL);
 	ut_ad(bb != NULL);
@@ -418,7 +419,7 @@ btr_blob_dbg_op(
 		rec = page_rec_get_next_const(rec);
 	} while (!page_rec_is_supremum(rec));
 
-	if (UNIV_LIKELY_NULL(heap)) {
+	if (heap) {
 		mem_heap_free(heap);
 	}
 
@@ -874,7 +875,7 @@ btr_page_create(
 	ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
 	btr_blob_dbg_assert_empty(index, buf_block_get_page_no(block));
 
-	if (UNIV_LIKELY_NULL(page_zip)) {
+	if (page_zip) {
 		page_create_zip(block, index, level, mtr);
 	} else {
 		page_create(block, mtr, dict_table_is_comp(index->table));
@@ -1123,6 +1124,15 @@ btr_page_free_low(
 	fseg_free_page(seg_header,
 		       buf_block_get_space(block),
 		       buf_block_get_page_no(block), mtr);
+
+	/* The page was marked free in the allocation bitmap, but it
+	should remain buffer-fixed until mtr_commit(mtr) or until it
+	is explicitly freed from the mini-transaction. */
+	ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
+	/* TODO: Discard any operations on the page from the redo log
+	and remove the block from the flush list and the buffer pool.
+	This would free up buffer pool earlier and reduce writes to
+	both the tablespace and the redo log. */
 }
 
 /**************************************************************//**
@@ -1169,7 +1179,7 @@ btr_node_ptr_set_child_page_no(
 
 	ut_ad(len == REC_NODE_PTR_SIZE);
 
-	if (UNIV_LIKELY_NULL(page_zip)) {
+	if (page_zip) {
 		page_zip_write_node_ptr(page_zip, rec,
 					rec_offs_data_size(offsets),
 					page_no, mtr);
@@ -1249,8 +1259,7 @@ btr_page_get_father_node_ptr_func(
 	offsets = rec_get_offsets(node_ptr, index, offsets,
 				  ULINT_UNDEFINED, &heap);
 
-	if (UNIV_UNLIKELY(btr_node_ptr_get_child_page_no(node_ptr, offsets)
-			  != page_no)) {
+	if (btr_node_ptr_get_child_page_no(node_ptr, offsets) != page_no) {
 		rec_t*	print_rec;
 		fputs("InnoDB: Dump of the child page:\n", stderr);
 		buf_page_print(page_align(user_rec), 0,
@@ -1437,7 +1446,7 @@ btr_create(
 	/* Create a new index page on the allocated segment page */
 	page_zip = buf_block_get_page_zip(block);
 
-	if (UNIV_LIKELY_NULL(page_zip)) {
+	if (page_zip) {
 		page = page_create_zip(block, index, 0, mtr);
 	} else {
 		page = page_create(block, mtr,
@@ -1556,7 +1565,9 @@ btr_free_root(
 	ut_a(btr_root_fseg_validate(header, space));
 #endif /* UNIV_BTR_DEBUG */
 
-	while (!fseg_free_step(header, mtr));
+	while (!fseg_free_step(header, mtr)) {
+		/* Free the entire segment in small steps. */
+	}
 }
 #endif /* !UNIV_HOTBACKUP */
 
@@ -1575,7 +1586,9 @@ btr_page_reorganize_low(
 	dict_index_t*	index,	/*!< in: record descriptor */
 	mtr_t*		mtr)	/*!< in: mtr */
 {
+#ifndef UNIV_HOTBACKUP
 	buf_pool_t*	buf_pool	= buf_pool_from_bpage(&block->page);
+#endif /* !UNIV_HOTBACKUP */
 	page_t*		page		= buf_block_get_frame(block);
 	page_zip_des_t*	page_zip	= buf_block_get_page_zip(block);
 	buf_block_t*	temp_block;
@@ -1617,7 +1630,7 @@ btr_page_reorganize_low(
 	buf_frame_copy(temp_page, page);
 
 #ifndef UNIV_HOTBACKUP
-	if (UNIV_LIKELY(!recovery)) {
+	if (!recovery) {
 		btr_search_drop_page_hash_index(block);
 	}
 
@@ -1647,9 +1660,7 @@ btr_page_reorganize_low(
 		ut_ad(max_trx_id != 0 || recovery);
 	}
 
-	if (UNIV_LIKELY_NULL(page_zip)
-	    && UNIV_UNLIKELY
-	    (!page_zip_compress(page_zip, page, index, NULL))) {
+	if (page_zip && !page_zip_compress(page_zip, page, index, NULL)) {
 
 		/* Restore the old page and exit. */
 		btr_blob_dbg_restore(page, temp_page, index,
@@ -1679,7 +1690,7 @@ btr_page_reorganize_low(
 	}
 
 #ifndef UNIV_HOTBACKUP
-	if (UNIV_LIKELY(!recovery)) {
+	if (!recovery) {
 		/* Update the record lock bitmaps */
 		lock_move_reorganize_page(block, temp_block);
 	}
@@ -1688,10 +1699,10 @@ btr_page_reorganize_low(
 	data_size2 = page_get_data_size(page);
 	max_ins_size2 = page_get_max_insert_size_after_reorganize(page, 1);
 
-	if (UNIV_UNLIKELY(data_size1 != data_size2)
-	    || UNIV_UNLIKELY(max_ins_size1 != max_ins_size2)) {
+	if (data_size1 != data_size2 || max_ins_size1 != max_ins_size2) {
 		buf_page_print(page, 0, BUF_PAGE_PRINT_NO_CRASH);
 		buf_page_print(temp_page, 0, BUF_PAGE_PRINT_NO_CRASH);
+
 		fprintf(stderr,
 			"InnoDB: Error: page old data size %lu"
 			" new data size %lu\n"
@@ -1759,7 +1770,7 @@ btr_parse_page_reorganize(
 
 	/* The record is empty, except for the record initial part */
 
-	if (UNIV_LIKELY(block != NULL)) {
+	if (block != NULL) {
 		btr_page_reorganize_low(TRUE, block, index, mtr);
 	}
 
@@ -1793,7 +1804,7 @@ btr_page_empty(
 	/* Recreate the page: note that global data on page (possible
 	segment headers, next page-field, etc.) is preserved intact */
 
-	if (UNIV_LIKELY_NULL(page_zip)) {
+	if (page_zip) {
 		page_create_zip(block, index, level, mtr);
 	} else {
 		page_create(block, mtr, dict_table_is_comp(index->table));
@@ -1886,10 +1897,9 @@ btr_root_raise_and_insert(
 #ifdef UNIV_ZIP_COPY
 	    || new_page_zip
 #endif /* UNIV_ZIP_COPY */
-	    || UNIV_UNLIKELY
-	    (!page_copy_rec_list_end(new_block, root_block,
+	    || !page_copy_rec_list_end(new_block, root_block,
 				     page_get_infimum_rec(root),
-				     index, mtr))) {
+				     index, mtr)) {
 		ut_a(new_page_zip);
 
 		/* Copy the page byte for byte. */
@@ -2044,8 +2054,7 @@ btr_page_get_split_rec_to_right(
 	the previous insert on the same page, we assume that there is a
 	pattern of sequential inserts here. */
 
-	if (UNIV_LIKELY(page_header_get_ptr(page, PAGE_LAST_INSERT)
-			== insert_point)) {
+	if (page_header_get_ptr(page, PAGE_LAST_INSERT) == insert_point) {
 
 		rec_t*	next_rec;
 
@@ -2113,13 +2122,13 @@ btr_page_get_split_rec(
 	free_space  = page_get_free_space_of_empty(page_is_comp(page));
 
 	page_zip = btr_cur_get_page_zip(cursor);
-	if (UNIV_LIKELY_NULL(page_zip)) {
+	if (page_zip) {
 		/* Estimate the free space of an empty compressed page. */
 		ulint	free_space_zip = page_zip_empty_size(
 			cursor->index->n_fields,
 			page_zip_get_size(page_zip));
 
-		if (UNIV_LIKELY(free_space > (ulint) free_space_zip)) {
+		if (free_space > (ulint) free_space_zip) {
 			free_space = (ulint) free_space_zip;
 		}
 	}
@@ -2192,7 +2201,7 @@ btr_page_get_split_rec(
 	}
 
 func_exit:
-	if (UNIV_LIKELY_NULL(heap)) {
+	if (heap) {
 		mem_heap_free(heap);
 	}
 	return(rec);
@@ -2337,7 +2346,7 @@ btr_attach_half_pages(
 /*==================*/
 	dict_index_t*	index,		/*!< in: the index tree */
 	buf_block_t*	block,		/*!< in/out: page to be split */
-	rec_t*		split_rec,	/*!< in: first record on upper
+	const rec_t*	split_rec,	/*!< in: first record on upper
 					half page */
 	buf_block_t*	new_block,	/*!< in/out: the new half page */
 	ulint		direction,	/*!< in: FSP_UP or FSP_DOWN */
@@ -2568,7 +2577,7 @@ func_start:
 		hint_page_no = page_no + 1;
 		split_rec = btr_page_get_split_rec(cursor, tuple, n_ext);
 
-		if (UNIV_UNLIKELY(split_rec == NULL)) {
+		if (split_rec == NULL) {
 			insert_left = btr_page_tuple_smaller(
 				cursor, tuple, offsets, n_uniq, &heap);
 		}
@@ -2620,15 +2629,14 @@ func_start:
 
 		insert_left = cmp_dtuple_rec(tuple, split_rec, offsets) < 0;
 
-		if (UNIV_UNLIKELY(!insert_left && new_page_zip
-				  && n_iterations > 0)) {
+		if (!insert_left && new_page_zip && n_iterations > 0) {
 			/* If a compressed page has already been split,
 			avoid further splits by inserting the record
 			to an empty page. */
 			split_rec = NULL;
 			goto insert_empty;
 		}
-	} else if (UNIV_UNLIKELY(insert_left)) {
+	} else if (insert_left) {
 		ut_a(n_iterations > 0);
 		first_rec = page_rec_get_next(page_get_infimum_rec(page));
 		move_limit = page_rec_get_next(btr_cur_get_rec(cursor));
@@ -2636,8 +2644,8 @@ func_start:
 insert_empty:
 		ut_ad(!split_rec);
 		ut_ad(!insert_left);
-		buf = mem_alloc(rec_get_converted_size(cursor->index,
-						       tuple, n_ext));
+		buf = (byte*) mem_alloc(rec_get_converted_size(cursor->index,
+							       tuple, n_ext));
 
 		first_rec = rec_convert_dtuple_to_rec(buf, cursor->index,
 						      tuple, n_ext);
@@ -2683,9 +2691,8 @@ insert_empty:
 #ifdef UNIV_ZIP_COPY
 		    || page_zip
 #endif /* UNIV_ZIP_COPY */
-		    || UNIV_UNLIKELY
-		    (!page_move_rec_list_start(new_block, block, move_limit,
-					       cursor->index, mtr))) {
+		    || !page_move_rec_list_start(new_block, block, move_limit,
+					       cursor->index, mtr)) {
 			/* For some reason, compressing new_page failed,
 			even though it should contain fewer records than
 			the original page.  Copy the page byte for byte
@@ -2726,9 +2733,8 @@ insert_empty:
 #ifdef UNIV_ZIP_COPY
 		    || page_zip
 #endif /* UNIV_ZIP_COPY */
-		    || UNIV_UNLIKELY
-		    (!page_move_rec_list_end(new_block, block, move_limit,
-					     cursor->index, mtr))) {
+		    || !page_move_rec_list_end(new_block, block, move_limit,
+					     cursor->index, mtr)) {
 			/* For some reason, compressing new_page failed,
 			even though it should contain fewer records than
 			the original page.  Copy the page byte for byte
@@ -2764,7 +2770,7 @@ insert_empty:
 	}
 
 #ifdef UNIV_ZIP_DEBUG
-	if (UNIV_LIKELY_NULL(page_zip)) {
+	if (page_zip) {
 		ut_a(page_zip_validate(page_zip, page));
 		ut_a(page_zip_validate(new_page_zip, new_page));
 	}
@@ -2804,15 +2810,14 @@ insert_empty:
 	}
 #endif /* UNIV_ZIP_DEBUG */
 
-	if (UNIV_LIKELY(rec != NULL)) {
+	if (rec != NULL) {
 
 		goto func_exit;
 	}
 
 	/* 8. If insert did not fit, try page reorganization */
 
-	if (UNIV_UNLIKELY
-	    (!btr_page_reorganize(insert_block, cursor->index, mtr))) {
+	if (!btr_page_reorganize(insert_block, cursor->index, mtr)) {
 
 		goto insert_failed;
 	}
@@ -2822,7 +2827,7 @@ insert_empty:
 	rec = page_cur_tuple_insert(page_cursor, tuple, cursor->index,
 				    n_ext, mtr);
 
-	if (UNIV_UNLIKELY(rec == NULL)) {
+	if (rec == NULL) {
 		/* The insert did not fit on the page: loop back to the
 		start of the function for a new split */
 insert_failed:
@@ -2856,6 +2861,7 @@ func_exit:
 		buf_block_get_page_no(left_block),
 		buf_block_get_page_no(right_block));
 #endif
+	MONITOR_INC(MONITOR_INDEX_SPLIT);
 
 	ut_ad(page_validate(buf_block_get_frame(left_block), cursor->index));
 	ut_ad(page_validate(buf_block_get_frame(right_block), cursor->index));
@@ -3012,7 +3018,7 @@ btr_set_min_rec_mark(
 {
 	ulint	info_bits;
 
-	if (UNIV_LIKELY(page_rec_is_comp(rec))) {
+	if (page_rec_is_comp(rec)) {
 		info_bits = rec_get_info_bits(rec, TRUE);
 
 		rec_set_info_bits_new(rec, info_bits | REC_INFO_MIN_REC_FLAG);
@@ -3131,10 +3137,9 @@ btr_lift_page_up(
 #ifdef UNIV_ZIP_COPY
 	    || father_page_zip
 #endif /* UNIV_ZIP_COPY */
-	    || UNIV_UNLIKELY
-	    (!page_copy_rec_list_end(father_block, block,
-				     page_get_infimum_rec(page),
-				     index, mtr))) {
+	    || !page_copy_rec_list_end(father_block, block,
+				       page_get_infimum_rec(page),
+				       index, mtr)) {
 		const page_zip_des_t*	page_zip
 			= buf_block_get_page_zip(block);
 		ut_a(father_page_zip);
@@ -3310,12 +3315,11 @@ err_exit:
 
 	max_ins_size = page_get_max_insert_size(merge_page, n_recs);
 
-	if (UNIV_UNLIKELY(data_size > max_ins_size)) {
+	if (data_size > max_ins_size) {
 
 		/* We have to reorganize merge_page */
 
-		if (UNIV_UNLIKELY(!btr_page_reorganize(merge_block,
-						       index, mtr))) {
+		if (!btr_page_reorganize(merge_block, index, mtr)) {
 
 			goto err_exit;
 		}
@@ -3325,7 +3329,7 @@ err_exit:
 		ut_ad(page_validate(merge_page, index));
 		ut_ad(max_ins_size == max_ins_size_reorg);
 
-		if (UNIV_UNLIKELY(data_size > max_ins_size)) {
+		if (data_size > max_ins_size) {
 
 			/* Add fault tolerance, though this should
 			never happen */
@@ -3336,7 +3340,7 @@ err_exit:
 
 	merge_page_zip = buf_block_get_page_zip(merge_block);
 #ifdef UNIV_ZIP_DEBUG
-	if (UNIV_LIKELY_NULL(merge_page_zip)) {
+	if (merge_page_zip) {
 		const page_zip_des_t*	page_zip
 			= buf_block_get_page_zip(block);
 		ut_a(page_zip);
@@ -3351,7 +3355,7 @@ err_exit:
 			merge_block, block, page_get_supremum_rec(page),
 			index, mtr);
 
-		if (UNIV_UNLIKELY(!orig_pred)) {
+		if (!orig_pred) {
 			goto err_exit;
 		}
 
@@ -3372,7 +3376,7 @@ err_exit:
 		byte		fil_page_prev[4];
 #endif /* UNIV_BTR_DEBUG */
 
-		if (UNIV_LIKELY_NULL(merge_page_zip)) {
+		if (merge_page_zip) {
 			/* The function page_zip_compress(), which will be
 			invoked by page_copy_rec_list_end() below,
 			requires that FIL_PAGE_PREV be FIL_NULL.
@@ -3390,7 +3394,7 @@ err_exit:
 						   page_get_infimum_rec(page),
 						   cursor->index, mtr);
 
-		if (UNIV_UNLIKELY(!orig_succ)) {
+		if (!orig_succ) {
 			ut_a(merge_page_zip);
 #ifdef UNIV_BTR_DEBUG
 			/* FIL_PAGE_PREV was restored from merge_page_zip. */
@@ -3403,7 +3407,7 @@ err_exit:
 		btr_search_drop_page_hash_index(block);
 
 #ifdef UNIV_BTR_DEBUG
-		if (UNIV_LIKELY_NULL(merge_page_zip)) {
+		if (merge_page_zip) {
 			/* Restore FIL_PAGE_PREV in order to avoid an assertion
 			failure in btr_level_list_remove(), which will set
 			the field again to FIL_NULL.  Even though this makes
@@ -3706,7 +3710,7 @@ btr_print_size(
 	fputs("INFO OF THE NON-LEAF PAGE SEGMENT\n", stderr);
 	fseg_print(seg, &mtr);
 
-	if (!(index->type & DICT_UNIVERSAL)) {
+	if (!dict_index_is_univ(index)) {
 
 		seg = root + PAGE_HEADER + PAGE_BTR_SEG_LEAF;
 
@@ -3804,7 +3808,7 @@ btr_print_index(
 	root = btr_root_block_get(index, &mtr);
 
 	btr_print_recursive(index, root, width, &heap, &offsets, &mtr);
-	if (UNIV_LIKELY_NULL(heap)) {
+	if (heap) {
 		mem_heap_free(heap);
 	}
 
@@ -3900,7 +3904,7 @@ btr_index_rec_validate(
 
 	page = page_align(rec);
 
-	if (UNIV_UNLIKELY(index->type & DICT_UNIVERSAL)) {
+	if (dict_index_is_univ(index)) {
 		/* The insert buffer index tree can contain records from any
 		other index: we cannot check the number of fields or
 		their length */
@@ -3908,8 +3912,7 @@ btr_index_rec_validate(
 		return(TRUE);
 	}
 
-	if (UNIV_UNLIKELY((ibool)!!page_is_comp(page)
-			  != dict_table_is_comp(index->table))) {
+	if ((ibool)!!page_is_comp(page) != dict_table_is_comp(index->table)) {
 		btr_index_rec_validate_report(page, rec, index);
 		fprintf(stderr, "InnoDB: compact flag=%lu, should be %lu\n",
 			(ulong) !!page_is_comp(page),
@@ -3920,8 +3923,7 @@ btr_index_rec_validate(
 
 	n = dict_index_get_n_fields(index);
 
-	if (!page_is_comp(page)
-	    && UNIV_UNLIKELY(rec_get_n_fields_old(rec) != n)) {
+	if (!page_is_comp(page) && rec_get_n_fields_old(rec) != n) {
 		btr_index_rec_validate_report(page, rec, index);
 		fprintf(stderr, "InnoDB: has %lu fields, should have %lu\n",
 			(ulong) rec_get_n_fields_old(rec), (ulong) n);
@@ -3972,14 +3974,14 @@ btr_index_rec_validate(
 				rec_print_new(stderr, rec, offsets);
 				putc('\n', stderr);
 			}
-			if (UNIV_LIKELY_NULL(heap)) {
+			if (heap) {
 				mem_heap_free(heap);
 			}
 			return(FALSE);
 		}
 	}
 
-	if (UNIV_LIKELY_NULL(heap)) {
+	if (heap) {
 		mem_heap_free(heap);
 	}
 	return(TRUE);
@@ -4171,8 +4173,9 @@ loop:
 		right_block = btr_block_get(space, zip_size, right_page_no,
 					    RW_X_LATCH, index, &mtr);
 		right_page = buf_block_get_frame(right_block);
-		if (UNIV_UNLIKELY(btr_page_get_prev(right_page, &mtr)
-				  != page_get_page_no(page))) {
+		if (btr_page_get_prev(right_page, &mtr)
+		    != page_get_page_no(page)) {
+
 			btr_validate_report2(index, level, block, right_block);
 			fputs("InnoDB: broken FIL_PAGE_NEXT"
 			      " or FIL_PAGE_PREV links\n", stderr);
@@ -4182,8 +4185,7 @@ loop:
 			ret = FALSE;
 		}
 
-		if (UNIV_UNLIKELY(page_is_comp(right_page)
-				  != page_is_comp(page))) {
+		if (page_is_comp(right_page) != page_is_comp(page)) {
 			btr_validate_report2(index, level, block, right_block);
 			fputs("InnoDB: 'compact' flag mismatch\n", stderr);
 			buf_page_print(page, 0, BUF_PAGE_PRINT_NO_CRASH);
@@ -4201,9 +4203,8 @@ loop:
 					  offsets, ULINT_UNDEFINED, &heap);
 		offsets2 = rec_get_offsets(right_rec, index,
 					   offsets2, ULINT_UNDEFINED, &heap);
-		if (UNIV_UNLIKELY(cmp_rec_rec(rec, right_rec,
-					      offsets, offsets2,
-					      index) >= 0)) {
+		if (cmp_rec_rec(rec, right_rec, offsets, offsets2,
+			        index) >= 0) {
 
 			btr_validate_report2(index, level, block, right_block);
 
@@ -4250,10 +4251,9 @@ loop:
 		offsets = btr_page_get_father_node_ptr(offsets, heap,
 						       &node_cur, &mtr);
 
-		if (UNIV_UNLIKELY(node_ptr != btr_cur_get_rec(&node_cur))
-		    || UNIV_UNLIKELY(btr_node_ptr_get_child_page_no(node_ptr,
-								    offsets)
-				     != buf_block_get_page_no(block))) {
+		if (node_ptr != btr_cur_get_rec(&node_cur)
+		    || btr_node_ptr_get_child_page_no(node_ptr, offsets)
+				     != buf_block_get_page_no(block)) {
 
 			btr_validate_report1(index, level, block);
 
@@ -4444,6 +4444,12 @@ btr_validate_index(
 	ulint	i;
 	ulint	n;
 
+	/* Full Text index are implemented by auxiliary tables,
+	not the B-tree */
+	if (index->type & DICT_FTS) {
+		return(TRUE);
+	}
+
 	mtr_start(&mtr);
 	mtr_x_lock(dict_index_get_lock(index), &mtr);
 
diff --git a/storage/innobase/btr/btr0cur.c b/storage/innobase/btr/btr0cur.cc
index 9c61e0cf763..5d1a19c3d30 100644
--- a/storage/innobase/btr/btr0cur.c
+++ b/storage/innobase/btr/btr0cur.cc
@@ -24,7 +24,7 @@ this program; if not, write to the Free Software Foundation, Inc.,
 *****************************************************************************/
 
 /**************************************************//**
-@file btr/btr0cur.c
+@file btr/btr0cur.cc
 The index tree cursor
 
 All changes that row operations make to a B-tree or the records
@@ -129,7 +129,12 @@ can be released by page reorganize, then it is reorganized */
 /** A BLOB field reference full of zero, for use in assertions and tests.
 Initially, BLOB field references are set to zero, in
 dtuple_convert_big_rec(). */
-UNIV_INTERN const byte field_ref_zero[BTR_EXTERN_FIELD_REF_SIZE];
+const byte field_ref_zero[BTR_EXTERN_FIELD_REF_SIZE] = {
+	0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0,
+};
 
 #ifndef UNIV_HOTBACKUP
 /*******************************************************************//**
@@ -408,6 +413,7 @@ btr_cur_search_to_nth_level(
 	ut_ad(dict_index_check_search_tuple(index, tuple));
 	ut_ad(!dict_index_is_ibuf(index) || ibuf_inside(mtr));
 	ut_ad(dtuple_check_typed(tuple));
+	ut_ad(!(index->type & DICT_FTS));
 	ut_ad(index->page != FIL_NULL);
 
 	UNIV_MEM_INVALID(&cursor->up_match, sizeof cursor->up_match);
@@ -930,7 +936,7 @@ btr_cur_open_at_index_side_func(
 		page_no = btr_node_ptr_get_child_page_no(node_ptr, offsets);
 	}
 
-	if (UNIV_LIKELY_NULL(heap)) {
+	if (heap) {
 		mem_heap_free(heap);
 	}
 }
@@ -1101,29 +1107,27 @@ btr_cur_ins_lock_and_undo(
 					     btr_cur_get_block(cursor),
 					     index, thr, mtr, inherit);
 
-	if (err != DB_SUCCESS) {
+	if (err != DB_SUCCESS
+	    || !dict_index_is_clust(index) || dict_index_is_ibuf(index)) {
 
 		return(err);
 	}
 
-	if (dict_index_is_clust(index) && !dict_index_is_ibuf(index)) {
-
-		err = trx_undo_report_row_operation(flags, TRX_UNDO_INSERT_OP,
-						    thr, index, entry,
-						    NULL, 0, NULL,
-						    &roll_ptr);
-		if (err != DB_SUCCESS) {
+	err = trx_undo_report_row_operation(flags, TRX_UNDO_INSERT_OP,
+					    thr, index, entry,
+					    NULL, 0, NULL,
+					    &roll_ptr);
+	if (err != DB_SUCCESS) {
 
-			return(err);
-		}
+		return(err);
+	}
 
-		/* Now we can fill in the roll ptr field in entry */
+	/* Now we can fill in the roll ptr field in entry */
 
-		if (!(flags & BTR_KEEP_SYS_FLAG)) {
+	if (!(flags & BTR_KEEP_SYS_FLAG)) {
 
-			row_upd_index_entry_sys_field(entry, index,
-						      DATA_ROLL_PTR, roll_ptr);
-		}
+		row_upd_index_entry_sys_field(entry, index,
+					      DATA_ROLL_PTR, roll_ptr);
 	}
 
 	return(DB_SUCCESS);
@@ -1140,8 +1144,7 @@ btr_cur_trx_report(
 	const dict_index_t*	index,	/*!< in: index */
 	const char*		op)	/*!< in: operation */
 {
-	fprintf(stderr, "Trx with id " TRX_ID_FMT " going to ",
-		(ullint) trx->id);
+	fprintf(stderr, "Trx with id " TRX_ID_FMT " going to ", trx->id);
 	fputs(op, stderr);
 	dict_index_name_print(stderr, trx, index);
 	putc('\n', stderr);
@@ -1238,7 +1241,7 @@ btr_cur_optimistic_insert(
 		rec_size = rec_get_converted_size(index, entry, n_ext);
 	}
 
-	if (UNIV_UNLIKELY(zip_size)) {
+	if (zip_size) {
 		/* Estimate the free space of an empty compressed page.
 		Subtract one byte for the encoded heap_no in the
 		modification log. */
@@ -1346,7 +1349,7 @@ fail_err:
 					     n_ext, mtr);
 
 		if (UNIV_UNLIKELY(!*rec)) {
-			if (UNIV_LIKELY(zip_size != 0)) {
+			if (zip_size != 0) {
 
 				goto fail;
 			}
@@ -1498,7 +1501,7 @@ btr_cur_pessimistic_insert(
 
 	if (page_zip_rec_needs_ext(rec_get_converted_size(index, entry, n_ext),
 				   dict_table_is_comp(index->table),
-				   dict_index_get_n_fields(index),
+				   dtuple_get_n_fields(entry),
 				   zip_size)) {
 		/* The record is so big that we have to store some fields
 		externally on separate database pages */
@@ -1923,8 +1926,8 @@ btr_cur_update_in_place(
 				    trx, roll_ptr, mtr);
 
 	if (was_delete_marked
-	    && !rec_get_deleted_flag(rec, page_is_comp(
-					     buf_block_get_frame(block)))) {
+	    && !rec_get_deleted_flag(
+		    rec, page_is_comp(buf_block_get_frame(block)))) {
 		/* The new updated record owns its possible externally
 		stored fields */
 
@@ -1975,7 +1978,6 @@ btr_cur_optimistic_update(
 	ulint		old_rec_size;
 	dtuple_t*	new_entry;
 	roll_ptr_t	roll_ptr;
-	trx_t*		trx;
 	mem_heap_t*	heap;
 	ulint		i;
 	ulint		n_ext;
@@ -1992,9 +1994,10 @@ btr_cur_optimistic_update(
 
 	heap = mem_heap_create(1024);
 	offsets = rec_get_offsets(rec, index, NULL, ULINT_UNDEFINED, &heap);
-#ifdef UNIV_BLOB_NULL_DEBUG
-	ut_a(!rec_offs_any_null_extern(rec, offsets));
-#endif /* UNIV_BLOB_NULL_DEBUG */
+#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
+	ut_a(!rec_offs_any_null_extern(rec, offsets)
+	     || trx_is_recv(thr_get_trx(thr)));
+#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
 
 #ifdef UNIV_DEBUG
 	if (btr_cur_print_record_ops && thr) {
@@ -2117,13 +2120,11 @@ any_extern:
 
 	page_cur_move_to_prev(page_cursor);
 
-	trx = thr_get_trx(thr);
-
 	if (!(flags & BTR_KEEP_SYS_FLAG)) {
 		row_upd_index_entry_sys_field(new_entry, index, DATA_ROLL_PTR,
 					      roll_ptr);
 		row_upd_index_entry_sys_field(new_entry, index, DATA_TRX_ID,
-					      trx->id);
+					      thr_get_trx(thr)->id);
 	}
 
 	/* There are no externally stored columns in new_entry */
@@ -3463,8 +3464,6 @@ btr_estimate_n_rows_in_range(
 				n_rows = n_rows * 2;
 			}
 
-			DBUG_EXECUTE_IF("bug14007649", return(n_rows););
-
 			/* Do not estimate the number of rows in the range
 			to over 1 / 2 of the estimated rows in the whole
 			table */
@@ -3574,7 +3573,8 @@ btr_record_not_null_field_in_rec(
 /*******************************************************************//**
 Estimates the number of different key values in a given index, for
 each n-column prefix of the index where n <= dict_index_get_n_unique(index).
-The estimates are stored in the array index->stat_n_diff_key_vals.
+The estimates are stored in the array index->stat_n_diff_key_vals[] and
+the number of pages that were sampled is saved in index->stat_n_sample_sizes[].
 If innodb_stats_method is "nulls_ignored", we also record the number of
 non-null values for each prefix and store the estimates in
 array index->stat_n_non_null_key_vals. */
@@ -3612,7 +3612,8 @@ btr_estimate_number_of_different_key_vals(
 			       * (sizeof *offsets_rec
 				  + sizeof *offsets_next_rec));
 
-	n_diff = mem_heap_zalloc(heap, (n_cols + 1) * sizeof(ib_int64_t));
+	n_diff = (ib_int64_t*) mem_heap_zalloc(heap, (n_cols + 1)
+					       * sizeof(ib_int64_t));
 
 	n_not_null = NULL;
 
@@ -3621,7 +3622,7 @@ btr_estimate_number_of_different_key_vals(
 	considered equal (by setting stats_null_not_equal value) */
 	switch (srv_innodb_stats_method) {
 	case SRV_STATS_NULLS_IGNORED:
-		n_not_null = mem_heap_zalloc(heap, (n_cols + 1)
+		n_not_null = (ib_int64_t*) mem_heap_zalloc(heap, (n_cols + 1)
 					     * sizeof *n_not_null);
 		/* fall through */
 
@@ -3641,14 +3642,14 @@ btr_estimate_number_of_different_key_vals(
 
 	/* It makes no sense to test more pages than are contained
 	in the index, thus we lower the number if it is too high */
-	if (srv_stats_sample_pages > index->stat_index_size) {
+	if (srv_stats_transient_sample_pages > index->stat_index_size) {
 		if (index->stat_index_size > 0) {
 			n_sample_pages = index->stat_index_size;
 		} else {
 			n_sample_pages = 1;
 		}
 	} else {
-		n_sample_pages = srv_stats_sample_pages;
+		n_sample_pages = srv_stats_transient_sample_pages;
 	}
 
 	/* We sample some pages in the index to get an estimate */
@@ -3762,7 +3763,7 @@ btr_estimate_number_of_different_key_vals(
 		index->stat_n_diff_key_vals[j]
 			= BTR_TABLE_STATS_FROM_SAMPLE(
 				n_diff[j], index, n_sample_pages,
-				total_external_size, not_empty_flag); 
+				total_external_size, not_empty_flag);
 
 		/* If the tree is small, smaller than
 		10 * n_sample_pages + total_external_size, then
@@ -3782,6 +3783,8 @@ btr_estimate_number_of_different_key_vals(
 
 		index->stat_n_diff_key_vals[j] += add_on;
 
+		index->stat_n_sample_sizes[j] = n_sample_pages;
+
 		/* Update the stat_n_non_null_key_vals[] with our
 		sampled result. stat_n_non_null_key_vals[] is created
 		and initialized to zero in dict_index_add_to_cache(),
@@ -4040,10 +4043,11 @@ btr_push_update_extern_fields(
 				will have to be copied. */
 				ut_a(uf->orig_len > BTR_EXTERN_FIELD_REF_SIZE);
 
-				data = dfield_get_data(field);
+				data = (byte*) dfield_get_data(field);
 				len = dfield_get_len(field);
 
-				buf = mem_heap_alloc(heap, uf->orig_len);
+				buf = (byte*) mem_heap_alloc(heap,
+							     uf->orig_len);
 				/* Copy the locally stored prefix. */
 				memcpy(buf, data,
 				       uf->orig_len
@@ -4107,7 +4111,6 @@ btr_blob_free(
 	mtr_commit(mtr);
 
 	buf_pool_mutex_enter(buf_pool);
-	mutex_enter(&block->mutex);
 
 	/* Only free the block if it is still allocated to
 	the same file page. */
@@ -4127,7 +4130,6 @@ btr_blob_free(
 	}
 
 	buf_pool_mutex_exit(buf_pool);
-	mutex_exit(&block->mutex);
 }
 
 /*******************************************************************//**
@@ -4217,10 +4219,11 @@ btr_store_big_rec_extern_fields(
 					* sizeof *freed_pages);
 			}
 
-			freed_pages = mem_heap_alloc(
-				heap,
-				btr_mtr->n_freed_pages
-				* sizeof *freed_pages);
+			freed_pages = static_cast<buf_block_t**>(
+				mem_heap_alloc(
+					heap,
+					btr_mtr->n_freed_pages
+					* sizeof *freed_pages));
 			n_freed_pages = 0;
 		}
 
@@ -4286,7 +4289,8 @@ btr_store_big_rec_extern_fields(
 			int	err = deflateReset(&c_stream);
 			ut_a(err == Z_OK);
 
-			c_stream.next_in = (void*) big_rec_vec->fields[i].data;
+			c_stream.next_in = (Bytef*)
+				big_rec_vec->fields[i].data;
 			c_stream.avail_in = extern_len;
 		}
 
@@ -4636,8 +4640,7 @@ btr_check_blob_fil_page_type(
 		ulint	flags = fil_space_get_flags(space_id);
 
 #ifndef UNIV_DEBUG /* Improve debug test coverage */
-		if (UNIV_LIKELY
-		    ((flags & DICT_TF_FORMAT_MASK) == DICT_TF_FORMAT_51)) {
+		if (dict_tf_get_format(flags) == UNIV_FORMAT_A) {
 			/* Old versions of InnoDB did not initialize
 			FIL_PAGE_TYPE on BLOB pages.  Do not print
 			anything about the type mismatch when reading
@@ -4816,7 +4819,7 @@ btr_free_externally_stored_field(
 
 			btr_page_free_low(index, ext_block, 0, &mtr);
 
-			if (page_zip) {
+			if (page_zip != NULL) {
 				mach_write_to_4(field_ref + BTR_EXTERN_PAGE_NO,
 						next_page_no);
 				mach_write_to_4(field_ref + BTR_EXTERN_LEN + 4,
@@ -5037,8 +5040,8 @@ btr_copy_zblob_prefix(
 	page_zip_set_alloc(&d_stream, heap);
 
 	ut_ad(ut_is_2pow(zip_size));
-	ut_ad(zip_size >= PAGE_ZIP_MIN_SIZE);
-	ut_ad(zip_size <= UNIV_PAGE_SIZE);
+	ut_ad(zip_size >= UNIV_ZIP_SIZE_MIN);
+	ut_ad(zip_size <= UNIV_ZIP_SIZE_MAX);
 	ut_ad(space_id);
 
 	err = inflateInit(&d_stream);
@@ -5179,7 +5182,7 @@ btr_copy_externally_stored_field_prefix_low(
 		return(0);
 	}
 
-	if (UNIV_UNLIKELY(zip_size)) {
+	if (zip_size) {
 		return(btr_copy_zblob_prefix(buf, len, zip_size,
 					     space_id, page_no, offset));
 	} else {
@@ -5251,7 +5254,7 @@ btr_copy_externally_stored_field_prefix(
 Copies an externally stored field of a record to mem heap.  The
 clustered index record must be protected by a lock or a page latch.
 @return	the whole field copied to heap */
-static
+UNIV_INTERN
 byte*
 btr_copy_externally_stored_field(
 /*=============================*/
@@ -5286,7 +5289,7 @@ btr_copy_externally_stored_field(
 
 	extern_len = mach_read_from_4(data + local_len + BTR_EXTERN_LEN + 4);
 
-	buf = mem_heap_alloc(heap, local_len + extern_len);
+	buf = (byte*) mem_heap_alloc(heap, local_len + extern_len);
 
 	memcpy(buf, data, local_len);
 	*len = local_len
diff --git a/storage/innobase/btr/btr0pcur.c b/storage/innobase/btr/btr0pcur.cc
index 7ac96c1a1b1..5a67afc7e69 100644
--- a/storage/innobase/btr/btr0pcur.c
+++ b/storage/innobase/btr/btr0pcur.cc
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1996, 2012, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 1996, 2011, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,13 +11,13 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
 /**************************************************//**
-@file btr/btr0pcur.c
+@file btr/btr0pcur.cc
 The index tree persistent cursor
 
 Created 2/23/1996 Heikki Tuuri
@@ -43,7 +43,7 @@ btr_pcur_create_for_mysql(void)
 {
 	btr_pcur_t*	pcur;
 
-	pcur = mem_alloc(sizeof(btr_pcur_t));
+	pcur = (btr_pcur_t*) mem_alloc(sizeof(btr_pcur_t));
 
 	pcur->btr_cur.index = NULL;
 	btr_pcur_init(pcur);
@@ -133,8 +133,6 @@ btr_pcur_store_position(
 
 		ut_a(btr_page_get_next(page, mtr) == FIL_NULL);
 		ut_a(btr_page_get_prev(page, mtr) == FIL_NULL);
-		ut_ad(page_is_leaf(page));
-		ut_ad(page_get_page_no(page) == index->page);
 
 		cursor->old_stored = BTR_PCUR_OLD_STORED;
 
@@ -191,7 +189,8 @@ btr_pcur_copy_stored_position(
 
 	if (pcur_donate->old_rec_buf) {
 
-		pcur_receive->old_rec_buf = mem_alloc(pcur_donate->buf_size);
+		pcur_receive->old_rec_buf = (byte*)
+			mem_alloc(pcur_donate->buf_size);
 
 		ut_memcpy(pcur_receive->old_rec_buf, pcur_donate->old_rec_buf,
 			  pcur_donate->buf_size);
@@ -327,19 +326,13 @@ btr_pcur_restore_position_func(
 	/* Save the old search mode of the cursor */
 	old_mode = cursor->search_mode;
 
-	switch (cursor->rel_pos) {
-	case BTR_PCUR_ON:
+	if (UNIV_LIKELY(cursor->rel_pos == BTR_PCUR_ON)) {
 		mode = PAGE_CUR_LE;
-		break;
-	case BTR_PCUR_AFTER:
+	} else if (cursor->rel_pos == BTR_PCUR_AFTER) {
 		mode = PAGE_CUR_G;
-		break;
-	case BTR_PCUR_BEFORE:
+	} else {
+		ut_ad(cursor->rel_pos == BTR_PCUR_BEFORE);
 		mode = PAGE_CUR_L;
-		break;
-	default:
-		ut_error;
-		mode = 0;
 	}
 
 	btr_pcur_open_with_no_init_func(index, tuple, mode, latch_mode,
@@ -348,44 +341,25 @@ btr_pcur_restore_position_func(
 	/* Restore the old search mode */
 	cursor->search_mode = old_mode;
 
-	if (btr_pcur_is_on_user_rec(cursor)) {
-		switch (cursor->rel_pos) {
-		case BTR_PCUR_ON:
-			if (!cmp_dtuple_rec(
-				    tuple, btr_pcur_get_rec(cursor),
-				    rec_get_offsets(btr_pcur_get_rec(cursor),
-						    index, NULL,
-						    ULINT_UNDEFINED, &heap))) {
-
-				/* We have to store the NEW value for
-				the modify clock, since the cursor can
-				now be on a different page! But we can
-				retain the value of old_rec */
-
-				cursor->block_when_stored =
-					btr_pcur_get_block(cursor);
-				cursor->modify_clock =
-					buf_block_get_modify_clock(
-						cursor->block_when_stored);
-				cursor->old_stored = BTR_PCUR_OLD_STORED;
+	if (cursor->rel_pos == BTR_PCUR_ON
+	    && btr_pcur_is_on_user_rec(cursor)
+	    && 0 == cmp_dtuple_rec(tuple, btr_pcur_get_rec(cursor),
+				   rec_get_offsets(
+					   btr_pcur_get_rec(cursor), index,
+					   NULL, ULINT_UNDEFINED, &heap))) {
 
-				mem_heap_free(heap);
+		/* We have to store the NEW value for the modify clock, since
+		the cursor can now be on a different page! But we can retain
+		the value of old_rec */
 
-				return(TRUE);
-			}
+		cursor->block_when_stored = btr_pcur_get_block(cursor);
+		cursor->modify_clock = buf_block_get_modify_clock(
+			cursor->block_when_stored);
+		cursor->old_stored = BTR_PCUR_OLD_STORED;
 
-			break;
-		case BTR_PCUR_BEFORE:
-			page_cur_move_to_next(btr_pcur_get_page_cur(cursor));
-			break;
-		case BTR_PCUR_AFTER:
-			page_cur_move_to_prev(btr_pcur_get_page_cur(cursor));
-			break;
-#ifdef UNIV_DEBUG
-		default:
-			ut_error;
-#endif /* UNIV_DEBUG */
-		}
+		mem_heap_free(heap);
+
+		return(TRUE);
 	}
 
 	mem_heap_free(heap);
@@ -587,8 +561,8 @@ btr_pcur_open_on_user_rec_func(
 	ulint		line,		/*!< in: line where called */
 	mtr_t*		mtr)		/*!< in: mtr */
 {
-	btr_pcur_open_func(index, tuple, mode, latch_mode, cursor,
-			   file, line, mtr);
+	btr_pcur_open_low(index, 0, tuple, mode, latch_mode, cursor,
+			  file, line, mtr);
 
 	if ((mode == PAGE_CUR_GE) || (mode == PAGE_CUR_G)) {
 
diff --git a/storage/innobase/btr/btr0sea.c b/storage/innobase/btr/btr0sea.cc
index 53a0c0eb5d9..7e6e2ef1cb1 100644
--- a/storage/innobase/btr/btr0sea.c
+++ b/storage/innobase/btr/btr0sea.cc
@@ -18,13 +18,13 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
 /********************************************************************//**
-@file btr/btr0sea.c
+@file btr/btr0sea.cc
 The index tree adaptive search
 
 Created 2/17/1996 Heikki Tuuri
@@ -42,16 +42,12 @@ Created 2/17/1996 Heikki Tuuri
 #include "btr0pcur.h"
 #include "btr0btr.h"
 #include "ha0ha.h"
+#include "srv0mon.h"
 
 /** Flag: has the search system been enabled?
 Protected by btr_search_latch. */
 UNIV_INTERN char		btr_search_enabled	= TRUE;
 
-#ifdef UNIV_PFS_MUTEX
-/* Key to register btr_search_enabled_mutex with performance schema */
-UNIV_INTERN mysql_pfs_key_t	btr_search_enabled_mutex_key;
-#endif /* UNIV_PFS_MUTEX */
-
 /** A dummy variable to fool the compiler */
 UNIV_INTERN ulint		btr_search_this_is_zero = 0;
 
@@ -172,14 +168,20 @@ btr_search_sys_create(
 	/* We allocate the search latch from dynamic memory:
 	see above at the global variable definition */
 
-	btr_search_latch_temp = mem_alloc(sizeof(rw_lock_t));
+	btr_search_latch_temp = (rw_lock_t*) mem_alloc(sizeof(rw_lock_t));
 
 	rw_lock_create(btr_search_latch_key, &btr_search_latch,
 		       SYNC_SEARCH_SYS);
 
-	btr_search_sys = mem_alloc(sizeof(btr_search_sys_t));
+	btr_search_sys = (btr_search_sys_t*)
+		mem_alloc(sizeof(btr_search_sys_t));
+
+	btr_search_sys->hash_index = ha_create(hash_size, 0,
+					MEM_HEAP_FOR_BTR_SEARCH, 0);
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+	btr_search_sys->hash_index->adaptive = TRUE;
+#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
 
-	btr_search_sys->hash_index = ha_create(hash_size, 0, 0);
 }
 
 /*****************************************************************//**
@@ -199,6 +201,28 @@ btr_search_sys_free(void)
 }
 
 /********************************************************************//**
+Set index->ref_count = 0 on all indexes of a table. */
+static
+void
+btr_search_disable_ref_count(
+/*=========================*/
+	dict_table_t*	table)	/*!< in/out: table */
+{
+	dict_index_t*	index;
+
+	ut_ad(mutex_own(&dict_sys->mutex));
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own(&btr_search_latch, RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+
+	for (index = dict_table_get_first_index(table); index;
+	     index = dict_table_get_next_index(index)) {
+
+		index->search_info->ref_count = 0;
+	}
+}
+
+/********************************************************************//**
 Disable the adaptive hash search system and empty the index. */
 UNIV_INTERN
 void
@@ -217,13 +241,13 @@ btr_search_disable(void)
 	for (table = UT_LIST_GET_FIRST(dict_sys->table_LRU); table;
 	     table = UT_LIST_GET_NEXT(table_LRU, table)) {
 
-		dict_index_t*	index;
+		btr_search_disable_ref_count(table);
+	}
 
-		for (index = dict_table_get_first_index(table); index;
-		     index = dict_table_get_next_index(index)) {
+	for (table = UT_LIST_GET_FIRST(dict_sys->table_non_LRU); table;
+	     table = UT_LIST_GET_NEXT(table_LRU, table)) {
 
-			index->search_info->ref_count = 0;
-		}
+		btr_search_disable_ref_count(table);
 	}
 
 	mutex_exit(&dict_sys->mutex);
@@ -263,7 +287,7 @@ btr_search_info_create(
 {
 	btr_search_t*	info;
 
-	info = mem_heap_alloc(heap, sizeof(btr_search_t));
+	info = (btr_search_t*) mem_heap_alloc(heap, sizeof(btr_search_t));
 
 #ifdef UNIV_DEBUG
 	info->magic_n = BTR_SEARCH_MAGIC_N;
@@ -585,6 +609,8 @@ btr_search_update_hash_ref(
 
 		ha_insert_for_fold(btr_search_sys->hash_index, fold,
 				   block, rec);
+
+		MONITOR_INC(MONITOR_ADAPTIVE_HASH_ROW_ADDED);
 	}
 }
 
@@ -646,7 +672,7 @@ btr_search_info_update_slow(
 		inside the called function. It might be that the compiler
 		would optimize the call just to pass pointers to block. */
 
-		params = mem_alloc(3 * sizeof(ulint));
+		params = (ulint*) mem_alloc(3 * sizeof(ulint));
 		params[0] = block->n_fields;
 		params[1] = block->n_bytes;
 		params[2] = block->left_side;
@@ -889,7 +915,7 @@ btr_search_guess_on_hash(
 	ut_ad(rw_lock_get_writer(&btr_search_latch) != RW_LOCK_EX);
 	ut_ad(rw_lock_get_reader_count(&btr_search_latch) > 0);
 
-	rec = ha_search_and_get_data(btr_search_sys->hash_index, fold);
+	rec = (rec_t*) ha_search_and_get_data(btr_search_sys->hash_index, fold);
 
 	if (UNIV_UNLIKELY(!rec)) {
 		goto failure_unlock;
@@ -1030,7 +1056,11 @@ btr_search_drop_page_hash_index(
 	buf_block_t*	block)	/*!< in: block containing index page,
 				s- or x-latched, or an index page
 				for which we know that
-				block->buf_fix_count == 0 */
+				block->buf_fix_count == 0 or it is an
+				index page which has already been
+				removed from the buf_pool->page_hash
+				i.e.: it is in state
+				BUF_BLOCK_REMOVE_HASH */
 {
 	hash_table_t*		table;
 	ulint			n_fields;
@@ -1053,6 +1083,13 @@ btr_search_drop_page_hash_index(
 	ut_ad(!rw_lock_own(&btr_search_latch, RW_LOCK_EX));
 #endif /* UNIV_SYNC_DEBUG */
 
+	/* Do a dirty check on block->index, return if the block is
+	not in the adaptive hash index. This is to avoid acquiring
+	shared btr_search_latch for performance consideration. */
+	if (!block->index) {
+		return;
+	}
+
 retry:
 	rw_lock_s_lock(&btr_search_latch);
 	index = block->index;
@@ -1070,7 +1107,8 @@ retry:
 #ifdef UNIV_SYNC_DEBUG
 	ut_ad(rw_lock_own(&(block->lock), RW_LOCK_SHARED)
 	      || rw_lock_own(&(block->lock), RW_LOCK_EX)
-	      || (block->page.buf_fix_count == 0));
+	      || block->page.buf_fix_count == 0
+	      || buf_block_get_state(block) == BUF_BLOCK_REMOVE_HASH);
 #endif /* UNIV_SYNC_DEBUG */
 
 	n_fields = block->curr_n_fields;
@@ -1090,7 +1128,7 @@ retry:
 	/* Calculate and cache fold values into an array for fast deletion
 	from the hash index */
 
-	folds = mem_alloc(n_recs * sizeof(ulint));
+	folds = (ulint*) mem_alloc(n_recs * sizeof(ulint));
 
 	n_cached = 0;
 
@@ -1163,6 +1201,9 @@ next_rec:
 
 	block->index = NULL;
 
+	MONITOR_INC(MONITOR_ADAPTIVE_HASH_PAGE_REMOVED);
+	MONITOR_INC_VALUE(MONITOR_ADAPTIVE_HASH_ROW_REMOVED, n_cached);
+
 cleanup:
 #if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
 	if (UNIV_UNLIKELY(block->n_pointers)) {
@@ -1309,8 +1350,8 @@ btr_search_build_page_hash_index(
 	/* Calculate and cache fold values and corresponding records into
 	an array for fast insertion to the hash index */
 
-	folds = mem_alloc(n_recs * sizeof(ulint));
-	recs = mem_alloc(n_recs * sizeof(rec_t*));
+	folds = (ulint*) mem_alloc(n_recs * sizeof(ulint));
+	recs = (rec_t**) mem_alloc(n_recs * sizeof(rec_t*));
 
 	n_cached = 0;
 
@@ -1412,6 +1453,8 @@ btr_search_build_page_hash_index(
 		ha_insert_for_fold(table, folds[i], block, recs[i]);
 	}
 
+	MONITOR_INC(MONITOR_ADAPTIVE_HASH_PAGE_ADDED);
+	MONITOR_INC_VALUE(MONITOR_ADAPTIVE_HASH_ROW_ADDED, n_cached);
 exit_func:
 	rw_lock_x_unlock(&btr_search_latch);
 
@@ -1541,7 +1584,12 @@ btr_search_update_hash_on_delete(
 	if (block->index) {
 		ut_a(block->index == index);
 
-		ha_search_and_delete_if_found(table, fold, rec);
+		if (ha_search_and_delete_if_found(table, fold, rec)) {
+			MONITOR_INC(MONITOR_ADAPTIVE_HASH_ROW_REMOVED);
+		} else {
+			MONITOR_INC(
+				MONITOR_ADAPTIVE_HASH_ROW_REMOVE_NOT_FOUND);
+		}
 	}
 
 	rw_lock_x_unlock(&btr_search_latch);
@@ -1597,8 +1645,11 @@ btr_search_update_hash_node_on_insert(
 
 		table = btr_search_sys->hash_index;
 
-		ha_search_and_update_if_found(table, cursor->fold, rec,
-					      block, page_rec_get_next(rec));
+		if (ha_search_and_update_if_found(
+			table, cursor->fold, rec, block,
+			page_rec_get_next(rec))) {
+			MONITOR_INC(MONITOR_ADAPTIVE_HASH_ROW_UPDATED);
+		}
 
 func_exit:
 		rw_lock_x_unlock(&btr_search_latch);
@@ -1623,9 +1674,9 @@ btr_search_update_hash_on_insert(
 	hash_table_t*	table;
 	buf_block_t*	block;
 	dict_index_t*	index;
-	rec_t*		rec;
-	rec_t*		ins_rec;
-	rec_t*		next_rec;
+	const rec_t*	rec;
+	const rec_t*	ins_rec;
+	const rec_t*	next_rec;
 	ulint		fold;
 	ulint		ins_fold;
 	ulint		next_fold = 0; /* remove warning (??? bug ???) */
@@ -1638,12 +1689,6 @@ btr_search_update_hash_on_insert(
 	ulint*		offsets		= offsets_;
 	rec_offs_init(offsets_);
 
-	table = btr_search_sys->hash_index;
-
-	btr_search_check_free_space_in_heap();
-
-	rec = btr_cur_get_rec(cursor);
-
 	block = btr_cur_get_block(cursor);
 
 #ifdef UNIV_SYNC_DEBUG
@@ -1657,6 +1702,12 @@ btr_search_update_hash_on_insert(
 		return;
 	}
 
+	btr_search_check_free_space_in_heap();
+
+	table = btr_search_sys->hash_index;
+
+	rec = btr_cur_get_rec(cursor);
+
 	ut_a(index == cursor->index);
 	ut_a(!dict_index_is_ibuf(index));
 
@@ -1664,8 +1715,8 @@ btr_search_update_hash_on_insert(
 	n_bytes = block->curr_n_bytes;
 	left_side = block->curr_left_side;
 
-	ins_rec = page_rec_get_next(rec);
-	next_rec = page_rec_get_next(ins_rec);
+	ins_rec = page_rec_get_next_const(rec);
+	next_rec = page_rec_get_next_const(ins_rec);
 
 	offsets = rec_get_offsets(ins_rec, index, offsets,
 				  ULINT_UNDEFINED, &heap);
@@ -1815,11 +1866,12 @@ btr_search_validate(void)
 			buf_pool_mutex_enter_all();
 		}
 
-		node = hash_get_nth_cell(btr_search_sys->hash_index, i)->node;
+		node = (ha_node_t*)
+			hash_get_nth_cell(btr_search_sys->hash_index, i)->node;
 
 		for (; node != NULL; node = node->next) {
 			const buf_block_t*	block
-				= buf_block_align(node->data);
+				= buf_block_align((byte*) node->data);
 			const buf_block_t*	hash_block;
 			buf_pool_t*		buf_pool;
 			index_id_t		page_index_id;
diff --git a/storage/innobase/buf/buf0buddy.c b/storage/innobase/buf/buf0buddy.cc
index 30c31dd71a0..b6774aede8e 100644
--- a/storage/innobase/buf/buf0buddy.c
+++ b/storage/innobase/buf/buf0buddy.cc
@@ -11,13 +11,13 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
 /**************************************************//**
-@file buf/buf0buddy.c
+@file buf/buf0buddy.cc
 Binary buddy allocator for compressed pages
 
 Created December 2006 by Marko Makela
@@ -46,7 +46,9 @@ buf_buddy_get(
 {
 	ut_ad(ut_is_2pow(size));
 	ut_ad(size >= BUF_BUDDY_LOW);
+	ut_ad(BUF_BUDDY_LOW <= UNIV_ZIP_SIZE_MIN);
 	ut_ad(size < BUF_BUDDY_HIGH);
+	ut_ad(BUF_BUDDY_HIGH == UNIV_PAGE_SIZE);
 	ut_ad(!ut_align_offset(page, size));
 
 	if (((ulint) page) & size) {
@@ -57,12 +59,15 @@ buf_buddy_get(
 }
 
 /** Validate a given zip_free list. */
-#define BUF_BUDDY_LIST_VALIDATE(b, i)				\
-	UT_LIST_VALIDATE(list, buf_page_t,			\
-			 b->zip_free[i],			\
-			 ut_ad(buf_page_get_state(		\
-				       ut_list_node_313)	\
-			       == BUF_BLOCK_ZIP_FREE))
+struct	CheckZipFree {
+	void	operator()(const buf_page_t* elem) const
+	{
+		ut_a(buf_page_get_state(elem) == BUF_BLOCK_ZIP_FREE);
+	}
+};
+
+#define BUF_BUDDY_LIST_VALIDATE(bp, i)				\
+	UT_LIST_VALIDATE(list, buf_page_t, bp->zip_free[i], CheckZipFree())
 
 /**********************************************************************//**
 Add a block to the head of the appropriate buddy free list. */
@@ -119,7 +124,7 @@ buf_buddy_alloc_zip(
 
 	ut_ad(buf_pool_mutex_own(buf_pool));
 	ut_a(i < BUF_BUDDY_SIZES);
-	ut_a(i >= buf_buddy_get_slot(PAGE_ZIP_MIN_SIZE));
+	ut_a(i >= buf_buddy_get_slot(UNIV_ZIP_SIZE_MIN));
 
 	ut_d(BUF_BUDDY_LIST_VALIDATE(buf_pool, i));
 
@@ -131,7 +136,7 @@ buf_buddy_alloc_zip(
 		buf_buddy_remove_from_free(buf_pool, bpage, i);
 	} else if (i + 1 < BUF_BUDDY_SIZES) {
 		/* Attempt to split. */
-		bpage = buf_buddy_alloc_zip(buf_pool, i + 1);
+		bpage = (buf_page_t*) buf_buddy_alloc_zip(buf_pool, i + 1);
 
 		if (bpage) {
 			buf_page_t*	buddy = (buf_page_t*)
@@ -235,7 +240,7 @@ buf_buddy_alloc_from(
 {
 	ulint	offs	= BUF_BUDDY_LOW << j;
 	ut_ad(j <= BUF_BUDDY_SIZES);
-	ut_ad(i >= buf_buddy_get_slot(PAGE_ZIP_MIN_SIZE));
+	ut_ad(i >= buf_buddy_get_slot(UNIV_ZIP_SIZE_MIN));
 	ut_ad(j >= i);
 	ut_ad(!ut_align_offset(buf, offs));
 
@@ -279,11 +284,11 @@ buf_buddy_alloc_low(
 	ut_ad(lru);
 	ut_ad(buf_pool_mutex_own(buf_pool));
 	ut_ad(!mutex_own(&buf_pool->zip_mutex));
-	ut_ad(i >= buf_buddy_get_slot(PAGE_ZIP_MIN_SIZE));
+	ut_ad(i >= buf_buddy_get_slot(UNIV_ZIP_SIZE_MIN));
 
 	if (i < BUF_BUDDY_SIZES) {
 		/* Try to allocate from the buddy system. */
-		block = buf_buddy_alloc_zip(buf_pool, i);
+		block = (buf_block_t*) buf_buddy_alloc_zip(buf_pool, i);
 
 		if (block) {
 			goto func_exit;
@@ -307,7 +312,7 @@ buf_buddy_alloc_low(
 alloc_big:
 	buf_buddy_block_register(block);
 
-	block = buf_buddy_alloc_from(
+	block = (buf_block_t*) buf_buddy_alloc_from(
 		buf_pool, block->frame, i, BUF_BUDDY_SIZES);
 
 func_exit:
@@ -338,7 +343,7 @@ buf_buddy_relocate(
 	ut_ad(!mutex_own(&buf_pool->zip_mutex));
 	ut_ad(!ut_align_offset(src, size));
 	ut_ad(!ut_align_offset(dst, size));
-	ut_ad(i >= buf_buddy_get_slot(PAGE_ZIP_MIN_SIZE));
+	ut_ad(i >= buf_buddy_get_slot(UNIV_ZIP_SIZE_MIN));
 	UNIV_MEM_ASSERT_W(dst, size);
 
 	/* We assume that all memory from buf_buddy_alloc()
@@ -358,9 +363,9 @@ buf_buddy_relocate(
 	pool), so there is nothing wrong about this.  The
 	mach_read_from_4() calls here will only trigger bogus
 	Valgrind memcheck warnings in UNIV_DEBUG_VALGRIND builds. */
-	space	= mach_read_from_4((const byte *) src
+	space	= mach_read_from_4((const byte*) src
 				   + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
-	page_no	= mach_read_from_4((const byte *) src
+	page_no	= mach_read_from_4((const byte*) src
 				   + FIL_PAGE_OFFSET);
 	/* Suppress Valgrind warnings about conditional jump
 	on uninitialized value. */
@@ -399,7 +404,7 @@ buf_buddy_relocate(
 		ullint	usec	= ut_time_us(NULL);
 		ut_a(bpage->zip.data == src);
 		memcpy(dst, src, size);
-		bpage->zip.data = dst;
+		bpage->zip.data = (page_zip_t*) dst;
 		mutex_exit(mutex);
 		UNIV_MEM_INVALID(src, size);
 		{
@@ -434,7 +439,7 @@ buf_buddy_free_low(
 	ut_ad(buf_pool_mutex_own(buf_pool));
 	ut_ad(!mutex_own(&buf_pool->zip_mutex));
 	ut_ad(i <= BUF_BUDDY_SIZES);
-	ut_ad(i >= buf_buddy_get_slot(PAGE_ZIP_MIN_SIZE));
+	ut_ad(i >= buf_buddy_get_slot(UNIV_ZIP_SIZE_MIN));
 	ut_ad(buf_pool->buddy_stat[i].used > 0);
 
 	buf_pool->buddy_stat[i].used--;
@@ -521,7 +526,7 @@ buddy_nonfree:
 
 func_exit:
 	/* Free the block to the buddy list. */
-	bpage = buf;
+	bpage = (buf_page_t*) buf;
 
 	/* Fill large blocks with a constant pattern. */
 	ut_d(memset(bpage, i, BUF_BUDDY_LOW << i));
diff --git a/storage/innobase/buf/buf0buf.c b/storage/innobase/buf/buf0buf.cc
index 1672057d552..2ae668ace50 100644
--- a/storage/innobase/buf/buf0buf.c
+++ b/storage/innobase/buf/buf0buf.cc
@@ -18,13 +18,13 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
 /**************************************************//**
-@file buf/buf0buf.c
+@file buf/buf0buf.cc
 The database buffer buf_pool
 
 Created 11/5/1995 Heikki Tuuri
@@ -51,6 +51,9 @@ Created 11/5/1995 Heikki Tuuri
 #include "dict0dict.h"
 #include "log0recv.h"
 #include "page0zip.h"
+#include "srv0mon.h"
+#include "buf0checksum.h"
+#include "buf0dblwr.h"
 
 /*
 		IMPLEMENTATION OF THE BUFFER POOL
@@ -182,7 +185,7 @@ uncompressed pages are accessible via buf_block_t objects that are
 reachable via buf_pool->chunks[].
 
 The chains of free memory blocks (buf_pool->zip_free[]) are used by
-the buddy allocator (buf0buddy.c) to keep track of currently unused
+the buddy allocator (buf0buddy.cc) to keep track of currently unused
 memory blocks of size sizeof(buf_page_t)..UNIV_PAGE_SIZE / 2.  These
 blocks are inside the UNIV_PAGE_SIZE-sized memory blocks of type
 BUF_BLOCK_MEMORY that the buddy allocator requests from the buffer
@@ -294,29 +297,26 @@ be effective only if PFS_GROUP_BUFFER_SYNC is defined. */
 # endif /* !PFS_SKIP_BUFFER_MUTEX_RWLOCK */
 #endif /* UNIV_PFS_MUTEX || UNIV_PFS_RWLOCK */
 
-/** A chunk of buffers.  The buffer pool is allocated in chunks. */
-struct buf_chunk_struct{
-	ulint		mem_size;	/*!< allocated size of the chunk */
-	ulint		size;		/*!< size of frames[] and blocks[] */
-	void*		mem;		/*!< pointer to the memory area which
-					was allocated for the frames */
-	buf_block_t*	blocks;		/*!< array of buffer control blocks */
-};
-#endif /* !UNIV_HOTBACKUP */
+/** Macro to determine whether the read of write counter is used depending
+on the io_type */
+#define MONITOR_RW_COUNTER(io_type, counter)		\
+	((io_type == BUF_IO_READ)			\
+	 ? (counter##_READ)				\
+	 : (counter##_WRITTEN))
 
 /********************************************************************//**
 Gets the smallest oldest_modification lsn for any page in the pool. Returns
 zero if all modified pages have been flushed to disk.
 @return oldest modification in pool, zero if none */
 UNIV_INTERN
-ib_uint64_t
+lsn_t
 buf_pool_get_oldest_modification(void)
 /*==================================*/
 {
 	ulint		i;
 	buf_page_t*	bpage;
-	ib_uint64_t	lsn = 0;
-	ib_uint64_t	oldest_lsn = 0;
+	lsn_t		lsn = 0;
+	lsn_t		oldest_lsn = 0;
 
 	/* When we traverse all the flush lists we don't want another
 	thread to add a dirty page to any flush list. */
@@ -371,6 +371,11 @@ buf_get_total_list_len(
 		buf_pool_t*	buf_pool;
 
 		buf_pool = buf_pool_from_array(i);
+
+		if (!buf_pool) {
+			continue;
+		}
+
 		*LRU_len += UT_LIST_GET_LEN(buf_pool->LRU);
 		*free_len += UT_LIST_GET_LEN(buf_pool->free);
 		*flush_list_len += UT_LIST_GET_LEN(buf_pool->flush_list);
@@ -395,6 +400,10 @@ buf_get_total_stat(
 
 		buf_pool = buf_pool_from_array(i);
 
+		if (!buf_pool) {
+			continue;
+		}
+
 		buf_stat = &buf_pool->stat;
 		tot_stat->n_page_gets += buf_stat->n_page_gets;
 		tot_stat->n_pages_read += buf_stat->n_pages_read;
@@ -438,60 +447,7 @@ buf_block_alloc(
 
 	return(block);
 }
-
-/********************************************************************//**
-Calculates a page checksum which is stored to the page when it is written
-to a file. Note that we must be careful to calculate the same value on
-32-bit and 64-bit architectures.
-@return	checksum */
-UNIV_INTERN
-ulint
-buf_calc_page_new_checksum(
-/*=======================*/
-	const byte*	page)	/*!< in: buffer page */
-{
-	ulint checksum;
-
-	/* Since the field FIL_PAGE_FILE_FLUSH_LSN, and in versions <= 4.1.x
-	..._ARCH_LOG_NO, are written outside the buffer pool to the first
-	pages of data files, we have to skip them in the page checksum
-	calculation.
-	We must also skip the field FIL_PAGE_SPACE_OR_CHKSUM where the
-	checksum is stored, and also the last 8 bytes of page because
-	there we store the old formula checksum. */
-
-	checksum = ut_fold_binary(page + FIL_PAGE_OFFSET,
-				  FIL_PAGE_FILE_FLUSH_LSN - FIL_PAGE_OFFSET)
-		+ ut_fold_binary(page + FIL_PAGE_DATA,
-				 UNIV_PAGE_SIZE - FIL_PAGE_DATA
-				 - FIL_PAGE_END_LSN_OLD_CHKSUM);
-	checksum = checksum & 0xFFFFFFFFUL;
-
-	return(checksum);
-}
-
-/********************************************************************//**
-In versions < 4.0.14 and < 4.1.1 there was a bug that the checksum only
-looked at the first few bytes of the page. This calculates that old
-checksum.
-NOTE: we must first store the new formula checksum to
-FIL_PAGE_SPACE_OR_CHKSUM before calculating and storing this old checksum
-because this takes that field as an input!
-@return	checksum */
-UNIV_INTERN
-ulint
-buf_calc_page_old_checksum(
-/*=======================*/
-	const byte*	page)	/*!< in: buffer page */
-{
-	ulint checksum;
-
-	checksum = ut_fold_binary(page, FIL_PAGE_FILE_FLUSH_LSN);
-
-	checksum = checksum & 0xFFFFFFFFUL;
-
-	return(checksum);
-}
+#endif /* !UNIV_HOTBACKUP */
 
 /********************************************************************//**
 Checks if a page is corrupt.
@@ -504,10 +460,12 @@ buf_page_is_corrupted(
 	ulint		zip_size)	/*!< in: size of compressed page;
 					0 for uncompressed pages */
 {
-	ulint		checksum_field;
-	ulint		old_checksum_field;
+	ulint		checksum_field1;
+	ulint		checksum_field2;
+	ibool		crc32_inited = FALSE;
+	ib_uint32_t	crc32 = ULINT32_UNDEFINED;
 
-	if (UNIV_LIKELY(!zip_size)
+	if (!zip_size
 	    && memcmp(read_buf + FIL_PAGE_LSN + 4,
 		      read_buf + UNIV_PAGE_SIZE
 		      - FIL_PAGE_END_LSN_OLD_CHKSUM + 4, 4)) {
@@ -520,7 +478,7 @@ buf_page_is_corrupted(
 
 #ifndef UNIV_HOTBACKUP
 	if (recv_lsn_checks_on) {
-		ib_uint64_t	current_lsn;
+		lsn_t	current_lsn;
 
 		if (log_peek_lsn(&current_lsn)
 		    && UNIV_UNLIKELY
@@ -530,69 +488,189 @@ buf_page_is_corrupted(
 
 			fprintf(stderr,
 				"  InnoDB: Error: page %lu log sequence number"
-				" %llu\n"
+				" " LSN_PF "\n"
 				"InnoDB: is in the future! Current system "
-				"log sequence number %llu.\n"
+				"log sequence number " LSN_PF ".\n"
 				"InnoDB: Your database may be corrupt or "
 				"you may have copied the InnoDB\n"
 				"InnoDB: tablespace but not the InnoDB "
 				"log files. See\n"
-				"InnoDB: " REFMAN "forcing-innodb-recovery.html\n"
+				"InnoDB: " REFMAN
+				"forcing-innodb-recovery.html\n"
 				"InnoDB: for more information.\n",
-				(ulong) mach_read_from_4(read_buf
-							 + FIL_PAGE_OFFSET),
-				mach_read_from_8(read_buf + FIL_PAGE_LSN),
+				(ulong) mach_read_from_4(
+					read_buf + FIL_PAGE_OFFSET),
+				(lsn_t) mach_read_from_8(
+					read_buf + FIL_PAGE_LSN),
 				current_lsn);
 		}
 	}
 #endif
 
-	/* If we use checksums validation, make additional check before
-	returning TRUE to ensure that the checksum is not equal to
-	BUF_NO_CHECKSUM_MAGIC which might be stored by InnoDB with checksums
-	disabled. Otherwise, skip checksum calculation and return FALSE */
+	/* Check whether the checksum fields have correct values */
+
+	if (srv_checksum_algorithm == SRV_CHECKSUM_ALGORITHM_NONE) {
+		return(FALSE);
+	}
+
+	if (zip_size) {
+		return(!page_zip_verify_checksum(read_buf, zip_size));
+	}
+
+	checksum_field1 = mach_read_from_4(
+		read_buf + FIL_PAGE_SPACE_OR_CHKSUM);
 
-	if (UNIV_LIKELY(srv_use_checksums)) {
-		checksum_field = mach_read_from_4(read_buf
-						  + FIL_PAGE_SPACE_OR_CHKSUM);
+	checksum_field2 = mach_read_from_4(
+		read_buf + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN_OLD_CHKSUM);
 
-		if (UNIV_UNLIKELY(zip_size)) {
-			return(checksum_field != BUF_NO_CHECKSUM_MAGIC
-			       && checksum_field
-			       != page_zip_calc_checksum(read_buf, zip_size));
-		}
+	/* declare empty pages non-corrupted */
+	if (checksum_field1 == 0 && checksum_field2 == 0
+	    && mach_read_from_4(read_buf + FIL_PAGE_LSN) == 0) {
+		/* make sure that the page is really empty */
+		ut_d(for (ulint i = 0; i < UNIV_PAGE_SIZE; i++) {
+		     ut_a(read_buf[i] == 0); });
 
-		old_checksum_field = mach_read_from_4(
-			read_buf + UNIV_PAGE_SIZE
-			- FIL_PAGE_END_LSN_OLD_CHKSUM);
+		return(FALSE);
+	}
+
+	switch ((srv_checksum_algorithm_t) srv_checksum_algorithm) {
+	case SRV_CHECKSUM_ALGORITHM_STRICT_CRC32:
+
+		crc32 = buf_calc_page_crc32(read_buf);
+
+		return(checksum_field1 != crc32 || checksum_field2 != crc32);
+
+	case SRV_CHECKSUM_ALGORITHM_STRICT_INNODB:
 
-		/* There are 2 valid formulas for old_checksum_field:
+		return(checksum_field1
+		       != buf_calc_page_new_checksum(read_buf)
+		       || checksum_field2
+		       != buf_calc_page_old_checksum(read_buf));
+
+	case SRV_CHECKSUM_ALGORITHM_STRICT_NONE:
+
+		return(checksum_field1 != BUF_NO_CHECKSUM_MAGIC
+		       || checksum_field2 != BUF_NO_CHECKSUM_MAGIC);
+
+	case SRV_CHECKSUM_ALGORITHM_CRC32:
+	case SRV_CHECKSUM_ALGORITHM_INNODB:
+		/* There are 3 valid formulas for
+		checksum_field2 (old checksum field):
 
 		1. Very old versions of InnoDB only stored 8 byte lsn to the
 		start and the end of the page.
 
-		2. Newer InnoDB versions store the old formula checksum
-		there. */
+		2. InnoDB versions before MySQL 5.6.3 store the old formula
+		checksum (buf_calc_page_old_checksum()).
 
-		if (old_checksum_field != mach_read_from_4(read_buf
-							   + FIL_PAGE_LSN)
-		    && old_checksum_field != BUF_NO_CHECKSUM_MAGIC
-		    && old_checksum_field
-		    != buf_calc_page_old_checksum(read_buf)) {
+		3. InnoDB versions 5.6.3 and newer with
+		innodb_checksum_algorithm=strict_crc32|crc32 store CRC32. */
 
-			return(TRUE);
+		/* since innodb_checksum_algorithm is not strict_* allow
+		any of the algos to match for the old field */
+
+		if (checksum_field2
+		    != mach_read_from_4(read_buf + FIL_PAGE_LSN)
+		    && checksum_field2 != BUF_NO_CHECKSUM_MAGIC) {
+
+			/* The checksum does not match any of the
+			fast to check. First check the selected algorithm
+			for writing checksums because we assume that the
+			chance of it matching is higher. */
+
+			if (srv_checksum_algorithm
+			    == SRV_CHECKSUM_ALGORITHM_CRC32) {
+
+				crc32 = buf_calc_page_crc32(read_buf);
+				crc32_inited = TRUE;
+
+				if (checksum_field2 != crc32
+				    && checksum_field2
+				    != buf_calc_page_old_checksum(read_buf)) {
+
+					return(TRUE);
+				}
+			} else {
+				ut_ad(srv_checksum_algorithm
+				     == SRV_CHECKSUM_ALGORITHM_INNODB);
+
+				if (checksum_field2
+				    != buf_calc_page_old_checksum(read_buf)) {
+
+					crc32 = buf_calc_page_crc32(read_buf);
+					crc32_inited = TRUE;
+
+					if (checksum_field2 != crc32) {
+						return(TRUE);
+					}
+				}
+			}
 		}
 
+		/* old field is fine, check the new field */
+
 		/* InnoDB versions < 4.0.14 and < 4.1.1 stored the space id
 		(always equal to 0), to FIL_PAGE_SPACE_OR_CHKSUM */
 
-		if (checksum_field != 0
-		    && checksum_field != BUF_NO_CHECKSUM_MAGIC
-		    && checksum_field
-		    != buf_calc_page_new_checksum(read_buf)) {
+		if (checksum_field1 != 0
+		    && checksum_field1 != BUF_NO_CHECKSUM_MAGIC) {
+
+			/* The checksum does not match any of the
+			fast to check. First check the selected algorithm
+			for writing checksums because we assume that the
+			chance of it matching is higher. */
+
+			if (srv_checksum_algorithm
+			    == SRV_CHECKSUM_ALGORITHM_CRC32) {
+
+				if (!crc32_inited) {
+					crc32 = buf_calc_page_crc32(read_buf);
+					crc32_inited = TRUE;
+				}
+
+				if (checksum_field1 != crc32
+				    && checksum_field1
+				    != buf_calc_page_new_checksum(read_buf)) {
+
+					return(TRUE);
+				}
+			} else {
+				ut_ad(srv_checksum_algorithm
+				     == SRV_CHECKSUM_ALGORITHM_INNODB);
+
+				if (checksum_field1
+				    != buf_calc_page_new_checksum(read_buf)) {
+
+					if (!crc32_inited) {
+						crc32 = buf_calc_page_crc32(
+							read_buf);
+						crc32_inited = TRUE;
+					}
+
+					if (checksum_field1 != crc32) {
+						return(TRUE);
+					}
+				}
+			}
+		}
+
+		/* If CRC32 is stored in at least one of the fields, then the
+		other field must also be CRC32 */
+		if (crc32_inited
+		    && ((checksum_field1 == crc32
+			 && checksum_field2 != crc32)
+			|| (checksum_field1 != crc32
+			    && checksum_field2 == crc32))) {
 
 			return(TRUE);
 		}
+
+		break;
+	case SRV_CHECKSUM_ALGORITHM_NONE:
+		/* should have returned FALSE earlier */
+		ut_error;
+	/* no default so the compiler will emit a warning if new enum
+	is added and not handled here */
 	}
 
 	return(FALSE);
@@ -615,9 +693,7 @@ buf_page_print(
 #ifndef UNIV_HOTBACKUP
 	dict_index_t*	index;
 #endif /* !UNIV_HOTBACKUP */
-	ulint		checksum;
-	ulint		old_checksum;
-	ulint		size	= zip_size;
+	ulint		size = zip_size;
 
 	if (!size) {
 		size = UNIV_PAGE_SIZE;
@@ -626,7 +702,7 @@ buf_page_print(
 	if (!(flags & BUF_PAGE_PRINT_NO_FULL)) {
 		ut_print_timestamp(stderr);
 		fprintf(stderr,
-			"  InnoDB: Page dump in ascii and hex (%lu bytes):\n",
+			" InnoDB: Page dump in ascii and hex (%lu bytes):\n",
 			(ulong) size);
 		ut_print_buf(stderr, read_buf, size);
 		fputs("\nInnoDB: End of page dump\n", stderr);
@@ -634,102 +710,80 @@ buf_page_print(
 
 	if (zip_size) {
 		/* Print compressed page. */
-
-		switch (fil_page_get_type(read_buf)) {
-		case FIL_PAGE_TYPE_ZBLOB:
-		case FIL_PAGE_TYPE_ZBLOB2:
-			checksum = srv_use_checksums
-				? page_zip_calc_checksum(read_buf, zip_size)
-				: BUF_NO_CHECKSUM_MAGIC;
-			ut_print_timestamp(stderr);
-			fprintf(stderr,
-				"  InnoDB: Compressed BLOB page"
-				" checksum %lu, stored %lu\n"
-				"InnoDB: Page lsn %lu %lu\n"
-				"InnoDB: Page number (if stored"
-				" to page already) %lu,\n"
-				"InnoDB: space id (if stored"
-				" to page already) %lu\n",
-				(ulong) checksum,
-				(ulong) mach_read_from_4(
-					read_buf + FIL_PAGE_SPACE_OR_CHKSUM),
-				(ulong) mach_read_from_4(
-					read_buf + FIL_PAGE_LSN),
-				(ulong) mach_read_from_4(
-					read_buf + (FIL_PAGE_LSN + 4)),
-				(ulong) mach_read_from_4(
-					read_buf + FIL_PAGE_OFFSET),
-				(ulong) mach_read_from_4(
-					read_buf
-					+ FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID));
-			return;
-		default:
-			ut_print_timestamp(stderr);
-			fprintf(stderr,
-				"  InnoDB: unknown page type %lu,"
-				" assuming FIL_PAGE_INDEX\n",
-				fil_page_get_type(read_buf));
-			/* fall through */
-		case FIL_PAGE_INDEX:
-			checksum = srv_use_checksums
-				? page_zip_calc_checksum(read_buf, zip_size)
-				: BUF_NO_CHECKSUM_MAGIC;
-
-			ut_print_timestamp(stderr);
-			fprintf(stderr,
-				"  InnoDB: Compressed page checksum %lu,"
-				" stored %lu\n"
-				"InnoDB: Page lsn %lu %lu\n"
-				"InnoDB: Page number (if stored"
-				" to page already) %lu,\n"
-				"InnoDB: space id (if stored"
-				" to page already) %lu\n",
-				(ulong) checksum,
-				(ulong) mach_read_from_4(
-					read_buf + FIL_PAGE_SPACE_OR_CHKSUM),
-				(ulong) mach_read_from_4(
-					read_buf + FIL_PAGE_LSN),
-				(ulong) mach_read_from_4(
-					read_buf + (FIL_PAGE_LSN + 4)),
-				(ulong) mach_read_from_4(
-					read_buf + FIL_PAGE_OFFSET),
-				(ulong) mach_read_from_4(
-					read_buf
-					+ FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID));
-			return;
-		case FIL_PAGE_TYPE_XDES:
-			/* This is an uncompressed page. */
-			break;
-		}
-	}
-
-	checksum = srv_use_checksums
-		? buf_calc_page_new_checksum(read_buf) : BUF_NO_CHECKSUM_MAGIC;
-	old_checksum = srv_use_checksums
-		? buf_calc_page_old_checksum(read_buf) : BUF_NO_CHECKSUM_MAGIC;
-
-	ut_print_timestamp(stderr);
-	fprintf(stderr,
-		"  InnoDB: Page checksum %lu, prior-to-4.0.14-form"
-		" checksum %lu\n"
-		"InnoDB: stored checksum %lu, prior-to-4.0.14-form"
-		" stored checksum %lu\n"
-		"InnoDB: Page lsn %lu %lu, low 4 bytes of lsn"
-		" at page end %lu\n"
-		"InnoDB: Page number (if stored to page already) %lu,\n"
-		"InnoDB: space id (if created with >= MySQL-4.1.1"
-		" and stored already) %lu\n",
-		(ulong) checksum, (ulong) old_checksum,
-		(ulong) mach_read_from_4(read_buf + FIL_PAGE_SPACE_OR_CHKSUM),
-		(ulong) mach_read_from_4(read_buf + UNIV_PAGE_SIZE
+		ut_print_timestamp(stderr);
+		fprintf(stderr,
+			" InnoDB: Compressed page type (" ULINTPF "); "
+			"stored checksum in field1 " ULINTPF "; "
+			"calculated checksums for field1: "
+			"%s " ULINTPF ", "
+			"%s " ULINTPF ", "
+			"%s " ULINTPF "; "
+			"page LSN " LSN_PF "; "
+			"page number (if stored to page already) " ULINTPF "; "
+			"space id (if stored to page already) " ULINTPF "\n",
+			fil_page_get_type(read_buf),
+			mach_read_from_4(read_buf + FIL_PAGE_SPACE_OR_CHKSUM),
+			buf_checksum_algorithm_name(
+				SRV_CHECKSUM_ALGORITHM_CRC32),
+			page_zip_calc_checksum(read_buf, zip_size,
+				SRV_CHECKSUM_ALGORITHM_CRC32),
+			buf_checksum_algorithm_name(
+				SRV_CHECKSUM_ALGORITHM_INNODB),
+			page_zip_calc_checksum(read_buf, zip_size,
+				SRV_CHECKSUM_ALGORITHM_INNODB),
+			buf_checksum_algorithm_name(
+				SRV_CHECKSUM_ALGORITHM_NONE),
+			page_zip_calc_checksum(read_buf, zip_size,
+				SRV_CHECKSUM_ALGORITHM_NONE),
+			mach_read_from_8(read_buf + FIL_PAGE_LSN),
+			mach_read_from_4(read_buf + FIL_PAGE_OFFSET),
+			mach_read_from_4(read_buf
+					 + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID));
+	} else {
+		ut_print_timestamp(stderr);
+		fprintf(stderr, " InnoDB: uncompressed page, "
+			"stored checksum in field1 " ULINTPF ", "
+			"calculated checksums for field1: "
+			"%s " UINT32PF ", "
+			"%s " ULINTPF ", "
+			"%s " ULINTPF ", "
+
+			"stored checksum in field2 " ULINTPF ", "
+			"calculated checksums for field2: "
+			"%s " UINT32PF ", "
+			"%s " ULINTPF ", "
+			"%s " ULINTPF ", "
+
+			"page LSN " ULINTPF " " ULINTPF ", "
+			"low 4 bytes of LSN at page end " ULINTPF ", "
+			"page number (if stored to page already) " ULINTPF ", "
+			"space id (if created with >= MySQL-4.1.1 "
+			"and stored already) %lu\n",
+			mach_read_from_4(read_buf + FIL_PAGE_SPACE_OR_CHKSUM),
+			buf_checksum_algorithm_name(SRV_CHECKSUM_ALGORITHM_CRC32),
+			buf_calc_page_crc32(read_buf),
+			buf_checksum_algorithm_name(SRV_CHECKSUM_ALGORITHM_INNODB),
+			buf_calc_page_new_checksum(read_buf),
+			buf_checksum_algorithm_name(SRV_CHECKSUM_ALGORITHM_NONE),
+			BUF_NO_CHECKSUM_MAGIC,
+
+			mach_read_from_4(read_buf + UNIV_PAGE_SIZE
 					 - FIL_PAGE_END_LSN_OLD_CHKSUM),
-		(ulong) mach_read_from_4(read_buf + FIL_PAGE_LSN),
-		(ulong) mach_read_from_4(read_buf + FIL_PAGE_LSN + 4),
-		(ulong) mach_read_from_4(read_buf + UNIV_PAGE_SIZE
+			buf_checksum_algorithm_name(SRV_CHECKSUM_ALGORITHM_CRC32),
+			buf_calc_page_crc32(read_buf),
+			buf_checksum_algorithm_name(SRV_CHECKSUM_ALGORITHM_INNODB),
+			buf_calc_page_old_checksum(read_buf),
+			buf_checksum_algorithm_name(SRV_CHECKSUM_ALGORITHM_NONE),
+			BUF_NO_CHECKSUM_MAGIC,
+
+			mach_read_from_4(read_buf + FIL_PAGE_LSN),
+			mach_read_from_4(read_buf + FIL_PAGE_LSN + 4),
+			mach_read_from_4(read_buf + UNIV_PAGE_SIZE
 					 - FIL_PAGE_END_LSN_OLD_CHKSUM + 4),
-		(ulong) mach_read_from_4(read_buf + FIL_PAGE_OFFSET),
-		(ulong) mach_read_from_4(read_buf
+			mach_read_from_4(read_buf + FIL_PAGE_OFFSET),
+			mach_read_from_4(read_buf
 					 + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID));
+	}
 
 #ifndef UNIV_HOTBACKUP
 	if (mach_read_from_2(read_buf + TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_TYPE)
@@ -874,7 +928,7 @@ buf_block_init(
 	buf_block_t*	block,		/*!< in: pointer to control block */
 	byte*		frame)		/*!< in: pointer to buffer frame */
 {
-	UNIV_MEM_DESC(frame, UNIV_PAGE_SIZE, block);
+	UNIV_MEM_DESC(frame, UNIV_PAGE_SIZE);
 
 	block->frame = frame;
 
@@ -931,7 +985,6 @@ buf_block_init(
 #endif /* PFS_SKIP_BUFFER_MUTEX_RWLOCK || PFS_GROUP_BUFFER_SYNC */
 
 	ut_ad(rw_lock_validate(&(block->lock)));
-
 }
 
 /********************************************************************//**
@@ -966,14 +1019,14 @@ buf_chunk_init(
 
 	/* Allocate the block descriptors from
 	the start of the memory block. */
-	chunk->blocks = chunk->mem;
+	chunk->blocks = (buf_block_t*) chunk->mem;
 
 	/* Align a pointer to the first frame.  Note that when
 	os_large_page_size is smaller than UNIV_PAGE_SIZE,
 	we may allocate one fewer block than requested.  When
 	it is bigger, we may allocate more blocks than requested. */
 
-	frame = ut_align(chunk->mem, UNIV_PAGE_SIZE);
+	frame = (byte*) ut_align(chunk->mem, UNIV_PAGE_SIZE);
 	chunk->size = chunk->mem_size / UNIV_PAGE_SIZE
 		- (frame != chunk->mem);
 
@@ -1171,7 +1224,9 @@ buf_pool_init_instance(
 
 	if (buf_pool_size > 0) {
 		buf_pool->n_chunks = 1;
-		buf_pool->chunks = chunk = mem_zalloc(sizeof *chunk);
+
+		buf_pool->chunks = chunk =
+			(buf_chunk_t*) mem_zalloc(sizeof *chunk);
 
 		UT_LIST_INIT(buf_pool->free);
 
@@ -1189,7 +1244,18 @@ buf_pool_init_instance(
 		buf_pool->curr_size = chunk->size;
 		buf_pool->curr_pool_size = buf_pool->curr_size * UNIV_PAGE_SIZE;
 
-		buf_pool->page_hash = hash_create(2 * buf_pool->curr_size);
+		/* Number of locks protecting page_hash must be a
+		power of two */
+		srv_n_page_hash_locks =
+				 ut_2_power_up(srv_n_page_hash_locks);
+		ut_a(srv_n_page_hash_locks != 0);
+		ut_a(srv_n_page_hash_locks <= MAX_PAGE_HASH_LOCKS);
+
+		buf_pool->page_hash = ha_create(2 * buf_pool->curr_size,
+						srv_n_page_hash_locks,
+						MEM_HEAP_FOR_PAGE_HASH,
+						SYNC_BUF_PAGE_HASH);
+
 		buf_pool->zip_hash = hash_create(2 * buf_pool->curr_size);
 
 		buf_pool->last_printout_time = ut_time();
@@ -1204,11 +1270,13 @@ buf_pool_init_instance(
 		buf_pool->no_flush[i] = os_event_create(NULL);
 	}
 
-	/* 3. Initialize LRU fields
-	--------------------------- */
+	buf_pool->watch = (buf_page_t*) mem_zalloc(
+		sizeof(*buf_pool->watch) * BUF_POOL_WATCH_SIZE);
 
 	/* All fields are initialized by mem_zalloc(). */
 
+	buf_pool->try_LRU_scan = TRUE;
+
 	buf_pool_mutex_exit(buf_pool);
 
 	return(DB_SUCCESS);
@@ -1246,6 +1314,9 @@ buf_pool_free_instance(
 		bpage = prev_bpage;
 	}
 
+	mem_free(buf_pool->watch);
+	buf_pool->watch = NULL;
+
 	chunks = buf_pool->chunks;
 	chunk = chunks + buf_pool->n_chunks;
 
@@ -1254,6 +1325,7 @@ buf_pool_free_instance(
 	}
 
 	mem_free(buf_pool->chunks);
+	ha_clear(buf_pool->page_hash);
 	hash_table_free(buf_pool->page_hash);
 	hash_table_free(buf_pool->zip_hash);
 }
@@ -1275,10 +1347,8 @@ buf_pool_init(
 	ut_ad(n_instances <= MAX_BUFFER_POOLS);
 	ut_ad(n_instances == srv_buf_pool_instances);
 
-	/* We create an extra buffer pool instance, this instance is used
-	for flushing the flush lists, to keep track of n_flush for all
-	the buffer pools and also used as a waiting object during flushing. */
-	buf_pool_ptr = mem_zalloc(n_instances * sizeof *buf_pool_ptr);
+	buf_pool_ptr = (buf_pool_t*) mem_zalloc(
+		n_instances * sizeof *buf_pool_ptr);
 
 	for (i = 0; i < n_instances; i++) {
 		buf_pool_t*	ptr	= &buf_pool_ptr[i];
@@ -1380,15 +1450,21 @@ buf_relocate(
 	ulint		fold;
 	buf_pool_t*	buf_pool = buf_pool_from_bpage(bpage);
 
+	fold = buf_page_address_fold(bpage->space, bpage->offset);
+
 	ut_ad(buf_pool_mutex_own(buf_pool));
+	ut_ad(buf_page_hash_lock_held_x(buf_pool, bpage));
 	ut_ad(mutex_own(buf_page_get_mutex(bpage)));
 	ut_a(buf_page_get_io_fix(bpage) == BUF_IO_NONE);
 	ut_a(bpage->buf_fix_count == 0);
 	ut_ad(bpage->in_LRU_list);
 	ut_ad(!bpage->in_zip_hash);
 	ut_ad(bpage->in_page_hash);
-	ut_ad(bpage == buf_page_hash_get(buf_pool,
-			       		 bpage->space, bpage->offset));
+	ut_ad(bpage == buf_page_hash_get_low(buf_pool,
+					     bpage->space,
+					     bpage->offset,
+					     fold));
+
 	ut_ad(!buf_pool_watch_is_sentinel(buf_pool, bpage));
 #ifdef UNIV_DEBUG
 	switch (buf_page_get_state(bpage)) {
@@ -1437,12 +1513,10 @@ buf_relocate(
 #endif /* UNIV_LRU_DEBUG */
 	}
 
-	ut_d(UT_LIST_VALIDATE(LRU, buf_page_t, buf_pool->LRU,
-			      ut_ad(ut_list_node_313->in_LRU_list)));
+        ut_d(UT_LIST_VALIDATE(
+		LRU, buf_page_t, buf_pool->LRU, CheckInLRUList()));
 
 	/* relocate buf_pool->page_hash */
-	fold = buf_page_address_fold(bpage->space, bpage->offset);
-
 	HASH_DELETE(buf_page_t, hash, buf_pool->page_hash, fold, bpage);
 	HASH_INSERT(buf_page_t, hash, buf_pool->page_hash, fold, dpage);
 }
@@ -1457,6 +1531,8 @@ buf_pool_watch_is_sentinel(
 	buf_pool_t*		buf_pool,	/*!< buffer pool instance */
 	const buf_page_t*	bpage)		/*!< in: block */
 {
+	/* We must also own the appropriate hash lock. */
+	ut_ad(buf_page_hash_lock_held_s_or_x(buf_pool, bpage));
 	ut_ad(buf_page_in_file(bpage));
 
 	if (bpage < &buf_pool->watch[0]
@@ -1477,8 +1553,9 @@ buf_pool_watch_is_sentinel(
 }
 
 /****************************************************************//**
-Add watch for the given page to be read in. Caller must have the buffer pool
-mutex reserved.
+Add watch for the given page to be read in. Caller must have
+appropriate hash_lock for the bpage. This function may release the
+hash_lock and reacquire it.
 @return NULL if watch set, block if the page is in the buffer pool */
 UNIV_INTERN
 buf_page_t*
@@ -1491,12 +1568,18 @@ buf_pool_watch_set(
 	buf_page_t*	bpage;
 	ulint		i;
 	buf_pool_t*	buf_pool = buf_pool_get(space, offset);
+	rw_lock_t*	hash_lock;
 
-	ut_ad(buf_pool_mutex_own(buf_pool));
+	hash_lock = buf_page_hash_lock_get(buf_pool, fold);
+
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own(hash_lock, RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
 
 	bpage = buf_page_hash_get_low(buf_pool, space, offset, fold);
 
 	if (UNIV_LIKELY_NULL(bpage)) {
+page_found:
 		if (!buf_pool_watch_is_sentinel(buf_pool, bpage)) {
 			/* The page was loaded meanwhile. */
 			return(bpage);
@@ -1506,6 +1589,34 @@ buf_pool_watch_set(
 		return(NULL);
 	}
 
+	/* From this point this function becomes fairly heavy in terms
+	of latching. We acquire the buf_pool mutex as well as all the
+	hash_locks. buf_pool mutex is needed because any changes to
+	the page_hash must be covered by it and hash_locks are needed
+	because we don't want to read any stale information in
+	buf_pool->watch[]. However, it is not in the critical code path
+	as this function will be called only by the purge thread. */
+
+
+	/* To obey latching order first release the hash_lock. */
+	rw_lock_x_unlock(hash_lock);
+
+	buf_pool_mutex_enter(buf_pool);
+	hash_lock_x_all(buf_pool->page_hash);
+
+	/* We have to recheck that the page
+	was not loaded or a watch set by some other
+	purge thread. This is because of the small
+	time window between when we release the
+	hash_lock to acquire buf_pool mutex above. */
+
+	bpage = buf_page_hash_get_low(buf_pool, space, offset, fold);
+	if (UNIV_LIKELY_NULL(bpage)) {
+		buf_pool_mutex_exit(buf_pool);
+		hash_unlock_x_all_but(buf_pool->page_hash, hash_lock);
+		goto page_found;
+	}
+
 	for (i = 0; i < BUF_POOL_WATCH_SIZE; i++) {
 		bpage = &buf_pool->watch[i];
 
@@ -1533,6 +1644,14 @@ buf_pool_watch_set(
 			ut_d(bpage->in_page_hash = TRUE);
 			HASH_INSERT(buf_page_t, hash, buf_pool->page_hash,
 				    fold, bpage);
+
+			buf_pool_mutex_exit(buf_pool);
+			/* Once the sentinel is in the page_hash we can
+			safely release all locks except just the
+			relevant hash_lock */
+			hash_unlock_x_all_but(buf_pool->page_hash,
+						hash_lock);
+
 			return(NULL);
 		case BUF_BLOCK_ZIP_PAGE:
 			ut_ad(bpage->in_page_hash);
@@ -1567,6 +1686,12 @@ buf_pool_watch_remove(
 					space, offset) */
 	buf_page_t*	watch)		/*!< in/out: sentinel for watch */
 {
+#ifdef UNIV_SYNC_DEBUG
+	/* We must also own the appropriate hash_bucket mutex. */
+	rw_lock_t* hash_lock = buf_page_hash_lock_get(buf_pool, fold);
+	ut_ad(rw_lock_own(hash_lock, RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+
 	ut_ad(buf_pool_mutex_own(buf_pool));
 
 	HASH_DELETE(buf_page_t, hash, buf_pool->page_hash, fold, watch);
@@ -1588,8 +1713,18 @@ buf_pool_watch_unset(
 	buf_page_t*	bpage;
 	buf_pool_t*	buf_pool = buf_pool_get(space, offset);
 	ulint		fold = buf_page_address_fold(space, offset);
-
+	rw_lock_t*	hash_lock = buf_page_hash_lock_get(buf_pool,
+							     fold);
+
+	/* We only need to have buf_pool mutex in case where we end
+	up calling buf_pool_watch_remove but to obey latching order
+	we acquire it here before acquiring hash_lock. This should
+	not cause too much grief as this function is only ever
+	called from the purge thread. */
 	buf_pool_mutex_enter(buf_pool);
+
+	rw_lock_x_lock(hash_lock);
+
 	bpage = buf_page_hash_get_low(buf_pool, space, offset, fold);
 	/* The page must exist because buf_pool_watch_set()
 	increments buf_fix_count. */
@@ -1611,6 +1746,7 @@ buf_pool_watch_unset(
 	}
 
 	buf_pool_mutex_exit(buf_pool);
+	rw_lock_x_unlock(hash_lock);
 }
 
 /****************************************************************//**
@@ -1629,15 +1765,17 @@ buf_pool_watch_occurred(
 	buf_page_t*	bpage;
 	buf_pool_t*	buf_pool = buf_pool_get(space, offset);
 	ulint		fold	= buf_page_address_fold(space, offset);
+	rw_lock_t*	hash_lock = buf_page_hash_lock_get(buf_pool,
+							     fold);
 
-	buf_pool_mutex_enter(buf_pool);
+	rw_lock_s_lock(hash_lock);
 
 	bpage = buf_page_hash_get_low(buf_pool, space, offset, fold);
 	/* The page must exist because buf_pool_watch_set()
 	increments buf_fix_count. */
 	ut_a(bpage);
 	ret = !buf_pool_watch_is_sentinel(buf_pool, bpage);
-	buf_pool_mutex_exit(buf_pool);
+	rw_lock_s_unlock(hash_lock);
 
 	return(ret);
 }
@@ -1736,20 +1874,22 @@ buf_page_set_file_page_was_freed(
 {
 	buf_page_t*	bpage;
 	buf_pool_t*	buf_pool = buf_pool_get(space, offset);
+	rw_lock_t*	hash_lock;
 
-	buf_pool_mutex_enter(buf_pool);
-
-	bpage = buf_page_hash_get(buf_pool, space, offset);
+	bpage = buf_page_hash_get_s_locked(buf_pool, space, offset,
+					   &hash_lock);
 
 	if (bpage) {
+		mutex_t*	block_mutex = buf_page_get_mutex(bpage);
 		ut_ad(!buf_pool_watch_is_sentinel(buf_pool, bpage));
+		mutex_enter(block_mutex);
+		rw_lock_s_unlock(hash_lock);
 		/* bpage->file_page_was_freed can already hold
 		when this code is invoked from dict_drop_index_tree() */
 		bpage->file_page_was_freed = TRUE;
+		mutex_exit(block_mutex);
 	}
 
-	buf_pool_mutex_exit(buf_pool);
-
 	return(bpage);
 }
 
@@ -1768,21 +1908,53 @@ buf_page_reset_file_page_was_freed(
 {
 	buf_page_t*	bpage;
 	buf_pool_t*	buf_pool = buf_pool_get(space, offset);
+	rw_lock_t*	hash_lock;
+
+	bpage = buf_page_hash_get_s_locked(buf_pool, space, offset,
+					   &hash_lock);
+	if (bpage) {
+		mutex_t*	block_mutex = buf_page_get_mutex(bpage);
+		ut_ad(!buf_pool_watch_is_sentinel(buf_pool, bpage));
+		mutex_enter(block_mutex);
+		rw_lock_s_unlock(hash_lock);
+		bpage->file_page_was_freed = FALSE;
+		mutex_exit(block_mutex);
+	}
+
+	return(bpage);
+}
+#endif /* UNIV_DEBUG_FILE_ACCESSES || UNIV_DEBUG */
+
+/********************************************************************//**
+Attempts to discard the uncompressed frame of a compressed page. The
+caller should not be holding any mutexes when this function is called.
+@return	TRUE if successful, FALSE otherwise. */
+static
+void
+buf_block_try_discard_uncompressed(
+/*===============================*/
+	ulint		space,	/*!< in: space id */
+	ulint		offset)	/*!< in: page number */
+{
+	buf_page_t*	bpage;
+	buf_pool_t*	buf_pool = buf_pool_get(space, offset);
 
+	/* Since we need to acquire buf_pool mutex to discard
+	the uncompressed frame and because page_hash mutex resides
+	below buf_pool mutex in sync ordering therefore we must
+	first release the page_hash mutex. This means that the
+	block in question can move out of page_hash. Therefore
+	we need to check again if the block is still in page_hash. */
 	buf_pool_mutex_enter(buf_pool);
 
 	bpage = buf_page_hash_get(buf_pool, space, offset);
 
 	if (bpage) {
-		ut_ad(!buf_pool_watch_is_sentinel(buf_pool, bpage));
-		bpage->file_page_was_freed = FALSE;
+		buf_LRU_free_block(bpage, FALSE);
 	}
 
 	buf_pool_mutex_exit(buf_pool);
-
-	return(bpage);
 }
-#endif /* UNIV_DEBUG_FILE_ACCESSES || UNIV_DEBUG */
 
 /********************************************************************//**
 Get read access to a compressed page (usually of type
@@ -1803,6 +1975,8 @@ buf_page_get_zip(
 {
 	buf_page_t*	bpage;
 	mutex_t*	block_mutex;
+	rw_lock_t*	hash_lock;
+	ibool		discard_attempted = FALSE;
 	ibool		must_read;
 	unsigned	access_time;
 	buf_pool_t*	buf_pool = buf_pool_get(space, offset);
@@ -1810,9 +1984,12 @@ buf_page_get_zip(
 	buf_pool->stat.n_page_gets++;
 
 	for (;;) {
-		buf_pool_mutex_enter(buf_pool);
 lookup:
-		bpage = buf_page_hash_get(buf_pool, space, offset);
+
+		/* The following call will also grab the page_hash
+		mutex if the page is found. */
+		bpage = buf_page_hash_get_s_locked(buf_pool, space,
+						offset, &hash_lock);
 		if (bpage) {
 			ut_ad(!buf_pool_watch_is_sentinel(buf_pool, bpage));
 			break;
@@ -1820,8 +1997,7 @@ lookup:
 
 		/* Page not in buf_pool: needs to be read from file */
 
-		buf_pool_mutex_exit(buf_pool);
-
+		ut_ad(!hash_lock);
 		buf_read_page(space, zip_size, offset);
 
 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
@@ -1829,10 +2005,12 @@ lookup:
 #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
 	}
 
-	if (UNIV_UNLIKELY(!bpage->zip.data)) {
+	ut_ad(buf_page_hash_lock_held_s(buf_pool, bpage));
+
+	if (!bpage->zip.data) {
 		/* There is no compressed page. */
 err_exit:
-		buf_pool_mutex_exit(buf_pool);
+		rw_lock_s_unlock(hash_lock);
 		return(NULL);
 	}
 
@@ -1852,16 +2030,17 @@ err_exit:
 		bpage->buf_fix_count++;
 		goto got_block;
 	case BUF_BLOCK_FILE_PAGE:
-		block_mutex = &((buf_block_t*) bpage)->mutex;
-		mutex_enter(block_mutex);
-
 		/* Discard the uncompressed page frame if possible. */
-		if (buf_LRU_free_block(bpage, FALSE)) {
-
-			mutex_exit(block_mutex);
+		if (!discard_attempted) {
+			rw_lock_s_unlock(hash_lock);
+			buf_block_try_discard_uncompressed(space,
+							   offset);
+			discard_attempted = TRUE;
 			goto lookup;
 		}
 
+		block_mutex = &((buf_block_t*) bpage)->mutex;
+		mutex_enter(block_mutex);
 		buf_block_buf_fix_inc((buf_block_t*) bpage,
 				      __FILE__, __LINE__);
 		goto got_block;
@@ -1874,15 +2053,13 @@ got_block:
 	must_read = buf_page_get_io_fix(bpage) == BUF_IO_READ;
 	access_time = buf_page_is_accessed(bpage);
 
-	buf_pool_mutex_exit(buf_pool);
-
-	mutex_exit(block_mutex);
-
-	buf_page_set_accessed_make_young(bpage, access_time);
-
+	rw_lock_s_unlock(hash_lock);
 #if defined UNIV_DEBUG_FILE_ACCESSES || defined UNIV_DEBUG
 	ut_a(!bpage->file_page_was_freed);
 #endif
+	mutex_exit(block_mutex);
+
+	buf_page_set_accessed_make_young(bpage, access_time);
 
 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
 	ut_a(++buf_dbg_counter % 5771 || buf_validate());
@@ -1945,26 +2122,28 @@ buf_zip_decompress(
 	buf_block_t*	block,	/*!< in/out: block */
 	ibool		check)	/*!< in: TRUE=verify the page checksum */
 {
-	const byte*	frame		= block->page.zip.data;
-	ulint		stamp_checksum	= mach_read_from_4(
-		frame + FIL_PAGE_SPACE_OR_CHKSUM);
+	const byte*	frame = block->page.zip.data;
+	ulint		size = page_zip_get_size(&block->page.zip);
 
 	ut_ad(buf_block_get_zip_size(block));
 	ut_a(buf_block_get_space(block) != 0);
 
-	if (UNIV_LIKELY(check && stamp_checksum != BUF_NO_CHECKSUM_MAGIC)) {
-		ulint	calc_checksum	= page_zip_calc_checksum(
-			frame, page_zip_get_size(&block->page.zip));
+	if (UNIV_UNLIKELY(check && !page_zip_verify_checksum(frame, size))) {
 
-		if (UNIV_UNLIKELY(stamp_checksum != calc_checksum)) {
-			ut_print_timestamp(stderr);
-			fprintf(stderr,
-				"  InnoDB: compressed page checksum mismatch"
-				" (space %u page %u): %lu != %lu\n",
-				block->page.space, block->page.offset,
-				stamp_checksum, calc_checksum);
-			return(FALSE);
-		}
+		ut_print_timestamp(stderr);
+		fprintf(stderr,
+			"  InnoDB: compressed page checksum mismatch"
+			" (space %u page %u): stored: %lu, crc32: %lu "
+			"innodb: %lu, none: %lu\n",
+			block->page.space, block->page.offset,
+			mach_read_from_4(frame + FIL_PAGE_SPACE_OR_CHKSUM),
+			page_zip_calc_checksum(frame, size,
+					       SRV_CHECKSUM_ALGORITHM_CRC32),
+			page_zip_calc_checksum(frame, size,
+					       SRV_CHECKSUM_ALGORITHM_INNODB),
+			page_zip_calc_checksum(frame, size,
+					       SRV_CHECKSUM_ALGORITHM_NONE));
+		return(FALSE);
 	}
 
 	switch (fil_page_get_type(frame)) {
@@ -2137,8 +2316,8 @@ buf_pointer_is_block_field_instance(
 	/* TODO: protect buf_pool->chunks with a mutex (it will
 	currently remain constant after buf_pool_init()) */
 	while (chunk < echunk) {
-		if (ptr >= (void *)chunk->blocks
-		    && ptr < (void *)(chunk->blocks + chunk->size)) {
+		if (ptr >= (void*) chunk->blocks
+		    && ptr < (void*) (chunk->blocks + chunk->size)) {
 
 			return(TRUE);
 		}
@@ -2185,14 +2364,12 @@ buf_block_is_uncompressed(
 	const buf_block_t*	block)		/*!< in: pointer to block,
 						not dereferenced */
 {
-	ut_ad(buf_pool_mutex_own(buf_pool));
-
 	if (UNIV_UNLIKELY((((ulint) block) % sizeof *block) != 0)) {
 		/* The pointer should be aligned. */
 		return(FALSE);
 	}
 
-	return(buf_pointer_is_block_field_instance(buf_pool, (void *)block));
+	return(buf_pointer_is_block_field_instance(buf_pool, (void*) block));
 }
 
 /********************************************************************//**
@@ -2220,6 +2397,9 @@ buf_page_get_gen(
 	unsigned	access_time;
 	ulint		fix_type;
 	ibool		must_read;
+	rw_lock_t*	hash_lock;
+	mutex_t*	block_mutex;
+	buf_page_t*	hash_bpage;
 	ulint		retries = 0;
 	buf_pool_t*	buf_pool = buf_pool_get(space, offset);
 
@@ -2252,10 +2432,11 @@ buf_page_get_gen(
 #endif
 	buf_pool->stat.n_page_gets++;
 	fold = buf_page_address_fold(space, offset);
+	hash_lock = buf_page_hash_lock_get(buf_pool, fold);
 loop:
 	block = guess;
-	buf_pool_mutex_enter(buf_pool);
 
+	rw_lock_s_lock(hash_lock);
 	if (block) {
 		/* If the guess is a compressed page descriptor that
 		has been allocated by buf_page_alloc_descriptor(),
@@ -2266,6 +2447,8 @@ loop:
 		    || space != block->page.space
 		    || buf_block_get_state(block) != BUF_BLOCK_FILE_PAGE) {
 
+			/* Our guess was bogus or things have changed
+			since. */
 			block = guess = NULL;
 		} else {
 			ut_ad(!block->page.in_zip_hash);
@@ -2278,8 +2461,8 @@ loop:
 			buf_pool, space, offset, fold);
 	}
 
-loop2:
-	if (block && buf_pool_watch_is_sentinel(buf_pool, &block->page)) {
+	if (!block || buf_pool_watch_is_sentinel(buf_pool, &block->page)) {
+		rw_lock_s_unlock(hash_lock);
 		block = NULL;
 	}
 
@@ -2287,21 +2470,32 @@ loop2:
 		/* Page not in buf_pool: needs to be read from file */
 
 		if (mode == BUF_GET_IF_IN_POOL_OR_WATCH) {
+			rw_lock_x_lock(hash_lock);
 			block = (buf_block_t*) buf_pool_watch_set(
 				space, offset, fold);
 
 			if (UNIV_LIKELY_NULL(block)) {
-
+				/* We can release hash_lock after we
+				acquire block_mutex to make sure that
+				no state change takes place. */
+				block_mutex = buf_page_get_mutex(&block->page);
+				mutex_enter(block_mutex);
+
+				/* Now safe to release page_hash mutex */
+				rw_lock_x_unlock(hash_lock);
 				goto got_block;
 			}
-		}
 
-		buf_pool_mutex_exit(buf_pool);
+			rw_lock_x_unlock(hash_lock);
+		}
 
 		if (mode == BUF_GET_IF_IN_POOL
 		    || mode == BUF_PEEK_IF_IN_POOL
 		    || mode == BUF_GET_IF_IN_POOL_OR_WATCH) {
-
+#ifdef UNIV_SYNC_DEBUG
+			ut_ad(!rw_lock_own(hash_lock, RW_LOCK_EX));
+			ut_ad(!rw_lock_own(hash_lock, RW_LOCK_SHARED));
+#endif /* UNIV_SYNC_DEBUG */
 			return(NULL);
 		}
 
@@ -2338,8 +2532,18 @@ loop2:
 		goto loop;
 	}
 
+
+	/* We can release hash_lock after we acquire block_mutex to
+	make sure that no state change takes place. */
+	block_mutex = buf_page_get_mutex(&block->page);
+	mutex_enter(block_mutex);
+
+	/* Now safe to release page_hash mutex */
+	rw_lock_s_unlock(hash_lock);
+
 got_block:
 	ut_ad(page_zip_get_size(&block->page.zip) == zip_size);
+	ut_ad(mutex_own(block_mutex));
 
 	must_read = buf_block_get_io_fix(block) == BUF_IO_READ;
 
@@ -2349,14 +2553,13 @@ got_block:
 		/* The page is being read to buffer pool,
 		but we cannot wait around for the read to
 		complete. */
-		buf_pool_mutex_exit(buf_pool);
+		mutex_exit(block_mutex);
 
 		return(NULL);
 	}
 
 	switch (buf_block_get_state(block)) {
 		buf_page_t*	bpage;
-		ibool		success;
 
 	case BUF_BLOCK_FILE_PAGE:
 		break;
@@ -2364,63 +2567,65 @@ got_block:
 	case BUF_BLOCK_ZIP_PAGE:
 	case BUF_BLOCK_ZIP_DIRTY:
 		bpage = &block->page;
-		/* Protect bpage->buf_fix_count. */
-		mutex_enter(&buf_pool->zip_mutex);
 
 		if (bpage->buf_fix_count
 		    || buf_page_get_io_fix(bpage) != BUF_IO_NONE) {
 			/* This condition often occurs when the buffer
 			is not buffer-fixed, but I/O-fixed by
 			buf_page_init_for_read(). */
-			mutex_exit(&buf_pool->zip_mutex);
+			mutex_exit(block_mutex);
 wait_until_unfixed:
 			/* The block is buffer-fixed or I/O-fixed.
 			Try again later. */
-			buf_pool_mutex_exit(buf_pool);
 			os_thread_sleep(WAIT_FOR_READ);
 
 			goto loop;
 		}
 
 		/* Allocate an uncompressed page. */
-		buf_pool_mutex_exit(buf_pool);
-		mutex_exit(&buf_pool->zip_mutex);
-
+		mutex_exit(block_mutex);
 		block = buf_LRU_get_free_block(buf_pool);
 		ut_a(block);
 
 		buf_pool_mutex_enter(buf_pool);
-		mutex_enter(&block->mutex);
-
-		{
-			buf_page_t*	hash_bpage;
 
-			hash_bpage = buf_page_hash_get_low(
-				buf_pool, space, offset, fold);
+		/* As we have released the page_hash lock and the
+		block_mutex to allocate an uncompressed page it is
+		possible that page_hash might have changed. We do
+		another lookup here while holding the hash_lock
+		to verify that bpage is indeed still a part of
+		page_hash. */
+		rw_lock_x_lock(hash_lock);
+		hash_bpage = buf_page_hash_get_low(buf_pool, space,
+						   offset, fold);
 
-			if (UNIV_UNLIKELY(bpage != hash_bpage)) {
-				/* The buf_pool->page_hash was modified
-				while buf_pool->mutex was released.
-				Free the block that was allocated. */
+		mutex_enter(&block->mutex);
+		if (UNIV_UNLIKELY(bpage != hash_bpage)) {
+			/* The buf_pool->page_hash was modified
+			while buf_pool->mutex was released.
+			Free the block that was allocated. */
 
-				buf_LRU_block_free_non_file_page(block);
-				mutex_exit(&block->mutex);
+			buf_LRU_block_free_non_file_page(block);
+			buf_pool_mutex_exit(buf_pool);
+			mutex_exit(&block->mutex);
+			rw_lock_x_unlock(hash_lock);
 
-				block = (buf_block_t*) hash_bpage;
-				goto loop2;
-			}
+			block = NULL;
+			goto loop;
 		}
 
 		if (UNIV_UNLIKELY
 		    (bpage->buf_fix_count
 		     || buf_page_get_io_fix(bpage) != BUF_IO_NONE)) {
 
+			rw_lock_x_unlock(hash_lock);
 			/* The block was buffer-fixed or I/O-fixed
 			while buf_pool->mutex was not held by this thread.
 			Free the block that was allocated and try again.
 			This should be extremely unlikely. */
 
 			buf_LRU_block_free_non_file_page(block);
+			buf_pool_mutex_exit(buf_pool);
 			mutex_exit(&block->mutex);
 
 			goto wait_until_unfixed;
@@ -2436,7 +2641,7 @@ wait_until_unfixed:
 		block->lock_hash_val = lock_rec_hash(space, offset);
 
 		UNIV_MEM_DESC(&block->page.zip.data,
-			      page_zip_get_size(&block->page.zip), block);
+			      page_zip_get_size(&block->page.zip));
 
 		if (buf_page_get_state(&block->page)
 		    == BUF_BLOCK_ZIP_PAGE) {
@@ -2461,10 +2666,11 @@ wait_until_unfixed:
 
 		block->page.buf_fix_count = 1;
 		buf_block_set_io_fix(block, BUF_IO_READ);
-		rw_lock_x_lock_inline(&block->lock, 0, file, line);
+		rw_lock_x_lock_func(&block->lock, 0, file, line);
 
 		UNIV_MEM_INVALID(bpage, sizeof *bpage);
 
+		rw_lock_x_unlock(hash_lock);
 		mutex_exit(&block->mutex);
 		mutex_exit(&buf_pool->zip_mutex);
 		buf_pool->n_pend_unzip++;
@@ -2475,8 +2681,8 @@ wait_until_unfixed:
 
 		/* Decompress the page and apply buffered operations
 		while not holding buf_pool->mutex or block->mutex. */
-		success = buf_zip_decompress(block, srv_use_checksums);
-		ut_a(success);
+
+		ut_a(buf_zip_decompress(block, TRUE));
 
 		if (UNIV_LIKELY(!recv_no_ibuf_operations)) {
 			ibuf_merge_or_delete_for_page(block, space, offset,
@@ -2488,8 +2694,8 @@ wait_until_unfixed:
 		mutex_enter(&block->mutex);
 		block->page.buf_fix_count--;
 		buf_block_set_io_fix(block, BUF_IO_NONE);
-		mutex_exit(&block->mutex);
 		buf_pool->n_pend_unzip--;
+		buf_pool_mutex_exit(buf_pool);
 		rw_lock_x_unlock(&block->lock);
 
 		break;
@@ -2503,9 +2709,13 @@ wait_until_unfixed:
 		break;
 	}
 
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(!rw_lock_own(hash_lock, RW_LOCK_EX));
+	ut_ad(!rw_lock_own(hash_lock, RW_LOCK_SHARED));
+#endif /* UNIV_SYNC_DEBUG */
+
 	ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
 
-	mutex_enter(&block->mutex);
 #if UNIV_WORD_SIZE == 4
 	/* On 32-bit systems, there is no padding in buf_page_t.  On
 	other systems, Valgrind could complain about uninitialized pad
@@ -2518,29 +2728,59 @@ wait_until_unfixed:
 		/* Try to evict the block from the buffer pool, to use the
 		insert buffer (change buffer) as much as possible. */
 
+		/* To obey the latching order, release the
+		block->mutex before acquiring buf_pool->mutex. Protect
+		the block from changes by temporarily buffer-fixing it
+		for the time we are not holding block->mutex. */
+		buf_block_buf_fix_inc(block, file, line);
+		mutex_exit(&block->mutex);
+		buf_pool_mutex_enter(buf_pool);
+		mutex_enter(&block->mutex);
+		buf_block_buf_fix_dec(block);
+		mutex_exit(&block->mutex);
+
+		/* Now we are only holding the buf_pool->mutex,
+		not block->mutex or hash_lock. Blocks cannot be
+		relocated or enter or exit the buf_pool while we
+		are holding the buf_pool->mutex. */
+
 		if (buf_LRU_free_block(&block->page, TRUE)) {
-			mutex_exit(&block->mutex);
+			buf_pool_mutex_exit(buf_pool);
+			rw_lock_x_lock(hash_lock);
+
 			if (mode == BUF_GET_IF_IN_POOL_OR_WATCH) {
 				/* Set the watch, as it would have
 				been set if the page were not in the
 				buffer pool in the first place. */
 				block = (buf_block_t*) buf_pool_watch_set(
 					space, offset, fold);
+			} else {
+				block = (buf_block_t*) buf_page_hash_get_low(
+					buf_pool, space, offset, fold);
+			}
 
-				if (UNIV_LIKELY_NULL(block)) {
+			if (UNIV_LIKELY_NULL(block)) {
+				block_mutex = buf_page_get_mutex(
+					&block->page);
+				/* The page entered the buffer
+				pool for some reason. Try to
+				evict it again. */
+				mutex_enter(block_mutex);
+				rw_lock_x_unlock(hash_lock);
 
-					/* The page entered the buffer
-					pool for some reason. Try to
-					evict it again. */
-					goto got_block;
-				}
+				goto got_block;
 			}
-			buf_pool_mutex_exit(buf_pool);
+
+			rw_lock_x_unlock(hash_lock);
 			fprintf(stderr,
 				"innodb_change_buffering_debug evict %u %u\n",
 				(unsigned) space, (unsigned) offset);
 			return(NULL);
-		} else if (buf_flush_page_try(buf_pool, block)) {
+		}
+
+		mutex_enter(&block->mutex);
+
+		if (buf_flush_page_try(buf_pool, block)) {
 			fprintf(stderr,
 				"innodb_change_buffering_debug flush %u %u\n",
 				(unsigned) space, (unsigned) offset);
@@ -2549,6 +2789,8 @@ wait_until_unfixed:
 		}
 
 		/* Failed to evict the page; change it directly */
+
+		buf_pool_mutex_exit(buf_pool);
 	}
 #endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */
 
@@ -2563,8 +2805,6 @@ wait_until_unfixed:
 
 	access_time = buf_page_is_accessed(&block->page);
 
-	buf_pool_mutex_exit(buf_pool);
-
 	if (UNIV_LIKELY(mode != BUF_PEEK_IF_IN_POOL)) {
 		buf_page_set_accessed_make_young(&block->page, access_time);
 	}
@@ -2601,14 +2841,14 @@ wait_until_unfixed:
 		break;
 
 	case RW_S_LATCH:
-		rw_lock_s_lock_inline(&(block->lock), 0, file, line);
+		rw_lock_s_lock_func(&(block->lock), 0, file, line);
 
 		fix_type = MTR_MEMO_PAGE_S_FIX;
 		break;
 
 	default:
 		ut_ad(rw_latch == RW_X_LATCH);
-		rw_lock_x_lock_inline(&(block->lock), 0, file, line);
+		rw_lock_x_lock_func(&(block->lock), 0, file, line);
 
 		fix_type = MTR_MEMO_PAGE_X_FIX;
 		break;
@@ -2628,6 +2868,10 @@ wait_until_unfixed:
 	ut_a(ibuf_count_get(buf_block_get_space(block),
 			    buf_block_get_page_no(block)) == 0);
 #endif
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(!rw_lock_own(hash_lock, RW_LOCK_EX));
+	ut_ad(!rw_lock_own(hash_lock, RW_LOCK_SHARED));
+#endif /* UNIV_SYNC_DEBUG */
 	return(block);
 }
 
@@ -2641,8 +2885,7 @@ buf_page_optimistic_get(
 /*====================*/
 	ulint		rw_latch,/*!< in: RW_S_LATCH, RW_X_LATCH */
 	buf_block_t*	block,	/*!< in: guessed buffer block */
-	ib_uint64_t	modify_clock,/*!< in: modify clock value if mode is
-				..._GUESS_ON_CLOCK */
+	ib_uint64_t	modify_clock,/*!< in: modify clock value */
 	const char*	file,	/*!< in: file name */
 	ulint		line,	/*!< in: line where called */
 	mtr_t*		mtr)	/*!< in: mini-transaction */
@@ -2688,8 +2931,8 @@ buf_page_optimistic_get(
 						file, line);
 		fix_type = MTR_MEMO_PAGE_S_FIX;
 	} else {
-		success = rw_lock_x_lock_func_nowait_inline(&(block->lock),
-							    file, line);
+		success = rw_lock_x_lock_func_nowait(&(block->lock),
+						     file, line);
 		fix_type = MTR_MEMO_PAGE_X_FIX;
 	}
 
@@ -2726,8 +2969,11 @@ buf_page_optimistic_get(
 #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
 
 #if defined UNIV_DEBUG_FILE_ACCESSES || defined UNIV_DEBUG
-	ut_a(block->page.file_page_was_freed == FALSE);
+	mutex_enter(&block->mutex);
+	ut_a(!block->page.file_page_was_freed);
+	mutex_exit(&block->mutex);
 #endif
+
 	if (UNIV_UNLIKELY(!access_time)) {
 		/* In the case of a first access, try to apply linear
 		read-ahead */
@@ -2818,8 +3064,8 @@ buf_page_get_known_nowait(
 						file, line);
 		fix_type = MTR_MEMO_PAGE_S_FIX;
 	} else {
-		success = rw_lock_x_lock_func_nowait_inline(&(block->lock),
-							    file, line);
+		success = rw_lock_x_lock_func_nowait(&(block->lock),
+						     file, line);
 		fix_type = MTR_MEMO_PAGE_X_FIX;
 	}
 
@@ -2839,7 +3085,18 @@ buf_page_get_known_nowait(
 	ut_a(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
 #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
 #if defined UNIV_DEBUG_FILE_ACCESSES || defined UNIV_DEBUG
-	ut_a(mode == BUF_KEEP_OLD || !block->page.file_page_was_freed);
+	if (mode != BUF_KEEP_OLD) {
+		/* If mode == BUF_KEEP_OLD, we are executing an I/O
+		completion routine.  Avoid a bogus assertion failure
+		when ibuf_merge_or_delete_for_page() is processing a
+		page that was just freed due to DROP INDEX, or
+		deleting a record from SYS_INDEXES. This check will be
+		skipped in recv_recover_page() as well. */
+
+		mutex_enter(&block->mutex);
+		ut_a(!block->page.file_page_was_freed);
+		mutex_exit(&block->mutex);
+	}
 #endif
 
 #ifdef UNIV_IBUF_COUNT_DEBUG
@@ -2855,7 +3112,7 @@ buf_page_get_known_nowait(
 /*******************************************************************//**
 Given a tablespace id and page number tries to get that page. If the
 page is not in the buffer pool it is not loaded and NULL is returned.
-Suitable for using when holding the kernel mutex.
+Suitable for using when holding the lock_sys_t::mutex.
 @return	pointer to a page or NULL */
 UNIV_INTERN
 const buf_block_t*
@@ -2871,22 +3128,25 @@ buf_page_try_get_func(
 	ibool		success;
 	ulint		fix_type;
 	buf_pool_t*	buf_pool = buf_pool_get(space_id, page_no);
+	rw_lock_t*	hash_lock;
 
 	ut_ad(mtr);
 	ut_ad(mtr->state == MTR_ACTIVE);
 
-	buf_pool_mutex_enter(buf_pool);
-	block = buf_block_hash_get(buf_pool, space_id, page_no);
+	block = buf_block_hash_get_s_locked(buf_pool, space_id,
+					    page_no, &hash_lock);
 
 	if (!block || buf_block_get_state(block) != BUF_BLOCK_FILE_PAGE) {
-		buf_pool_mutex_exit(buf_pool);
+		if (block) {
+			rw_lock_s_unlock(hash_lock);
+		}
 		return(NULL);
 	}
 
 	ut_ad(!buf_pool_watch_is_sentinel(buf_pool, &block->page));
 
 	mutex_enter(&block->mutex);
-	buf_pool_mutex_exit(buf_pool);
+	rw_lock_s_unlock(hash_lock);
 
 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
 	ut_a(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
@@ -2906,8 +3166,8 @@ buf_page_try_get_func(
 		S-latch. */
 
 		fix_type = MTR_MEMO_PAGE_X_FIX;
-		success = rw_lock_x_lock_func_nowait_inline(&block->lock,
-							    file, line);
+		success = rw_lock_x_lock_func_nowait(&block->lock,
+						     file, line);
 	}
 
 	if (!success) {
@@ -2925,7 +3185,9 @@ buf_page_try_get_func(
 	ut_a(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
 #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
 #if defined UNIV_DEBUG_FILE_ACCESSES || defined UNIV_DEBUG
-	ut_a(block->page.file_page_was_freed == FALSE);
+	mutex_enter(&block->mutex);
+	ut_a(!block->page.file_page_was_freed);
+	mutex_exit(&block->mutex);
 #endif /* UNIV_DEBUG_FILE_ACCESSES || UNIV_DEBUG */
 	buf_block_dbg_add_level(block, SYNC_NO_ORDER_CHECK);
 
@@ -2977,9 +3239,15 @@ buf_page_init(
 
 	ut_ad(buf_pool == buf_pool_get(space, offset));
 	ut_ad(buf_pool_mutex_own(buf_pool));
+
 	ut_ad(mutex_own(&(block->mutex)));
 	ut_a(buf_block_get_state(block) != BUF_BLOCK_FILE_PAGE);
 
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own(buf_page_hash_lock_get(buf_pool, fold),
+			  RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+
 	/* Set the state of the block */
 	buf_block_set_file_page(block, space, offset);
 
@@ -3063,6 +3331,7 @@ buf_page_init_for_read(
 	buf_block_t*	block;
 	buf_page_t*	bpage	= NULL;
 	buf_page_t*	watch_page;
+	rw_lock_t*	hash_lock;
 	mtr_t		mtr;
 	ulint		fold;
 	ibool		lru	= FALSE;
@@ -3091,8 +3360,7 @@ buf_page_init_for_read(
 		ut_ad(mode == BUF_READ_ANY_PAGE);
 	}
 
-	if (zip_size && UNIV_LIKELY(!unzip)
-	    && UNIV_LIKELY(!recv_recovery_is_on())) {
+	if (zip_size && !unzip && !recv_recovery_is_on()) {
 		block = NULL;
 	} else {
 		block = buf_LRU_get_free_block(buf_pool);
@@ -3101,14 +3369,17 @@ buf_page_init_for_read(
 	}
 
 	fold = buf_page_address_fold(space, offset);
+	hash_lock = buf_page_hash_lock_get(buf_pool, fold);
 
 	buf_pool_mutex_enter(buf_pool);
+	rw_lock_x_lock(hash_lock);
 
 	watch_page = buf_page_hash_get_low(buf_pool, space, offset, fold);
 	if (watch_page && !buf_pool_watch_is_sentinel(buf_pool, watch_page)) {
 		/* The page is already in the buffer pool. */
 		watch_page = NULL;
 err_exit:
+		rw_lock_x_unlock(hash_lock);
 		if (block) {
 			mutex_enter(&block->mutex);
 			buf_LRU_block_free_non_file_page(block);
@@ -3130,11 +3401,13 @@ err_exit:
 
 	if (block) {
 		bpage = &block->page;
+
 		mutex_enter(&block->mutex);
 
 		ut_ad(buf_pool_from_bpage(bpage) == buf_pool);
 
 		buf_page_init(buf_pool, space, offset, fold, block);
+		rw_lock_x_unlock(hash_lock);
 
 		/* The block must be put to the LRU list, to the old blocks */
 		buf_LRU_add_block(bpage, TRUE/* to old blocks */);
@@ -3151,7 +3424,7 @@ err_exit:
 		rw_lock_x_lock_gen(&block->lock, BUF_IO_READ);
 		buf_page_set_io_fix(bpage, BUF_IO_READ);
 
-		if (UNIV_UNLIKELY(zip_size)) {
+		if (zip_size) {
 			page_zip_set_size(&block->page.zip, zip_size);
 
 			/* buf_pool->mutex may be released and
@@ -3165,7 +3438,7 @@ err_exit:
 			mutex_exit(&block->mutex);
 			data = buf_buddy_alloc(buf_pool, zip_size, &lru);
 			mutex_enter(&block->mutex);
-			block->page.zip.data = data;
+			block->page.zip.data = (page_zip_t*) data;
 
 			/* To maintain the invariant
 			block->in_unzip_LRU_list
@@ -3178,12 +3451,16 @@ err_exit:
 
 		mutex_exit(&block->mutex);
 	} else {
+		rw_lock_x_unlock(hash_lock);
+
 		/* The compressed page must be allocated before the
 		control block (bpage), in order to avoid the
 		invocation of buf_buddy_relocate_block() on
 		uninitialized data. */
 		data = buf_buddy_alloc(buf_pool, zip_size, &lru);
 
+		rw_lock_x_lock(hash_lock);
+
 		/* If buf_buddy_alloc() allocated storage from the LRU list,
 		it released and reacquired buf_pool->mutex.  Thus, we must
 		check the page_hash again, as it may have been modified. */
@@ -3192,11 +3469,12 @@ err_exit:
 			watch_page = buf_page_hash_get_low(
 				buf_pool, space, offset, fold);
 
-			if (watch_page
+			if (UNIV_UNLIKELY(watch_page
 			    && !buf_pool_watch_is_sentinel(buf_pool,
-				   			   watch_page)) {
+							   watch_page))) {
 
 				/* The block was added by some other thread. */
+				rw_lock_x_unlock(hash_lock);
 				watch_page = NULL;
 				buf_buddy_free(buf_pool, data, zip_size);
 
@@ -3212,11 +3490,11 @@ err_exit:
 
 		page_zip_des_init(&bpage->zip);
 		page_zip_set_size(&bpage->zip, zip_size);
-		bpage->zip.data = data;
+		bpage->zip.data = (page_zip_t*) data;
 
 		mutex_enter(&buf_pool->zip_mutex);
 		UNIV_MEM_DESC(bpage->zip.data,
-			      page_zip_get_size(&bpage->zip), bpage);
+			      page_zip_get_size(&bpage->zip));
 
 		buf_page_init_low(bpage);
 
@@ -3235,6 +3513,7 @@ err_exit:
 		ut_d(bpage->in_page_hash = TRUE);
 
 		if (UNIV_LIKELY_NULL(watch_page)) {
+
 			/* Preserve the reference count. */
 			ulint	buf_fix_count = watch_page->buf_fix_count;
 			ut_a(buf_fix_count > 0);
@@ -3246,6 +3525,8 @@ err_exit:
 		HASH_INSERT(buf_page_t, hash, buf_pool->page_hash, fold,
 			    bpage);
 
+		rw_lock_x_unlock(hash_lock);
+
 		/* The block must be put to the LRU list, to the old blocks */
 		buf_LRU_add_block(bpage, TRUE/* to old blocks */);
 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
@@ -3266,6 +3547,12 @@ func_exit:
 		ibuf_mtr_commit(&mtr);
 	}
 
+
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(!rw_lock_own(hash_lock, RW_LOCK_EX));
+	ut_ad(!rw_lock_own(hash_lock, RW_LOCK_SHARED));
+#endif /* UNIV_SYNC_DEBUG */
+
 	ut_ad(!bpage || buf_page_in_file(bpage));
 	return(bpage);
 }
@@ -3291,7 +3578,8 @@ buf_page_create(
 	ulint		fold;
 	buf_block_t*	free_block	= NULL;
 	ulint		time_ms		= ut_time_ms();
-	buf_pool_t*	buf_pool 	= buf_pool_get(space, offset);
+	buf_pool_t*	buf_pool	= buf_pool_get(space, offset);
+	rw_lock_t*	hash_lock;
 
 	ut_ad(mtr);
 	ut_ad(mtr->state == MTR_ACTIVE);
@@ -3300,8 +3588,10 @@ buf_page_create(
 	free_block = buf_LRU_get_free_block(buf_pool);
 
 	fold = buf_page_address_fold(space, offset);
+	hash_lock = buf_page_hash_lock_get(buf_pool, fold);
 
 	buf_pool_mutex_enter(buf_pool);
+	rw_lock_x_lock(hash_lock);
 
 	block = (buf_block_t*) buf_page_hash_get_low(
 		buf_pool, space, offset, fold);
@@ -3318,6 +3608,7 @@ buf_page_create(
 
 		/* Page can be found in buf_pool */
 		buf_pool_mutex_exit(buf_pool);
+		rw_lock_x_unlock(hash_lock);
 
 		buf_block_free(free_block);
 
@@ -3340,6 +3631,8 @@ buf_page_create(
 
 	buf_page_init(buf_pool, space, offset, fold, block);
 
+	rw_lock_x_unlock(hash_lock);
+
 	/* The block must be put to the LRU list */
 	buf_LRU_add_block(&block->page, FALSE);
 
@@ -3367,7 +3660,7 @@ buf_page_create(
 		has been added to buf_pool->LRU and buf_pool->page_hash. */
 		data = buf_buddy_alloc(buf_pool, zip_size, &lru);
 		mutex_enter(&block->mutex);
-		block->page.zip.data = data;
+		block->page.zip.data = (page_zip_t*) data;
 
 		/* To maintain the invariant
 		block->in_unzip_LRU_list
@@ -3394,9 +3687,6 @@ buf_page_create(
 
 	ibuf_merge_or_delete_for_page(NULL, space, offset, zip_size, TRUE);
 
-	/* Flush pages from the end of the LRU list if necessary */
-	buf_flush_free_margin(buf_pool);
-
 	frame = block->frame;
 
 	memset(frame + FIL_PAGE_PREV, 0xff, 4);
@@ -3422,6 +3712,112 @@ buf_page_create(
 }
 
 /********************************************************************//**
+Monitor the buffer page read/write activity, and increment corresponding
+counter value if MONITOR_MODULE_BUF_PAGE (module_buf_page) module is
+enabled. */
+static
+void
+buf_page_monitor(
+/*=============*/
+	const buf_page_t*	bpage,	/*!< in: pointer to the block */
+	enum buf_io_fix		io_type)/*!< in: io_fix types */
+{
+	const byte*	frame;
+	monitor_id_t	counter;
+
+	/* If the counter module is not turned on, just return */
+	if (!MONITOR_IS_ON(MONITOR_MODULE_BUF_PAGE)) {
+		return;
+	}
+
+	ut_a(io_type == BUF_IO_READ || io_type == BUF_IO_WRITE);
+
+	frame = bpage->zip.data
+		? bpage->zip.data
+		: ((buf_block_t*) bpage)->frame;
+
+	switch (fil_page_get_type(frame)) {
+		ulint	level;
+
+	case FIL_PAGE_INDEX:
+		level = btr_page_get_level_low(frame);
+
+		/* Check if it is an index page for insert buffer */
+		if (btr_page_get_index_id(frame)
+		    == (index_id_t)(DICT_IBUF_ID_MIN + IBUF_SPACE_ID)) {
+			if (level == 0) {
+				counter = MONITOR_RW_COUNTER(
+					io_type, MONITOR_INDEX_IBUF_LEAF_PAGE);
+			} else {
+				counter = MONITOR_RW_COUNTER(
+					io_type,
+					MONITOR_INDEX_IBUF_NON_LEAF_PAGE);
+			}
+		} else {
+			if (level == 0) {
+				counter = MONITOR_RW_COUNTER(
+					io_type, MONITOR_INDEX_LEAF_PAGE);
+			} else {
+				counter = MONITOR_RW_COUNTER(
+					io_type, MONITOR_INDEX_NON_LEAF_PAGE);
+			}
+		}
+		break;
+
+        case FIL_PAGE_UNDO_LOG:
+		counter = MONITOR_RW_COUNTER(io_type, MONITOR_UNDO_LOG_PAGE);
+		break;
+
+        case FIL_PAGE_INODE:
+		counter = MONITOR_RW_COUNTER(io_type, MONITOR_INODE_PAGE);
+		break;
+
+        case FIL_PAGE_IBUF_FREE_LIST:
+		counter = MONITOR_RW_COUNTER(io_type,
+					     MONITOR_IBUF_FREELIST_PAGE);
+		break;
+
+        case FIL_PAGE_IBUF_BITMAP:
+		counter = MONITOR_RW_COUNTER(io_type,
+					     MONITOR_IBUF_BITMAP_PAGE);
+		break;
+
+        case FIL_PAGE_TYPE_SYS:
+		counter = MONITOR_RW_COUNTER(io_type, MONITOR_SYSTEM_PAGE);
+		break;
+
+        case FIL_PAGE_TYPE_TRX_SYS:
+		counter = MONITOR_RW_COUNTER(io_type, MONITOR_TRX_SYSTEM_PAGE);
+		break;
+
+        case FIL_PAGE_TYPE_FSP_HDR:
+		counter = MONITOR_RW_COUNTER(io_type, MONITOR_FSP_HDR_PAGE);
+		break;
+
+        case FIL_PAGE_TYPE_XDES:
+		counter = MONITOR_RW_COUNTER(io_type, MONITOR_XDES_PAGE);
+		break;
+
+        case FIL_PAGE_TYPE_BLOB:
+		counter = MONITOR_RW_COUNTER(io_type, MONITOR_BLOB_PAGE);
+		break;
+
+        case FIL_PAGE_TYPE_ZBLOB:
+		counter = MONITOR_RW_COUNTER(io_type, MONITOR_ZBLOB_PAGE);
+		break;
+
+        case FIL_PAGE_TYPE_ZBLOB2:
+		counter = MONITOR_RW_COUNTER(io_type, MONITOR_ZBLOB2_PAGE);
+		break;
+
+	default:
+		counter = MONITOR_RW_COUNTER(io_type, MONITOR_OTHER_PAGE);
+	}
+
+	MONITOR_INC_NOCHECK(counter);
+}
+
+/********************************************************************//**
 Mark a table with the specified space pointed by bpage->space corrupted.
 Also remove the bpage from LRU list.
 @return TRUE if successful */
@@ -3522,7 +3918,7 @@ buf_page_io_complete(
 			frame + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
 
 		if (bpage->space == TRX_SYS_SPACE
-		    && trx_doublewrite_page_inside(bpage->offset)) {
+		    && buf_dblwr_page_inside(bpage->offset)) {
 
 			ut_print_timestamp(stderr);
 			fprintf(stderr,
@@ -3678,6 +4074,8 @@ corrupt:
 		ut_error;
 	}
 
+	buf_page_monitor(bpage, io_type);
+
 #ifdef UNIV_DEBUG
 	if (buf_debug_prints) {
 		fprintf(stderr, "Has %s page space %lu page no %lu\n",
@@ -3735,8 +4133,7 @@ buf_pool_invalidate_instance(
 /*=========================*/
 	buf_pool_t*	buf_pool)	/*!< in: buffer pool instance */
 {
-	ibool		freed;
-	enum buf_flush	i;
+	ulint		i;
 
 	buf_pool_mutex_enter(buf_pool);
 
@@ -3754,8 +4151,10 @@ buf_pool_invalidate_instance(
 		pool invalidation to proceed we must ensure there is NO
 		write activity happening. */
 		if (buf_pool->n_flush[i] > 0) {
+			enum buf_flush	type = static_cast<enum buf_flush>(i);
+
 			buf_pool_mutex_exit(buf_pool);
-			buf_flush_wait_batch_end(buf_pool, i);
+			buf_flush_wait_batch_end(buf_pool, type);
 			buf_pool_mutex_enter(buf_pool);
 		}
 	}
@@ -3764,21 +4163,17 @@ buf_pool_invalidate_instance(
 
 	ut_ad(buf_all_freed_instance(buf_pool));
 
-	freed = TRUE;
+	buf_pool_mutex_enter(buf_pool);
 
-	while (freed) {
-		freed = buf_LRU_search_and_free_block(buf_pool, 100);
+	while (buf_LRU_scan_and_free_block(buf_pool, TRUE)) {
 	}
 
-	buf_pool_mutex_enter(buf_pool);
-
 	ut_ad(UT_LIST_GET_LEN(buf_pool->LRU) == 0);
 	ut_ad(UT_LIST_GET_LEN(buf_pool->unzip_LRU) == 0);
 
 	buf_pool->freed_page_clock = 0;
 	buf_pool->LRU_old = NULL;
 	buf_pool->LRU_old_len = 0;
-	buf_pool->LRU_flush_ended = 0;
 
 	memset(&buf_pool->stat, 0x00, sizeof(buf_pool->stat));
 	buf_refresh_io_stats(buf_pool);
@@ -3815,17 +4210,21 @@ buf_pool_validate_instance(
 	buf_page_t*	b;
 	buf_chunk_t*	chunk;
 	ulint		i;
-	ulint		n_single_flush	= 0;
 	ulint		n_lru_flush	= 0;
+	ulint		n_page_flush	= 0;
 	ulint		n_list_flush	= 0;
 	ulint		n_lru		= 0;
 	ulint		n_flush		= 0;
 	ulint		n_free		= 0;
 	ulint		n_zip		= 0;
+	ulint		fold		= 0;
+	ulint		space		= 0;
+	ulint		offset		= 0;
 
 	ut_ad(buf_pool);
 
 	buf_pool_mutex_enter(buf_pool);
+	hash_lock_x_all(buf_pool->page_hash);
 
 	chunk = buf_pool->chunks;
 
@@ -3850,11 +4249,13 @@ buf_pool_validate_instance(
 				break;
 
 			case BUF_BLOCK_FILE_PAGE:
-				ut_a(buf_page_hash_get(buf_pool,
-						       buf_block_get_space(
-							       block),
-						       buf_block_get_page_no(
-							       block))
+				space = buf_block_get_space(block);
+				offset = buf_block_get_page_no(block);
+				fold = buf_page_address_fold(space, offset);
+				ut_a(buf_page_hash_get_low(buf_pool,
+							   space,
+							   offset,
+							   fold)
 				     == &block->page);
 
 #ifdef UNIV_IBUF_COUNT_DEBUG
@@ -3874,16 +4275,17 @@ buf_pool_validate_instance(
 							&block->page)) {
 					case BUF_FLUSH_LRU:
 						n_lru_flush++;
+						goto assert_s_latched;
+					case BUF_FLUSH_SINGLE_PAGE:
+						n_page_flush++;
+assert_s_latched:
 						ut_a(rw_lock_is_locked(
 							     &block->lock,
-							     RW_LOCK_SHARED));
+								     RW_LOCK_SHARED));
 						break;
 					case BUF_FLUSH_LIST:
 						n_list_flush++;
 						break;
-					case BUF_FLUSH_SINGLE_PAGE:
-						n_single_flush++;
-						break;
 					default:
 						ut_error;
 					}
@@ -3945,8 +4347,9 @@ buf_pool_validate_instance(
 		we have acquired buf_pool->zip_mutex above which acts
 		as the 'block->mutex' for these bpages. */
 		ut_a(!b->oldest_modification);
-		ut_a(buf_page_hash_get(buf_pool, b->space, b->offset) == b);
-
+		fold = buf_page_address_fold(b->space, b->offset);
+		ut_a(buf_page_hash_get_low(buf_pool, b->space, b->offset,
+					   fold) == b);
 		n_lru++;
 		n_zip++;
 	}
@@ -3974,12 +4377,12 @@ buf_pool_validate_instance(
 				case BUF_FLUSH_LRU:
 					n_lru_flush++;
 					break;
+				case BUF_FLUSH_SINGLE_PAGE:
+					n_page_flush++;
+					break;
 				case BUF_FLUSH_LIST:
 					n_list_flush++;
 					break;
-				case BUF_FLUSH_SINGLE_PAGE:
-					n_single_flush++;
-					break;
 				default:
 					ut_error;
 				}
@@ -3998,11 +4401,14 @@ buf_pool_validate_instance(
 			ut_error;
 			break;
 		}
-		ut_a(buf_page_hash_get(buf_pool, b->space, b->offset) == b);
+		fold = buf_page_address_fold(b->space, b->offset);
+		ut_a(buf_page_hash_get_low(buf_pool, b->space, b->offset,
+					   fold) == b);
 	}
 
 	ut_a(UT_LIST_GET_LEN(buf_pool->flush_list) == n_flush);
 
+	hash_unlock_x_all(buf_pool->page_hash);
 	buf_flush_list_mutex_exit(buf_pool);
 
 	mutex_exit(&buf_pool->zip_mutex);
@@ -4022,9 +4428,9 @@ buf_pool_validate_instance(
 		ut_error;
 	}
 
-	ut_a(buf_pool->n_flush[BUF_FLUSH_SINGLE_PAGE] == n_single_flush);
 	ut_a(buf_pool->n_flush[BUF_FLUSH_LIST] == n_list_flush);
 	ut_a(buf_pool->n_flush[BUF_FLUSH_LRU] == n_lru_flush);
+	ut_a(buf_pool->n_flush[BUF_FLUSH_SINGLE_PAGE] == n_page_flush);
 
 	buf_pool_mutex_exit(buf_pool);
 
@@ -4079,8 +4485,10 @@ buf_print_instance(
 
 	size = buf_pool->curr_size;
 
-	index_ids = mem_alloc(size * sizeof *index_ids);
-	counts = mem_alloc(sizeof(ulint) * size);
+	index_ids = static_cast<index_id_t*>(
+		mem_alloc(size * sizeof *index_ids));
+
+	counts = static_cast<ulint*>(mem_alloc(sizeof(ulint) * size));
 
 	buf_pool_mutex_enter(buf_pool);
 	buf_flush_list_mutex_enter(buf_pool);
@@ -4311,26 +4719,18 @@ buf_get_latched_pages_number(void)
 #endif /* UNIV_DEBUG */
 
 /*********************************************************************//**
-Returns the number of pending buf pool ios.
-@return	number of pending I/O operations */
+Returns the number of pending buf pool read ios.
+@return	number of pending read I/O operations */
 UNIV_INTERN
 ulint
-buf_get_n_pending_ios(void)
-/*=======================*/
+buf_get_n_pending_read_ios(void)
+/*============================*/
 {
 	ulint	i;
 	ulint	pend_ios = 0;
 
 	for (i = 0; i < srv_buf_pool_instances; i++) {
-		buf_pool_t*	buf_pool;
-
-		buf_pool = buf_pool_from_array(i);
-
-		pend_ios +=
-			buf_pool->n_pend_reads
-			+ buf_pool->n_flush[BUF_FLUSH_LRU]
-			+ buf_pool->n_flush[BUF_FLUSH_LIST]
-			+ buf_pool->n_flush[BUF_FLUSH_SINGLE_PAGE];
+		pend_ios += buf_pool_from_array(i)->n_pend_reads;
 	}
 
 	return(pend_ios);
@@ -4387,8 +4787,6 @@ buf_stats_aggregate_pool_info(
 	total_info->n_pend_reads += pool_info->n_pend_reads;
 	total_info->n_pending_flush_lru += pool_info->n_pending_flush_lru;
 	total_info->n_pending_flush_list += pool_info->n_pending_flush_list;
-	total_info->n_pending_flush_single_page +=
-		 pool_info->n_pending_flush_single_page;
 	total_info->n_pages_made_young += pool_info->n_pages_made_young;
 	total_info->n_pages_not_made_young += pool_info->n_pages_not_made_young;
 	total_info->n_pages_read += pool_info->n_pages_read;
@@ -4421,7 +4819,7 @@ buf_stats_aggregate_pool_info(
 Collect buffer pool stats information for a buffer pool. Also
 record aggregated stats if there are more than one buffer pool
 in the server */
-static
+UNIV_INTERN
 void
 buf_stats_get_pool_info(
 /*====================*/
@@ -4465,7 +4863,8 @@ buf_stats_get_pool_info(
 		  + buf_pool->init_flush[BUF_FLUSH_LIST]);
 
 	pool_info->n_pending_flush_single_page =
-		 buf_pool->n_flush[BUF_FLUSH_SINGLE_PAGE];
+		 (buf_pool->n_flush[BUF_FLUSH_SINGLE_PAGE]
+		  + buf_pool->init_flush[BUF_FLUSH_SINGLE_PAGE]);
 
 	buf_flush_list_mutex_exit(buf_pool);
 
@@ -4571,7 +4970,7 @@ buf_print_io_instance(
 		"Old database pages %lu\n"
 		"Modified db pages  %lu\n"
 		"Pending reads %lu\n"
-		"Pending writes: LRU %lu, flush list %lu, single page %lu\n",
+		"Pending writes: LRU %lu, flush list %lu single page %lu\n",
 		pool_info->pool_size,
 		pool_info->free_list_len,
 		pool_info->lru_len,
@@ -4654,8 +5053,10 @@ buf_print_io(
 		pool_info_total = &pool_info[srv_buf_pool_instances];
 	} else {
 		ut_a(srv_buf_pool_instances == 1);
-		pool_info_total = pool_info = (buf_pool_info_t*) mem_zalloc(
-			sizeof *pool_info)
+
+		pool_info_total = pool_info =
+			static_cast<buf_pool_info_t*>(
+				mem_zalloc(sizeof *pool_info));
 	}
 
 	for (i = 0; i < srv_buf_pool_instances; i++) {
@@ -4742,7 +5143,7 @@ buf_all_freed(void)
 		if (!buf_all_freed_instance(buf_pool)) {
 			return(FALSE);
 		}
- 	}
+	}
 
 	return(TRUE);
 }
@@ -4753,8 +5154,8 @@ pool.
 @return	number of pending i/o */
 UNIV_INTERN
 ulint
-buf_pool_check_num_pending_io(void)
-/*===============================*/
+buf_pool_check_no_pending_io(void)
+/*==============================*/
 {
 	ulint		i;
 	ulint		pending_io = 0;
@@ -4768,8 +5169,8 @@ buf_pool_check_num_pending_io(void)
 
 		pending_io += buf_pool->n_pend_reads
 			      + buf_pool->n_flush[BUF_FLUSH_LRU]
-			      + buf_pool->n_flush[BUF_FLUSH_LIST]
-			      + buf_pool->n_flush[BUF_FLUSH_SINGLE_PAGE];
+			      + buf_pool->n_flush[BUF_FLUSH_SINGLE_PAGE]
+			      + buf_pool->n_flush[BUF_FLUSH_LIST];
 
 	}
 
@@ -4822,7 +5223,7 @@ buf_page_init_for_backup_restore(
 
 	/* We assume that block->page.data has been allocated
 	with zip_size == UNIV_PAGE_SIZE. */
-	ut_ad(zip_size <= UNIV_PAGE_SIZE);
+	ut_ad(zip_size <= UNIV_ZIP_SIZE_MAX);
 	ut_ad(ut_is_2pow(zip_size));
 	page_zip_set_size(&block->page.zip, zip_size);
 	if (zip_size) {
diff --git a/storage/innobase/buf/buf0checksum.cc b/storage/innobase/buf/buf0checksum.cc
new file mode 100644
index 00000000000..ec79bbe6be9
--- /dev/null
+++ b/storage/innobase/buf/buf0checksum.cc
@@ -0,0 +1,155 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2011, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file buf/buf0checksum.cc
+Buffer pool checksum functions, also linked from /extra/innochecksum.cc
+
+Created Aug 11, 2011 Vasil Dimov
+*******************************************************/
+
+#include "univ.i"
+#include "fil0fil.h" /* FIL_* */
+#include "ut0crc32.h" /* ut_crc32() */
+#include "ut0rnd.h" /* ut_fold_binary() */
+
+#ifndef UNIV_INNOCHECKSUM
+
+#include "srv0srv.h" /* SRV_CHECKSUM_* */
+#include "buf0types.h"
+
+/** the macro MYSQL_SYSVAR_ENUM() requires "long unsigned int" and if we
+use srv_checksum_algorithm_t here then we get a compiler error:
+ha_innodb.cc:12251: error: cannot convert 'srv_checksum_algorithm_t*' to
+  'long unsigned int*' in initialization */
+UNIV_INTERN ulong	srv_checksum_algorithm = SRV_CHECKSUM_ALGORITHM_INNODB;
+
+#endif /* !UNIV_INNOCHECKSUM */
+
+/********************************************************************//**
+Calculates a page CRC32 which is stored to the page when it is written
+to a file. Note that we must be careful to calculate the same value on
+32-bit and 64-bit architectures.
+@return	checksum */
+UNIV_INTERN
+ib_uint32_t
+buf_calc_page_crc32(
+/*================*/
+	const byte*	page)	/*!< in: buffer page */
+{
+	ib_uint32_t	checksum;
+
+	/* Since the field FIL_PAGE_FILE_FLUSH_LSN, and in versions <= 4.1.x
+	FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, are written outside the buffer pool
+	to the first pages of data files, we have to skip them in the page
+	checksum calculation.
+	We must also skip the field FIL_PAGE_SPACE_OR_CHKSUM where the
+	checksum is stored, and also the last 8 bytes of page because
+	there we store the old formula checksum. */
+
+	checksum = ut_crc32(page + FIL_PAGE_OFFSET,
+			    FIL_PAGE_FILE_FLUSH_LSN - FIL_PAGE_OFFSET)
+		^ ut_crc32(page + FIL_PAGE_DATA,
+			   UNIV_PAGE_SIZE - FIL_PAGE_DATA
+			   - FIL_PAGE_END_LSN_OLD_CHKSUM);
+
+	return(checksum);
+}
+
+/********************************************************************//**
+Calculates a page checksum which is stored to the page when it is written
+to a file. Note that we must be careful to calculate the same value on
+32-bit and 64-bit architectures.
+@return	checksum */
+UNIV_INTERN
+ulint
+buf_calc_page_new_checksum(
+/*=======================*/
+	const byte*	page)	/*!< in: buffer page */
+{
+	ulint checksum;
+
+	/* Since the field FIL_PAGE_FILE_FLUSH_LSN, and in versions <= 4.1.x
+	FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, are written outside the buffer pool
+	to the first pages of data files, we have to skip them in the page
+	checksum calculation.
+	We must also skip the field FIL_PAGE_SPACE_OR_CHKSUM where the
+	checksum is stored, and also the last 8 bytes of page because
+	there we store the old formula checksum. */
+
+	checksum = ut_fold_binary(page + FIL_PAGE_OFFSET,
+				  FIL_PAGE_FILE_FLUSH_LSN - FIL_PAGE_OFFSET)
+		+ ut_fold_binary(page + FIL_PAGE_DATA,
+				 UNIV_PAGE_SIZE - FIL_PAGE_DATA
+				 - FIL_PAGE_END_LSN_OLD_CHKSUM);
+	checksum = checksum & 0xFFFFFFFFUL;
+
+	return(checksum);
+}
+
+/********************************************************************//**
+In versions < 4.0.14 and < 4.1.1 there was a bug that the checksum only
+looked at the first few bytes of the page. This calculates that old
+checksum.
+NOTE: we must first store the new formula checksum to
+FIL_PAGE_SPACE_OR_CHKSUM before calculating and storing this old checksum
+because this takes that field as an input!
+@return	checksum */
+UNIV_INTERN
+ulint
+buf_calc_page_old_checksum(
+/*=======================*/
+	const byte*	page)	/*!< in: buffer page */
+{
+	ulint checksum;
+
+	checksum = ut_fold_binary(page, FIL_PAGE_FILE_FLUSH_LSN);
+
+	checksum = checksum & 0xFFFFFFFFUL;
+
+	return(checksum);
+}
+
+#ifndef UNIV_INNOCHECKSUM
+
+/********************************************************************//**
+Return a printable string describing the checksum algorithm.
+@return	algorithm name */
+UNIV_INTERN
+const char*
+buf_checksum_algorithm_name(
+/*========================*/
+	srv_checksum_algorithm_t	algo)	/*!< in: algorithm */
+{
+	switch (algo) {
+	case SRV_CHECKSUM_ALGORITHM_CRC32:
+	case SRV_CHECKSUM_ALGORITHM_STRICT_CRC32:
+		return("crc32");
+	case SRV_CHECKSUM_ALGORITHM_INNODB:
+	case SRV_CHECKSUM_ALGORITHM_STRICT_INNODB:
+		return("innodb");
+	case SRV_CHECKSUM_ALGORITHM_NONE:
+	case SRV_CHECKSUM_ALGORITHM_STRICT_NONE:
+		return("none");
+	}
+
+	ut_error;
+	return(NULL);
+}
+
+#endif /* !UNIV_INNOCHECKSUM */
diff --git a/storage/innobase/buf/buf0dblwr.cc b/storage/innobase/buf/buf0dblwr.cc
new file mode 100644
index 00000000000..ad6ef7c4cef
--- /dev/null
+++ b/storage/innobase/buf/buf0dblwr.cc
@@ -0,0 +1,1086 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2012, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file buf/buf0dblwr.cc
+Doublwrite buffer module
+
+Created 2011/12/19
+*******************************************************/
+
+#include "buf0dblwr.h"
+
+#include "buf0buf.h"
+#include "buf0lru.h"
+#include "buf0flu.h"
+#include "buf0checksum.h"
+#include "srv0start.h"
+#include "srv0srv.h"
+#include "page0zip.h"
+#include "trx0sys.h"
+#include "page0page.h"
+#include "mtr0log.h"
+
+#ifndef UNIV_HOTBACKUP
+
+/** Time in milliseconds that we sleep when unable to find a slot in
+the doublewrite buffer or when we have to wait for a running batch
+to end. */
+#define TRX_DOUBLEWRITE_BATCH_POLL_DELAY	10000
+
+#ifdef UNIV_PFS_MUTEX
+/* Key to register the mutex with performance schema */
+UNIV_INTERN mysql_pfs_key_t	buf_dblwr_mutex_key;
+#endif /* UNIV_PFS_RWLOCK */
+
+/** The doublewrite buffer */
+UNIV_INTERN buf_dblwr_t*	buf_dblwr = NULL;
+
+/** Set to TRUE when the doublewrite buffer is being created */
+UNIV_INTERN ibool	buf_dblwr_being_created = FALSE;
+
+/****************************************************************//**
+Determines if a page number is located inside the doublewrite buffer.
+@return TRUE if the location is inside the two blocks of the
+doublewrite buffer */
+UNIV_INTERN
+ibool
+buf_dblwr_page_inside(
+/*==================*/
+	ulint	page_no)	/*!< in: page number */
+{
+	if (buf_dblwr == NULL) {
+
+		return(FALSE);
+	}
+
+	if (page_no >= buf_dblwr->block1
+	    && page_no < buf_dblwr->block1
+	    + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
+		return(TRUE);
+	}
+
+	if (page_no >= buf_dblwr->block2
+	    && page_no < buf_dblwr->block2
+	    + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
+		return(TRUE);
+	}
+
+	return(FALSE);
+}
+
+/****************************************************************//**
+Calls buf_page_get() on the TRX_SYS_PAGE and returns a pointer to the
+doublewrite buffer within it.
+@return	pointer to the doublewrite buffer within the filespace header
+page. */
+UNIV_INLINE
+byte*
+buf_dblwr_get(
+/*==========*/
+	mtr_t*	mtr)	/*!< in/out: MTR to hold the page latch */
+{
+	buf_block_t*	block;
+
+	block = buf_page_get(TRX_SYS_SPACE, 0, TRX_SYS_PAGE_NO,
+			     RW_X_LATCH, mtr);
+	buf_block_dbg_add_level(block, SYNC_NO_ORDER_CHECK);
+
+	return(buf_block_get_frame(block) + TRX_SYS_DOUBLEWRITE);
+}
+
+
+/****************************************************************//**
+Creates or initialializes the doublewrite buffer at a database start. */
+static
+void
+buf_dblwr_init(
+/*===========*/
+	byte*	doublewrite)	/*!< in: pointer to the doublewrite buf
+				header on trx sys page */
+{
+	ulint	buf_size;
+
+	buf_dblwr = static_cast<buf_dblwr_t*>(
+		mem_zalloc(sizeof(buf_dblwr_t)));
+
+	/* There are two blocks of same size in the doublewrite
+	buffer. */
+	buf_size = 2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE;
+
+	/* There must be atleast one buffer for single page writes
+	and one buffer for batch writes. */
+	ut_a(srv_doublewrite_batch_size > 0
+	     && srv_doublewrite_batch_size < buf_size);
+
+	mutex_create(buf_dblwr_mutex_key,
+		     &buf_dblwr->mutex, SYNC_DOUBLEWRITE);
+
+	buf_dblwr->first_free = 0;
+	buf_dblwr->s_reserved = 0;
+	buf_dblwr->b_reserved = 0;
+
+	buf_dblwr->block1 = mach_read_from_4(
+		doublewrite + TRX_SYS_DOUBLEWRITE_BLOCK1);
+	buf_dblwr->block2 = mach_read_from_4(
+		doublewrite + TRX_SYS_DOUBLEWRITE_BLOCK2);
+
+	buf_dblwr->in_use = static_cast<ibool*>(
+		mem_zalloc(buf_size * sizeof(ibool)));
+
+	buf_dblwr->write_buf_unaligned = static_cast<byte*>(
+		ut_malloc((1 + buf_size) * UNIV_PAGE_SIZE));
+
+	buf_dblwr->write_buf = static_cast<byte*>(
+		ut_align(buf_dblwr->write_buf_unaligned,
+			 UNIV_PAGE_SIZE));
+
+	buf_dblwr->buf_block_arr = static_cast<buf_page_t**>(
+		mem_zalloc(buf_size * sizeof(void*)));
+}
+
+/****************************************************************//**
+Creates the doublewrite buffer to a new InnoDB installation. The header of the
+doublewrite buffer is placed on the trx system header page. */
+UNIV_INTERN
+void
+buf_dblwr_create(void)
+/*==================*/
+{
+	buf_block_t*	block2;
+	buf_block_t*	new_block;
+	byte*	doublewrite;
+	byte*	fseg_header;
+	ulint	page_no;
+	ulint	prev_page_no;
+	ulint	i;
+	mtr_t	mtr;
+
+	if (buf_dblwr) {
+		/* Already inited */
+
+		return;
+	}
+
+start_again:
+	mtr_start(&mtr);
+	buf_dblwr_being_created = TRUE;
+
+	doublewrite = buf_dblwr_get(&mtr);
+
+	if (mach_read_from_4(doublewrite + TRX_SYS_DOUBLEWRITE_MAGIC)
+	    == TRX_SYS_DOUBLEWRITE_MAGIC_N) {
+		/* The doublewrite buffer has already been created:
+		just read in some numbers */
+
+		buf_dblwr_init(doublewrite);
+
+		mtr_commit(&mtr);
+		buf_dblwr_being_created = FALSE;
+		return;
+	}
+
+	ut_print_timestamp(stderr);
+	fprintf(stderr,
+		" InnoDB: Doublewrite buffer not found:"
+		" creating new\n");
+
+	if (buf_pool_get_curr_size()
+	    < ((2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE
+		+ FSP_EXTENT_SIZE / 2 + 100)
+	       * UNIV_PAGE_SIZE)) {
+		fprintf(stderr,
+			"InnoDB: Cannot create doublewrite buffer:"
+			" you must\n"
+			"InnoDB: increase your buffer pool size.\n"
+			"InnoDB: Cannot continue operation.\n");
+
+		exit(1);
+	}
+
+	block2 = fseg_create(TRX_SYS_SPACE, TRX_SYS_PAGE_NO,
+			     TRX_SYS_DOUBLEWRITE
+			     + TRX_SYS_DOUBLEWRITE_FSEG, &mtr);
+
+	/* fseg_create acquires a second latch on the page,
+	therefore we must declare it: */
+
+	buf_block_dbg_add_level(block2, SYNC_NO_ORDER_CHECK);
+
+	if (block2 == NULL) {
+		fprintf(stderr,
+			"InnoDB: Cannot create doublewrite buffer:"
+			" you must\n"
+			"InnoDB: increase your tablespace size.\n"
+			"InnoDB: Cannot continue operation.\n");
+
+		/* We exit without committing the mtr to prevent
+		its modifications to the database getting to disk */
+
+		exit(1);
+	}
+
+	fseg_header = doublewrite + TRX_SYS_DOUBLEWRITE_FSEG;
+	prev_page_no = 0;
+
+	for (i = 0; i < 2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE
+		     + FSP_EXTENT_SIZE / 2; i++) {
+		new_block = fseg_alloc_free_page(
+			fseg_header, prev_page_no + 1, FSP_UP, &mtr);
+		if (new_block == NULL) {
+			fprintf(stderr,
+				"InnoDB: Cannot create doublewrite"
+				" buffer: you must\n"
+				"InnoDB: increase your"
+				" tablespace size.\n"
+				"InnoDB: Cannot continue operation.\n"
+				);
+
+			exit(1);
+		}
+
+		/* We read the allocated pages to the buffer pool;
+		when they are written to disk in a flush, the space
+		id and page number fields are also written to the
+		pages. When we at database startup read pages
+		from the doublewrite buffer, we know that if the
+		space id and page number in them are the same as
+		the page position in the tablespace, then the page
+		has not been written to in doublewrite. */
+
+		ut_ad(rw_lock_get_x_lock_count(&new_block->lock) == 1);
+		page_no = buf_block_get_page_no(new_block);
+
+		if (i == FSP_EXTENT_SIZE / 2) {
+			ut_a(page_no == FSP_EXTENT_SIZE);
+			mlog_write_ulint(doublewrite
+					 + TRX_SYS_DOUBLEWRITE_BLOCK1,
+					 page_no, MLOG_4BYTES, &mtr);
+			mlog_write_ulint(doublewrite
+					 + TRX_SYS_DOUBLEWRITE_REPEAT
+					 + TRX_SYS_DOUBLEWRITE_BLOCK1,
+					 page_no, MLOG_4BYTES, &mtr);
+
+		} else if (i == FSP_EXTENT_SIZE / 2
+			   + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
+			ut_a(page_no == 2 * FSP_EXTENT_SIZE);
+			mlog_write_ulint(doublewrite
+					 + TRX_SYS_DOUBLEWRITE_BLOCK2,
+					 page_no, MLOG_4BYTES, &mtr);
+			mlog_write_ulint(doublewrite
+					 + TRX_SYS_DOUBLEWRITE_REPEAT
+					 + TRX_SYS_DOUBLEWRITE_BLOCK2,
+					 page_no, MLOG_4BYTES, &mtr);
+
+		} else if (i > FSP_EXTENT_SIZE / 2) {
+			ut_a(page_no == prev_page_no + 1);
+		}
+
+		if (((i + 1) & 15) == 0) {
+			/* rw_locks can only be recursively x-locked
+			2048 times. (on 32 bit platforms,
+			(lint) 0 - (X_LOCK_DECR * 2049)
+			is no longer a negative number, and thus
+			lock_word becomes like a shared lock).
+			For 4k page size this loop will
+			lock the fseg header too many times. Since
+			this code is not done while any other threads
+			are active, restart the MTR occasionally. */
+			mtr_commit(&mtr);
+			mtr_start(&mtr);
+			doublewrite = buf_dblwr_get(&mtr);
+			fseg_header = doublewrite
+				      + TRX_SYS_DOUBLEWRITE_FSEG;
+		}
+
+		prev_page_no = page_no;
+	}
+
+	mlog_write_ulint(doublewrite + TRX_SYS_DOUBLEWRITE_MAGIC,
+			 TRX_SYS_DOUBLEWRITE_MAGIC_N,
+			 MLOG_4BYTES, &mtr);
+	mlog_write_ulint(doublewrite + TRX_SYS_DOUBLEWRITE_MAGIC
+			 + TRX_SYS_DOUBLEWRITE_REPEAT,
+			 TRX_SYS_DOUBLEWRITE_MAGIC_N,
+			 MLOG_4BYTES, &mtr);
+
+	mlog_write_ulint(doublewrite
+			 + TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED,
+			 TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED_N,
+			 MLOG_4BYTES, &mtr);
+	mtr_commit(&mtr);
+
+	/* Flush the modified pages to disk and make a checkpoint */
+	log_make_checkpoint_at(LSN_MAX, TRUE);
+
+	/* Remove doublewrite pages from LRU */
+	buf_pool_invalidate();
+
+	ut_print_timestamp(stderr);
+	fprintf(stderr, " InnoDB: Doublewrite buffer created\n");
+
+	goto start_again;
+}
+
+/****************************************************************//**
+At a database startup initializes the doublewrite buffer memory structure if
+we already have a doublewrite buffer created in the data files. If we are
+upgrading to an InnoDB version which supports multiple tablespaces, then this
+function performs the necessary update operations. If we are in a crash
+recovery, this function uses a possible doublewrite buffer to restore
+half-written pages in the data files. */
+UNIV_INTERN
+void
+buf_dblwr_init_or_restore_pages(
+/*============================*/
+	ibool	restore_corrupt_pages)	/*!< in: TRUE=restore pages */
+{
+	byte*	buf;
+	byte*	read_buf;
+	byte*	unaligned_read_buf;
+	ulint	block1;
+	ulint	block2;
+	byte*	page;
+	ibool	reset_space_ids = FALSE;
+	byte*	doublewrite;
+	ulint	space_id;
+	ulint	page_no;
+	ulint	i;
+
+	/* We do the file i/o past the buffer pool */
+
+	unaligned_read_buf = static_cast<byte*>(ut_malloc(2 * UNIV_PAGE_SIZE));
+
+	read_buf = static_cast<byte*>(
+		ut_align(unaligned_read_buf, UNIV_PAGE_SIZE));
+
+	/* Read the trx sys header to check if we are using the doublewrite
+	buffer */
+
+	fil_io(OS_FILE_READ, TRUE, TRX_SYS_SPACE, 0, TRX_SYS_PAGE_NO, 0,
+	       UNIV_PAGE_SIZE, read_buf, NULL);
+	doublewrite = read_buf + TRX_SYS_DOUBLEWRITE;
+
+	if (mach_read_from_4(doublewrite + TRX_SYS_DOUBLEWRITE_MAGIC)
+	    == TRX_SYS_DOUBLEWRITE_MAGIC_N) {
+		/* The doublewrite buffer has been created */
+
+		buf_dblwr_init(doublewrite);
+
+		block1 = buf_dblwr->block1;
+		block2 = buf_dblwr->block2;
+
+		buf = buf_dblwr->write_buf;
+	} else {
+		goto leave_func;
+	}
+
+	if (mach_read_from_4(doublewrite + TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED)
+	!= TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED_N) {
+
+		/* We are upgrading from a version < 4.1.x to a version where
+		multiple tablespaces are supported. We must reset the space id
+		field in the pages in the doublewrite buffer because starting
+		from this version the space id is stored to
+		FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID. */
+
+		reset_space_ids = TRUE;
+
+		fprintf(stderr,
+			"InnoDB: Resetting space id's in the"
+			" doublewrite buffer\n");
+	}
+
+	/* Read the pages from the doublewrite buffer to memory */
+
+	fil_io(OS_FILE_READ, TRUE, TRX_SYS_SPACE, 0, block1, 0,
+	       TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE,
+	       buf, NULL);
+	fil_io(OS_FILE_READ, TRUE, TRX_SYS_SPACE, 0, block2, 0,
+	       TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE,
+	       buf + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE,
+	       NULL);
+	/* Check if any of these pages is half-written in data files, in the
+	intended position */
+
+	page = buf;
+
+	for (i = 0; i < TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * 2; i++) {
+
+		ulint source_page_no;
+		page_no = mach_read_from_4(page + FIL_PAGE_OFFSET);
+
+		if (reset_space_ids) {
+
+			space_id = 0;
+			mach_write_to_4(page
+					+ FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, 0);
+			/* We do not need to calculate new checksums for the
+			pages because the field .._SPACE_ID does not affect
+			them. Write the page back to where we read it from. */
+
+			if (i < TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
+				source_page_no = block1 + i;
+			} else {
+				source_page_no = block2
+					+ i - TRX_SYS_DOUBLEWRITE_BLOCK_SIZE;
+			}
+
+			fil_io(OS_FILE_WRITE, TRUE, 0, 0, source_page_no, 0,
+			       UNIV_PAGE_SIZE, page, NULL);
+		} else {
+
+			space_id = mach_read_from_4(
+				page + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
+		}
+
+		if (!restore_corrupt_pages) {
+			/* The database was shut down gracefully: no need to
+			restore pages */
+
+		} else if (!fil_tablespace_exists_in_mem(space_id)) {
+			/* Maybe we have dropped the single-table tablespace
+			and this page once belonged to it: do nothing */
+
+		} else if (!fil_check_adress_in_tablespace(space_id,
+							   page_no)) {
+			fprintf(stderr,
+				"InnoDB: Warning: a page in the"
+				" doublewrite buffer is not within space\n"
+				"InnoDB: bounds; space id %lu"
+				" page number %lu, page %lu in"
+				" doublewrite buf.\n",
+				(ulong) space_id, (ulong) page_no, (ulong) i);
+
+		} else if (space_id == TRX_SYS_SPACE
+			   && ((page_no >= block1
+				&& page_no
+				< block1 + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE)
+			       || (page_no >= block2
+				   && page_no
+				   < (block2
+				      + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE)))) {
+
+			/* It is an unwritten doublewrite buffer page:
+			do nothing */
+		} else {
+			ulint	zip_size = fil_space_get_zip_size(space_id);
+
+			/* Read in the actual page from the file */
+			fil_io(OS_FILE_READ, TRUE, space_id, zip_size,
+			       page_no, 0,
+			       zip_size ? zip_size : UNIV_PAGE_SIZE,
+			       read_buf, NULL);
+
+			/* Check if the page is corrupt */
+
+			if (UNIV_UNLIKELY
+			    (buf_page_is_corrupted(read_buf, zip_size))) {
+
+				fprintf(stderr,
+					"InnoDB: Warning: database page"
+					" corruption or a failed\n"
+					"InnoDB: file read of"
+					" space %lu page %lu.\n"
+					"InnoDB: Trying to recover it from"
+					" the doublewrite buffer.\n",
+					(ulong) space_id, (ulong) page_no);
+
+				if (buf_page_is_corrupted(page, zip_size)) {
+					fprintf(stderr,
+						"InnoDB: Dump of the page:\n");
+					buf_page_print(
+						read_buf, zip_size,
+						BUF_PAGE_PRINT_NO_CRASH);
+					fprintf(stderr,
+						"InnoDB: Dump of"
+						" corresponding page"
+						" in doublewrite buffer:\n");
+					buf_page_print(
+						page, zip_size,
+						BUF_PAGE_PRINT_NO_CRASH);
+
+					fprintf(stderr,
+						"InnoDB: Also the page in the"
+						" doublewrite buffer"
+						" is corrupt.\n"
+						"InnoDB: Cannot continue"
+						" operation.\n"
+						"InnoDB: You can try to"
+						" recover the database"
+						" with the my.cnf\n"
+						"InnoDB: option:\n"
+						"InnoDB:"
+						" innodb_force_recovery=6\n");
+					ut_error;
+				}
+
+				/* Write the good page from the
+				doublewrite buffer to the intended
+				position */
+
+				fil_io(OS_FILE_WRITE, TRUE, space_id,
+				       zip_size, page_no, 0,
+				       zip_size ? zip_size : UNIV_PAGE_SIZE,
+				       page, NULL);
+				fprintf(stderr,
+					"InnoDB: Recovered the page from"
+					" the doublewrite buffer.\n");
+			}
+		}
+
+		page += UNIV_PAGE_SIZE;
+	}
+
+	fil_flush_file_spaces(FIL_TABLESPACE);
+
+leave_func:
+	ut_free(unaligned_read_buf);
+}
+
+/****************************************************************//**
+Frees doublewrite buffer. */
+UNIV_INTERN
+void
+buf_dblwr_free(void)
+/*================*/
+{
+	/* Free the double write data structures. */
+	ut_a(buf_dblwr != NULL);
+	ut_ad(buf_dblwr->s_reserved == 0);
+	ut_ad(buf_dblwr->b_reserved == 0);
+
+	ut_free(buf_dblwr->write_buf_unaligned);
+	buf_dblwr->write_buf_unaligned = NULL;
+
+	mem_free(buf_dblwr->buf_block_arr);
+	buf_dblwr->buf_block_arr = NULL;
+
+	mem_free(buf_dblwr->in_use);
+	buf_dblwr->in_use = NULL;
+
+	mutex_free(&buf_dblwr->mutex);
+	mem_free(buf_dblwr);
+	buf_dblwr = NULL;
+}
+
+/********************************************************************//**
+Updates the doublewrite buffer when an IO request that is part of an
+LRU or flush batch is completed. */
+UNIV_INTERN
+void
+buf_dblwr_update(void)
+/*==================*/
+{
+	if (!srv_use_doublewrite_buf || buf_dblwr == NULL) {
+		return;
+	}
+
+	mutex_enter(&buf_dblwr->mutex);
+
+	ut_ad(buf_dblwr->batch_running);
+	ut_ad(buf_dblwr->b_reserved > 0);
+
+	buf_dblwr->b_reserved--;
+	if (buf_dblwr->b_reserved == 0) {
+
+		mutex_exit(&buf_dblwr->mutex);
+		/* This will finish the batch. Sync data files
+		to the disk. */
+		fil_flush_file_spaces(FIL_TABLESPACE);
+		mutex_enter(&buf_dblwr->mutex);
+
+		/* We can now reuse the doublewrite memory buffer: */
+		buf_dblwr->first_free = 0;
+		buf_dblwr->batch_running = FALSE;
+	}
+
+	mutex_exit(&buf_dblwr->mutex);
+}
+
+/********************************************************************//**
+Check the LSN values on the page. */
+static
+void
+buf_dblwr_check_page_lsn(
+/*=====================*/
+	const page_t*	page)		/*!< in: page to check */
+{
+	if (memcmp(page + (FIL_PAGE_LSN + 4),
+		   page + (UNIV_PAGE_SIZE
+			   - FIL_PAGE_END_LSN_OLD_CHKSUM + 4),
+		   4)) {
+
+		ut_print_timestamp(stderr);
+		fprintf(stderr,
+			" InnoDB: ERROR: The page to be written"
+			" seems corrupt!\n"
+			"InnoDB: The low 4 bytes of LSN fields do not match "
+			"(" ULINTPF " != " ULINTPF ")!"
+			" Noticed in the buffer pool.\n",
+			mach_read_from_4(
+				page + FIL_PAGE_LSN + 4),
+			mach_read_from_4(
+				page + UNIV_PAGE_SIZE
+				- FIL_PAGE_END_LSN_OLD_CHKSUM + 4));
+	}
+}
+
+/********************************************************************//**
+Asserts when a corrupt block is find during writing out data to the
+disk. */
+static
+void
+buf_dblwr_assert_on_corrupt_block(
+/*==============================*/
+	const buf_block_t*	block)	/*!< in: block to check */
+{
+	buf_page_print(block->frame, 0, BUF_PAGE_PRINT_NO_CRASH);
+
+	ut_print_timestamp(stderr);
+	fprintf(stderr,
+		"  InnoDB: Apparent corruption of an"
+		" index page n:o %lu in space %lu\n"
+		"InnoDB: to be written to data file."
+		" We intentionally crash server\n"
+		"InnoDB: to prevent corrupt data"
+		" from ending up in data\n"
+		"InnoDB: files.\n",
+		(ulong) buf_block_get_page_no(block),
+		(ulong) buf_block_get_space(block));
+
+	ut_error;
+}
+
+/********************************************************************//**
+Check the LSN values on the page with which this block is associated.
+Also validate the page if the option is set. */
+static
+void
+buf_dblwr_check_block(
+/*==================*/
+	const buf_block_t*	block)	/*!< in: block to check */
+{
+	if (buf_block_get_state(block) != BUF_BLOCK_FILE_PAGE
+	    || block->page.zip.data) {
+		/* No simple validate for compressed pages exists. */
+		return;
+	}
+
+	buf_dblwr_check_page_lsn(block->frame);
+
+	if (!block->check_index_page_at_flush) {
+		return;
+	}
+
+	if (page_is_comp(block->frame)) {
+		if (!page_simple_validate_new(block->frame)) {
+			buf_dblwr_assert_on_corrupt_block(block);
+		}
+	} else if (!page_simple_validate_old(block->frame)) {
+
+		buf_dblwr_assert_on_corrupt_block(block);
+	}
+}
+
+/********************************************************************//**
+Writes a page that has already been written to the doublewrite buffer
+to the datafile. It is the job of the caller to sync the datafile. */
+static
+void
+buf_dblwr_write_block_to_datafile(
+/*==============================*/
+	const buf_block_t*	block)	/*!< in: block to write */
+{
+	ut_a(block);
+	ut_a(buf_page_in_file(&block->page));
+
+	if (block->page.zip.data) {
+		fil_io(OS_FILE_WRITE | OS_AIO_SIMULATED_WAKE_LATER,
+		       FALSE, buf_page_get_space(&block->page),
+		       buf_page_get_zip_size(&block->page),
+		       buf_page_get_page_no(&block->page), 0,
+		       buf_page_get_zip_size(&block->page),
+		       (void*) block->page.zip.data,
+		       (void*) block);
+
+		goto exit;
+	}
+
+	ut_a(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
+	buf_dblwr_check_page_lsn(block->frame);
+
+	fil_io(OS_FILE_WRITE | OS_AIO_SIMULATED_WAKE_LATER,
+	       FALSE, buf_block_get_space(block), 0,
+	       buf_block_get_page_no(block), 0, UNIV_PAGE_SIZE,
+	       (void*) block->frame, (void*) block);
+
+exit:
+	/* Increment the counter of I/O operations used
+	for selecting LRU policy. */
+	buf_LRU_stat_inc_io();
+}
+
+/********************************************************************//**
+Flushes possible buffered writes from the doublewrite memory buffer to disk,
+and also wakes up the aio thread if simulated aio is used. It is very
+important to call this function after a batch of writes has been posted,
+and also when we may have to wait for a page latch! Otherwise a deadlock
+of threads can occur. */
+UNIV_INTERN
+void
+buf_dblwr_flush_buffered_writes(void)
+/*=================================*/
+{
+	byte*		write_buf;
+	ulint		len;
+	ulint		len2;
+	ulint		i;
+
+	if (!srv_use_doublewrite_buf || buf_dblwr == NULL) {
+		/* Sync the writes to the disk. */
+		buf_flush_sync_datafiles();
+		return;
+	}
+
+try_again:
+	mutex_enter(&(buf_dblwr->mutex));
+
+	/* Write first to doublewrite buffer blocks. We use synchronous
+	aio and thus know that file write has been completed when the
+	control returns. */
+
+	if (buf_dblwr->first_free == 0) {
+
+		mutex_exit(&(buf_dblwr->mutex));
+
+		return;
+	}
+
+	if (buf_dblwr->batch_running) {
+		mutex_exit(&buf_dblwr->mutex);
+
+		/* Another thread is running the batch right now. Wait
+		for it to finish. */
+		os_thread_sleep(TRX_DOUBLEWRITE_BATCH_POLL_DELAY);
+		goto try_again;
+	}
+
+	ut_a(!buf_dblwr->batch_running);
+
+	/* Disallow anyone else to post to doublewrite buffer or to
+	start another batch of flushing. */
+	buf_dblwr->batch_running = TRUE;
+
+	/* Now safe to release the mutex. Note that though no other
+	thread is allowed to post to the doublewrite batch flushing
+	but any threads working on single page flushes are allowed
+	to proceed. */
+	mutex_exit(&buf_dblwr->mutex);
+
+	write_buf = buf_dblwr->write_buf;
+
+	for (len2 = 0, i = 0;
+	     i < buf_dblwr->first_free;
+	     len2 += UNIV_PAGE_SIZE, i++) {
+
+		const buf_block_t*	block;
+
+		block = (buf_block_t*) buf_dblwr->buf_block_arr[i];
+
+		if (buf_block_get_state(block) != BUF_BLOCK_FILE_PAGE
+		    || block->page.zip.data) {
+			/* No simple validate for compressed
+			pages exists. */
+			continue;
+		}
+
+		/* Check that the actual page in the buffer pool is
+		not corrupt and the LSN values are sane. */
+		buf_dblwr_check_block(block);
+
+		/* Check that the page as written to the doublewrite
+		buffer has sane LSN values. */
+		buf_dblwr_check_page_lsn(write_buf + len2);
+	}
+
+	/* Write out the first block of the doublewrite buffer */
+	len = ut_min(TRX_SYS_DOUBLEWRITE_BLOCK_SIZE,
+		     buf_dblwr->first_free) * UNIV_PAGE_SIZE;
+
+	fil_io(OS_FILE_WRITE, TRUE, TRX_SYS_SPACE, 0,
+	       buf_dblwr->block1, 0, len,
+	       (void*) write_buf, NULL);
+
+	if (buf_dblwr->first_free <= TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
+		/* No unwritten pages in the second block. */
+		goto flush;
+	}
+
+	/* Write out the second block of the doublewrite buffer. */
+	len = (buf_dblwr->first_free - TRX_SYS_DOUBLEWRITE_BLOCK_SIZE)
+	       * UNIV_PAGE_SIZE;
+
+	write_buf = buf_dblwr->write_buf
+		    + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE;
+
+	fil_io(OS_FILE_WRITE, TRUE, TRX_SYS_SPACE, 0,
+	       buf_dblwr->block2, 0, len,
+	       (void*) write_buf, NULL);
+
+flush:
+	/* increment the doublewrite flushed pages counter */
+	srv_dblwr_pages_written += buf_dblwr->first_free;
+	srv_dblwr_writes++;
+
+	/* Now flush the doublewrite buffer data to disk */
+	fil_flush(TRX_SYS_SPACE);
+
+	/* We know that the writes have been flushed to disk now
+	and in recovery we will find them in the doublewrite buffer
+	blocks. Next do the writes to the intended positions. */
+
+	for (i = 0; i < buf_dblwr->first_free; i++) {
+		const buf_block_t* block = (buf_block_t*)
+			buf_dblwr->buf_block_arr[i];
+
+		buf_dblwr_write_block_to_datafile(block);
+	}
+
+	/* Wake possible simulated aio thread to actually post the
+	writes to the operating system. We don't flush the files
+	at this point. We leave it to the IO helper thread to flush
+	datafiles when the whole batch has been processed. */
+	os_aio_simulated_wake_handler_threads();
+}
+
+/********************************************************************//**
+Posts a buffer page for writing. If the doublewrite memory buffer is
+full, calls buf_dblwr_flush_buffered_writes and waits for for free
+space to appear. */
+UNIV_INTERN
+void
+buf_dblwr_add_to_batch(
+/*====================*/
+	buf_page_t*	bpage)	/*!< in: buffer block to write */
+{
+	ulint	zip_size;
+
+	ut_a(buf_page_in_file(bpage));
+
+try_again:
+	mutex_enter(&(buf_dblwr->mutex));
+
+	ut_a(buf_dblwr->first_free <= srv_doublewrite_batch_size);
+
+	if (buf_dblwr->batch_running) {
+		mutex_exit(&buf_dblwr->mutex);
+
+		/* This not nearly as bad as it looks. There is only
+		page_cleaner thread which does background flushing
+		in batches therefore it is unlikely to be a contention
+		point. The only exception is when a user thread is
+		forced to do a flush batch because of a sync
+		checkpoint. */
+		os_thread_sleep(TRX_DOUBLEWRITE_BATCH_POLL_DELAY);
+		goto try_again;
+	}
+
+	if (buf_dblwr->first_free == srv_doublewrite_batch_size) {
+		mutex_exit(&(buf_dblwr->mutex));
+
+		buf_dblwr_flush_buffered_writes();
+
+		goto try_again;
+	}
+
+	zip_size = buf_page_get_zip_size(bpage);
+
+	if (zip_size) {
+		UNIV_MEM_ASSERT_RW(bpage->zip.data, zip_size);
+		/* Copy the compressed page and clear the rest. */
+		memcpy(buf_dblwr->write_buf
+		       + UNIV_PAGE_SIZE * buf_dblwr->first_free,
+		       bpage->zip.data, zip_size);
+		memset(buf_dblwr->write_buf
+		       + UNIV_PAGE_SIZE * buf_dblwr->first_free
+		       + zip_size, 0, UNIV_PAGE_SIZE - zip_size);
+	} else {
+		ut_a(buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE);
+		UNIV_MEM_ASSERT_RW(((buf_block_t*) bpage)->frame,
+				   UNIV_PAGE_SIZE);
+
+		memcpy(buf_dblwr->write_buf
+		       + UNIV_PAGE_SIZE * buf_dblwr->first_free,
+		       ((buf_block_t*) bpage)->frame, UNIV_PAGE_SIZE);
+	}
+
+	buf_dblwr->buf_block_arr[buf_dblwr->first_free] = bpage;
+
+	buf_dblwr->first_free++;
+	buf_dblwr->b_reserved++;
+
+	ut_ad(buf_dblwr->b_reserved <= srv_doublewrite_batch_size);
+
+	if (buf_dblwr->first_free == srv_doublewrite_batch_size) {
+		mutex_exit(&(buf_dblwr->mutex));
+
+		buf_dblwr_flush_buffered_writes();
+
+		return;
+	}
+
+	mutex_exit(&(buf_dblwr->mutex));
+}
+
+/********************************************************************//**
+Writes a page to the doublewrite buffer on disk, sync it, then write
+the page to the datafile and sync the datafile. This function is used
+for single page flushes. If all the buffers allocated for single page
+flushes in the doublewrite buffer are in use we wait here for one to
+become free. We are guaranteed that a slot will become free because any
+thread that is using a slot must also release the slot before leaving
+this function. */
+UNIV_INTERN
+void
+buf_dblwr_write_single_page(
+/*========================*/
+	buf_page_t*	bpage)	/*!< in: buffer block to write */
+{
+	ulint		n_slots;
+	ulint		size;
+	ulint		zip_size;
+	ulint		offset;
+	ulint		i;
+
+	ut_a(buf_page_in_file(bpage));
+	ut_a(srv_use_doublewrite_buf);
+	ut_a(buf_dblwr != NULL);
+
+	/* total number of slots available for single page flushes
+	starts from srv_doublewrite_batch_size to the end of the
+	buffer. */
+	size = 2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE;
+	ut_a(size > srv_doublewrite_batch_size);
+	n_slots = size - srv_doublewrite_batch_size;
+
+	if (buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE) {
+
+		/* Check that the actual page in the buffer pool is
+		not corrupt and the LSN values are sane. */
+		buf_dblwr_check_block((buf_block_t*) bpage);
+
+		/* Check that the page as written to the doublewrite
+		buffer has sane LSN values. */
+		if (!bpage->zip.data) {
+			buf_dblwr_check_page_lsn(
+				((buf_block_t*) bpage)->frame);
+		}
+	}
+
+retry:
+	mutex_enter(&buf_dblwr->mutex);
+	if (buf_dblwr->s_reserved == n_slots) {
+
+		mutex_exit(&buf_dblwr->mutex);
+		/* All slots are reserved. Since it involves two IOs
+		during the processing a sleep of 10ms should be
+		enough. */
+		os_thread_sleep(TRX_DOUBLEWRITE_BATCH_POLL_DELAY);
+		goto retry;
+	}
+
+	for (i = srv_doublewrite_batch_size; i < size; ++i) {
+
+		if (!buf_dblwr->in_use[i]) {
+			break;
+		}
+	}
+
+	/* We are guaranteed to find a slot. */
+	ut_a(i < size);
+	buf_dblwr->in_use[i] = TRUE;
+	buf_dblwr->s_reserved++;
+	buf_dblwr->buf_block_arr[i] = bpage;
+	mutex_exit(&buf_dblwr->mutex);
+
+	/* Lets see if we are going to write in the first or second
+	block of the doublewrite buffer. */
+	if (i < TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
+		offset = buf_dblwr->block1 + i;
+	} else {
+		offset = buf_dblwr->block2 + i
+			 - TRX_SYS_DOUBLEWRITE_BLOCK_SIZE;
+	}
+
+	/* We deal with compressed and uncompressed pages a little
+	differently here. In case of uncompressed pages we can
+	directly write the block to the allocated slot in the
+	doublewrite buffer in the system tablespace and then after
+	syncing the system table space we can proceed to write the page
+	in the datafile.
+	In case of compressed page we first do a memcpy of the block
+	to the in-memory buffer of doublewrite before proceeding to
+	write it. This is so because we want to pad the remaining
+	bytes in the doublewrite page with zeros. */
+
+	zip_size = buf_page_get_zip_size(bpage);
+	if (zip_size) {
+		memcpy(buf_dblwr->write_buf + UNIV_PAGE_SIZE * i,
+		       bpage->zip.data, zip_size);
+		memset(buf_dblwr->write_buf + UNIV_PAGE_SIZE * i
+		       + zip_size, 0, UNIV_PAGE_SIZE - zip_size);
+
+		fil_io(OS_FILE_WRITE, TRUE, TRX_SYS_SPACE, 0,
+		       offset, 0, UNIV_PAGE_SIZE,
+		       (void*) (buf_dblwr->write_buf
+				+ UNIV_PAGE_SIZE * i), NULL);
+	} else {
+		/* It is a regular page. Write it directly to the
+		doublewrite buffer */
+		fil_io(OS_FILE_WRITE, TRUE, TRX_SYS_SPACE, 0,
+		       offset, 0, UNIV_PAGE_SIZE,
+		       (void*) ((buf_block_t*) bpage)->frame,
+		       NULL);
+	}
+
+	/* Now flush the doublewrite buffer data to disk */
+	fil_flush(TRX_SYS_SPACE);
+
+	/* We know that the write has been flushed to disk now
+	and during recovery we will find it in the doublewrite buffer
+	blocks. Next do the write to the intended position. */
+	buf_dblwr_write_block_to_datafile((buf_block_t*) bpage);
+
+	/* Sync the writes to the disk. */
+	buf_flush_sync_datafiles();
+
+	mutex_enter(&buf_dblwr->mutex);
+
+	buf_dblwr->s_reserved--;
+	buf_dblwr->buf_block_arr[i] = NULL;
+	buf_dblwr->in_use[i] = FALSE;
+
+	/* increment the doublewrite flushed pages counter */
+	srv_dblwr_pages_written += buf_dblwr->first_free;
+	srv_dblwr_writes++;
+
+	mutex_exit(&(buf_dblwr->mutex));
+
+}
+#endif /* !UNIV_HOTBACKUP */
diff --git a/storage/innobase/buf/buf0dump.cc b/storage/innobase/buf/buf0dump.cc
new file mode 100644
index 00000000000..27757241c3e
--- /dev/null
+++ b/storage/innobase/buf/buf0dump.cc
@@ -0,0 +1,620 @@
+/*****************************************************************************
+
+Copyright (c) 2011, 2011, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file buf/buf0dump.cc
+Implements a buffer pool dump/load.
+
+Created April 08, 2011 Vasil Dimov
+*******************************************************/
+
+#include <stdarg.h> /* va_* */
+#include <string.h> /* strerror() */
+
+#include "univ.i"
+
+#include "buf0buf.h" /* buf_pool_mutex_enter(), srv_buf_pool_instances */
+#include "buf0dump.h"
+#include "db0err.h" /* enum db_err */
+#include "dict0dict.h" /* dict_operation_lock */
+#include "os0file.h" /* OS_FILE_MAX_PATH */
+#include "os0sync.h" /* os_event* */
+#include "os0thread.h" /* os_thread_* */
+#include "srv0srv.h" /* srv_fast_shutdown, srv_buf_dump* */
+#include "srv0start.h" /* srv_shutdown_state */
+#include "sync0rw.h" /* rw_lock_s_lock() */
+#include "ut0byte.h" /* ut_ull_create() */
+#include "ut0sort.h" /* UT_SORT_FUNCTION_BODY */
+#include "buf0rea.h" /* buf_read_page_async() */
+
+enum status_severity {
+	STATUS_INFO,
+	STATUS_NOTICE,
+	STATUS_ERR
+};
+
+#define SHUTTING_DOWN()	(UNIV_UNLIKELY(srv_shutdown_state \
+				       != SRV_SHUTDOWN_NONE))
+
+/* Flags that tell the buffer pool dump/load thread which action should it
+take after being waked up. */
+static ibool	buf_dump_should_start = FALSE;
+static ibool	buf_load_should_start = FALSE;
+
+static ibool	buf_load_abort_flag = FALSE;
+
+/* Used to temporary store dump info in order to avoid IO while holding
+buffer pool mutex during dump and also to sort the contents of the dump
+before reading the pages from disk during load.
+We store the space id in the high 32 bits and page no in low 32 bits. */
+typedef ib_uint64_t	buf_dump_t;
+
+/* Aux macros to create buf_dump_t and to extract space and page from it */
+#define BUF_DUMP_CREATE(space, page)	ut_ull_create(space, page)
+#define BUF_DUMP_SPACE(a)		((ulint) ((a) >> 32))
+#define BUF_DUMP_PAGE(a)		((ulint) ((a) & 0xFFFFFFFFUL))
+
+/*****************************************************************//**
+Wakes up the buffer pool dump/load thread and instructs it to start
+a dump. This function is called by MySQL code via buffer_pool_dump_now()
+and it should return immediately because the whole MySQL is frozen during
+its execution. */
+UNIV_INTERN
+void
+buf_dump_start()
+/*============*/
+{
+	buf_dump_should_start = TRUE;
+	os_event_set(srv_buf_dump_event);
+}
+
+/*****************************************************************//**
+Wakes up the buffer pool dump/load thread and instructs it to start
+a load. This function is called by MySQL code via buffer_pool_load_now()
+and it should return immediately because the whole MySQL is frozen during
+its execution. */
+UNIV_INTERN
+void
+buf_load_start()
+/*============*/
+{
+	buf_load_should_start = TRUE;
+	os_event_set(srv_buf_dump_event);
+}
+
+/*****************************************************************//**
+Sets the global variable that feeds MySQL's innodb_buffer_pool_dump_status
+to the specified string. The format and the following parameters are the
+same as the ones used for printf(3). The value of this variable can be
+retrieved by:
+SELECT variable_value FROM information_schema.global_status WHERE
+variable_name = 'INNODB_BUFFER_POOL_DUMP_STATUS';
+or by:
+SHOW STATUS LIKE 'innodb_buffer_pool_dump_status'; */
+static __attribute__((nonnull, format(printf, 2, 3)))
+void
+buf_dump_status(
+/*============*/
+	enum status_severity	severity,/*!< in: status severity */
+	const char*		fmt,	/*!< in: format */
+	...)				/*!< in: extra parameters according
+					to fmt */
+{
+	va_list	ap;
+
+	va_start(ap, fmt);
+
+	ut_vsnprintf(
+		export_vars.innodb_buffer_pool_dump_status,
+		sizeof(export_vars.innodb_buffer_pool_dump_status),
+		fmt, ap);
+
+	if (severity == STATUS_NOTICE || severity == STATUS_ERR) {
+		ut_print_timestamp(stderr);
+		fprintf(stderr, " InnoDB: %s\n",
+			export_vars.innodb_buffer_pool_dump_status);
+	}
+
+	va_end(ap);
+}
+
+/*****************************************************************//**
+Sets the global variable that feeds MySQL's innodb_buffer_pool_load_status
+to the specified string. The format and the following parameters are the
+same as the ones used for printf(3). The value of this variable can be
+retrieved by:
+SELECT variable_value FROM information_schema.global_status WHERE
+variable_name = 'INNODB_BUFFER_POOL_LOAD_STATUS';
+or by:
+SHOW STATUS LIKE 'innodb_buffer_pool_load_status'; */
+static __attribute__((nonnull, format(printf, 2, 3)))
+void
+buf_load_status(
+/*============*/
+	enum status_severity	severity,/*!< in: status severity */
+	const char*	fmt,	/*!< in: format */
+	...)			/*!< in: extra parameters according to fmt */
+{
+	va_list	ap;
+
+	va_start(ap, fmt);
+
+	ut_vsnprintf(
+		export_vars.innodb_buffer_pool_load_status,
+		sizeof(export_vars.innodb_buffer_pool_load_status),
+		fmt, ap);
+
+	if (severity == STATUS_NOTICE || severity == STATUS_ERR) {
+		ut_print_timestamp(stderr);
+		fprintf(stderr, " InnoDB: %s\n",
+			export_vars.innodb_buffer_pool_load_status);
+	}
+
+	va_end(ap);
+}
+
+/*****************************************************************//**
+Perform a buffer pool dump into the file specified by
+innodb_buffer_pool_filename. If any errors occur then the value of
+innodb_buffer_pool_dump_status will be set accordingly, see buf_dump_status().
+The dump filename can be specified by (relative to srv_data_home):
+SET GLOBAL innodb_buffer_pool_filename='filename'; */
+static
+void
+buf_dump(
+/*=====*/
+	ibool	obey_shutdown)	/*!< in: quit if we are in a shutting down
+				state */
+{
+#define SHOULD_QUIT()	(SHUTTING_DOWN() && obey_shutdown)
+
+	char	full_filename[OS_FILE_MAX_PATH];
+	char	tmp_filename[OS_FILE_MAX_PATH];
+	char	now[32];
+	FILE*	f;
+	ulint	i;
+	int	ret;
+
+	ut_snprintf(full_filename, sizeof(full_filename),
+		    "%s%c%s", srv_data_home, SRV_PATH_SEPARATOR,
+		    srv_buf_dump_filename);
+
+	ut_snprintf(tmp_filename, sizeof(tmp_filename),
+		    "%s.incomplete", full_filename);
+
+	buf_dump_status(STATUS_NOTICE, "Dumping buffer pool(s) to %s",
+			full_filename);
+
+	f = fopen(tmp_filename, "w");
+	if (f == NULL) {
+		buf_dump_status(STATUS_ERR,
+				"Cannot open '%s' for writing: %s",
+				tmp_filename, strerror(errno));
+		return;
+	}
+	/* else */
+
+	/* walk through each buffer pool */
+	for (i = 0; i < srv_buf_pool_instances && !SHOULD_QUIT(); i++) {
+		buf_pool_t*		buf_pool;
+		const buf_page_t*	bpage;
+		buf_dump_t*		dump;
+		ulint			n_pages;
+		ulint			j;
+
+		buf_pool = buf_pool_from_array(i);
+
+		/* obtain buf_pool mutex before allocate, since
+		UT_LIST_GET_LEN(buf_pool->LRU) could change */
+		buf_pool_mutex_enter(buf_pool);
+
+		n_pages = UT_LIST_GET_LEN(buf_pool->LRU);
+
+		/* skip empty buffer pools */
+		if (n_pages == 0) {
+			buf_pool_mutex_exit(buf_pool);
+			continue;
+		}
+
+		dump = static_cast<buf_dump_t*>(
+			ut_malloc(n_pages * sizeof(*dump))) ;
+
+		if (dump == NULL) {
+			buf_pool_mutex_exit(buf_pool);
+			fclose(f);
+			buf_dump_status(STATUS_ERR,
+					"Cannot allocate " ULINTPF " bytes: %s",
+					(ulint) (n_pages * sizeof(*dump)),
+					strerror(errno));
+			/* leave tmp_filename to exist */
+			return;
+		}
+
+		for (bpage = UT_LIST_GET_LAST(buf_pool->LRU), j = 0;
+		     bpage != NULL;
+		     bpage = UT_LIST_GET_PREV(LRU, bpage), j++) {
+
+			ut_a(buf_page_in_file(bpage));
+
+			dump[j] = BUF_DUMP_CREATE(buf_page_get_space(bpage),
+						  buf_page_get_page_no(bpage));
+		}
+
+		ut_a(j == n_pages);
+
+		buf_pool_mutex_exit(buf_pool);
+
+		for (j = 0; j < n_pages && !SHOULD_QUIT(); j++) {
+			ret = fprintf(f, ULINTPF "," ULINTPF "\n",
+				      BUF_DUMP_SPACE(dump[j]),
+				      BUF_DUMP_PAGE(dump[j]));
+			if (ret < 0) {
+				ut_free(dump);
+				fclose(f);
+				buf_dump_status(STATUS_ERR,
+						"Cannot write to '%s': %s",
+						tmp_filename, strerror(errno));
+				/* leave tmp_filename to exist */
+				return;
+			}
+
+			if (j % 128 == 0) {
+				buf_dump_status(
+					STATUS_INFO,
+					"Dumping buffer pool "
+					ULINTPF "/" ULINTPF ", "
+					"page " ULINTPF "/" ULINTPF,
+					i + 1, srv_buf_pool_instances,
+					j + 1, n_pages);
+			}
+		}
+
+		ut_free(dump);
+	}
+
+	ret = fclose(f);
+	if (ret != 0) {
+		buf_dump_status(STATUS_ERR,
+				"Cannot close '%s': %s",
+				tmp_filename, strerror(errno));
+		return;
+	}
+	/* else */
+
+	ret = unlink(full_filename);
+	if (ret != 0 && errno != ENOENT) {
+		buf_dump_status(STATUS_ERR,
+				"Cannot delete '%s': %s",
+				full_filename, strerror(errno));
+		/* leave tmp_filename to exist */
+		return;
+	}
+	/* else */
+
+	ret = rename(tmp_filename, full_filename);
+	if (ret != 0) {
+		buf_dump_status(STATUS_ERR,
+				"Cannot rename '%s' to '%s': %s",
+				tmp_filename, full_filename,
+				strerror(errno));
+		/* leave tmp_filename to exist */
+		return;
+	}
+	/* else */
+
+	/* success */
+
+	ut_sprintf_timestamp(now);
+
+	buf_dump_status(STATUS_NOTICE,
+			"Buffer pool(s) dump completed at %s", now);
+}
+
+/*****************************************************************//**
+Compare two buffer pool dump entries, used to sort the dump on
+space_no,page_no before loading in order to increase the chance for
+sequential IO.
+@return -1/0/1 if entry 1 is smaller/equal/bigger than entry 2 */
+static
+lint
+buf_dump_cmp(
+/*=========*/
+	const buf_dump_t	d1,	/*!< in: buffer pool dump entry 1 */
+	const buf_dump_t	d2)	/*!< in: buffer pool dump entry 2 */
+{
+	if (d1 < d2) {
+		return(-1);
+	} else if (d1 == d2) {
+		return(0);
+	} else {
+		return(1);
+	}
+}
+
+/*****************************************************************//**
+Sort a buffer pool dump on space_no, page_no. */
+static
+void
+buf_dump_sort(
+/*==========*/
+	buf_dump_t*	dump,	/*!< in/out: buffer pool dump to sort */
+	buf_dump_t*	tmp,	/*!< in/out: temp storage */
+	ulint		low,	/*!< in: lowest index (inclusive) */
+	ulint		high)	/*!< in: highest index (non-inclusive) */
+{
+	UT_SORT_FUNCTION_BODY(buf_dump_sort, dump, tmp, low, high,
+			      buf_dump_cmp);
+}
+
+/*****************************************************************//**
+Perform a buffer pool load from the file specified by
+innodb_buffer_pool_filename. If any errors occur then the value of
+innodb_buffer_pool_load_status will be set accordingly, see buf_load_status().
+The dump filename can be specified by (relative to srv_data_home):
+SET GLOBAL innodb_buffer_pool_filename='filename'; */
+static
+void
+buf_load()
+/*======*/
+{
+	char		full_filename[OS_FILE_MAX_PATH];
+	char		now[32];
+	FILE*		f;
+	buf_dump_t*	dump;
+	buf_dump_t*	dump_tmp;
+	ulint		dump_n;
+	ulint		total_buffer_pools_pages;
+	ulint		i;
+	ulint		space_id;
+	ulint		page_no;
+	int		fscanf_ret;
+
+	/* Ignore any leftovers from before */
+	buf_load_abort_flag = FALSE;
+
+	ut_snprintf(full_filename, sizeof(full_filename),
+		    "%s%c%s", srv_data_home, SRV_PATH_SEPARATOR,
+		    srv_buf_dump_filename);
+
+	buf_load_status(STATUS_NOTICE,
+			"Loading buffer pool(s) from %s", full_filename);
+
+	f = fopen(full_filename, "r");
+	if (f == NULL) {
+		buf_load_status(STATUS_ERR,
+				"Cannot open '%s' for reading: %s",
+				full_filename, strerror(errno));
+		return;
+	}
+	/* else */
+
+	/* First scan the file to estimate how many entries are in it.
+	This file is tiny (approx 500KB per 1GB buffer pool), reading it
+	two times is fine. */
+	dump_n = 0;
+	while (fscanf(f, ULINTPF "," ULINTPF, &space_id, &page_no) == 2
+	       && !SHUTTING_DOWN()) {
+		dump_n++;
+	}
+
+	if (!SHUTTING_DOWN() && !feof(f)) {
+		/* fscanf() returned != 2 */
+		const char*	what;
+		if (ferror(f)) {
+			what = "reading";
+		} else {
+			what = "parsing";
+		}
+		fclose(f);
+		buf_load_status(STATUS_ERR, "Error %s '%s', "
+				"unable to load buffer pool (stage 1)",
+				what, full_filename);
+		return;
+	}
+
+	/* If dump is larger than the buffer pool(s), then we ignore the
+	extra trailing. This could happen if a dump is made, then buffer
+	pool is shrunk and then load it attempted. */
+	total_buffer_pools_pages = buf_pool_get_n_pages()
+		* srv_buf_pool_instances;
+	if (dump_n > total_buffer_pools_pages) {
+		dump_n = total_buffer_pools_pages;
+	}
+
+	dump = static_cast<buf_dump_t*>(ut_malloc(dump_n * sizeof(*dump)));
+
+	if (dump == NULL) {
+		fclose(f);
+		buf_load_status(STATUS_ERR,
+				"Cannot allocate " ULINTPF " bytes: %s",
+				(ulint) (dump_n * sizeof(*dump)),
+				strerror(errno));
+		return;
+	}
+
+	dump_tmp = static_cast<buf_dump_t*>(
+		ut_malloc(dump_n * sizeof(*dump_tmp)));
+
+	if (dump_tmp == NULL) {
+		ut_free(dump);
+		fclose(f);
+		buf_load_status(STATUS_ERR,
+				"Cannot allocate " ULINTPF " bytes: %s",
+				(ulint) (dump_n * sizeof(*dump_tmp)),
+				strerror(errno));
+		return;
+	}
+
+	rewind(f);
+
+	for (i = 0; i < dump_n && !SHUTTING_DOWN(); i++) {
+		fscanf_ret = fscanf(f, ULINTPF "," ULINTPF,
+				    &space_id, &page_no);
+
+		if (fscanf_ret != 2) {
+			if (feof(f)) {
+				break;
+			}
+			/* else */
+
+			ut_free(dump);
+			ut_free(dump_tmp);
+			fclose(f);
+			buf_load_status(STATUS_ERR,
+					"Error parsing '%s', unable "
+					"to load buffer pool (stage 2)",
+					full_filename);
+			return;
+		}
+
+		if (space_id > ULINT32_MASK || page_no > ULINT32_MASK) {
+			ut_free(dump);
+			ut_free(dump_tmp);
+			fclose(f);
+			buf_load_status(STATUS_ERR,
+					"Error parsing '%s': bogus "
+					"space,page " ULINTPF "," ULINTPF
+					" at line " ULINTPF ", "
+					"unable to load buffer pool",
+					full_filename,
+					space_id, page_no,
+					i);
+			return;
+		}
+
+		dump[i] = BUF_DUMP_CREATE(space_id, page_no);
+	}
+
+	/* Set dump_n to the actual number of initialized elements,
+	i could be smaller than dump_n here if the file got truncated after
+	we read it the first time. */
+	dump_n = i;
+
+	fclose(f);
+
+	if (dump_n == 0) {
+		ut_free(dump);
+		ut_sprintf_timestamp(now);
+		buf_load_status(STATUS_NOTICE,
+				"Buffer pool(s) load completed at %s "
+				"(%s was empty)", now, full_filename);
+		return;
+	}
+
+	if (!SHUTTING_DOWN()) {
+		buf_dump_sort(dump, dump_tmp, 0, dump_n);
+	}
+
+	ut_free(dump_tmp);
+
+	for (i = 0; i < dump_n && !SHUTTING_DOWN(); i++) {
+
+		buf_read_page_async(BUF_DUMP_SPACE(dump[i]),
+				    BUF_DUMP_PAGE(dump[i]));
+
+		if (i % 64 == 63) {
+			os_aio_simulated_wake_handler_threads();
+		}
+
+		if (i % 128 == 0) {
+			buf_load_status(STATUS_INFO,
+					"Loaded " ULINTPF "/" ULINTPF " pages",
+					i + 1, dump_n);
+		}
+
+		if (buf_load_abort_flag) {
+			buf_load_abort_flag = FALSE;
+			ut_free(dump);
+			buf_load_status(
+				STATUS_NOTICE,
+				"Buffer pool(s) load aborted on request");
+			return;
+		}
+	}
+
+	ut_free(dump);
+
+	ut_sprintf_timestamp(now);
+
+	buf_load_status(STATUS_NOTICE,
+			"Buffer pool(s) load completed at %s", now);
+}
+
+/*****************************************************************//**
+Aborts a currently running buffer pool load. This function is called by
+MySQL code via buffer_pool_load_abort() and it should return immediately
+because the whole MySQL is frozen during its execution. */
+UNIV_INTERN
+void
+buf_load_abort()
+/*============*/
+{
+	buf_load_abort_flag = TRUE;
+}
+
+/*****************************************************************//**
+This is the main thread for buffer pool dump/load. It waits for an
+event and when waked up either performs a dump or load and sleeps
+again.
+@return this function does not return, it calls os_thread_exit() */
+extern "C" UNIV_INTERN
+os_thread_ret_t
+DECLARE_THREAD(buf_dump_thread)(
+/*============================*/
+	void*	arg __attribute__((unused)))	/*!< in: a dummy parameter
+						required by os_thread_create */
+{
+	srv_buf_dump_thread_active = TRUE;
+
+	buf_dump_status(STATUS_INFO, "not started");
+	buf_load_status(STATUS_INFO, "not started");
+
+	if (srv_buffer_pool_load_at_startup) {
+		buf_load();
+	}
+
+	while (!SHUTTING_DOWN()) {
+
+		os_event_wait(srv_buf_dump_event);
+
+		if (buf_dump_should_start) {
+			buf_dump_should_start = FALSE;
+			buf_dump(TRUE /* quit on shutdown */);
+		}
+
+		if (buf_load_should_start) {
+			buf_load_should_start = FALSE;
+			buf_load();
+		}
+
+		os_event_reset(srv_buf_dump_event);
+	}
+
+	if (srv_buffer_pool_dump_at_shutdown && srv_fast_shutdown != 2) {
+		buf_dump(FALSE /* ignore shutdown down flag,
+		keep going even if we are in a shutdown state */);
+	}
+
+	srv_buf_dump_thread_active = FALSE;
+
+	/* We count the number of threads in os_thread_exit(). A created
+	thread should always use that to exit and not use return() to exit. */
+	os_thread_exit(NULL);
+
+	OS_THREAD_DUMMY_RETURN;
+}
diff --git a/storage/innobase/buf/buf0flu.c b/storage/innobase/buf/buf0flu.cc
index 7cd09d6675e..023ed766c62 100644
--- a/storage/innobase/buf/buf0flu.c
+++ b/storage/innobase/buf/buf0flu.cc
@@ -11,13 +11,13 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
 /**************************************************//**
-@file buf/buf0flu.c
+@file buf/buf0flu.cc
 The database buffer buf_pool flush algorithm
 
 Created 11/11/1995 Heikki Tuuri
@@ -25,11 +25,9 @@ Created 11/11/1995 Heikki Tuuri
 
 #include "buf0flu.h"
 
-#ifdef UNIV_NONINL
-#include "buf0flu.ic"
-#endif
-
 #include "buf0buf.h"
+#include "buf0checksum.h"
+#include "srv0start.h"
 #include "srv0srv.h"
 #include "page0zip.h"
 #ifndef UNIV_HOTBACKUP
@@ -43,8 +41,14 @@ Created 11/11/1995 Heikki Tuuri
 #include "log0log.h"
 #include "os0file.h"
 #include "trx0sys.h"
+#include "srv0mon.h"
 #include "mysql/plugin.h"
 #include "mysql/service_thd_wait.h"
+#include "buf0dblwr.h"
+
+#ifdef UNIV_NONINL
+#include "buf0flu.ic"
+#endif
 
 /**********************************************************************
 These statistics are generated for heuristics used in estimating the
@@ -77,6 +81,27 @@ static buf_flush_stat_t	buf_flush_stat_sum;
 /** Number of pages flushed through non flush_list flushes. */
 static ulint buf_lru_flush_page_count = 0;
 
+/** Flag indicating if the page_cleaner is in active state. This flag
+is set to TRUE by the page_cleaner thread when it is spawned and is set
+back to FALSE at shutdown by the page_cleaner as well. Therefore no
+need to protect it by a mutex. It is only ever read by the thread
+doing the shutdown */
+UNIV_INTERN ibool buf_page_cleaner_is_active = FALSE;
+
+/** LRU flush batch is further divided into this chunk size to
+reduce the wait time for the threads waiting for a clean block */
+#define PAGE_CLEANER_LRU_BATCH_CHUNK_SIZE	100
+
+#ifdef UNIV_PFS_THREAD
+UNIV_INTERN mysql_pfs_key_t buf_page_cleaner_thread_key;
+#endif /* UNIV_PFS_THREAD */
+
+/** If LRU list of a buf_pool is less than this size then LRU eviction
+should not happen. This is because when we do LRU flushing we also put
+the blocks on free list. If LRU list is very small then we can end up
+in thrashing. */
+#define BUF_LRU_MIN_LEN		256
+
 /* @} */
 
 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
@@ -282,7 +307,7 @@ buf_flush_insert_into_flush_list(
 /*=============================*/
 	buf_pool_t*	buf_pool,	/*!< buffer pool instance */
 	buf_block_t*	block,		/*!< in/out: block which is modified */
-	ib_uint64_t	lsn)		/*!< in: oldest modification */
+	lsn_t		lsn)		/*!< in: oldest modification */
 {
 	ut_ad(!buf_pool_mutex_own(buf_pool));
 	ut_ad(log_flush_order_mutex_own());
@@ -313,7 +338,7 @@ buf_flush_insert_into_flush_list(
 	{
 		ulint	zip_size = buf_block_get_zip_size(block);
 
-		if (UNIV_UNLIKELY(zip_size)) {
+		if (zip_size) {
 			UNIV_MEM_ASSERT_RW(block->page.zip.data, zip_size);
 		} else {
 			UNIV_MEM_ASSERT_RW(block->frame, UNIV_PAGE_SIZE);
@@ -337,7 +362,7 @@ buf_flush_insert_sorted_into_flush_list(
 /*====================================*/
 	buf_pool_t*	buf_pool,	/*!< in: buffer pool instance */
 	buf_block_t*	block,		/*!< in/out: block which is modified */
-	ib_uint64_t	lsn)		/*!< in: oldest modification */
+	lsn_t		lsn)		/*!< in: oldest modification */
 {
 	buf_page_t*	prev_b;
 	buf_page_t*	b;
@@ -373,7 +398,7 @@ buf_flush_insert_sorted_into_flush_list(
 	{
 		ulint	zip_size = buf_block_get_zip_size(block);
 
-		if (UNIV_UNLIKELY(zip_size)) {
+		if (zip_size) {
 			UNIV_MEM_ASSERT_RW(block->page.zip.data, zip_size);
 		} else {
 			UNIV_MEM_ASSERT_RW(block->frame, UNIV_PAGE_SIZE);
@@ -412,6 +437,8 @@ buf_flush_insert_sorted_into_flush_list(
 				     prev_b, &block->page);
 	}
 
+	MONITOR_INC(MONITOR_PAGE_INFLUSH);
+
 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
 	ut_a(buf_flush_validate_low(buf_pool));
 #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
@@ -464,7 +491,7 @@ buf_flush_ready_for_flush(
 /*======================*/
 	buf_page_t*	bpage,	/*!< in: buffer control block, must be
 				buf_page_in_file(bpage) */
-	enum buf_flush	flush_type)/*!< in: BUF_FLUSH_LRU or BUF_FLUSH_LIST */
+	enum buf_flush	flush_type)/*!< in: type of flush */
 {
 #ifdef UNIV_DEBUG
 	buf_pool_t*	buf_pool = buf_pool_from_bpage(bpage);
@@ -472,26 +499,33 @@ buf_flush_ready_for_flush(
 #endif
 	ut_a(buf_page_in_file(bpage));
 	ut_ad(mutex_own(buf_page_get_mutex(bpage)));
-	ut_ad(flush_type == BUF_FLUSH_LRU || BUF_FLUSH_LIST);
+	ut_ad(flush_type < BUF_FLUSH_N_TYPES);
 
-	if (bpage->oldest_modification != 0
-	    && buf_page_get_io_fix(bpage) == BUF_IO_NONE) {
-		ut_ad(bpage->in_flush_list);
-
-		if (flush_type != BUF_FLUSH_LRU) {
-
-			return(TRUE);
+	if (bpage->oldest_modification == 0
+	    || buf_page_get_io_fix(bpage) != BUF_IO_NONE) {
+		return(FALSE);
+	}
 
-		} else if (bpage->buf_fix_count == 0) {
+	ut_ad(bpage->in_flush_list);
 
-			/* If we are flushing the LRU list, to avoid deadlocks
-			we require the block not to be bufferfixed, and hence
-			not latched. */
+	switch (flush_type) {
+	case BUF_FLUSH_LIST:
+		return(TRUE);
 
-			return(TRUE);
-		}
+	case BUF_FLUSH_LRU:
+	case BUF_FLUSH_SINGLE_PAGE:
+		/* Because any thread may call single page flush, even
+		when owning locks on pages, to avoid deadlocks, we must
+		make sure that the that it is not buffer fixed.
+		The same holds true for LRU flush because a user thread
+		may end up waiting for an LRU flush to end while
+		holding locks on other pages. */
+		return(bpage->buf_fix_count == 0);
+	case BUF_FLUSH_N_TYPES:
+		break;
 	}
 
+	ut_error;
 	return(FALSE);
 }
 
@@ -548,6 +582,8 @@ buf_flush_remove(
 	ut_a(buf_flush_validate_skip(buf_pool));
 #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
 
+	MONITOR_DEC(MONITOR_PAGE_INFLUSH);
+
 	buf_flush_list_mutex_exit(buf_pool);
 }
 
@@ -647,15 +683,6 @@ buf_flush_write_complete(
 	flush_type = buf_page_get_flush_type(bpage);
 	buf_pool->n_flush[flush_type]--;
 
-	if (flush_type == BUF_FLUSH_LRU) {
-		/* Put the block to the end of the LRU list to wait to be
-		moved to the free list */
-
-		buf_LRU_make_block_old(bpage);
-
-		buf_pool->LRU_flush_ended++;
-	}
-
 	/* fprintf(stderr, "n pending flush %lu\n",
 	buf_pool->n_flush[flush_type]); */
 
@@ -666,326 +693,19 @@ buf_flush_write_complete(
 
 		os_event_set(buf_pool->no_flush[flush_type]);
 	}
-}
-
-/********************************************************************//**
-Flush a batch of writes to the datafiles that have already been
-written by the OS. */
-static
-void
-buf_flush_sync_datafiles(void)
-/*==========================*/
-{
-	/* Wake possible simulated aio thread to actually post the
-	writes to the operating system */
-	os_aio_simulated_wake_handler_threads();
-
-	/* Wait that all async writes to tablespaces have been posted to
-	the OS */
-	os_aio_wait_until_no_pending_writes();
-
-	/* Now we flush the data to disk (for example, with fsync) */
-	fil_flush_file_spaces(FIL_TABLESPACE);
-
-	return;
-}
-
-/********************************************************************//**
-Flushes possible buffered writes from the doublewrite memory buffer to disk,
-and also wakes up the aio thread if simulated aio is used. It is very
-important to call this function after a batch of writes has been posted,
-and also when we may have to wait for a page latch! Otherwise a deadlock
-of threads can occur. */
-static
-void
-buf_flush_buffered_writes(void)
-/*===========================*/
-{
-	byte*		write_buf;
-	ulint		len;
-	ulint		len2;
-	ulint		i;
-
-	if (!srv_use_doublewrite_buf || trx_doublewrite == NULL) {
-		/* Sync the writes to the disk. */
-		buf_flush_sync_datafiles();
-		return;
-	}
-
-	mutex_enter(&(trx_doublewrite->mutex));
-
-	/* Write first to doublewrite buffer blocks. We use synchronous
-	aio and thus know that file write has been completed when the
-	control returns. */
-
-	if (trx_doublewrite->first_free == 0) {
-
-		mutex_exit(&(trx_doublewrite->mutex));
-
-		return;
-	}
-
-	for (i = 0; i < trx_doublewrite->first_free; i++) {
-
-		const buf_block_t*	block;
-
-		block = (buf_block_t*) trx_doublewrite->buf_block_arr[i];
-
-		if (buf_block_get_state(block) != BUF_BLOCK_FILE_PAGE
-		    || block->page.zip.data) {
-			/* No simple validate for compressed pages exists. */
-			continue;
-		}
-
-		if (UNIV_UNLIKELY
-		    (memcmp(block->frame + (FIL_PAGE_LSN + 4),
-			    block->frame + (UNIV_PAGE_SIZE
-					    - FIL_PAGE_END_LSN_OLD_CHKSUM + 4),
-			    4))) {
-			ut_print_timestamp(stderr);
-			fprintf(stderr,
-				"  InnoDB: ERROR: The page to be written"
-				" seems corrupt!\n"
-				"InnoDB: The lsn fields do not match!"
-				" Noticed in the buffer pool\n"
-				"InnoDB: before posting to the"
-				" doublewrite buffer.\n");
-		}
-
-		if (!block->check_index_page_at_flush) {
-		} else if (page_is_comp(block->frame)) {
-			if (UNIV_UNLIKELY
-			    (!page_simple_validate_new(block->frame))) {
-corrupted_page:
-				buf_page_print(block->frame, 0,
-					       BUF_PAGE_PRINT_NO_CRASH);
-
-				ut_print_timestamp(stderr);
-				fprintf(stderr,
-					"  InnoDB: Apparent corruption of an"
-					" index page n:o %lu in space %lu\n"
-					"InnoDB: to be written to data file."
-					" We intentionally crash server\n"
-					"InnoDB: to prevent corrupt data"
-					" from ending up in data\n"
-					"InnoDB: files.\n",
-					(ulong) buf_block_get_page_no(block),
-					(ulong) buf_block_get_space(block));
-
-				ut_error;
-			}
-		} else if (UNIV_UNLIKELY
-			   (!page_simple_validate_old(block->frame))) {
-
-			goto corrupted_page;
-		}
-	}
-
-	/* increment the doublewrite flushed pages counter */
-	srv_dblwr_pages_written+= trx_doublewrite->first_free;
-	srv_dblwr_writes++;
-
-	len = ut_min(TRX_SYS_DOUBLEWRITE_BLOCK_SIZE,
-		     trx_doublewrite->first_free) * UNIV_PAGE_SIZE;
-
-	write_buf = trx_doublewrite->write_buf;
-	i = 0;
-
-	fil_io(OS_FILE_WRITE, TRUE, TRX_SYS_SPACE, 0,
-	       trx_doublewrite->block1, 0, len,
-	       (void*) write_buf, NULL);
-
-	for (len2 = 0; len2 + UNIV_PAGE_SIZE <= len;
-	     len2 += UNIV_PAGE_SIZE, i++) {
-		const buf_block_t* block = (buf_block_t*)
-			trx_doublewrite->buf_block_arr[i];
-
-		if (UNIV_LIKELY(!block->page.zip.data)
-		    && UNIV_LIKELY(buf_block_get_state(block)
-				   == BUF_BLOCK_FILE_PAGE)
-		    && UNIV_UNLIKELY
-		    (memcmp(write_buf + len2 + (FIL_PAGE_LSN + 4),
-			    write_buf + len2
-			    + (UNIV_PAGE_SIZE
-			       - FIL_PAGE_END_LSN_OLD_CHKSUM + 4), 4))) {
-			ut_print_timestamp(stderr);
-			fprintf(stderr,
-				"  InnoDB: ERROR: The page to be written"
-				" seems corrupt!\n"
-				"InnoDB: The lsn fields do not match!"
-				" Noticed in the doublewrite block1.\n");
-		}
-	}
-
-	if (trx_doublewrite->first_free <= TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
-		goto flush;
-	}
-
-	len = (trx_doublewrite->first_free - TRX_SYS_DOUBLEWRITE_BLOCK_SIZE)
-		* UNIV_PAGE_SIZE;
-
-	write_buf = trx_doublewrite->write_buf
-		+ TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE;
-	ut_ad(i == TRX_SYS_DOUBLEWRITE_BLOCK_SIZE);
-
-	fil_io(OS_FILE_WRITE, TRUE, TRX_SYS_SPACE, 0,
-	       trx_doublewrite->block2, 0, len,
-	       (void*) write_buf, NULL);
-
-	for (len2 = 0; len2 + UNIV_PAGE_SIZE <= len;
-	     len2 += UNIV_PAGE_SIZE, i++) {
-		const buf_block_t* block = (buf_block_t*)
-			trx_doublewrite->buf_block_arr[i];
-
-		if (UNIV_LIKELY(!block->page.zip.data)
-		    && UNIV_LIKELY(buf_block_get_state(block)
-				   == BUF_BLOCK_FILE_PAGE)
-		    && UNIV_UNLIKELY
-		    (memcmp(write_buf + len2 + (FIL_PAGE_LSN + 4),
-			    write_buf + len2
-			    + (UNIV_PAGE_SIZE
-			       - FIL_PAGE_END_LSN_OLD_CHKSUM + 4), 4))) {
-			ut_print_timestamp(stderr);
-			fprintf(stderr,
-				"  InnoDB: ERROR: The page to be"
-				" written seems corrupt!\n"
-				"InnoDB: The lsn fields do not match!"
-				" Noticed in"
-				" the doublewrite block2.\n");
-		}
-	}
-
-flush:
-	/* Now flush the doublewrite buffer data to disk */
-
-	fil_flush(TRX_SYS_SPACE);
-
-	/* We know that the writes have been flushed to disk now
-	and in recovery we will find them in the doublewrite buffer
-	blocks. Next do the writes to the intended positions. */
-
-	for (i = 0; i < trx_doublewrite->first_free; i++) {
-		const buf_block_t* block = (buf_block_t*)
-			trx_doublewrite->buf_block_arr[i];
-
-		ut_a(buf_page_in_file(&block->page));
-		if (UNIV_LIKELY_NULL(block->page.zip.data)) {
-			fil_io(OS_FILE_WRITE | OS_AIO_SIMULATED_WAKE_LATER,
-			       FALSE, buf_page_get_space(&block->page),
-			       buf_page_get_zip_size(&block->page),
-			       buf_page_get_page_no(&block->page), 0,
-			       buf_page_get_zip_size(&block->page),
-			       (void*)block->page.zip.data,
-			       (void*)block);
-
-			/* Increment the counter of I/O operations used
-			for selecting LRU policy. */
-			buf_LRU_stat_inc_io();
-
-			continue;
-		}
-
-		ut_a(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
-
-		if (UNIV_UNLIKELY(memcmp(block->frame + (FIL_PAGE_LSN + 4),
-					 block->frame
-					 + (UNIV_PAGE_SIZE
-					    - FIL_PAGE_END_LSN_OLD_CHKSUM + 4),
-					 4))) {
-			ut_print_timestamp(stderr);
-			fprintf(stderr,
-				"  InnoDB: ERROR: The page to be written"
-				" seems corrupt!\n"
-				"InnoDB: The lsn fields do not match!"
-				" Noticed in the buffer pool\n"
-				"InnoDB: after posting and flushing"
-				" the doublewrite buffer.\n"
-				"InnoDB: Page buf fix count %lu,"
-				" io fix %lu, state %lu\n",
-				(ulong)block->page.buf_fix_count,
-				(ulong)buf_block_get_io_fix(block),
-				(ulong)buf_block_get_state(block));
-		}
-
-		fil_io(OS_FILE_WRITE | OS_AIO_SIMULATED_WAKE_LATER,
-		       FALSE, buf_block_get_space(block), 0,
-		       buf_block_get_page_no(block), 0, UNIV_PAGE_SIZE,
-		       (void*)block->frame, (void*)block);
-
-		/* Increment the counter of I/O operations used
-		for selecting LRU policy. */
-		buf_LRU_stat_inc_io();
-	}
-
-	/* Sync the writes to the disk. */
-	buf_flush_sync_datafiles();
-
-	/* We can now reuse the doublewrite memory buffer: */
-	trx_doublewrite->first_free = 0;
-
-	mutex_exit(&(trx_doublewrite->mutex));
-}
-
-/********************************************************************//**
-Posts a buffer page for writing. If the doublewrite memory buffer is
-full, calls buf_flush_buffered_writes and waits for for free space to
-appear. */
-static
-void
-buf_flush_post_to_doublewrite_buf(
-/*==============================*/
-	buf_page_t*	bpage)	/*!< in: buffer block to write */
-{
-	ulint	zip_size;
-try_again:
-	mutex_enter(&(trx_doublewrite->mutex));
-
-	ut_a(buf_page_in_file(bpage));
-
-	if (trx_doublewrite->first_free
-	    >= 2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
-		mutex_exit(&(trx_doublewrite->mutex));
-
-		buf_flush_buffered_writes();
-
-		goto try_again;
-	}
-
-	zip_size = buf_page_get_zip_size(bpage);
-
-	if (UNIV_UNLIKELY(zip_size)) {
-		UNIV_MEM_ASSERT_RW(bpage->zip.data, zip_size);
-		/* Copy the compressed page and clear the rest. */
-		memcpy(trx_doublewrite->write_buf
-		       + UNIV_PAGE_SIZE * trx_doublewrite->first_free,
-		       bpage->zip.data, zip_size);
-		memset(trx_doublewrite->write_buf
-		       + UNIV_PAGE_SIZE * trx_doublewrite->first_free
-		       + zip_size, 0, UNIV_PAGE_SIZE - zip_size);
-	} else {
-		ut_a(buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE);
-		UNIV_MEM_ASSERT_RW(((buf_block_t*) bpage)->frame,
-				   UNIV_PAGE_SIZE);
-
-		memcpy(trx_doublewrite->write_buf
-		       + UNIV_PAGE_SIZE * trx_doublewrite->first_free,
-		       ((buf_block_t*) bpage)->frame, UNIV_PAGE_SIZE);
-	}
-
-	trx_doublewrite->buf_block_arr[trx_doublewrite->first_free] = bpage;
-
-	trx_doublewrite->first_free++;
 
-	if (trx_doublewrite->first_free
-	    >= 2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
-		mutex_exit(&(trx_doublewrite->mutex));
-
-		buf_flush_buffered_writes();
-
-		return;
+	switch (flush_type) {
+	case BUF_FLUSH_LIST:
+	case BUF_FLUSH_LRU:
+		buf_dblwr_update();
+		break;
+	case BUF_FLUSH_SINGLE_PAGE:
+		/* Single page flushes are synchronous. No need
+		to update doublewrite */
+		break;
+	case BUF_FLUSH_N_TYPES:
+		ut_error;
 	}
-
-	mutex_exit(&(trx_doublewrite->mutex));
 }
 #endif /* !UNIV_HOTBACKUP */
 
@@ -995,19 +715,25 @@ UNIV_INTERN
 void
 buf_flush_init_for_writing(
 /*=======================*/
-	byte*		page,		/*!< in/out: page */
-	void*		page_zip_,	/*!< in/out: compressed page, or NULL */
-	ib_uint64_t	newest_lsn)	/*!< in: newest modification lsn
-					to the page */
+	byte*	page,		/*!< in/out: page */
+	void*	page_zip_,	/*!< in/out: compressed page, or NULL */
+	lsn_t	newest_lsn)	/*!< in: newest modification lsn
+				to the page */
 {
+	ib_uint32_t	checksum = 0 /* silence bogus gcc warning */;
+
 	ut_ad(page);
 
 	if (page_zip_) {
-		page_zip_des_t*	page_zip = page_zip_;
-		ulint		zip_size = page_zip_get_size(page_zip);
+		page_zip_des_t*	page_zip;
+		ulint		zip_size;
+
+		page_zip = static_cast<page_zip_des_t*>(page_zip_);
+		zip_size = page_zip_get_size(page_zip);
+
 		ut_ad(zip_size);
 		ut_ad(ut_is_2pow(zip_size));
-		ut_ad(zip_size <= UNIV_PAGE_SIZE);
+		ut_ad(zip_size <= UNIV_ZIP_SIZE_MAX);
 
 		switch (UNIV_EXPECT(fil_page_get_type(page), FIL_PAGE_INDEX)) {
 		case FIL_PAGE_TYPE_ALLOCATED:
@@ -1021,15 +747,17 @@ buf_flush_init_for_writing(
 		case FIL_PAGE_TYPE_ZBLOB:
 		case FIL_PAGE_TYPE_ZBLOB2:
 		case FIL_PAGE_INDEX:
+			checksum = page_zip_calc_checksum(
+				page_zip->data, zip_size,
+				static_cast<srv_checksum_algorithm_t>(
+					srv_checksum_algorithm));
+
 			mach_write_to_8(page_zip->data
 					+ FIL_PAGE_LSN, newest_lsn);
 			memset(page_zip->data + FIL_PAGE_FILE_FLUSH_LSN, 0, 8);
 			mach_write_to_4(page_zip->data
 					+ FIL_PAGE_SPACE_OR_CHKSUM,
-					srv_use_checksums
-					? page_zip_calc_checksum(
-						page_zip->data, zip_size)
-					: BUF_NO_CHECKSUM_MAGIC);
+					checksum);
 			return;
 		}
 
@@ -1051,32 +779,82 @@ buf_flush_init_for_writing(
 
 	/* Store the new formula checksum */
 
-	mach_write_to_4(page + FIL_PAGE_SPACE_OR_CHKSUM,
-			srv_use_checksums
-			? buf_calc_page_new_checksum(page)
-			: BUF_NO_CHECKSUM_MAGIC);
+	switch ((srv_checksum_algorithm_t) srv_checksum_algorithm) {
+	case SRV_CHECKSUM_ALGORITHM_CRC32:
+	case SRV_CHECKSUM_ALGORITHM_STRICT_CRC32:
+		checksum = buf_calc_page_crc32(page);
+		break;
+	case SRV_CHECKSUM_ALGORITHM_INNODB:
+	case SRV_CHECKSUM_ALGORITHM_STRICT_INNODB:
+		checksum = (ib_uint32_t) buf_calc_page_new_checksum(page);
+		break;
+	case SRV_CHECKSUM_ALGORITHM_NONE:
+	case SRV_CHECKSUM_ALGORITHM_STRICT_NONE:
+		checksum = BUF_NO_CHECKSUM_MAGIC;
+		break;
+	/* no default so the compiler will emit a warning if new enum
+	is added and not handled here */
+	}
+
+	mach_write_to_4(page + FIL_PAGE_SPACE_OR_CHKSUM, checksum);
 
 	/* We overwrite the first 4 bytes of the end lsn field to store
 	the old formula checksum. Since it depends also on the field
 	FIL_PAGE_SPACE_OR_CHKSUM, it has to be calculated after storing the
 	new formula checksum. */
 
+	if (srv_checksum_algorithm == SRV_CHECKSUM_ALGORITHM_STRICT_INNODB
+	    || srv_checksum_algorithm == SRV_CHECKSUM_ALGORITHM_INNODB) {
+
+		checksum = (ib_uint32_t) buf_calc_page_old_checksum(page);
+
+		/* In other cases we use the value assigned from above.
+		If CRC32 is used then it is faster to use that checksum
+		(calculated above) instead of calculating another one.
+		We can afford to store something other than
+		buf_calc_page_old_checksum() or BUF_NO_CHECKSUM_MAGIC in
+		this field because the file will not be readable by old
+		versions of MySQL/InnoDB anyway (older than MySQL 5.6.3) */
+	}
+
 	mach_write_to_4(page + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN_OLD_CHKSUM,
-			srv_use_checksums
-			? buf_calc_page_old_checksum(page)
-			: BUF_NO_CHECKSUM_MAGIC);
+			checksum);
 }
 
 #ifndef UNIV_HOTBACKUP
 /********************************************************************//**
+Flush a batch of writes to the datafiles that have already been
+written by the OS. */
+UNIV_INTERN
+void
+buf_flush_sync_datafiles(void)
+/*==========================*/
+{
+	/* Wake possible simulated aio thread to actually post the
+	writes to the operating system */
+	os_aio_simulated_wake_handler_threads();
+
+	/* Wait that all async writes to tablespaces have been posted to
+	the OS */
+	os_aio_wait_until_no_pending_writes();
+
+	/* Now we flush the data to disk (for example, with fsync) */
+	fil_flush_file_spaces(FIL_TABLESPACE);
+
+	return;
+}
+
+/********************************************************************//**
 Does an asynchronous write of a buffer page. NOTE: in simulated aio and
 also when the doublewrite buffer is used, we must call
-buf_flush_buffered_writes after we have posted a batch of writes! */
+buf_dblwr_flush_buffered_writes after we have posted a batch of
+writes! */
 static
 void
 buf_flush_write_block_low(
 /*======================*/
-	buf_page_t*	bpage)	/*!< in: buffer block to write */
+	buf_page_t*	bpage,		/*!< in: buffer block to write */
+	enum buf_flush	flush_type)	/*!< in: type of flush */
 {
 	ulint	zip_size	= buf_page_get_zip_size(bpage);
 	page_t*	frame		= NULL;
@@ -1131,10 +909,9 @@ buf_flush_write_block_low(
 		break;
 	case BUF_BLOCK_ZIP_DIRTY:
 		frame = bpage->zip.data;
-		if (UNIV_LIKELY(srv_use_checksums)) {
-			ut_a(mach_read_from_4(frame + FIL_PAGE_SPACE_OR_CHKSUM)
-			     == page_zip_calc_checksum(frame, zip_size));
-		}
+
+		ut_a(page_zip_verify_checksum(frame, zip_size));
+
 		mach_write_to_8(frame + FIL_PAGE_LSN,
 				bpage->newest_modification);
 		memset(frame + FIL_PAGE_FILE_FLUSH_LSN, 0, 8);
@@ -1152,93 +929,18 @@ buf_flush_write_block_low(
 		break;
 	}
 
-	if (!srv_use_doublewrite_buf || !trx_doublewrite) {
+	if (!srv_use_doublewrite_buf || !buf_dblwr) {
 		fil_io(OS_FILE_WRITE | OS_AIO_SIMULATED_WAKE_LATER,
 		       FALSE, buf_page_get_space(bpage), zip_size,
 		       buf_page_get_page_no(bpage), 0,
 		       zip_size ? zip_size : UNIV_PAGE_SIZE,
 		       frame, bpage);
+	} else if (flush_type == BUF_FLUSH_SINGLE_PAGE) {
+		buf_dblwr_write_single_page(bpage);
 	} else {
-		buf_flush_post_to_doublewrite_buf(bpage);
-	}
-}
-
-# if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG
-/********************************************************************//**
-Writes a flushable page asynchronously from the buffer pool to a file.
-NOTE: buf_pool->mutex and block->mutex must be held upon entering this
-function, and they will be released by this function after flushing.
-This is loosely based on buf_flush_batch() and buf_flush_page().
-@return TRUE if the page was flushed and the mutexes released */
-UNIV_INTERN
-ibool
-buf_flush_page_try(
-/*===============*/
-	buf_pool_t*	buf_pool,	/*!< in/out: buffer pool instance */
-	buf_block_t*	block)		/*!< in/out: buffer control block */
-{
-	ut_ad(buf_pool_mutex_own(buf_pool));
-	ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
-	ut_ad(mutex_own(&block->mutex));
-
-	if (!buf_flush_ready_for_flush(&block->page, BUF_FLUSH_LRU)) {
-		return(FALSE);
-	}
-
-	if (buf_pool->n_flush[BUF_FLUSH_LRU] > 0
-	    || buf_pool->init_flush[BUF_FLUSH_LRU]) {
-		/* There is already a flush batch of the same type running */
-		return(FALSE);
-	}
-
-	buf_pool->init_flush[BUF_FLUSH_LRU] = TRUE;
-
-	buf_page_set_io_fix(&block->page, BUF_IO_WRITE);
-
-	buf_page_set_flush_type(&block->page, BUF_FLUSH_LRU);
-
-	if (buf_pool->n_flush[BUF_FLUSH_LRU]++ == 0) {
-
-		os_event_reset(buf_pool->no_flush[BUF_FLUSH_LRU]);
+		buf_dblwr_add_to_batch(bpage);
 	}
-
-	/* VERY IMPORTANT:
-	Because any thread may call the LRU flush, even when owning
-	locks on pages, to avoid deadlocks, we must make sure that the
-	s-lock is acquired on the page without waiting: this is
-	accomplished because buf_flush_ready_for_flush() must hold,
-	and that requires the page not to be bufferfixed. */
-
-	rw_lock_s_lock_gen(&block->lock, BUF_IO_WRITE);
-
-	/* Note that the s-latch is acquired before releasing the
-	buf_pool mutex: this ensures that the latch is acquired
-	immediately. */
-
-	mutex_exit(&block->mutex);
-	buf_pool_mutex_exit(buf_pool);
-
-	/* Even though block is not protected by any mutex at this
-	point, it is safe to access block, because it is io_fixed and
-	oldest_modification != 0.  Thus, it cannot be relocated in the
-	buffer pool or removed from flush_list or LRU_list. */
-
-	buf_flush_write_block_low(&block->page);
-
-	buf_pool_mutex_enter(buf_pool);
-	buf_pool->init_flush[BUF_FLUSH_LRU] = FALSE;
-
-	if (buf_pool->n_flush[BUF_FLUSH_LRU] == 0) {
-		/* The running flush batch has ended */
-		os_event_set(buf_pool->no_flush[BUF_FLUSH_LRU]);
-	}
-
-	buf_pool_mutex_exit(buf_pool);
-	buf_flush_buffered_writes();
-
-	return(TRUE);
 }
-# endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */
 
 /********************************************************************//**
 Writes a flushable page asynchronously from the buffer pool to a file.
@@ -1253,13 +955,12 @@ buf_flush_page(
 /*===========*/
 	buf_pool_t*	buf_pool,	/*!< in: buffer pool instance */
 	buf_page_t*	bpage,		/*!< in: buffer control block */
-	enum buf_flush	flush_type)	/*!< in: BUF_FLUSH_LRU
-					or BUF_FLUSH_LIST */
+	enum buf_flush	flush_type)	/*!< in: type of flush */
 {
 	mutex_t*	block_mutex;
 	ibool		is_uncompressed;
 
-	ut_ad(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST);
+	ut_ad(flush_type < BUF_FLUSH_N_TYPES);
 	ut_ad(buf_pool_mutex_own(buf_pool));
 	ut_ad(buf_page_in_file(bpage));
 
@@ -1305,7 +1006,7 @@ buf_flush_page(
 		flush_list or LRU_list. */
 
 		if (!is_s_latched) {
-			buf_flush_buffered_writes();
+			buf_dblwr_flush_buffered_writes();
 
 			if (is_uncompressed) {
 				rw_lock_s_lock_gen(&((buf_block_t*) bpage)
@@ -1316,12 +1017,17 @@ buf_flush_page(
 		break;
 
 	case BUF_FLUSH_LRU:
+	case BUF_FLUSH_SINGLE_PAGE:
 		/* VERY IMPORTANT:
-		Because any thread may call the LRU flush, even when owning
-		locks on pages, to avoid deadlocks, we must make sure that the
-		s-lock is acquired on the page without waiting: this is
-		accomplished because buf_flush_ready_for_flush() must hold,
-		and that requires the page not to be bufferfixed. */
+		Because any thread may call single page flush, even when
+		owning locks on pages, to avoid deadlocks, we must make
+		sure that the s-lock is acquired on the page without
+		waiting: this is accomplished because
+		buf_flush_ready_for_flush() must hold, and that requires
+		the page not to be bufferfixed.
+		The same holds true for LRU flush because a user thread
+		may end up waiting for an LRU flush to end while
+		holding locks on other pages. */
 
 		if (is_uncompressed) {
 			rw_lock_s_lock_gen(&((buf_block_t*) bpage)->lock,
@@ -1352,9 +1058,38 @@ buf_flush_page(
 			flush_type, bpage->space, bpage->offset);
 	}
 #endif /* UNIV_DEBUG */
-	buf_flush_write_block_low(bpage);
+	buf_flush_write_block_low(bpage, flush_type);
 }
 
+# if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG
+/********************************************************************//**
+Writes a flushable page asynchronously from the buffer pool to a file.
+NOTE: buf_pool->mutex and block->mutex must be held upon entering this
+function, and they will be released by this function after flushing.
+This is loosely based on buf_flush_batch() and buf_flush_page().
+@return TRUE if the page was flushed and the mutexes released */
+UNIV_INTERN
+ibool
+buf_flush_page_try(
+/*===============*/
+	buf_pool_t*	buf_pool,	/*!< in/out: buffer pool instance */
+	buf_block_t*	block)		/*!< in/out: buffer control block */
+{
+	ut_ad(buf_pool_mutex_own(buf_pool));
+	ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
+	ut_ad(mutex_own(&block->mutex));
+
+	if (!buf_flush_ready_for_flush(&block->page, BUF_FLUSH_SINGLE_PAGE)) {
+		return(FALSE);
+	}
+
+	/* The following call will release the buffer pool and
+	block mutex. */
+	buf_flush_page(buf_pool, &block->page, BUF_FLUSH_SINGLE_PAGE);
+	buf_flush_sync_datafiles();
+	return(TRUE);
+}
+# endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */
 /***********************************************************//**
 Flushes to disk all flushable pages within the flush area.
 @return	number of pages flushed */
@@ -1379,10 +1114,10 @@ buf_flush_try_neighbors(
 
 	ut_ad(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST);
 
-	if (UT_LIST_GET_LEN(buf_pool->LRU) < BUF_LRU_OLD_MIN_LEN) {
-		/* If there is little space, it is better not to flush
-		any block except from the end of the LRU list */
-
+	if (UT_LIST_GET_LEN(buf_pool->LRU) < BUF_LRU_OLD_MIN_LEN
+	    || !srv_flush_neighbors) {
+		/* If there is little space or neighbor flushing is
+		not enabled then just flush the victim. */
 		low = offset;
 		high = offset + 1;
 	} else {
@@ -1391,7 +1126,7 @@ buf_flush_try_neighbors(
 		original page. */
 
 		ulint	buf_flush_area;
-	
+
 		buf_flush_area	= ut_min(
 			BUF_READ_AHEAD_AREA(buf_pool),
 			buf_pool->curr_size / 16);
@@ -1473,6 +1208,14 @@ buf_flush_try_neighbors(
 		buf_pool_mutex_exit(buf_pool);
 	}
 
+	if (count > 0) {
+		MONITOR_INC_VALUE_CUMULATIVE(
+					MONITOR_FLUSH_NEIGHBOR_TOTAL_PAGE,
+					MONITOR_FLUSH_NEIGHBOR_COUNT,
+					MONITOR_FLUSH_NEIGHBOR_PAGES,
+					(count - 1));
+	}
+
 	return(count);
 }
 
@@ -1480,7 +1223,7 @@ buf_flush_try_neighbors(
 Check if the block is modified and ready for flushing. If the the block
 is ready to flush then flush the page and try o flush its neighbors.
 
-@return	TRUE if buf_pool mutex was not released during this function.
+@return	TRUE if buf_pool mutex was released during this function.
 This does not guarantee that some pages were written as well.
 Number of pages written are incremented to the count. */
 static
@@ -1545,37 +1288,134 @@ buf_flush_page_and_try_neighbors(
 }
 
 /*******************************************************************//**
+This utility moves the uncompressed frames of pages to the free list.
+Note that this function does not actually flush any data to disk. It
+just detaches the uncompressed frames from the compressed pages at the
+tail of the unzip_LRU and puts those freed frames in the free list.
+Note that it is a best effort attempt and it is not guaranteed that
+after a call to this function there will be 'max' blocks in the free
+list.
+@return number of blocks moved to the free list. */
+static
+ulint
+buf_free_from_unzip_LRU_list_batch(
+/*===============================*/
+	buf_pool_t*	buf_pool,	/*!< in: buffer pool instance */
+	ulint		max)		/*!< in: desired number of
+					blocks in the free_list */
+{
+	buf_block_t*	block;
+	ulint		scanned = 0;
+	ulint		count = 0;
+	ulint		free_len = UT_LIST_GET_LEN(buf_pool->free);
+	ulint		lru_len = UT_LIST_GET_LEN(buf_pool->unzip_LRU);
+
+	ut_ad(buf_pool_mutex_own(buf_pool));
+
+	block = UT_LIST_GET_LAST(buf_pool->unzip_LRU);
+	while (block != NULL && count < max
+	       && free_len < srv_LRU_scan_depth
+	       && lru_len > UT_LIST_GET_LEN(buf_pool->LRU) / 10) {
+
+		++scanned;
+		if (buf_LRU_free_block(&block->page, FALSE)) {
+			/* Block was freed. buf_pool->mutex potentially
+			released and reacquired */
+			++count;
+			block = UT_LIST_GET_LAST(buf_pool->unzip_LRU);
+
+		} else {
+
+			block = UT_LIST_GET_PREV(unzip_LRU, block);
+		}
+
+		free_len = UT_LIST_GET_LEN(buf_pool->free);
+		lru_len = UT_LIST_GET_LEN(buf_pool->unzip_LRU);
+	}
+
+	ut_ad(buf_pool_mutex_own(buf_pool));
+
+	if (scanned) {
+		MONITOR_INC_VALUE_CUMULATIVE(
+			MONITOR_LRU_BATCH_SCANNED,
+			MONITOR_LRU_BATCH_SCANNED_NUM_CALL,
+			MONITOR_LRU_BATCH_SCANNED_PER_CALL,
+			scanned);
+	}
+
+	return(count);
+}
+
+/*******************************************************************//**
 This utility flushes dirty blocks from the end of the LRU list.
-In the case of an LRU flush the calling thread may own latches to
-pages: to avoid deadlocks, this function must be written so that it
-cannot end up waiting for these latches!
+The calling thread is not allowed to own any latches on pages!
+It attempts to make 'max' blocks available in the free list. Note that
+it is a best effort attempt and it is not guaranteed that after a call
+to this function there will be 'max' blocks in the free list.
 @return number of blocks for which the write request was queued. */
 static
 ulint
 buf_flush_LRU_list_batch(
 /*=====================*/
 	buf_pool_t*	buf_pool,	/*!< in: buffer pool instance */
-	ulint		max)		/*!< in: max of blocks to flush */
+	ulint		max)		/*!< in: desired number of
+					blocks in the free_list */
 {
 	buf_page_t*	bpage;
+	ulint		scanned = 0;
 	ulint		count = 0;
+	ulint		free_len = UT_LIST_GET_LEN(buf_pool->free);
+	ulint		lru_len = UT_LIST_GET_LEN(buf_pool->LRU);
 
 	ut_ad(buf_pool_mutex_own(buf_pool));
 
-	do {
-		/* Start from the end of the list looking for a
-		suitable block to be flushed. */
-		bpage = UT_LIST_GET_LAST(buf_pool->LRU);
+	bpage = UT_LIST_GET_LAST(buf_pool->LRU);
+	while (bpage != NULL && count < max
+	       && free_len < srv_LRU_scan_depth
+	       && lru_len > BUF_LRU_MIN_LEN) {
 
-		/* Iterate backwards over the flush list till we find
-		a page that isn't ready for flushing. */
-		while (bpage != NULL
-		       && !buf_flush_page_and_try_neighbors(
-				bpage, BUF_FLUSH_LRU, max, &count)) {
+		mutex_t* block_mutex = buf_page_get_mutex(bpage);
+		ibool	 evict;
 
+		mutex_enter(block_mutex);
+		evict = buf_flush_ready_for_replace(bpage);
+		mutex_exit(block_mutex);
+
+		++scanned;
+
+		/* If the block is ready to be replaced we try to
+		free it i.e.: put it on the free list.
+		Otherwise we try to flush the block and its
+		neighbors. In this case we'll put it on the
+		free list in the next pass. We do this extra work
+		of putting blocks to the free list instead of
+		just flushing them because after every flush
+		we have to restart the scan from the tail of
+		the LRU list and if we don't clear the tail
+		of the flushed pages then the scan becomes
+		O(n*n). */
+		if (evict) {
+			if (buf_LRU_free_block(bpage, TRUE)) {
+				/* buf_pool->mutex was potentially
+				released and reacquired. */
+				bpage = UT_LIST_GET_LAST(buf_pool->LRU);
+			} else {
+				bpage = UT_LIST_GET_PREV(LRU, bpage);
+			}
+		} else if (buf_flush_page_and_try_neighbors(
+				bpage,
+				BUF_FLUSH_LRU, max, &count)) {
+
+			/* buf_pool->mutex was released.
+			Restart the scan. */
+			bpage = UT_LIST_GET_LAST(buf_pool->LRU);
+		} else {
 			bpage = UT_LIST_GET_PREV(LRU, bpage);
 		}
-	} while (bpage != NULL && count < max);
+
+		free_len = UT_LIST_GET_LEN(buf_pool->free);
+		lru_len = UT_LIST_GET_LEN(buf_pool->LRU);
+	}
 
 	/* We keep track of all flushes happening as part of LRU
 	flush. When estimating the desired rate at which flush_list
@@ -1584,6 +1424,41 @@ buf_flush_LRU_list_batch(
 
 	ut_ad(buf_pool_mutex_own(buf_pool));
 
+	if (scanned) {
+		MONITOR_INC_VALUE_CUMULATIVE(
+			MONITOR_LRU_BATCH_SCANNED,
+			MONITOR_LRU_BATCH_SCANNED_NUM_CALL,
+			MONITOR_LRU_BATCH_SCANNED_PER_CALL,
+			scanned);
+	}
+
+	return(count);
+}
+
+/*******************************************************************//**
+Flush and move pages from LRU or unzip_LRU list to the free list.
+Whether LRU or unzip_LRU is used depends on the state of the system.
+@return number of blocks for which either the write request was queued
+or in case of unzip_LRU the number of blocks actually moved to the
+free list */
+static
+ulint
+buf_do_LRU_batch(
+/*=============*/
+	buf_pool_t*	buf_pool,	/*!< in: buffer pool instance */
+	ulint		max)		/*!< in: desired number of
+					blocks in the free_list */
+{
+	ulint	count = 0;
+
+	if (buf_LRU_evict_from_unzip_LRU(buf_pool)) {
+		count += buf_free_from_unzip_LRU_list_batch(buf_pool, max);
+	}
+
+	if (max > count) {
+		count += buf_flush_LRU_list_batch(buf_pool, max - count);
+	}
+
 	return(count);
 }
 
@@ -1595,14 +1470,14 @@ ULINT_UNDEFINED if there was a flush of the same type already
 running */
 static
 ulint
-buf_flush_flush_list_batch(
-/*=======================*/
+buf_do_flush_list_batch(
+/*====================*/
 	buf_pool_t*	buf_pool,	/*!< in: buffer pool instance */
 	ulint		min_n,		/*!< in: wished minimum mumber
 					of blocks flushed (it is not
 					guaranteed that the actual
 					number is that big, though) */
-	ib_uint64_t	lsn_limit)	/*!< all blocks whose
+	lsn_t		lsn_limit)	/*!< all blocks whose
 					oldest_modification is smaller
 					than this should be flushed (if
 					their number does not exceed
@@ -1611,6 +1486,7 @@ buf_flush_flush_list_batch(
 	ulint		len;
 	buf_page_t*	bpage;
 	ulint		count = 0;
+	ulint		scanned = 0;
 
 	ut_ad(buf_pool_mutex_own(buf_pool));
 
@@ -1654,6 +1530,7 @@ buf_flush_flush_list_batch(
 		       && !buf_flush_page_and_try_neighbors(
 				bpage, BUF_FLUSH_LIST, min_n, &count)) {
 
+			++scanned;
 			buf_flush_list_mutex_enter(buf_pool);
 
 			/* If we are here that means that buf_pool->mutex
@@ -1683,6 +1560,11 @@ buf_flush_flush_list_batch(
 
 	} while (count < min_n && bpage != NULL && len > 0);
 
+	MONITOR_INC_VALUE_CUMULATIVE(MONITOR_FLUSH_BATCH_SCANNED,
+				     MONITOR_FLUSH_BATCH_SCANNED_NUM_CALL,
+				     MONITOR_FLUSH_BATCH_SCANNED_PER_CALL,
+				     scanned);
+
 	ut_ad(buf_pool_mutex_own(buf_pool));
 
 	return(count);
@@ -1708,7 +1590,7 @@ buf_flush_batch(
 	ulint		min_n,		/*!< in: wished minimum mumber of blocks
 					flushed (it is not guaranteed that the
 					actual number is that big, though) */
-	ib_uint64_t	lsn_limit)	/*!< in: in the case of BUF_FLUSH_LIST
+	lsn_t		lsn_limit)	/*!< in: in the case of BUF_FLUSH_LIST
 					all blocks whose oldest_modification is
 					smaller than this should be flushed
 					(if their number does not exceed
@@ -1726,12 +1608,12 @@ buf_flush_batch(
 
 	/* Note: The buffer pool mutex is released and reacquired within
 	the flush functions. */
-	switch(flush_type) {
+	switch (flush_type) {
 	case BUF_FLUSH_LRU:
-		count = buf_flush_LRU_list_batch(buf_pool, min_n);
+		count = buf_do_LRU_batch(buf_pool, min_n);
 		break;
 	case BUF_FLUSH_LIST:
-		count = buf_flush_flush_list_batch(buf_pool, min_n, lsn_limit);
+		count = buf_do_flush_list_batch(buf_pool, min_n, lsn_limit);
 		break;
 	default:
 		ut_error;
@@ -1739,7 +1621,7 @@ buf_flush_batch(
 
 	buf_pool_mutex_exit(buf_pool);
 
-	buf_flush_buffered_writes();
+	buf_dblwr_flush_buffered_writes();
 
 #ifdef UNIV_DEBUG
 	if (buf_debug_prints && count > 0) {
@@ -1750,6 +1632,8 @@ buf_flush_batch(
 	}
 #endif /* UNIV_DEBUG */
 
+	srv_buf_pool_flushed += count;
+
 	return(count);
 }
 
@@ -1762,7 +1646,7 @@ buf_flush_common(
 	enum buf_flush	flush_type,	/*!< in: type of flush */
 	ulint		page_count)	/*!< in: number of pages flushed */
 {
-	buf_flush_buffered_writes();
+	buf_dblwr_flush_buffered_writes();
 
 	ut_a(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST);
 
@@ -1776,6 +1660,13 @@ buf_flush_common(
 #endif /* UNIV_DEBUG */
 
 	srv_buf_pool_flushed += page_count;
+
+	if (flush_type == BUF_FLUSH_LRU) {
+		/* We keep track of all flushes happening as part of LRU
+		flush. When estimating the desired rate at which flush_list
+		should be flushed we factor in this value. */
+		buf_lru_flush_page_count += page_count;
+	}
 }
 
 /******************************************************************//**
@@ -1821,6 +1712,8 @@ buf_flush_end(
 
 	buf_pool->init_flush[flush_type] = FALSE;
 
+	buf_pool->try_LRU_scan = TRUE;
+
 	if (buf_pool->n_flush[flush_type] == 0) {
 
 		/* The running flush batch has ended */
@@ -1863,17 +1756,17 @@ buf_flush_wait_batch_end(
 }
 
 /*******************************************************************//**
-This utility flushes dirty blocks from the end of the LRU list.
-NOTE: The calling thread may own latches to pages: to avoid deadlocks,
-this function must be written so that it cannot end up waiting for these
-latches!
+This utility flushes dirty blocks from the end of the LRU list and also
+puts replaceable clean pages from the end of the LRU list to the free
+list.
+NOTE: The calling thread is not allowed to own any latches on pages!
 @return number of blocks for which the write request was queued;
 ULINT_UNDEFINED if there was a flush of the same type already running */
-UNIV_INTERN
+static
 ulint
 buf_flush_LRU(
 /*==========*/
-	buf_pool_t*	buf_pool,	/*!< in: buffer pool instance */
+	buf_pool_t*	buf_pool,	/*!< in/out: buffer pool instance */
 	ulint		min_n)		/*!< in: wished minimum mumber of blocks
 					flushed (it is not guaranteed that the
 					actual number is that big, though) */
@@ -1906,7 +1799,7 @@ buf_flush_list(
 	ulint		min_n,		/*!< in: wished minimum mumber of blocks
 					flushed (it is not guaranteed that the
 					actual number is that big, though) */
-	ib_uint64_t	lsn_limit)	/*!< in the case BUF_FLUSH_LIST all
+	lsn_t		lsn_limit)	/*!< in the case BUF_FLUSH_LIST all
 					blocks whose oldest_modification is
 					smaller than this should be flushed
 					(if their number does not exceed
@@ -1956,112 +1849,112 @@ buf_flush_list(
 		buf_flush_common(BUF_FLUSH_LIST, page_count);
 
 		total_page_count += page_count;
+
+		if (page_count) {
+			MONITOR_INC_VALUE_CUMULATIVE(
+				MONITOR_FLUSH_BATCH_TOTAL_PAGE,
+				MONITOR_FLUSH_BATCH_COUNT,
+				MONITOR_FLUSH_BATCH_PAGES,
+				page_count);
+		}
 	}
 
-	return(lsn_limit != IB_ULONGLONG_MAX && skipped
+	return(lsn_limit != LSN_MAX && skipped
 	       ? ULINT_UNDEFINED : total_page_count);
 }
- 
+
 /******************************************************************//**
-Gives a recommendation of how many blocks should be flushed to establish
-a big enough margin of replaceable blocks near the end of the LRU list
-and in the free list.
-@return number of blocks which should be flushed from the end of the
-LRU list */
-static
-ulint
-buf_flush_LRU_recommendation(
-/*=========================*/
-	buf_pool_t*	buf_pool)		/*!< in: Buffer pool instance */
+This function picks up a single dirty page from the tail of the LRU
+list, flushes it, removes it from page_hash and LRU list and puts
+it on the free list. It is called from user threads when they are
+unable to find a replaceable page at the tail of the LRU list i.e.:
+when the background LRU flushing in the page_cleaner thread is not
+fast enough to keep pace with the workload.
+@return TRUE if success. */
+UNIV_INTERN
+ibool
+buf_flush_single_page_from_LRU(
+/*===========================*/
+	buf_pool_t*	buf_pool)	/*!< in/out: buffer pool instance */
 {
+	ulint		scanned;
 	buf_page_t*	bpage;
-	ulint		n_replaceable;
-	ulint		distance	= 0;
+	mutex_t*	block_mutex;
+	ibool		freed;
+	ibool		evict_zip;
 
 	buf_pool_mutex_enter(buf_pool);
 
-	n_replaceable = UT_LIST_GET_LEN(buf_pool->free);
-
-	bpage = UT_LIST_GET_LAST(buf_pool->LRU);
-
-	while ((bpage != NULL)
-	       && (n_replaceable < BUF_FLUSH_FREE_BLOCK_MARGIN(buf_pool)
-		   + BUF_FLUSH_EXTRA_MARGIN(buf_pool))
-	       && (distance < BUF_LRU_FREE_SEARCH_LEN(buf_pool))) {
-
-		mutex_t* block_mutex = buf_page_get_mutex(bpage);
+	for (bpage = UT_LIST_GET_LAST(buf_pool->LRU), scanned = 1;
+	     bpage != NULL;
+	     bpage = UT_LIST_GET_PREV(LRU, bpage), ++scanned) {
 
+		block_mutex = buf_page_get_mutex(bpage);
 		mutex_enter(block_mutex);
-
-		if (buf_flush_ready_for_replace(bpage)) {
-			n_replaceable++;
+		if (buf_flush_ready_for_flush(bpage,
+					      BUF_FLUSH_SINGLE_PAGE)) {
+			/* buf_flush_page() will release the block
+			mutex */
+			break;
 		}
-
 		mutex_exit(block_mutex);
-
-		distance++;
-
-		bpage = UT_LIST_GET_PREV(LRU, bpage);
 	}
 
-	buf_pool_mutex_exit(buf_pool);
-
-	if (n_replaceable >= BUF_FLUSH_FREE_BLOCK_MARGIN(buf_pool)) {
+	MONITOR_INC_VALUE_CUMULATIVE(
+		MONITOR_LRU_SINGLE_FLUSH_SCANNED,
+		MONITOR_LRU_SINGLE_FLUSH_SCANNED_NUM_CALL,
+		MONITOR_LRU_SINGLE_FLUSH_SCANNED_PER_CALL,
+		scanned);
 
-		return(0);
+	if (!bpage) {
+		/* Can't find a single flushable page. */
+		buf_pool_mutex_exit(buf_pool);
+		return(FALSE);
 	}
 
-	return(BUF_FLUSH_FREE_BLOCK_MARGIN(buf_pool)
-	       + BUF_FLUSH_EXTRA_MARGIN(buf_pool)
-	       - n_replaceable);
-}
-
-/*********************************************************************//**
-Flushes pages from the end of the LRU list if there is too small a margin
-of replaceable pages there or in the free list. VERY IMPORTANT: this function
-is called also by threads which have locks on pages. To avoid deadlocks, we
-flush only pages such that the s-lock required for flushing can be acquired
-immediately, without waiting. */
-UNIV_INTERN
-void
-buf_flush_free_margin(
-/*==================*/
-	buf_pool_t*	buf_pool)		/*!< in: Buffer pool instance */
-{
-	ulint	n_to_flush;
+	/* The following call will release the buffer pool and
+	block mutex. */
+	buf_flush_page(buf_pool, bpage, BUF_FLUSH_SINGLE_PAGE);
 
-	n_to_flush = buf_flush_LRU_recommendation(buf_pool);
+	buf_flush_sync_datafiles();
 
-	if (n_to_flush > 0) {
-		ulint	n_flushed;
+	/* At this point the page has been written to the disk.
+	As we are not holding buffer pool or block mutex therefore
+	we cannot use the bpage safely. It may have been plucked out
+	of the LRU list by some other thread or it may even have
+	relocated in case of a compressed page. We need to start
+	the scan of LRU list again to remove the block from the LRU
+	list and put it on the free list. */
+	buf_pool_mutex_enter(buf_pool);
 
-		n_flushed = buf_flush_LRU(buf_pool, n_to_flush);
+	for (bpage = UT_LIST_GET_LAST(buf_pool->LRU);
+	     bpage != NULL;
+	     bpage = UT_LIST_GET_PREV(LRU, bpage)) {
 
-		if (n_flushed == ULINT_UNDEFINED) {
-			/* There was an LRU type flush batch already running;
-			let us wait for it to end */
+		ibool	ready;
 
-			buf_flush_wait_batch_end(buf_pool, BUF_FLUSH_LRU);
+		block_mutex = buf_page_get_mutex(bpage);
+		mutex_enter(block_mutex);
+		ready = buf_flush_ready_for_replace(bpage);
+		mutex_exit(block_mutex);
+		if (ready) {
+			break;
 		}
+
 	}
-}
 
-/*********************************************************************//**
-Flushes pages from the end of all the LRU lists. */
-UNIV_INTERN
-void
-buf_flush_free_margins(void)
-/*========================*/
-{
-	ulint	i;
+	if (!bpage) {
+		/* Can't find a single replaceable page. */
+		buf_pool_mutex_exit(buf_pool);
+		return(FALSE);
+	}
 
-	for (i = 0; i < srv_buf_pool_instances; i++) {
-		buf_pool_t*	buf_pool;
+	evict_zip = !buf_LRU_evict_from_unzip_LRU(buf_pool);;
 
-		buf_pool = buf_pool_from_array(i);
+	freed = buf_LRU_free_block(bpage, evict_zip);
+	buf_pool_mutex_exit(buf_pool);
 
-		buf_flush_free_margin(buf_pool);
-	}
+	return(freed);
 }
 
 /*********************************************************************
@@ -2075,8 +1968,8 @@ buf_flush_stat_update(void)
 /*=======================*/
 {
 	buf_flush_stat_t*	item;
-	ib_uint64_t		lsn_diff;
-	ib_uint64_t		lsn;
+	lsn_t			lsn_diff;
+	lsn_t			lsn;
 	ulint			n_flushed;
 
 	lsn = log_get_lsn();
@@ -2119,19 +2012,18 @@ in the number of dirty pages (for example, an in-memory workload)
 it can cause IO bursts of flushing. This function implements heuristics
 to avoid this burstiness.
 @return	number of dirty pages to be flushed / second */
-UNIV_INTERN
+static
 ulint
 buf_flush_get_desired_flush_rate(void)
 /*==================================*/
 {
 	ulint		i;
-	lint		rate;
-	ulint		redo_avg;
+	lsn_t		redo_avg;
 	ulint		n_dirty = 0;
-	ulint		n_flush_req;
-	ulint		lru_flush_avg;
-	ib_uint64_t	lsn = log_get_lsn();
-	ulint		log_capacity = log_get_capacity();
+	ib_uint64_t	n_flush_req;
+	ib_uint64_t	lru_flush_avg;
+	lsn_t		lsn = log_get_lsn();
+	lsn_t		log_capacity = log_get_capacity();
 
 	/* log_capacity should never be zero after the initialization
 	of log subsystem. */
@@ -2154,9 +2046,8 @@ buf_flush_get_desired_flush_rate(void)
 	/* redo_avg below is average at which redo is generated in
 	past BUF_FLUSH_STAT_N_INTERVAL + redo generated in the current
 	interval. */
-	redo_avg = (ulint) (buf_flush_stat_sum.redo
-			    / BUF_FLUSH_STAT_N_INTERVAL
-			    + (lsn - buf_flush_stat_cur.redo));
+	redo_avg = buf_flush_stat_sum.redo / BUF_FLUSH_STAT_N_INTERVAL
+		+ (lsn - buf_flush_stat_cur.redo);
 
 	/* An overflow can happen possibly if we flush more than 2^32
 	pages in BUF_FLUSH_STAT_N_INTERVAL. This is a very very
@@ -2177,11 +2068,373 @@ buf_flush_get_desired_flush_rate(void)
 	list is the difference between the required rate and the
 	number of pages that we are historically flushing from the
 	LRU list */
-	rate = n_flush_req - lru_flush_avg;
-	return(rate > 0 ? (ulint) rate : 0);
+	if (n_flush_req <= lru_flush_avg) {
+		return(0);
+	} else {
+		ib_uint64_t	rate;
+
+		rate = n_flush_req - lru_flush_avg;
+
+		return((ulint) (rate < PCT_IO(100) ? rate : PCT_IO(100)));
+	}
+}
+
+/*********************************************************************//**
+Clears up tail of the LRU lists:
+* Put replaceable pages at the tail of LRU to the free list
+* Flush dirty pages at the tail of LRU to the disk
+The depth to which we scan each buffer pool is controlled by dynamic
+config parameter innodb_LRU_scan_depth.
+@return total pages flushed */
+UNIV_INLINE
+ulint
+page_cleaner_flush_LRU_tail(void)
+/*=============================*/
+{
+	ulint	i;
+	ulint	j;
+	ulint	total_flushed = 0;
+
+	for (i = 0; i < srv_buf_pool_instances; i++) {
+
+		buf_pool_t*	buf_pool = buf_pool_from_array(i);
+
+		/* We divide LRU flush into smaller chunks because
+		there may be user threads waiting for the flush to
+		end in buf_LRU_get_free_block(). */
+		for (j = 0;
+		     j < srv_LRU_scan_depth;
+		     j += PAGE_CLEANER_LRU_BATCH_CHUNK_SIZE) {
+
+			ulint	n_flushed = buf_flush_LRU(buf_pool,
+				PAGE_CLEANER_LRU_BATCH_CHUNK_SIZE);
+
+			/* Currently page_cleaner is the only thread
+			that can trigger an LRU flush. It is possible
+			that a batch triggered during last iteration is
+			still running, */
+			if (n_flushed != ULINT_UNDEFINED) {
+				total_flushed += n_flushed;
+			}
+		}
+	}
+
+	if (total_flushed) {
+		MONITOR_INC_VALUE_CUMULATIVE(
+			MONITOR_LRU_BATCH_TOTAL_PAGE,
+			MONITOR_LRU_BATCH_COUNT,
+			MONITOR_LRU_BATCH_PAGES,
+			total_flushed);
+	}
+
+	return(total_flushed);
+}
+
+/*********************************************************************//**
+Wait for any possible LRU flushes that are in progress to end. */
+UNIV_INLINE
+void
+page_cleaner_wait_LRU_flush(void)
+/*=============================*/
+{
+	ulint	i;
+
+	for (i = 0; i < srv_buf_pool_instances; i++) {
+		buf_pool_t*	buf_pool;
+
+		buf_pool = buf_pool_from_array(i);
+
+		buf_pool_mutex_enter(buf_pool);
+
+		if (buf_pool->n_flush[BUF_FLUSH_LRU] > 0
+		   || buf_pool->init_flush[BUF_FLUSH_LRU]) {
+
+			buf_pool_mutex_exit(buf_pool);
+			buf_flush_wait_batch_end(buf_pool, BUF_FLUSH_LRU);
+		} else {
+			buf_pool_mutex_exit(buf_pool);
+		}
+	}
+}
+
+/*********************************************************************//**
+Flush a batch of dirty pages from the flush list
+@return number of pages flushed, 0 if no page is flushed or if another
+flush_list type batch is running */
+static
+ulint
+page_cleaner_do_flush_batch(
+/*========================*/
+	ulint		n_to_flush,	/*!< in: number of pages that
+					we should attempt to flush. If
+					an lsn_limit is provided then
+					this value will have no affect */
+	lsn_t		lsn_limit)	/*!< in: LSN up to which flushing
+					must happen */
+{
+	ulint n_flushed;
+
+	ut_ad(n_to_flush == ULINT_MAX || lsn_limit == LSN_MAX);
+
+	n_flushed = buf_flush_list(n_to_flush, lsn_limit);
+	if (n_flushed == ULINT_UNDEFINED) {
+		n_flushed = 0;
+	}
+
+	return(n_flushed);
+}
+
+/*********************************************************************//**
+This function is called approximately once every second by the
+page_cleaner thread. Based on various factors it decides if there is a
+need to do flushing. If flushing is needed it is performed and the
+number of pages flushed is returned.
+@return number of pages flushed */
+static
+ulint
+page_cleaner_flush_pages_if_needed(void)
+/*====================================*/
+{
+	ulint	n_pages_flushed = 0;
+	lsn_t	lsn_limit = log_async_flush_lsn();
+
+	/* Currently we decide whether or not to flush and how much to
+	flush based on three factors.
+
+	1) If the amount of LSN for which pages are not flushed to disk
+	yet is greater than log_sys->max_modified_age_async. This is
+	the most urgent type of flush and we attempt to cleanup enough
+	of the tail of the flush_list to avoid flushing inside user
+	threads.
+
+	2) If modified page ratio is greater than the one specified by
+	the user. In that case we flush full 100% IO_CAPACITY of the
+	server. Note that 1 and 2 are not mutually exclusive. We can
+	end up executing both steps.
+
+	3) If adaptive_flushing is set by the user and neither of 1
+	or 2 has occurred above then we flush a batch based on our
+	heuristics. */
+
+	if (lsn_limit != LSN_MAX) {
+
+		/* async flushing is requested */
+		n_pages_flushed = page_cleaner_do_flush_batch(ULINT_MAX,
+							      lsn_limit);
+
+		MONITOR_INC_VALUE_CUMULATIVE(
+			MONITOR_FLUSH_ASYNC_TOTAL_PAGE,
+			MONITOR_FLUSH_ASYNC_COUNT,
+			MONITOR_FLUSH_ASYNC_PAGES,
+			n_pages_flushed);
+	}
+
+	if (UNIV_UNLIKELY(n_pages_flushed < PCT_IO(100)
+			  && buf_get_modified_ratio_pct()
+			     > srv_max_buf_pool_modified_pct)) {
+
+		/* Try to keep the number of modified pages in the
+		buffer pool under the limit wished by the user */
+
+		n_pages_flushed += page_cleaner_do_flush_batch(PCT_IO(100),
+							       LSN_MAX);
+
+		MONITOR_INC_VALUE_CUMULATIVE(
+			MONITOR_FLUSH_MAX_DIRTY_TOTAL_PAGE,
+			MONITOR_FLUSH_MAX_DIRTY_COUNT,
+			MONITOR_FLUSH_MAX_DIRTY_PAGES,
+			n_pages_flushed);
+	}
+
+	if (srv_adaptive_flushing && n_pages_flushed == 0) {
+
+		/* Try to keep the rate of flushing of dirty
+		pages such that redo log generation does not
+		produce bursts of IO at checkpoint time. */
+		ulint n_flush = buf_flush_get_desired_flush_rate();
+
+		ut_ad(n_flush <= PCT_IO(100));
+		if (n_flush) {
+			n_pages_flushed = page_cleaner_do_flush_batch(
+				n_flush, LSN_MAX);
+
+			MONITOR_INC_VALUE_CUMULATIVE(
+				MONITOR_FLUSH_ADAPTIVE_TOTAL_PAGE,
+				MONITOR_FLUSH_ADAPTIVE_COUNT,
+				MONITOR_FLUSH_ADAPTIVE_PAGES,
+				n_pages_flushed);
+		}
+	}
+
+	return(n_pages_flushed);
+}
+
+/*********************************************************************//**
+Puts the page_cleaner thread to sleep if it has finished work in less
+than a second */
+static
+void
+page_cleaner_sleep_if_needed(
+/*=========================*/
+	ulint	next_loop_time)	/*!< in: time when next loop iteration
+				should start */
+{
+	ulint	cur_time = ut_time_ms();
+
+	if (next_loop_time > cur_time) {
+		/* Get sleep interval in micro seconds. We use
+		ut_min() to avoid long sleep in case of
+		wrap around. */
+		os_thread_sleep(ut_min(1000000,
+				(next_loop_time - cur_time)
+				 * 1000));
+	}
+}
+
+/******************************************************************//**
+page_cleaner thread tasked with flushing dirty pages from the buffer
+pools. As of now we'll have only one instance of this thread.
+@return a dummy parameter */
+extern "C" UNIV_INTERN
+os_thread_ret_t
+DECLARE_THREAD(buf_flush_page_cleaner_thread)(
+/*==========================================*/
+	void*	arg __attribute__((unused)))
+			/*!< in: a dummy parameter required by
+			os_thread_create */
+{
+	ulint	next_loop_time = ut_time_ms() + 1000;
+	ulint	n_flushed = 0;
+	ulint	last_activity = srv_get_activity_count();
+	ulint	i;
+
+#ifdef UNIV_PFS_THREAD
+	pfs_register_thread(buf_page_cleaner_thread_key);
+#endif /* UNIV_PFS_THREAD */
+
+#ifdef UNIV_DEBUG_THREAD_CREATION
+	fprintf(stderr, "InnoDB: page_cleaner thread running, id %lu\n",
+		os_thread_pf(os_thread_get_curr_id()));
+#endif /* UNIV_DEBUG_THREAD_CREATION */
+
+	buf_page_cleaner_is_active = TRUE;
+
+	while (srv_shutdown_state == SRV_SHUTDOWN_NONE) {
+
+		/* The page_cleaner skips sleep if the server is
+		idle and there are no pending IOs in the buffer pool
+		and there is work to do. */
+		if (srv_check_activity(last_activity)
+		    || buf_get_n_pending_read_ios()
+		    || n_flushed == 0) {
+			page_cleaner_sleep_if_needed(next_loop_time);
+		}
+
+		next_loop_time = ut_time_ms() + 1000;
+
+		if (srv_check_activity(last_activity)) {
+			last_activity = srv_get_activity_count();
+
+			/* Flush pages from end of LRU if required */
+			n_flushed = page_cleaner_flush_LRU_tail();
+
+			/* Flush pages from flush_list if required */
+			n_flushed += page_cleaner_flush_pages_if_needed();
+		} else {
+			n_flushed = page_cleaner_do_flush_batch(
+							PCT_IO(100),
+							LSN_MAX);
+
+			if (n_flushed) {
+				MONITOR_INC_VALUE_CUMULATIVE(
+					MONITOR_FLUSH_BACKGROUND_TOTAL_PAGE,
+					MONITOR_FLUSH_BACKGROUND_COUNT,
+					MONITOR_FLUSH_BACKGROUND_PAGES,
+					n_flushed);
+			}
+		}
+	}
+
+	ut_ad(srv_shutdown_state > 0);
+	if (srv_fast_shutdown == 2) {
+		/* In very fast shutdown we simulate a crash of
+		buffer pool. We are not required to do any flushing */
+		goto thread_exit;
+	}
+
+	/* In case of normal and slow shutdown the page_cleaner thread
+	must wait for all other activity in the server to die down.
+	Note that we can start flushing the buffer pool as soon as the
+	server enters shutdown phase but we must stay alive long enough
+	to ensure that any work done by the master or purge threads is
+	also flushed.
+	During shutdown we pass through two stages. In the first stage,
+	when SRV_SHUTDOWN_CLEANUP is set other threads like the master
+	and the purge threads may be working as well. We start flushing
+	the buffer pool but can't be sure that no new pages are being
+	dirtied until we enter SRV_SHUTDOWN_FLUSH_PHASE phase. */
+
+	do {
+		n_flushed = page_cleaner_do_flush_batch(PCT_IO(100), LSN_MAX);
+
+		/* We sleep only if there are no pages to flush */
+		if (n_flushed == 0) {
+			os_thread_sleep(100000);
+		}
+	} while (srv_shutdown_state == SRV_SHUTDOWN_CLEANUP);
+
+	/* At this point all threads including the master and the purge
+	thread must have been suspended. */
+	ut_a(srv_get_active_thread_type() == SRV_NONE);
+	ut_a(srv_shutdown_state == SRV_SHUTDOWN_FLUSH_PHASE);
+
+	/* We can now make a final sweep on flushing the buffer pool
+	and exit after we have cleaned the whole buffer pool.
+	It is important that we wait for any running batch that has
+	been triggered by us to finish. Otherwise we can end up
+	considering end of that batch as a finish of our final
+	sweep and we'll come out of the loop leaving behind dirty pages
+	in the flush_list */
+	buf_flush_wait_batch_end(NULL, BUF_FLUSH_LIST);
+	page_cleaner_wait_LRU_flush();
+
+	do {
+
+		n_flushed = buf_flush_list(PCT_IO(100), LSN_MAX);
+		buf_flush_wait_batch_end(NULL, BUF_FLUSH_LIST);
+
+	} while (n_flushed > 0);
+
+	/* Some sanity checks */
+	ut_a(srv_get_active_thread_type() == SRV_NONE);
+	ut_a(srv_shutdown_state == SRV_SHUTDOWN_FLUSH_PHASE);
+	for (i = 0; i < srv_buf_pool_instances; i++) {
+		buf_pool_t* buf_pool = buf_pool_from_array(i);
+		ut_a(UT_LIST_GET_LEN(buf_pool->flush_list) == 0);
+	}
+
+	/* We have lived our life. Time to die. */
+
+thread_exit:
+	buf_page_cleaner_is_active = FALSE;
+
+	/* We count the number of threads in os_thread_exit(). A created
+	thread should always use that to exit and not use return() to exit. */
+	os_thread_exit(NULL);
+
+	OS_THREAD_DUMMY_RETURN;
 }
 
 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
+
+/** Functor to validate the flush list. */
+struct	Check {
+	void	operator()(const buf_page_t* elem)
+	{
+		ut_a(elem->in_flush_list);
+	}
+};
+
 /******************************************************************//**
 Validates the flush list.
 @return	TRUE if ok */
@@ -2196,8 +2449,7 @@ buf_flush_validate_low(
 
 	ut_ad(buf_flush_list_mutex_own(buf_pool));
 
-	UT_LIST_VALIDATE(list, buf_page_t, buf_pool->flush_list,
-			 ut_ad(ut_list_node_313->in_flush_list));
+	UT_LIST_VALIDATE(list, buf_page_t, buf_pool->flush_list, Check());
 
 	bpage = UT_LIST_GET_FIRST(buf_pool->flush_list);
 
@@ -2209,7 +2461,7 @@ buf_flush_validate_low(
 	}
 
 	while (bpage != NULL) {
-		const ib_uint64_t om = bpage->oldest_modification;
+		const lsn_t	om = bpage->oldest_modification;
 
 		ut_ad(buf_pool_from_bpage(bpage) == buf_pool);
 
diff --git a/storage/innobase/buf/buf0lru.c b/storage/innobase/buf/buf0lru.cc
index 8e787fdba17..92883269d42 100644
--- a/storage/innobase/buf/buf0lru.c
+++ b/storage/innobase/buf/buf0lru.cc
@@ -11,13 +11,13 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
 /**************************************************//**
-@file buf/buf0lru.c
+@file buf/buf0lru.cc
 The database buffer replacement algorithm
 
 Created 11/5/1995 Heikki Tuuri
@@ -25,6 +25,7 @@ Created 11/5/1995 Heikki Tuuri
 
 #include "buf0lru.h"
 
+#ifndef UNIV_HOTBACKUP
 #ifdef UNIV_NONINL
 #include "buf0lru.ic"
 #endif
@@ -40,6 +41,7 @@ Created 11/5/1995 Heikki Tuuri
 #include "btr0btr.h"
 #include "buf0buddy.h"
 #include "buf0buf.h"
+#include "buf0dblwr.h"
 #include "buf0flu.h"
 #include "buf0rea.h"
 #include "btr0sea.h"
@@ -48,6 +50,7 @@ Created 11/5/1995 Heikki Tuuri
 #include "page0zip.h"
 #include "log0recv.h"
 #include "srv0srv.h"
+#include "srv0mon.h"
 
 /** The number of blocks from the LRU_old pointer onward, including
 the block pointed to, must be buf_pool->LRU_old_ratio/BUF_LRU_OLD_RATIO_DIV
@@ -125,7 +128,11 @@ UNIV_INTERN uint	buf_LRU_old_threshold_ms;
 /******************************************************************//**
 Takes a block out of the LRU list and page hash table.
 If the block is compressed-only (BUF_BLOCK_ZIP_PAGE),
-the object will be freed and buf_pool->zip_mutex will be released.
+the object will be freed.
+
+The caller must hold buf_pool->mutex, the buf_page_get_mutex() mutex
+and the appropriate hash_lock. This function will release the
+buf_page_get_mutex() and the hash_lock.
 
 If a compressed page or a compressed-only block descriptor is freed,
 other compressed pages or compressed-only block descriptors may be
@@ -154,7 +161,7 @@ buf_LRU_block_free_hashed_page(
 Determines if the unzip_LRU list should be used for evicting a victim
 instead of the general LRU list.
 @return	TRUE if should use unzip_LRU */
-UNIV_INLINE
+UNIV_INTERN
 ibool
 buf_LRU_evict_from_unzip_LRU(
 /*=========================*/
@@ -247,8 +254,8 @@ buf_LRU_drop_page_hash_for_tablespace(
 		return;
 	}
 
-	page_arr = ut_malloc(
-		sizeof(ulint) * BUF_LRU_DROP_SEARCH_SIZE);
+	page_arr = static_cast<ulint*>(ut_malloc(
+		sizeof(ulint) * BUF_LRU_DROP_SEARCH_SIZE));
 
 	buf_pool_mutex_enter(buf_pool);
 	num_entries = 0;
@@ -335,275 +342,39 @@ next_page:
 }
 
 /******************************************************************//**
-While flushing (or removing dirty) pages from a tablespace we don't
-want to hog the CPU and resources. Release the buffer pool and block
-mutex and try to force a context switch. Then reacquire the same mutexes.
-The current page is "fixed" before the release of the mutexes and then
-"unfixed" again once we have reacquired the mutexes. */
-static
-void
-buf_flush_yield(
-/*============*/
-	buf_pool_t*	buf_pool,	/*!< in/out: buffer pool instance */
-	buf_page_t*	bpage)		/*!< in/out: current page */
-{
-	mutex_t*	block_mutex;
-
-	ut_ad(buf_pool_mutex_own(buf_pool));
-	ut_ad(buf_page_in_file(bpage));
-
-	block_mutex = buf_page_get_mutex(bpage);
-
-	mutex_enter(block_mutex);
-	/* "Fix" the block so that the position cannot be
-	changed after we release the buffer pool and
-	block mutexes. */
-	buf_page_set_sticky(bpage);
-
-	/* Now it is safe to release the buf_pool->mutex. */
-	buf_pool_mutex_exit(buf_pool);
-
-	mutex_exit(block_mutex);
-	/* Try and force a context switch. */
-	os_thread_yield();
-
-	buf_pool_mutex_enter(buf_pool);
-
-	mutex_enter(block_mutex);
-	/* "Unfix" the block now that we have both the
-	buffer pool and block mutex again. */
-	buf_page_unset_sticky(bpage);
-	mutex_exit(block_mutex);
-}
-
-/******************************************************************//**
-If we have hogged the resources for too long then release the buffer
-pool and flush list mutex and do a thread yield. Set the current page
-to "sticky" so that it is not relocated during the yield.
-@return TRUE if yielded */
-static
-ibool
-buf_flush_try_yield(
-/*================*/
-	buf_pool_t*	buf_pool,	/*!< in/out: buffer pool instance */
-	buf_page_t*	bpage,		/*!< in/out: bpage to remove */
-	ulint		processed)	/*!< in: number of pages processed */
-{
-	/* Every BUF_LRU_DROP_SEARCH_SIZE iterations in the
-	loop we release buf_pool->mutex to let other threads
-	do their job but only if the block is not IO fixed. This
-	ensures that the block stays in its position in the
-	flush_list. */
-
-	if (bpage != NULL
-	    && processed >= BUF_LRU_DROP_SEARCH_SIZE
-	    && buf_page_get_io_fix(bpage) == BUF_IO_NONE) {
-
-		buf_flush_list_mutex_exit(buf_pool);
-
-		/* Release the buffer pool and block mutex
-		to give the other threads a go. */
-
-		buf_flush_yield(buf_pool, bpage);
-
-		buf_flush_list_mutex_enter(buf_pool);
-
-		/* Should not have been removed from the flush
-		list during the yield. However, this check is
-		not sufficient to catch a remove -> add. */
-
-		ut_ad(bpage->in_flush_list);
-
-		return(TRUE);
-	}
-
-	return(FALSE);
-}
-
-/******************************************************************//**
-Removes a single page from a given tablespace inside a specific
-buffer pool instance.
-@return TRUE if page was removed. */
-static
-ibool
-buf_flush_or_remove_page(
-/*=====================*/
-	buf_pool_t*	buf_pool,	/*!< in/out: buffer pool instance */
-	buf_page_t*	bpage)		/*!< in/out: bpage to remove */
-{
-	mutex_t*	block_mutex;
-	ibool		processed = FALSE;
-
-	ut_ad(buf_pool_mutex_own(buf_pool));
-	ut_ad(buf_flush_list_mutex_own(buf_pool));
-
-	block_mutex = buf_page_get_mutex(bpage);
-
-	/* bpage->space and bpage->io_fix are protected by
-	buf_pool->mutex and block_mutex. It is safe to check
-	them while holding buf_pool->mutex only. */
-
-	if (buf_page_get_io_fix(bpage) != BUF_IO_NONE) {
-
-		/* We cannot remove this page during this scan
-		yet; maybe the system is currently reading it
-		in, or flushing the modifications to the file */
-
-	} else {
-
-		/* We have to release the flush_list_mutex to obey the
-		latching order. We are however guaranteed that the page
-		will stay in the flush_list because buf_flush_remove()
-		needs buf_pool->mutex as well (for the non-flush case). */
-
-		buf_flush_list_mutex_exit(buf_pool);
-
-		mutex_enter(block_mutex);
-
-		ut_ad(bpage->oldest_modification != 0);
-
-		if (bpage->buf_fix_count == 0) {
-
-			buf_flush_remove(bpage);
-
-			processed = TRUE;
-		}
-
-		mutex_exit(block_mutex);
-
-		buf_flush_list_mutex_enter(buf_pool);
-	}
-
-	ut_ad(!mutex_own(block_mutex));
-
-	return(processed);
-}
-
-/******************************************************************//**
 Remove all dirty pages belonging to a given tablespace inside a specific
 buffer pool instance when we are deleting the data file(s) of that
 tablespace. The pages still remain a part of LRU and are evicted from
-the list as they age towards the tail of the LRU.
-@return TRUE if all freed. */
-static
-ibool
-buf_flush_or_remove_pages(
-/*======================*/
-	buf_pool_t*	buf_pool,	/*!< buffer pool instance */
-	ulint		id)		/*!< in: target space id for which
-					to remove or flush pages */
-{
-	buf_page_t*	prev;
-	buf_page_t*	bpage;
-	ulint		processed = 0;
-	ibool		all_freed = TRUE;
-
-	buf_flush_list_mutex_enter(buf_pool);
-
-	for (bpage = UT_LIST_GET_LAST(buf_pool->flush_list);
-	     bpage != NULL;
-	     bpage = prev) {
-
-		ut_a(buf_page_in_file(bpage));
-		ut_ad(bpage->in_flush_list);
-
-		/* Save the previous link because once we free the
-		page we can't rely on the links. */
-
-		prev = UT_LIST_GET_PREV(list, bpage);
-
-		if (buf_page_get_space(bpage) != id) {
-
-			/* Skip this block, as it does not belong to
-			the target space. */
-
-		} else if (!buf_flush_or_remove_page(buf_pool, bpage)) {
-
-			/* Remove was unsuccessful, we have to try again
-			by scanning the entire list from the end. */
-
-			all_freed = FALSE;
-		}
-
-		++processed;
-
-		/* Yield if we have hogged the CPU and mutexes for too long. */
-		if (buf_flush_try_yield(buf_pool, prev, processed)) {
-
-			/* Reset the batch size counter if we had to yield. */
-
-			processed = 0;
-		}
-
-	}
-
-	buf_flush_list_mutex_exit(buf_pool);
-
-	return(all_freed);
-}
-
-/******************************************************************//**
-Remove or flush all the dirty pages that belong to a given tablespace
-inside a specific buffer pool instance. The pages will remain in the LRU
-list and will be evicted from the LRU list as they age and move towards
-the tail of the LRU list. */
-static
-void
-buf_flush_dirty_pages(
-/*==================*/
-	buf_pool_t*	buf_pool,	/*!< buffer pool instance */
-	ulint		id)		/*!< in: space id */
-{
-	ibool	all_freed;
-
-	do {
-		buf_pool_mutex_enter(buf_pool);
-
-		all_freed = buf_flush_or_remove_pages(buf_pool, id);
-
-		buf_pool_mutex_exit(buf_pool);
-
-		ut_ad(buf_flush_validate(buf_pool));
-
-		if (!all_freed) {
-			os_thread_sleep(20000);
-		}
-
-	} while (!all_freed);
-}
-
-/******************************************************************//**
-Remove all pages that belong to a given tablespace inside a specific
-buffer pool instance when we are DISCARDing the tablespace. */
+the list as they age towards the tail of the LRU. */
 static
 void
-buf_LRU_remove_all_pages(
-/*=====================*/
+buf_LRU_remove_dirty_pages_for_tablespace(
+/*======================================*/
 	buf_pool_t*	buf_pool,	/*!< buffer pool instance */
 	ulint		id)		/*!< in: space id */
 {
 	buf_page_t*	bpage;
 	ibool		all_freed;
+	ulint		i;
 
 scan_again:
 	buf_pool_mutex_enter(buf_pool);
+	buf_flush_list_mutex_enter(buf_pool);
 
 	all_freed = TRUE;
 
-	for (bpage = UT_LIST_GET_LAST(buf_pool->LRU);
-	     bpage != NULL;
-	     /* No op */) {
+	for (bpage = UT_LIST_GET_LAST(buf_pool->flush_list), i = 0;
+	     bpage != NULL; ++i) {
 
 		buf_page_t*	prev_bpage;
 		mutex_t*	block_mutex = NULL;
 
 		ut_a(buf_page_in_file(bpage));
-		ut_ad(bpage->in_LRU_list);
 
-		prev_bpage = UT_LIST_GET_PREV(LRU, bpage);
+		prev_bpage = UT_LIST_GET_PREV(list, bpage);
 
 		/* bpage->space and bpage->io_fix are protected by
-		buf_pool->mutex and the block_mutex. It is safe to check
+		buf_pool->mutex and block_mutex. It is safe to check
 		them while holding buf_pool->mutex only. */
 
 		if (buf_page_get_space(bpage) != id) {
@@ -617,87 +388,83 @@ scan_again:
 
 			all_freed = FALSE;
 			goto next_page;
-		} else {
-
-			block_mutex = buf_page_get_mutex(bpage);
-			mutex_enter(block_mutex);
-
-			if (bpage->buf_fix_count > 0) {
-
-				mutex_exit(block_mutex);
-
-				/* We cannot remove this page during
-				this scan yet; maybe the system is
-				currently reading it in, or flushing
-				the modifications to the file */
-
-				all_freed = FALSE;
-
-				goto next_page;
-			}
 		}
 
-		ut_ad(mutex_own(block_mutex));
+		/* We have to release the flush_list_mutex to obey the
+		latching order. We are however guaranteed that the page
+		will stay in the flush_list because buf_flush_remove()
+		needs buf_pool->mutex as well. */
+		buf_flush_list_mutex_exit(buf_pool);
+		block_mutex = buf_page_get_mutex(bpage);
+		mutex_enter(block_mutex);
 
-#ifdef UNIV_DEBUG
-		if (buf_debug_prints) {
-			fprintf(stderr,
-				"Dropping space %lu page %lu\n",
-				(ulong) buf_page_get_space(bpage),
-				(ulong) buf_page_get_page_no(bpage));
-		}
-#endif
-		if (buf_page_get_state(bpage) != BUF_BLOCK_FILE_PAGE) {
-			/* Do nothing, because the adaptive hash index
-			covers uncompressed pages only. */
-		} else if (((buf_block_t*) bpage)->index) {
-			ulint	page_no;
-			ulint	zip_size;
+		if (bpage->buf_fix_count > 0) {
+			mutex_exit(block_mutex);
+			buf_flush_list_mutex_enter(buf_pool);
 
-			buf_pool_mutex_exit(buf_pool);
+			/* We cannot remove this page during
+			this scan yet; maybe the system is
+			currently reading it in, or flushing
+			the modifications to the file */
 
-			zip_size = buf_page_get_zip_size(bpage);
-			page_no = buf_page_get_page_no(bpage);
+			all_freed = FALSE;
+			goto next_page;
+		}
 
-			mutex_exit(block_mutex);
+		ut_ad(bpage->oldest_modification != 0);
 
-			/* Note that the following call will acquire
-			and release block->lock X-latch. */
+		buf_flush_remove(bpage);
 
-			btr_search_drop_page_hash_when_freed(
-				id, zip_size, page_no);
+		mutex_exit(block_mutex);
+		buf_flush_list_mutex_enter(buf_pool);
+next_page:
+		bpage = prev_bpage;
 
-			goto scan_again;
+		if (!bpage) {
+			break;
 		}
 
-		if (bpage->oldest_modification != 0) {
-			buf_flush_remove(bpage);
+		/* Every BUF_LRU_DROP_SEARCH_SIZE iterations in the
+		loop we release buf_pool->mutex to let other threads
+		do their job. */
+		if (i < BUF_LRU_DROP_SEARCH_SIZE) {
+			continue;
 		}
 
-		ut_ad(!bpage->in_flush_list);
-
-		/* Remove from the LRU list. */
+		/* We IO-fix the block to make sure that the block
+		stays in its position in the flush_list. */
+		if (buf_page_get_io_fix(bpage) != BUF_IO_NONE) {
+			/* Block is already IO-fixed. We don't
+			want to change the value. Lets leave
+			this block alone. */
+			continue;
+		}
 
-		if (buf_LRU_block_remove_hashed_page(bpage, TRUE)
-		    != BUF_BLOCK_ZIP_FREE) {
+		buf_flush_list_mutex_exit(buf_pool);
+		block_mutex = buf_page_get_mutex(bpage);
+		mutex_enter(block_mutex);
+		buf_page_set_sticky(bpage);
+		mutex_exit(block_mutex);
 
-			buf_LRU_block_free_hashed_page((buf_block_t*) bpage);
-			mutex_exit(block_mutex);
+		/* Now it is safe to release the buf_pool->mutex. */
+		buf_pool_mutex_exit(buf_pool);
+		os_thread_yield();
+		buf_pool_mutex_enter(buf_pool);
 
-		} else {
-			/* The block_mutex should have been released
-			by buf_LRU_block_remove_hashed_page() when it
-			returns BUF_BLOCK_ZIP_FREE. */
-			ut_ad(block_mutex == &buf_pool->zip_mutex);
-		}
+		mutex_enter(block_mutex);
+		buf_page_unset_sticky(bpage);
+		mutex_exit(block_mutex);
 
-		ut_ad(!mutex_own(block_mutex));
+		buf_flush_list_mutex_enter(buf_pool);
+		ut_ad(bpage->in_flush_list);
 
-next_page:
-		bpage = prev_bpage;
+		i = 0;
 	}
 
 	buf_pool_mutex_exit(buf_pool);
+	buf_flush_list_mutex_exit(buf_pool);
+
+	ut_ad(buf_flush_validate(buf_pool));
 
 	if (!all_freed) {
 		os_thread_sleep(20000);
@@ -707,46 +474,28 @@ next_page:
 }
 
 /******************************************************************//**
-Removes all pages belonging to a given tablespace. */
+Invalidates all pages belonging to a given tablespace when we are deleting
+the data file(s) of that tablespace. */
 UNIV_INTERN
 void
-buf_LRU_flush_or_remove_pages(
+buf_LRU_invalidate_tablespace(
 /*==========================*/
-	ulint			id,	/*!< in: space id */
-	enum buf_remove_t	buf_remove)/*!< in: remove or flush
-					strategy */
+	ulint	id)	/*!< in: space id */
 {
-	ulint		i;
+	ulint	i;
 
+	/* Before we attempt to drop pages one by one we first
+	attempt to drop page hash index entries in batches to make
+	it more efficient. The batching attempt is a best effort
+	attempt and does not guarantee that all pages hash entries
+	will be dropped. We get rid of remaining page hash entries
+	one by one below. */
 	for (i = 0; i < srv_buf_pool_instances; i++) {
 		buf_pool_t*	buf_pool;
 
 		buf_pool = buf_pool_from_array(i);
-
-		switch (buf_remove) {
-		case BUF_REMOVE_ALL_NO_WRITE:
-			/* A DISCARD tablespace case. Remove AHI entries
-			and evict all pages from LRU. */
-
-			/* Before we attempt to drop pages hash entries
-			one by one we first attempt to drop page hash
-			index entries in batches to make it more
-			efficient. The batching attempt is a best effort
-			attempt and does not guarantee that all pages
-			hash entries will be dropped. We get rid of
-			remaining page hash entries one by one below. */
-			buf_LRU_drop_page_hash_for_tablespace(buf_pool, id);
-			buf_LRU_remove_all_pages(buf_pool, id);
-			break;
-
-		case BUF_REMOVE_FLUSH_NO_WRITE:
-			/* A DROP table case. AHI entries are already
-			removed. No need to evict all pages from LRU
-			list. Just evict pages from flush list without
-			writing. */
-			buf_flush_dirty_pages(buf_pool, id);
-			break;
-		}
+		buf_LRU_drop_page_hash_for_tablespace(buf_pool, id);
+		buf_LRU_remove_dirty_pages_for_tablespace(buf_pool, id);
 	}
 }
 
@@ -794,55 +543,44 @@ ibool
 buf_LRU_free_from_unzip_LRU_list(
 /*=============================*/
 	buf_pool_t*	buf_pool,	/*!< in: buffer pool instance */
-	ulint		n_iterations)	/*!< in: how many times this has
-					been called repeatedly without
-					result: a high value means that
-					we should search farther; we will
-					search n_iterations / 5 of the
-					unzip_LRU list, or nothing if
-					n_iterations >= 5 */
+	ibool		scan_all)	/*!< in: scan whole LRU list
+					if TRUE, otherwise scan only
+					srv_LRU_scan_depth / 2 blocks. */
 {
 	buf_block_t*	block;
-	ulint		distance;
+	ibool 		freed;
+	ulint		scanned;
 
 	ut_ad(buf_pool_mutex_own(buf_pool));
 
-	/* Theoratically it should be much easier to find a victim
-	from unzip_LRU as we can choose even a dirty block (as we'll
-	be evicting only the uncompressed frame).  In a very unlikely
-	eventuality that we are unable to find a victim from
-	unzip_LRU, we fall back to the regular LRU list.  We do this
-	if we have done five iterations so far. */
-
-	if (UNIV_UNLIKELY(n_iterations >= 5)
-	    || !buf_LRU_evict_from_unzip_LRU(buf_pool)) {
-
+	if (!buf_LRU_evict_from_unzip_LRU(buf_pool)) {
 		return(FALSE);
 	}
 
-	distance = 100 + (n_iterations
-			  * UT_LIST_GET_LEN(buf_pool->unzip_LRU)) / 5;
+	for (block = UT_LIST_GET_LAST(buf_pool->unzip_LRU),
+	     scanned = 1, freed = FALSE;
+	     block != NULL && !freed
+	     && (scan_all || scanned < srv_LRU_scan_depth);
+	     ++scanned) {
 
-	for (block = UT_LIST_GET_LAST(buf_pool->unzip_LRU);
-	     UNIV_LIKELY(block != NULL) && UNIV_LIKELY(distance > 0);
-	     block = UT_LIST_GET_PREV(unzip_LRU, block), distance--) {
-
-		ibool freed;
+		buf_block_t*	prev_block = UT_LIST_GET_PREV(unzip_LRU,
+						block);
 
 		ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
 		ut_ad(block->in_unzip_LRU_list);
 		ut_ad(block->page.in_LRU_list);
 
-		mutex_enter(&block->mutex);
 		freed = buf_LRU_free_block(&block->page, FALSE);
-		mutex_exit(&block->mutex);
 
-		if (freed) {
-			return(TRUE);
-		}
+		block = prev_block;
 	}
 
-	return(FALSE);
+	MONITOR_INC_VALUE_CUMULATIVE(
+		MONITOR_LRU_UNZIP_SEARCH_SCANNED,
+		MONITOR_LRU_UNZIP_SEARCH_SCANNED_NUM_CALL,
+		MONITOR_LRU_UNZIP_SEARCH_SCANNED_PER_CALL,
+		scanned);
+	return(freed);
 }
 
 /******************************************************************//**
@@ -852,129 +590,68 @@ UNIV_INLINE
 ibool
 buf_LRU_free_from_common_LRU_list(
 /*==============================*/
-	buf_pool_t*	buf_pool,
-	ulint		n_iterations)
-				/*!< in: how many times this has been called
-				repeatedly without result: a high value means
-				that we should search farther; if
-				n_iterations < 10, then we search
-				n_iterations / 10 * buf_pool->curr_size
-				pages from the end of the LRU list */
+	buf_pool_t*	buf_pool,	/*!< in: buffer pool instance */
+	ibool		scan_all)	/*!< in: scan whole LRU list
+					if TRUE, otherwise scan only
+					srv_LRU_scan_depth / 2 blocks. */
 {
 	buf_page_t*	bpage;
-	ulint		distance;
+	ibool		freed;
+	ulint		scanned;
 
 	ut_ad(buf_pool_mutex_own(buf_pool));
 
-	distance = 100 + (n_iterations * buf_pool->curr_size) / 10;
-
-	for (bpage = UT_LIST_GET_LAST(buf_pool->LRU);
-	     UNIV_LIKELY(bpage != NULL) && UNIV_LIKELY(distance > 0);
-	     bpage = UT_LIST_GET_PREV(LRU, bpage), distance--) {
+	for (bpage = UT_LIST_GET_LAST(buf_pool->LRU),
+	     scanned = 1, freed = FALSE;
+	     bpage != NULL && !freed
+	     && (scan_all || scanned < srv_LRU_scan_depth);
+	     ++scanned) {
 
-		ibool		freed;
 		unsigned	accessed;
-		mutex_t*	block_mutex = buf_page_get_mutex(bpage);
+		buf_page_t*	prev_bpage = UT_LIST_GET_PREV(LRU,
+						bpage);
 
 		ut_ad(buf_page_in_file(bpage));
 		ut_ad(bpage->in_LRU_list);
 
-		mutex_enter(block_mutex);
 		accessed = buf_page_is_accessed(bpage);
 		freed = buf_LRU_free_block(bpage, TRUE);
-		mutex_exit(block_mutex);
-
-		if (freed) {
+		if (freed && !accessed) {
 			/* Keep track of pages that are evicted without
 			ever being accessed. This gives us a measure of
 			the effectiveness of readahead */
-			if (!accessed) {
-				++buf_pool->stat.n_ra_pages_evicted;
-			}
-			return(TRUE);
+			++buf_pool->stat.n_ra_pages_evicted;
 		}
-	}
-
-	return(FALSE);
-}
-
-/******************************************************************//**
-Try to free a replaceable block.
-@return	TRUE if found and freed */
-UNIV_INTERN
-ibool
-buf_LRU_search_and_free_block(
-/*==========================*/
-	buf_pool_t*	buf_pool,
-				/*!< in: buffer pool instance */
-	ulint		n_iterations)
-				/*!< in: how many times this has been called
-				repeatedly without result: a high value means
-				that we should search farther; if
-				n_iterations < 10, then we search
-				n_iterations / 10 * buf_pool->curr_size
-				pages from the end of the LRU list; if
-				n_iterations < 5, then we will also search
-				n_iterations / 5 of the unzip_LRU list. */
-{
-	ibool	freed = FALSE;
 
-	buf_pool_mutex_enter(buf_pool);
-
-	freed = buf_LRU_free_from_unzip_LRU_list(buf_pool, n_iterations);
-
-	if (!freed) {
-		freed = buf_LRU_free_from_common_LRU_list(
-			buf_pool, n_iterations);
-	}
-
-	if (!freed) {
-		buf_pool->LRU_flush_ended = 0;
-	} else if (buf_pool->LRU_flush_ended > 0) {
-		buf_pool->LRU_flush_ended--;
+		bpage = prev_bpage;
 	}
 
-	buf_pool_mutex_exit(buf_pool);
+	MONITOR_INC_VALUE_CUMULATIVE(
+		MONITOR_LRU_SEARCH_SCANNED,
+		MONITOR_LRU_SEARCH_SCANNED_NUM_CALL,
+		MONITOR_LRU_SEARCH_SCANNED_PER_CALL,
+		scanned);
 
 	return(freed);
 }
 
 /******************************************************************//**
-Tries to remove LRU flushed blocks from the end of the LRU list and put them
-to the free list. This is beneficial for the efficiency of the insert buffer
-operation, as flushed pages from non-unique non-clustered indexes are here
-taken out of the buffer pool, and their inserts redirected to the insert
-buffer. Otherwise, the flushed blocks could get modified again before read
-operations need new buffer blocks, and the i/o work done in flushing would be
-wasted. */
+Try to free a replaceable block.
+@return	TRUE if found and freed */
 UNIV_INTERN
-void
-buf_LRU_try_free_flushed_blocks(
-/*============================*/
-	buf_pool_t*	buf_pool)		/*!< in: buffer pool instance */
+ibool
+buf_LRU_scan_and_free_block(
+/*========================*/
+	buf_pool_t*	buf_pool,	/*!< in: buffer pool instance */
+	ibool		scan_all)	/*!< in: scan whole LRU list
+					if TRUE, otherwise scan only
+					'old' blocks. */
 {
+	ut_ad(buf_pool_mutex_own(buf_pool));
 
-	if (buf_pool == NULL) {
-		ulint	i;
-
-		for (i = 0; i < srv_buf_pool_instances; i++) {
-			buf_pool = buf_pool_from_array(i);
-			buf_LRU_try_free_flushed_blocks(buf_pool);
-		}
-	} else {
-		buf_pool_mutex_enter(buf_pool);
-
-		while (buf_pool->LRU_flush_ended > 0) {
-
-			buf_pool_mutex_exit(buf_pool);
-
-			buf_LRU_search_and_free_block(buf_pool, 1);
-
-			buf_pool_mutex_enter(buf_pool);
-		}
-
-		buf_pool_mutex_exit(buf_pool);
-	}
+	return(buf_LRU_free_from_unzip_LRU_list(buf_pool, scan_all)
+	       || buf_LRU_free_from_common_LRU_list(
+			buf_pool, scan_all));
 }
 
 /******************************************************************//**
@@ -1050,23 +727,17 @@ buf_LRU_get_free_only(
 }
 
 /******************************************************************//**
-Returns a free block from the buf_pool. The block is taken off the
-free list. If it is empty, blocks are moved from the end of the
-LRU list to the free list.
-@return	the free control block, in state BUF_BLOCK_READY_FOR_USE */
-UNIV_INTERN
-buf_block_t*
-buf_LRU_get_free_block(
-/*===================*/
-	buf_pool_t*	buf_pool)	/*!< in/out: buffer pool instance */
+Checks how much of buf_pool is occupied by non-data objects like
+AHI, lock heaps etc. Depending on the size of non-data objects this
+function will either assert or issue a warning and switch on the
+status monitor. */
+static
+void
+buf_LRU_check_size_of_non_data_objects(
+/*===================================*/
+	const buf_pool_t*	buf_pool)	/*!< in: buffer pool instance */
 {
-	buf_block_t*	block		= NULL;
-	ibool		freed;
-	ulint		n_iterations	= 1;
-	ibool		mon_value_was	= FALSE;
-	ibool		started_monitor	= FALSE;
-loop:
-	buf_pool_mutex_enter(buf_pool);
+	ut_ad(buf_pool_mutex_own(buf_pool));
 
 	if (!recv_recovery_on && UT_LIST_GET_LEN(buf_pool->free)
 	    + UT_LIST_GET_LEN(buf_pool->LRU) < buf_pool->curr_size / 20) {
@@ -1119,7 +790,7 @@ loop:
 
 			buf_lru_switched_on_innodb_mon = TRUE;
 			srv_print_innodb_monitor = TRUE;
-			os_event_set(srv_lock_timeout_thread_event);
+			os_event_set(srv_timeout_event);
 		}
 	} else if (buf_lru_switched_on_innodb_mon) {
 
@@ -1131,12 +802,59 @@ loop:
 		buf_lru_switched_on_innodb_mon = FALSE;
 		srv_print_innodb_monitor = FALSE;
 	}
+}
+
+/******************************************************************//**
+Returns a free block from the buf_pool. The block is taken off the
+free list. If free list is empty, blocks are moved from the end of the
+LRU list to the free list.
+This function is called from a user thread when it needs a clean
+block to read in a page. Note that we only ever get a block from
+the free list. Even when we flush a page or find a page in LRU scan
+we put it to free list to be used.
+* iteration 0:
+  * get a block from free list, success:done
+  * if there is an LRU flush batch in progress:
+    * wait for batch to end: retry free list
+  * if buf_pool->try_LRU_scan is set
+    * scan LRU up to srv_LRU_scan_depth to find a clean block
+    * the above will put the block on free list
+    * success:retry the free list
+  * flush one dirty page from tail of LRU to disk
+    * the above will put the block on free list
+    * success: retry the free list
+* iteration 1:
+  * same as iteration 0 except:
+    * scan whole LRU list
+    * scan LRU list even if buf_pool->try_LRU_scan is not set
+* iteration > 1:
+  * same as iteration 1 but sleep 100ms
+@return	the free control block, in state BUF_BLOCK_READY_FOR_USE */
+UNIV_INTERN
+buf_block_t*
+buf_LRU_get_free_block(
+/*===================*/
+	buf_pool_t*	buf_pool)	/*!< in/out: buffer pool instance */
+{
+	buf_block_t*	block		= NULL;
+	ibool		freed		= FALSE;
+	ulint		n_iterations	= 0;
+	ulint		flush_failures	= 0;
+	ibool		mon_value_was	= FALSE;
+	ibool		started_monitor	= FALSE;
+
+	MONITOR_INC(MONITOR_LRU_GET_FREE_SEARCH);
+loop:
+	buf_pool_mutex_enter(buf_pool);
+
+	buf_LRU_check_size_of_non_data_objects(buf_pool);
 
 	/* If there is a block in the free list, take it */
 	block = buf_LRU_get_free_only(buf_pool);
-	buf_pool_mutex_exit(buf_pool);
 
 	if (block) {
+
+		buf_pool_mutex_exit(buf_pool);
 		ut_ad(buf_pool_from_block(block) == buf_pool);
 		memset(&block->page.zip, 0, sizeof block->page.zip);
 
@@ -1147,20 +865,52 @@ loop:
 		return(block);
 	}
 
-	/* If no block was in the free list, search from the end of the LRU
-	list and try to free a block there */
+	if (buf_pool->init_flush[BUF_FLUSH_LRU]
+	    && srv_use_doublewrite_buf
+	    && buf_dblwr != NULL) {
+
+		/* If there is an LRU flush happening in the background
+		then we wait for it to end instead of trying a single
+		page flush. If, however, we are not using doublewrite
+		buffer then it is better to do our own single page
+		flush instead of waiting for LRU flush to end. */
+		buf_pool_mutex_exit(buf_pool);
+		buf_flush_wait_batch_end(buf_pool, BUF_FLUSH_LRU);
+		goto loop;
+	}
+
+	freed = FALSE;
+	if (buf_pool->try_LRU_scan || n_iterations > 0) {
+		/* If no block was in the free list, search from the
+		end of the LRU list and try to free a block there.
+		If we are doing for the first time we'll scan only
+		tail of the LRU list otherwise we scan the whole LRU
+		list. */
+		freed = buf_LRU_scan_and_free_block(buf_pool,
+						    n_iterations > 0);
+
+		if (!freed && n_iterations == 0) {
+			/* Tell other threads that there is no point
+			in scanning the LRU list. This flag is set to
+			TRUE again when we flush a batch from this
+			buffer pool. */
+			buf_pool->try_LRU_scan = FALSE;
+		}
+	}
 
-	freed = buf_LRU_search_and_free_block(buf_pool, n_iterations);
+	buf_pool_mutex_exit(buf_pool);
 
-	if (freed > 0) {
+	if (freed) {
 		goto loop;
+
 	}
 
-	if (n_iterations > 30) {
+	if (n_iterations > 20) {
 		ut_print_timestamp(stderr);
 		fprintf(stderr,
 			"  InnoDB: Warning: difficult to find free blocks in\n"
-			"InnoDB: the buffer pool (%lu search iterations)!"
+			"InnoDB: the buffer pool (%lu search iterations)!\n"
+			"InnoDB: %lu failed attempts to flush a page!"
 			" Consider\n"
 			"InnoDB: increasing the buffer pool size.\n"
 			"InnoDB: It is also possible that"
@@ -1179,6 +929,7 @@ loop:
 			"InnoDB: Starting InnoDB Monitor to print further\n"
 			"InnoDB: diagnostics to the standard output.\n",
 			(ulong) n_iterations,
+			(ulong)	flush_failures,
 			(ulong) fil_n_pending_log_flushes,
 			(ulong) fil_n_pending_tablespace_flushes,
 			(ulong) os_n_file_reads, (ulong) os_n_file_writes,
@@ -1187,35 +938,35 @@ loop:
 		mon_value_was = srv_print_innodb_monitor;
 		started_monitor = TRUE;
 		srv_print_innodb_monitor = TRUE;
-		os_event_set(srv_lock_timeout_thread_event);
+		os_event_set(srv_timeout_event);
 	}
 
-	/* No free block was found: try to flush the LRU list */
+	/* If we have scanned the whole LRU and still are unable to
+	find a free block then we should sleep here to let the
+	page_cleaner do an LRU batch for us.
+	TODO: It'd be better if we can signal the page_cleaner. Perhaps
+	we should use timed wait for page_cleaner. */
+	if (n_iterations > 1) {
 
-	buf_flush_free_margin(buf_pool);
-	++srv_buf_pool_wait_free;
-
-	os_aio_simulated_wake_handler_threads();
-
-	buf_pool_mutex_enter(buf_pool);
-
-	if (buf_pool->LRU_flush_ended > 0) {
-		/* We have written pages in an LRU flush. To make the insert
-		buffer more efficient, we try to move these pages to the free
-		list. */
-
-		buf_pool_mutex_exit(buf_pool);
-
-		buf_LRU_try_free_flushed_blocks(buf_pool);
-	} else {
-		buf_pool_mutex_exit(buf_pool);
+		os_thread_sleep(100000);
 	}
 
-	if (n_iterations > 10) {
-
-		os_thread_sleep(500000);
+	/* No free block was found: try to flush the LRU list.
+	This call will flush one page from the LRU and put it on the
+	free list. That means that the free block is up for grabs for
+	all user threads.
+	TODO: A more elegant way would have been to return the freed
+	up block to the caller here but the code that deals with
+	removing the block from page_hash and LRU_list is fairly
+	involved (particularly in case of compressed pages). We
+	can do that in a separate patch sometime in future. */
+	if (!buf_flush_single_page_from_LRU(buf_pool)) {
+		MONITOR_INC(MONITOR_LRU_SINGLE_FLUSH_FAILURE_COUNT);
+		++flush_failures;
 	}
 
+	++srv_buf_pool_wait_free;
+
 	n_iterations++;
 
 	goto loop;
@@ -1622,9 +1373,8 @@ NOTE: If this function returns TRUE, it will temporarily
 release buf_pool->mutex.  Furthermore, the page frame will no longer be
 accessible via bpage.
 
-The caller must hold buf_pool->mutex and buf_page_get_mutex(bpage) and
-release these two mutexes after the call.  No other
-buf_page_get_mutex() may be held when calling this function.
+The caller must hold buf_pool->mutex and must not hold any
+buf_page_get_mutex() when calling this function.
 @return TRUE if freed, FALSE otherwise. */
 UNIV_INTERN
 ibool
@@ -1636,13 +1386,20 @@ buf_LRU_free_block(
 {
 	buf_page_t*	b = NULL;
 	buf_pool_t*	buf_pool = buf_pool_from_bpage(bpage);
+	enum buf_page_state		page_state;
+	const ulint	fold = buf_page_address_fold(bpage->space,
+						     bpage->offset);
+	rw_lock_t*	hash_lock = buf_page_hash_lock_get(buf_pool, fold);
+
 	mutex_t*	block_mutex = buf_page_get_mutex(bpage);
 
 	ut_ad(buf_pool_mutex_own(buf_pool));
-	ut_ad(mutex_own(block_mutex));
 	ut_ad(buf_page_in_file(bpage));
 	ut_ad(bpage->in_LRU_list);
-	ut_ad(!bpage->in_flush_list == !bpage->oldest_modification);
+
+	rw_lock_x_lock(hash_lock);
+	mutex_enter(block_mutex);
+
 #if UNIV_WORD_SIZE == 4
 	/* On 32-bit systems, there is no padding in buf_page_t.  On
 	other systems, Valgrind could complain about uninitialized pad
@@ -1653,7 +1410,7 @@ buf_LRU_free_block(
 	if (!buf_page_can_relocate(bpage)) {
 
 		/* Do not free buffer-fixed or I/O-fixed blocks. */
-		return(FALSE);
+		goto func_exit;
 	}
 
 #ifdef UNIV_IBUF_COUNT_DEBUG
@@ -1665,28 +1422,37 @@ buf_LRU_free_block(
 		/* Do not completely free dirty blocks. */
 
 		if (bpage->oldest_modification) {
-			return(FALSE);
+			goto func_exit;
 		}
-	} else if (bpage->oldest_modification) {
-		/* Do not completely free dirty blocks. */
+	} else if ((bpage->oldest_modification)
+		   && (buf_page_get_state(bpage)
+		       != BUF_BLOCK_FILE_PAGE)) {
 
-		if (buf_page_get_state(bpage) != BUF_BLOCK_FILE_PAGE) {
-			ut_ad(buf_page_get_state(bpage)
-			      == BUF_BLOCK_ZIP_DIRTY);
-			return(FALSE);
-		}
+		ut_ad(buf_page_get_state(bpage)
+		      == BUF_BLOCK_ZIP_DIRTY);
+
+func_exit:
+		rw_lock_x_unlock(hash_lock);
+		mutex_exit(block_mutex);
+		return(FALSE);
 
-		goto alloc;
 	} else if (buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE) {
-		/* Allocate the control block for the compressed page.
-		If it cannot be allocated (without freeing a block
-		from the LRU list), refuse to free bpage. */
-alloc:
 		b = buf_page_alloc_descriptor();
 		ut_a(b);
 		memcpy(b, bpage, sizeof *b);
 	}
 
+	ut_ad(buf_pool_mutex_own(buf_pool));
+	ut_ad(buf_page_in_file(bpage));
+	ut_ad(bpage->in_LRU_list);
+	ut_ad(!bpage->in_flush_list == !bpage->oldest_modification);
+#if UNIV_WORD_SIZE == 4
+	/* On 32-bit systems, there is no padding in buf_page_t.  On
+	other systems, Valgrind could complain about uninitialized pad
+	bytes. */
+	UNIV_MEM_ASSERT_RW(bpage, sizeof *bpage);
+#endif
+
 #ifdef UNIV_DEBUG
 	if (buf_debug_prints) {
 		fprintf(stderr, "Putting space %lu page %lu to free list\n",
@@ -1695,167 +1461,197 @@ alloc:
 	}
 #endif /* UNIV_DEBUG */
 
-	if (buf_LRU_block_remove_hashed_page(bpage, zip)
-	    != BUF_BLOCK_ZIP_FREE) {
-		ut_a(bpage->buf_fix_count == 0);
+#ifdef UNIV_SYNC_DEBUG
+        ut_ad(rw_lock_own(hash_lock, RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+	ut_ad(buf_page_can_relocate(bpage));
+
+	page_state = buf_LRU_block_remove_hashed_page(bpage, zip);
 
-		if (b) {
-			buf_page_t*	hash_b;
-			buf_page_t*	prev_b	= UT_LIST_GET_PREV(LRU, b);
+#ifdef UNIV_SYNC_DEBUG
+	/* buf_LRU_block_remove_hashed_page() releases the hash_lock */
+	ut_ad(!rw_lock_own(hash_lock, RW_LOCK_EX)
+	      && !rw_lock_own(hash_lock, RW_LOCK_SHARED));
+#endif /* UNIV_SYNC_DEBUG */
 
-			const ulint	fold = buf_page_address_fold(
-				bpage->space, bpage->offset);
+	if (page_state == BUF_BLOCK_ZIP_FREE) {
+		return(TRUE);
+	}
 
-			hash_b	= buf_page_hash_get_low(
-				buf_pool, bpage->space, bpage->offset, fold);
+	ut_ad(page_state == BUF_BLOCK_REMOVE_HASH);
 
-			ut_a(!hash_b);
+	/* We have just freed a BUF_BLOCK_FILE_PAGE. If b != NULL
+	then it was a compressed page with an uncompressed frame and
+	we are interested in freeing only the uncompressed frame.
+	Therefore we have to reinsert the compressed page descriptor
+	into the LRU and page_hash (and possibly flush_list).
+	if b == NULL then it was a regular page that has been freed */
 
-			b->state = b->oldest_modification
-				? BUF_BLOCK_ZIP_DIRTY
-				: BUF_BLOCK_ZIP_PAGE;
-			UNIV_MEM_DESC(b->zip.data,
-				      page_zip_get_size(&b->zip), b);
+	if (b) {
+		buf_page_t*	prev_b	= UT_LIST_GET_PREV(LRU, b);
 
-			/* The fields in_page_hash and in_LRU_list of
-			the to-be-freed block descriptor should have
-			been cleared in
-			buf_LRU_block_remove_hashed_page(), which
-			invokes buf_LRU_remove_block(). */
-			ut_ad(!bpage->in_page_hash);
-			ut_ad(!bpage->in_LRU_list);
-			/* bpage->state was BUF_BLOCK_FILE_PAGE because
-			b != NULL. The type cast below is thus valid. */
-			ut_ad(!((buf_block_t*) bpage)->in_unzip_LRU_list);
+		rw_lock_x_lock(hash_lock);
+		mutex_enter(block_mutex);
+
+		ut_a(!buf_page_hash_get_low(buf_pool,
+					    bpage->space,
+					    bpage->offset,
+					    fold));
+
+		b->state = b->oldest_modification
+			? BUF_BLOCK_ZIP_DIRTY
+			: BUF_BLOCK_ZIP_PAGE;
+		UNIV_MEM_DESC(b->zip.data,
+			      page_zip_get_size(&b->zip));
+
+		/* The fields in_page_hash and in_LRU_list of
+		the to-be-freed block descriptor should have
+		been cleared in
+		buf_LRU_block_remove_hashed_page(), which
+		invokes buf_LRU_remove_block(). */
+		ut_ad(!bpage->in_page_hash);
+		ut_ad(!bpage->in_LRU_list);
+		/* bpage->state was BUF_BLOCK_FILE_PAGE because
+		b != NULL. The type cast below is thus valid. */
+		ut_ad(!((buf_block_t*) bpage)->in_unzip_LRU_list);
 
-			/* The fields of bpage were copied to b before
-			buf_LRU_block_remove_hashed_page() was invoked. */
-			ut_ad(!b->in_zip_hash);
-			ut_ad(b->in_page_hash);
-			ut_ad(b->in_LRU_list);
+		/* The fields of bpage were copied to b before
+		buf_LRU_block_remove_hashed_page() was invoked. */
+		ut_ad(!b->in_zip_hash);
+		ut_ad(b->in_page_hash);
+		ut_ad(b->in_LRU_list);
 
-			HASH_INSERT(buf_page_t, hash,
-				    buf_pool->page_hash, fold, b);
+		HASH_INSERT(buf_page_t, hash,
+			    buf_pool->page_hash, fold, b);
 
-			/* Insert b where bpage was in the LRU list. */
-			if (UNIV_LIKELY(prev_b != NULL)) {
-				ulint	lru_len;
+		/* Insert b where bpage was in the LRU list. */
+		if (UNIV_LIKELY(prev_b != NULL)) {
+			ulint	lru_len;
 
-				ut_ad(prev_b->in_LRU_list);
-				ut_ad(buf_page_in_file(prev_b));
+			ut_ad(prev_b->in_LRU_list);
+			ut_ad(buf_page_in_file(prev_b));
 #if UNIV_WORD_SIZE == 4
-				/* On 32-bit systems, there is no
-				padding in buf_page_t.  On other
-				systems, Valgrind could complain about
-				uninitialized pad bytes. */
-				UNIV_MEM_ASSERT_RW(prev_b, sizeof *prev_b);
+			/* On 32-bit systems, there is no
+			padding in buf_page_t.  On other
+			systems, Valgrind could complain about
+			uninitialized pad bytes. */
+			UNIV_MEM_ASSERT_RW(prev_b, sizeof *prev_b);
 #endif
-				UT_LIST_INSERT_AFTER(LRU, buf_pool->LRU,
-						     prev_b, b);
+			UT_LIST_INSERT_AFTER(LRU, buf_pool->LRU,
+					     prev_b, b);
 
-				if (buf_page_is_old(b)) {
-					buf_pool->LRU_old_len++;
-					if (UNIV_UNLIKELY
-					    (buf_pool->LRU_old
-					     == UT_LIST_GET_NEXT(LRU, b))) {
+			if (buf_page_is_old(b)) {
+				buf_pool->LRU_old_len++;
+				if (UNIV_UNLIKELY
+				    (buf_pool->LRU_old
+				     == UT_LIST_GET_NEXT(LRU, b))) {
 
-						buf_pool->LRU_old = b;
-					}
+					buf_pool->LRU_old = b;
 				}
+			}
 
-				lru_len = UT_LIST_GET_LEN(buf_pool->LRU);
-
-				if (lru_len > BUF_LRU_OLD_MIN_LEN) {
-					ut_ad(buf_pool->LRU_old);
-					/* Adjust the length of the
-					old block list if necessary */
-					buf_LRU_old_adjust_len(buf_pool);
-				} else if (lru_len == BUF_LRU_OLD_MIN_LEN) {
-					/* The LRU list is now long
-					enough for LRU_old to become
-					defined: init it */
-					buf_LRU_old_init(buf_pool);
-				}
+			lru_len = UT_LIST_GET_LEN(buf_pool->LRU);
+
+			if (lru_len > BUF_LRU_OLD_MIN_LEN) {
+				ut_ad(buf_pool->LRU_old);
+				/* Adjust the length of the
+				old block list if necessary */
+				buf_LRU_old_adjust_len(buf_pool);
+			} else if (lru_len == BUF_LRU_OLD_MIN_LEN) {
+				/* The LRU list is now long
+				enough for LRU_old to become
+				defined: init it */
+				buf_LRU_old_init(buf_pool);
+			}
 #ifdef UNIV_LRU_DEBUG
-				/* Check that the "old" flag is consistent
-				in the block and its neighbours. */
-				buf_page_set_old(b, buf_page_is_old(b));
+			/* Check that the "old" flag is consistent
+			in the block and its neighbours. */
+			buf_page_set_old(b, buf_page_is_old(b));
 #endif /* UNIV_LRU_DEBUG */
-			} else {
-				ut_d(b->in_LRU_list = FALSE);
-				buf_LRU_add_block_low(b, buf_page_is_old(b));
-			}
+		} else {
+			ut_d(b->in_LRU_list = FALSE);
+			buf_LRU_add_block_low(b, buf_page_is_old(b));
+		}
 
-			if (b->state == BUF_BLOCK_ZIP_PAGE) {
+		if (b->state == BUF_BLOCK_ZIP_PAGE) {
 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
-				buf_LRU_insert_zip_clean(b);
+			buf_LRU_insert_zip_clean(b);
 #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
-			} else {
-				/* Relocate on buf_pool->flush_list. */
-				buf_flush_relocate_on_flush_list(bpage, b);
-			}
-
-			bpage->zip.data = NULL;
-			page_zip_set_size(&bpage->zip, 0);
-
-			/* Prevent buf_page_get_gen() from
-			decompressing the block while we release
-			buf_pool->mutex and block_mutex. */
-			mutex_enter(&buf_pool->zip_mutex);
-			buf_page_set_sticky(b);
-			mutex_exit(&buf_pool->zip_mutex);
+		} else {
+			/* Relocate on buf_pool->flush_list. */
+			buf_flush_relocate_on_flush_list(bpage, b);
 		}
 
-		buf_pool_mutex_exit(buf_pool);
+		bpage->zip.data = NULL;
+		page_zip_set_size(&bpage->zip, 0);
 		mutex_exit(block_mutex);
 
-		/* Remove possible adaptive hash index on the page.
-		The page was declared uninitialized by
-		buf_LRU_block_remove_hashed_page().  We need to flag
-		the contents of the page valid (which it still is) in
-		order to avoid bogus Valgrind warnings.*/
+		/* Prevent buf_page_get_gen() from
+		decompressing the block while we release
+		buf_pool->mutex and block_mutex. */
+		block_mutex = buf_page_get_mutex(b);
+		mutex_enter(block_mutex);
+		buf_page_set_sticky(b);
+		mutex_exit(block_mutex);
 
-		UNIV_MEM_VALID(((buf_block_t*) bpage)->frame,
-			       UNIV_PAGE_SIZE);
-		btr_search_drop_page_hash_index((buf_block_t*) bpage);
-		UNIV_MEM_INVALID(((buf_block_t*) bpage)->frame,
-				 UNIV_PAGE_SIZE);
+		rw_lock_x_unlock(hash_lock);
 
-		if (b) {
-			/* Compute and stamp the compressed page
-			checksum while not holding any mutex.  The
-			block is already half-freed
-			(BUF_BLOCK_REMOVE_HASH) and removed from
-			buf_pool->page_hash, thus inaccessible by any
-			other thread. */
-
-			mach_write_to_4(
-				b->zip.data + FIL_PAGE_SPACE_OR_CHKSUM,
-				UNIV_LIKELY(srv_use_checksums)
-				? page_zip_calc_checksum(
-					b->zip.data,
-					page_zip_get_size(&b->zip))
-				: BUF_NO_CHECKSUM_MAGIC);
-		}
+	} else {
 
-		buf_pool_mutex_enter(buf_pool);
+		/* There can be multiple threads doing an LRU scan to
+		free a block. The page_cleaner thread can be doing an
+		LRU batch whereas user threads can potentially be doing
+		multiple single page flushes. As we release
+		buf_pool->mutex below we need to make sure that no one
+		else considers this block as a victim for page
+		replacement. This block is already out of page_hash
+		and we are about to remove it from the LRU list and put
+		it on the free list. */
 		mutex_enter(block_mutex);
+		buf_page_set_sticky(bpage);
+		mutex_exit(block_mutex);
+	}
 
-		if (b) {
-			mutex_enter(&buf_pool->zip_mutex);
-			buf_page_unset_sticky(b);
-			mutex_exit(&buf_pool->zip_mutex);
-		}
+	buf_pool_mutex_exit(buf_pool);
 
-		buf_LRU_block_free_hashed_page((buf_block_t*) bpage);
-	} else {
-		/* The block_mutex should have been released by
-		buf_LRU_block_remove_hashed_page() when it returns
-		BUF_BLOCK_ZIP_FREE. */
-		ut_ad(block_mutex == &buf_pool->zip_mutex);
-		mutex_enter(block_mutex);
+	/* Remove possible adaptive hash index on the page.
+	The page was declared uninitialized by
+	buf_LRU_block_remove_hashed_page().  We need to flag
+	the contents of the page valid (which it still is) in
+	order to avoid bogus Valgrind warnings.*/
+
+	UNIV_MEM_VALID(((buf_block_t*) bpage)->frame,
+		       UNIV_PAGE_SIZE);
+	btr_search_drop_page_hash_index((buf_block_t*) bpage);
+	UNIV_MEM_INVALID(((buf_block_t*) bpage)->frame,
+			 UNIV_PAGE_SIZE);
+
+	if (b) {
+		ib_uint32_t	checksum;
+		/* Compute and stamp the compressed page
+		checksum while not holding any mutex.  The
+		block is already half-freed
+		(BUF_BLOCK_REMOVE_HASH) and removed from
+		buf_pool->page_hash, thus inaccessible by any
+		other thread. */
+
+		checksum = page_zip_calc_checksum(
+			b->zip.data,
+			page_zip_get_size(&b->zip),
+			static_cast<srv_checksum_algorithm_t>(
+				srv_checksum_algorithm));
+
+		mach_write_to_4(b->zip.data + FIL_PAGE_SPACE_OR_CHKSUM,
+				checksum);
 	}
 
+	buf_pool_mutex_enter(buf_pool);
+
+	mutex_enter(block_mutex);
+	buf_page_unset_sticky(b != NULL ? b : bpage);
+	mutex_exit(block_mutex);
+
+	buf_LRU_block_free_hashed_page((buf_block_t*) bpage);
 	return(TRUE);
 }
 
@@ -1924,7 +1720,11 @@ buf_LRU_block_free_non_file_page(
 /******************************************************************//**
 Takes a block out of the LRU list and page hash table.
 If the block is compressed-only (BUF_BLOCK_ZIP_PAGE),
-the object will be freed and buf_pool->zip_mutex will be released.
+the object will be freed.
+
+The caller must hold buf_pool->mutex, the buf_page_get_mutex() mutex
+and the appropriate hash_lock. This function will release the
+buf_page_get_mutex() and the hash_lock.
 
 If a compressed page or a compressed-only block descriptor is freed,
 other compressed pages or compressed-only block descriptors may be
@@ -1944,11 +1744,18 @@ buf_LRU_block_remove_hashed_page(
 	ulint			fold;
 	const buf_page_t*	hashed_bpage;
 	buf_pool_t*		buf_pool = buf_pool_from_bpage(bpage);
+	rw_lock_t*		hash_lock;
 
 	ut_ad(bpage);
 	ut_ad(buf_pool_mutex_own(buf_pool));
 	ut_ad(mutex_own(buf_page_get_mutex(bpage)));
 
+	fold = buf_page_address_fold(bpage->space, bpage->offset);
+	hash_lock = buf_page_hash_lock_get(buf_pool, fold);
+#ifdef UNIV_SYNC_DEBUG
+        ut_ad(rw_lock_own(hash_lock, RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+
 	ut_a(buf_page_get_io_fix(bpage) == BUF_IO_NONE);
 	ut_a(bpage->buf_fix_count == 0);
 
@@ -2032,9 +1839,8 @@ buf_LRU_block_remove_hashed_page(
 		break;
 	}
 
-	fold = buf_page_address_fold(bpage->space, bpage->offset);
-	hashed_bpage = buf_page_hash_get_low(
-		buf_pool, bpage->space, bpage->offset, fold);
+	hashed_bpage = buf_page_hash_get_low(buf_pool, bpage->space,
+					     bpage->offset, fold);
 
 	if (UNIV_UNLIKELY(bpage != hashed_bpage)) {
 		fprintf(stderr,
@@ -2054,6 +1860,7 @@ buf_LRU_block_remove_hashed_page(
 
 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
 		mutex_exit(buf_page_get_mutex(bpage));
+		rw_lock_x_unlock(hash_lock);
 		buf_pool_mutex_exit(buf_pool);
 		buf_print();
 		buf_LRU_print();
@@ -2080,6 +1887,7 @@ buf_LRU_block_remove_hashed_page(
 #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
 
 		mutex_exit(&buf_pool->zip_mutex);
+		rw_lock_x_unlock(hash_lock);
 		buf_pool_mutex_exit_forbid(buf_pool);
 
 		buf_buddy_free(
@@ -2099,6 +1907,28 @@ buf_LRU_block_remove_hashed_page(
 				 UNIV_PAGE_SIZE);
 		buf_page_set_state(bpage, BUF_BLOCK_REMOVE_HASH);
 
+		/* Question: If we release bpage and hash mutex here
+		then what protects us against:
+		1) Some other thread buffer fixing this page
+		2) Some other thread trying to read this page and
+		not finding it in buffer pool attempting to read it
+		from the disk.
+		Answer:
+		1) Cannot happen because the page is no longer in the
+		page_hash. Only possibility is when while invalidating
+		a tablespace we buffer fix the prev_page in LRU to
+		avoid relocation during the scan. But that is not
+		possible because we are holding buf_pool mutex.
+
+		2) Not possible because in buf_page_init_for_read()
+		we do a look up of page_hash while holding buf_pool
+		mutex and since we are holding buf_pool mutex here
+		and by the time we'll release it in the caller we'd
+		have inserted the compressed only descriptor in the
+		page_hash. */
+		rw_lock_x_unlock(hash_lock);
+		mutex_exit(&((buf_block_t*) bpage)->mutex);
+
 		if (zip && bpage->zip.data) {
 			/* Free the compressed page. */
 			void*	data = bpage->zip.data;
@@ -2107,7 +1937,6 @@ buf_LRU_block_remove_hashed_page(
 			ut_ad(!bpage->in_free_list);
 			ut_ad(!bpage->in_flush_list);
 			ut_ad(!bpage->in_LRU_list);
-			mutex_exit(&((buf_block_t*) bpage)->mutex);
 			buf_pool_mutex_exit_forbid(buf_pool);
 
 			buf_buddy_free(
@@ -2115,7 +1944,6 @@ buf_LRU_block_remove_hashed_page(
 				page_zip_get_size(&bpage->zip));
 
 			buf_pool_mutex_exit_allow(buf_pool);
-			mutex_enter(&((buf_block_t*) bpage)->mutex);
 			page_zip_set_size(&bpage->zip, 0);
 		}
 
@@ -2147,11 +1975,12 @@ buf_LRU_block_free_hashed_page(
 	buf_pool_t*	buf_pool = buf_pool_from_block(block);
 	ut_ad(buf_pool_mutex_own(buf_pool));
 #endif
-	ut_ad(mutex_own(&block->mutex));
 
+	mutex_enter(&block->mutex);
 	buf_block_set_state(block, BUF_BLOCK_MEMORY);
 
 	buf_LRU_block_free_non_file_page(block);
+	mutex_exit(&block->mutex);
 }
 
 /******************************************************************//**
@@ -2210,7 +2039,7 @@ buf_LRU_old_ratio_update_instance(
 	} else {
 		buf_pool->LRU_old_ratio = ratio;
 	}
-	/* the reverse of 
+	/* the reverse of
 	ratio = old_pct * BUF_LRU_OLD_RATIO_DIV / 100 */
 	return((uint) (ratio * 100 / (double) BUF_LRU_OLD_RATIO_DIV + 0.5));
 }
@@ -2327,14 +2156,13 @@ buf_LRU_validate_instance(
 		ut_a(old_len <= new_len + BUF_LRU_OLD_TOLERANCE);
 	}
 
-	UT_LIST_VALIDATE(LRU, buf_page_t, buf_pool->LRU,
-			 ut_ad(ut_list_node_313->in_LRU_list));
-
-	bpage = UT_LIST_GET_FIRST(buf_pool->LRU);
+	UT_LIST_VALIDATE(LRU, buf_page_t, buf_pool->LRU, CheckInLRUList());
 
 	old_len = 0;
 
-	while (bpage != NULL) {
+	for (bpage = UT_LIST_GET_FIRST(buf_pool->LRU);
+	     bpage != NULL;
+             bpage = UT_LIST_GET_NEXT(LRU, bpage)) {
 
 		switch (buf_page_get_state(bpage)) {
 		case BUF_BLOCK_ZIP_FREE:
@@ -2366,14 +2194,11 @@ buf_LRU_validate_instance(
 
 			ut_a(!next || buf_page_is_old(next));
 		}
-
-		bpage = UT_LIST_GET_NEXT(LRU, bpage);
 	}
 
 	ut_a(buf_pool->LRU_old_len == old_len);
 
-	UT_LIST_VALIDATE(list, buf_page_t, buf_pool->free,
-			 ut_ad(ut_list_node_313->in_free_list));
+	UT_LIST_VALIDATE(list, buf_page_t, buf_pool->free, CheckInFreeList());
 
 	for (bpage = UT_LIST_GET_FIRST(buf_pool->free);
 	     bpage != NULL;
@@ -2382,9 +2207,9 @@ buf_LRU_validate_instance(
 		ut_a(buf_page_get_state(bpage) == BUF_BLOCK_NOT_USED);
 	}
 
-	UT_LIST_VALIDATE(unzip_LRU, buf_block_t, buf_pool->unzip_LRU,
-			 ut_ad(ut_list_node_313->in_unzip_LRU_list
-			       && ut_list_node_313->page.in_LRU_list));
+	UT_LIST_VALIDATE(
+                unzip_LRU, buf_block_t, buf_pool->unzip_LRU,
+                CheckUnzipLRUAndLRUList());
 
 	for (block = UT_LIST_GET_FIRST(buf_pool->unzip_LRU);
 	     block;
@@ -2507,3 +2332,4 @@ buf_LRU_print(void)
 	}
 }
 #endif /* UNIV_DEBUG_PRINT || UNIV_DEBUG || UNIV_BUF_DEBUG */
+#endif /* !UNIV_HOTBACKUP */
diff --git a/storage/innobase/buf/buf0rea.c b/storage/innobase/buf/buf0rea.cc
index da804a66b29..1b3e5deed05 100644
--- a/storage/innobase/buf/buf0rea.c
+++ b/storage/innobase/buf/buf0rea.cc
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1995, 2010, Innobase Oy. All Rights Reserved.
+Copyright (c) 1995, 2011, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,13 +11,13 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
 /**************************************************//**
-@file buf/buf0rea.c
+@file buf/buf0rea.cc
 The database buffer read
 
 Created 11/5/1995 Heikki Tuuri
@@ -31,6 +31,7 @@ Created 11/5/1995 Heikki Tuuri
 #include "buf0buf.h"
 #include "buf0flu.h"
 #include "buf0lru.h"
+#include "buf0dblwr.h"
 #include "ibuf0ibuf.h"
 #include "log0recv.h"
 #include "trx0sys.h"
@@ -58,7 +59,7 @@ flag is cleared and the x-lock released by an i/o-handler thread.
 @return 1 if a read request was queued, 0 if the page already resided
 in buf_pool, or if the page is in the doublewrite buffer blocks in
 which case it is never read into the pool, or if the tablespace does
-not exist or is being dropped 
+not exist or is being dropped
 @return 1 if read request is issued. 0 if it is not */
 static
 ulint
@@ -83,19 +84,17 @@ buf_read_page_low(
 {
 	buf_page_t*	bpage;
 	ulint		wake_later;
+	ibool		ignore_nonexistent_pages;
 
 	*err = DB_SUCCESS;
 
 	wake_later = mode & OS_AIO_SIMULATED_WAKE_LATER;
 	mode = mode & ~OS_AIO_SIMULATED_WAKE_LATER;
 
-	if (trx_doublewrite && space == TRX_SYS_SPACE
-	    && (   (offset >= trx_doublewrite->block1
-		    && offset < trx_doublewrite->block1
-		    + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE)
-		   || (offset >= trx_doublewrite->block2
-		       && offset < trx_doublewrite->block2
-		       + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE))) {
+	ignore_nonexistent_pages = mode & BUF_READ_IGNORE_NONEXISTENT_PAGES;
+	mode &= ~BUF_READ_IGNORE_NONEXISTENT_PAGES;
+
+	if (space == TRX_SYS_SPACE && buf_dblwr_page_inside(offset)) {
 		ut_print_timestamp(stderr);
 		fprintf(stderr,
 			"  InnoDB: Warning: trying to read"
@@ -141,18 +140,27 @@ buf_read_page_low(
 
 	thd_wait_begin(NULL, THD_WAIT_DISKIO);
 	if (zip_size) {
-		*err = fil_io(OS_FILE_READ | wake_later,
+		*err = fil_io(OS_FILE_READ | wake_later
+			      | ignore_nonexistent_pages,
 			      sync, space, zip_size, offset, 0, zip_size,
 			      bpage->zip.data, bpage);
 	} else {
 		ut_a(buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE);
 
-		*err = fil_io(OS_FILE_READ | wake_later,
+		*err = fil_io(OS_FILE_READ | wake_later
+			      | ignore_nonexistent_pages,
 			      sync, space, 0, offset, 0, UNIV_PAGE_SIZE,
 			      ((buf_block_t*) bpage)->frame, bpage);
 	}
 	thd_wait_end(NULL);
-	ut_a(*err == DB_SUCCESS);
+
+	if (*err != DB_SUCCESS) {
+		if (ignore_nonexistent_pages) {
+			return(0);
+		}
+		/* else */
+		ut_error;
+	}
 
 	if (sync) {
 		/* The i/o is already completed when we arrive from
@@ -342,7 +350,6 @@ buf_read_page(
 	ulint	zip_size,/*!< in: compressed page size in bytes, or 0 */
 	ulint	offset)	/*!< in: page number */
 {
-	buf_pool_t*	buf_pool = buf_pool_get(space, offset);
 	ib_int64_t	tablespace_version;
 	ulint		count;
 	ulint		err;
@@ -366,9 +373,6 @@ buf_read_page(
 			(ulong) space, (ulong) offset);
 	}
 
-	/* Flush pages from the end of the LRU list if necessary */
-	buf_flush_free_margin(buf_pool);
-
 	/* Increment number of I/O operations used for LRU policy. */
 	buf_LRU_stat_inc_io();
 
@@ -376,6 +380,49 @@ buf_read_page(
 }
 
 /********************************************************************//**
+High-level function which reads a page asynchronously from a file to the
+buffer buf_pool if it is not already there. Sets the io_fix flag and sets
+an exclusive lock on the buffer frame. The flag is cleared and the x-lock
+released by the i/o-handler thread.
+@return TRUE if page has been read in, FALSE in case of failure */
+UNIV_INTERN
+ibool
+buf_read_page_async(
+/*================*/
+	ulint	space,	/*!< in: space id */
+	ulint	offset)	/*!< in: page number */
+{
+	ulint		zip_size;
+	ib_int64_t	tablespace_version;
+	ulint		count;
+	ulint		err;
+
+	zip_size = fil_space_get_zip_size(space);
+
+	if (zip_size == ULINT_UNDEFINED) {
+		return(FALSE);
+	}
+
+	tablespace_version = fil_space_get_version(space);
+
+	count = buf_read_page_low(&err, TRUE, BUF_READ_ANY_PAGE
+				  | OS_AIO_SIMULATED_WAKE_LATER
+				  | BUF_READ_IGNORE_NONEXISTENT_PAGES,
+				  space, zip_size, FALSE,
+				  tablespace_version, offset);
+	srv_buf_pool_reads += count;
+
+	/* We do not increment number of I/O operations used for LRU policy
+	here (buf_LRU_stat_inc_io()). We use this in heuristics to decide
+	about evicting uncompressed version of compressed pages from the
+	buffer pool. Since this function is called from buffer pool load
+	these IOs are deliberate and are not part of normal workload we can
+	ignore these in our heuristics. */
+
+	return(count > 0);
+}
+
+/********************************************************************//**
 Applies linear read-ahead if in the buf_pool the page is a border page of
 a linear read-ahead area and all the pages in the area have been accessed.
 Does not read any page if the read-ahead mechanism is not activated. Note
@@ -427,6 +474,11 @@ buf_read_ahead_linear(
 		= BUF_READ_AHEAD_AREA(buf_pool);
 	ulint		threshold;
 
+	/* check if readahead is disabled */
+	if (!srv_read_ahead_threshold) {
+		return(0);
+	}
+
 	if (UNIV_UNLIKELY(srv_startup_is_before_trx_rollback_phase)) {
 		/* No read-ahead to avoid thread deadlocks */
 		return(0);
@@ -636,9 +688,6 @@ buf_read_ahead_linear(
 
 	os_aio_simulated_wake_handler_threads();
 
-	/* Flush pages from the end of the LRU list if necessary */
-	buf_flush_free_margin(buf_pool);
-
 #ifdef UNIV_DEBUG
 	if (buf_debug_prints && (count > 0)) {
 		fprintf(stderr,
@@ -724,9 +773,6 @@ tablespace_deleted:
 
 	os_aio_simulated_wake_handler_threads();
 
-	/* Flush pages from the end of all the LRU lists if necessary */
-	buf_flush_free_margins();
-
 #ifdef UNIV_DEBUG
 	if (buf_debug_prints) {
 		fprintf(stderr,
@@ -796,7 +842,7 @@ buf_read_recv_pages(
 					"InnoDB: Number of pending reads %lu,"
 					" pending pread calls %lu\n",
 					(ulong) buf_pool->n_pend_reads,
-					(ulong)os_file_n_pending_preads);
+					(ulong) os_file_n_pending_preads);
 
 				os_aio_print_debug = TRUE;
 			}
@@ -818,9 +864,6 @@ buf_read_recv_pages(
 
 	os_aio_simulated_wake_handler_threads();
 
-	/* Flush pages from the end of all the LRU lists if necessary */
-	buf_flush_free_margins();
-
 #ifdef UNIV_DEBUG
 	if (buf_debug_prints) {
 		fprintf(stderr,
diff --git a/storage/innobase/data/data0data.c b/storage/innobase/data/data0data.cc
index 6d07fc249fa..179de79b69f 100644
--- a/storage/innobase/data/data0data.c
+++ b/storage/innobase/data/data0data.cc
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1994, 2012, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,13 +11,13 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
 /********************************************************************//**
-@file data/data0data.c
+@file data/data0data.cc
 SQL data field and tuple
 
 Created 5/30/1994 Heikki Tuuri
@@ -53,35 +53,6 @@ UNIV_INTERN ulint	data_dummy;
 #endif /* UNIV_DEBUG */
 
 #ifndef UNIV_HOTBACKUP
-/*********************************************************************//**
-Tests if dfield data length and content is equal to the given.
-@return	TRUE if equal */
-UNIV_INTERN
-ibool
-dfield_data_is_binary_equal(
-/*========================*/
-	const dfield_t*	field,	/*!< in: field */
-	ulint		len,	/*!< in: data length or UNIV_SQL_NULL */
-	const byte*	data)	/*!< in: data */
-{
-	if (len != dfield_get_len(field)) {
-
-		return(FALSE);
-	}
-
-	if (len == UNIV_SQL_NULL) {
-
-		return(TRUE);
-	}
-
-	if (0 != memcmp(dfield_get_data(field), data, len)) {
-
-		return(FALSE);
-	}
-
-	return(TRUE);
-}
-
 /************************************************************//**
 Compare two data tuples, respecting the collation of character fields.
 @return 1, 0 , -1 if tuple1 is greater, equal, less, respectively,
@@ -274,7 +245,9 @@ dtuple_validate(
 
 		if (!dfield_is_null(field)) {
 
-			const byte*	data = dfield_get_data(field);
+			const byte*	data;
+
+			data = static_cast<const byte*>(dfield_get_data(field));
 #ifndef UNIV_DEBUG_VALGRIND
 			ulint		j;
 
@@ -311,7 +284,7 @@ dfield_print(
 	ulint		i;
 
 	len = dfield_get_len(dfield);
-	data = dfield_get_data(dfield);
+	data = static_cast<const byte*>(dfield_get_data(dfield));
 
 	if (dfield_is_null(dfield)) {
 		fputs("NULL", stderr);
@@ -333,7 +306,7 @@ dfield_print(
 		break;
 	case DATA_INT:
 		ut_a(len == 4); /* only works for 32-bit integers */
-		fprintf(stderr, "%d", (int)mach_read_from_4(data));
+		fprintf(stderr, "%d", (int) mach_read_from_4(data));
 		break;
 	default:
 		ut_error;
@@ -356,7 +329,7 @@ dfield_print_also_hex(
 	ibool		print_also_hex;
 
 	len = dfield_get_len(dfield);
-	data = dfield_get_data(dfield);
+	data = static_cast<const byte*>(dfield_get_data(dfield));
 
 	if (dfield_is_null(dfield)) {
 		fputs("NULL", stderr);
@@ -438,25 +411,25 @@ dfield_print_also_hex(
 		case DATA_TRX_ID:
 			id = mach_read_from_6(data);
 
-			fprintf(stderr, "trx_id " TRX_ID_FMT, (ullint) id);
+			fprintf(stderr, "trx_id " TRX_ID_FMT, id);
 			break;
 
 		case DATA_ROLL_PTR:
 			id = mach_read_from_7(data);
 
-			fprintf(stderr, "roll_ptr " TRX_ID_FMT, (ullint) id);
+			fprintf(stderr, "roll_ptr " TRX_ID_FMT, id);
 			break;
 
 		case DATA_ROW_ID:
 			id = mach_read_from_6(data);
 
-			fprintf(stderr, "row_id " TRX_ID_FMT, (ullint) id);
+			fprintf(stderr, "row_id " TRX_ID_FMT, id);
 			break;
 
 		default:
 			id = mach_ull_read_compressed(data);
 
-			fprintf(stderr, "mix_id " TRX_ID_FMT, (ullint) id);
+			fprintf(stderr, "mix_id " TRX_ID_FMT, id);
 		}
 		break;
 
@@ -484,7 +457,7 @@ dfield_print_also_hex(
 			break;
 		}
 
-		data = dfield_get_data(dfield);
+		data = static_cast<byte*>(dfield_get_data(dfield));
 		/* fall through */
 
 	case DATA_BINARY:
@@ -579,11 +552,11 @@ dtuple_convert_big_rec(
 	ulint		local_len;
 	ulint		local_prefix_len;
 
-	if (UNIV_UNLIKELY(!dict_index_is_clust(index))) {
+	if (!dict_index_is_clust(index)) {
 		return(NULL);
 	}
 
-	if (dict_table_get_format(index->table) < DICT_TF_FORMAT_ZIP) {
+	if (dict_table_get_format(index->table) < UNIV_FORMAT_B) {
 		/* up to MySQL 5.1: store a 768-byte prefix locally */
 		local_len = BTR_EXTERN_FIELD_REF_SIZE
 			+ DICT_ANTELOPE_MAX_INDEX_COL_LEN;
@@ -608,11 +581,15 @@ dtuple_convert_big_rec(
 	heap = mem_heap_create(size + dtuple_get_n_fields(entry)
 			       * sizeof(big_rec_field_t) + 1000);
 
-	vector = mem_heap_alloc(heap, sizeof(big_rec_t));
+	vector = static_cast<big_rec_t*>(
+		mem_heap_alloc(heap, sizeof(big_rec_t)));
 
 	vector->heap = heap;
-	vector->fields = mem_heap_alloc(heap, dtuple_get_n_fields(entry)
-					* sizeof(big_rec_field_t));
+
+	vector->fields = static_cast<big_rec_field_t*>(
+		mem_heap_alloc(
+			heap,
+			dtuple_get_n_fields(entry) * sizeof(big_rec_field_t)));
 
 	/* Decide which fields to shorten: the algorithm is to look for
 	a variable-length field that yields the biggest savings when
@@ -703,7 +680,7 @@ skip_field:
 		b->data = (char*) dfield_get_data(dfield) + local_prefix_len;
 
 		/* Allocate the locally stored part of the column. */
-		data = mem_heap_alloc(heap, local_len);
+		data = static_cast<byte*>(mem_heap_alloc(heap, local_len));
 
 		/* Copy the local prefix. */
 		memcpy(data, dfield_get_data(dfield), local_prefix_len);
diff --git a/storage/innobase/data/data0type.c b/storage/innobase/data/data0type.cc
index 20d1f5db8d3..0b9e08544a5 100644
--- a/storage/innobase/data/data0type.c
+++ b/storage/innobase/data/data0type.cc
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2011, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,13 +11,13 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
 /**************************************************//**
-@file data/data0type.c
+@file data/data0type.cc
 Data types
 
 Created 1/16/1996 Heikki Tuuri
@@ -158,7 +158,7 @@ dtype_form_prtype(
 	ulint	charset_coll)	/*!< in: MySQL charset-collation code */
 {
 	ut_a(old_prtype < 256 * 256);
-	ut_a(charset_coll < 256);
+	ut_a(charset_coll <= MAX_CHAR_COLL_NUM);
 
 	return(old_prtype + (charset_coll << 16));
 }
diff --git a/storage/innobase/dict/dict0boot.c b/storage/innobase/dict/dict0boot.cc
index 20d676e6129..8e305364ac8 100644
--- a/storage/innobase/dict/dict0boot.c
+++ b/storage/innobase/dict/dict0boot.cc
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1996, 2010, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2011, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,13 +11,13 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
 /**************************************************//**
-@file dict/dict0boot.c
+@file dict/dict0boot.cc
 Data dictionary creation and booting
 
 Created 4/18/1996 Heikki Tuuri
@@ -254,6 +254,24 @@ dict_boot(void)
 	mtr_t		mtr;
 	ulint		error;
 
+	/* Be sure these constants do not ever change.  To avoid bloat,
+	only check the *NUM_FIELDS* in each table */
+
+	ut_ad(DICT_NUM_COLS__SYS_TABLES == 8);
+	ut_ad(DICT_NUM_FIELDS__SYS_TABLES == 10);
+	ut_ad(DICT_NUM_FIELDS__SYS_TABLE_IDS == 2);
+	ut_ad(DICT_NUM_COLS__SYS_COLUMNS == 7);
+	ut_ad(DICT_NUM_FIELDS__SYS_COLUMNS == 9);
+	ut_ad(DICT_NUM_COLS__SYS_INDEXES == 7);
+	ut_ad(DICT_NUM_FIELDS__SYS_INDEXES == 9);
+	ut_ad(DICT_NUM_COLS__SYS_FIELDS == 3);
+	ut_ad(DICT_NUM_FIELDS__SYS_FIELDS == 5);
+	ut_ad(DICT_NUM_COLS__SYS_FOREIGN == 4);
+	ut_ad(DICT_NUM_FIELDS__SYS_FOREIGN == 6);
+	ut_ad(DICT_NUM_FIELDS__SYS_FOREIGN_FOR_NAME == 2);
+	ut_ad(DICT_NUM_COLS__SYS_FOREIGN_COLS == 4);
+	ut_ad(DICT_NUM_FIELDS__SYS_FOREIGN_COLS == 6);
+
 	mtr_start(&mtr);
 
 	/* Create the hash tables etc. */
@@ -283,14 +301,16 @@ dict_boot(void)
 	/* Insert into the dictionary cache the descriptions of the basic
 	system tables */
 	/*-------------------------*/
-	table = dict_mem_table_create("SYS_TABLES", DICT_HDR_SPACE, 8, 0);
+	table = dict_mem_table_create("SYS_TABLES", DICT_HDR_SPACE, 8, 0, 0);
 
 	dict_mem_table_add_col(table, heap, "NAME", DATA_BINARY, 0, 0);
 	dict_mem_table_add_col(table, heap, "ID", DATA_BINARY, 0, 0);
 	/* ROW_FORMAT = (N_COLS >> 31) ? COMPACT : REDUNDANT */
 	dict_mem_table_add_col(table, heap, "N_COLS", DATA_INT, 0, 4);
-	/* TYPE is either DICT_TABLE_ORDINARY, or (TYPE & DICT_TF_COMPACT)
-	and (TYPE & DICT_TF_FORMAT_MASK) are nonzero and TYPE = table->flags */
+	/* If the format is UNIV_FORMAT_A, table->flags == 0, and
+	TYPE == 1, which is defined as SYS_TABLE_TYPE_ANTELOPE.
+	The low order bit of TYPE is always set to 1.  If the format
+	is UNIV_FORMAT_B or higher, this field matches table->flags. */
 	dict_mem_table_add_col(table, heap, "TYPE", DATA_INT, 0, 4);
 	dict_mem_table_add_col(table, heap, "MIX_ID", DATA_BINARY, 0, 0);
 	/* MIX_LEN may contain additional table flags when
@@ -302,7 +322,7 @@ dict_boot(void)
 
 	table->id = DICT_TABLES_ID;
 
-	dict_table_add_to_cache(table, heap);
+	dict_table_add_to_cache(table, FALSE, heap);
 	dict_sys->sys_tables = table;
 	mem_heap_empty(heap);
 
@@ -335,7 +355,7 @@ dict_boot(void)
 	ut_a(error == DB_SUCCESS);
 
 	/*-------------------------*/
-	table = dict_mem_table_create("SYS_COLUMNS", DICT_HDR_SPACE, 7, 0);
+	table = dict_mem_table_create("SYS_COLUMNS", DICT_HDR_SPACE, 7, 0, 0);
 
 	dict_mem_table_add_col(table, heap, "TABLE_ID", DATA_BINARY, 0, 0);
 	dict_mem_table_add_col(table, heap, "POS", DATA_INT, 0, 4);
@@ -347,7 +367,7 @@ dict_boot(void)
 
 	table->id = DICT_COLUMNS_ID;
 
-	dict_table_add_to_cache(table, heap);
+	dict_table_add_to_cache(table, FALSE, heap);
 	dict_sys->sys_columns = table;
 	mem_heap_empty(heap);
 
@@ -367,7 +387,7 @@ dict_boot(void)
 	ut_a(error == DB_SUCCESS);
 
 	/*-------------------------*/
-	table = dict_mem_table_create("SYS_INDEXES", DICT_HDR_SPACE, 7, 0);
+	table = dict_mem_table_create("SYS_INDEXES", DICT_HDR_SPACE, 7, 0, 0);
 
 	dict_mem_table_add_col(table, heap, "TABLE_ID", DATA_BINARY, 0, 0);
 	dict_mem_table_add_col(table, heap, "ID", DATA_BINARY, 0, 0);
@@ -377,22 +397,9 @@ dict_boot(void)
 	dict_mem_table_add_col(table, heap, "SPACE", DATA_INT, 0, 4);
 	dict_mem_table_add_col(table, heap, "PAGE_NO", DATA_INT, 0, 4);
 
-	/* The '+ 2' below comes from the fields DB_TRX_ID, DB_ROLL_PTR */
-#if DICT_SYS_INDEXES_PAGE_NO_FIELD != 6 + 2
-#error "DICT_SYS_INDEXES_PAGE_NO_FIELD != 6 + 2"
-#endif
-#if DICT_SYS_INDEXES_SPACE_NO_FIELD != 5 + 2
-#error "DICT_SYS_INDEXES_SPACE_NO_FIELD != 5 + 2"
-#endif
-#if DICT_SYS_INDEXES_TYPE_FIELD != 4 + 2
-#error "DICT_SYS_INDEXES_TYPE_FIELD != 4 + 2"
-#endif
-#if DICT_SYS_INDEXES_NAME_FIELD != 2 + 2
-#error "DICT_SYS_INDEXES_NAME_FIELD != 2 + 2"
-#endif
-
 	table->id = DICT_INDEXES_ID;
-	dict_table_add_to_cache(table, heap);
+
+	dict_table_add_to_cache(table, FALSE, heap);
 	dict_sys->sys_indexes = table;
 	mem_heap_empty(heap);
 
@@ -412,14 +419,15 @@ dict_boot(void)
 	ut_a(error == DB_SUCCESS);
 
 	/*-------------------------*/
-	table = dict_mem_table_create("SYS_FIELDS", DICT_HDR_SPACE, 3, 0);
+	table = dict_mem_table_create("SYS_FIELDS", DICT_HDR_SPACE, 3, 0, 0);
 
 	dict_mem_table_add_col(table, heap, "INDEX_ID", DATA_BINARY, 0, 0);
 	dict_mem_table_add_col(table, heap, "POS", DATA_INT, 0, 4);
 	dict_mem_table_add_col(table, heap, "COL_NAME", DATA_BINARY, 0, 0);
 
 	table->id = DICT_FIELDS_ID;
-	dict_table_add_to_cache(table, heap);
+
+	dict_table_add_to_cache(table, FALSE, heap);
 	dict_sys->sys_fields = table;
 	mem_heap_free(heap);
 
@@ -439,6 +447,7 @@ dict_boot(void)
 	ut_a(error == DB_SUCCESS);
 
 	mtr_commit(&mtr);
+
 	/*-------------------------*/
 
 	/* Initialize the insert buffer table and index for each tablespace */
diff --git a/storage/innobase/dict/dict0crea.c b/storage/innobase/dict/dict0crea.cc
index d7373a4b8ef..d58b304ab92 100644
--- a/storage/innobase/dict/dict0crea.c
+++ b/storage/innobase/dict/dict0crea.cc
@@ -11,13 +11,13 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
 /**************************************************//**
-@file dict/dict0crea.c
+@file dict/dict0crea.cc
 Database object creation
 
 Created 1/8/1996 Heikki Tuuri
@@ -42,6 +42,7 @@ Created 1/8/1996 Heikki Tuuri
 #include "trx0roll.h"
 #include "usr0sess.h"
 #include "ut0vec.h"
+#include "dict0priv.h"
 
 /*****************************************************************//**
 Based on a table object, this function builds the entry to be inserted
@@ -60,6 +61,7 @@ dict_create_sys_tables_tuple(
 	dtuple_t*	entry;
 	dfield_t*	dfield;
 	byte*		ptr;
+	ulint		type;
 
 	ut_ad(table);
 	ut_ad(heap);
@@ -71,65 +73,74 @@ dict_create_sys_tables_tuple(
 	dict_table_copy_types(entry, sys_tables);
 
 	/* 0: NAME -----------------------------*/
-	dfield = dtuple_get_nth_field(entry, 0/*NAME*/);
+	dfield = dtuple_get_nth_field(
+		entry, DICT_COL__SYS_TABLES__NAME);
 
 	dfield_set_data(dfield, table->name, ut_strlen(table->name));
+
+	/* 1: DB_TRX_ID added later */
+	/* 2: DB_ROLL_PTR added later */
 	/* 3: ID -------------------------------*/
-	dfield = dtuple_get_nth_field(entry, 1/*ID*/);
+	dfield = dtuple_get_nth_field(
+		entry, DICT_COL__SYS_TABLES__ID);
 
-	ptr = mem_heap_alloc(heap, 8);
+	ptr = static_cast<byte*>(mem_heap_alloc(heap, 8));
 	mach_write_to_8(ptr, table->id);
 
 	dfield_set_data(dfield, ptr, 8);
-	/* 4: N_COLS ---------------------------*/
-	dfield = dtuple_get_nth_field(entry, 2/*N_COLS*/);
 
-#if DICT_TF_COMPACT != 1
-#error
-#endif
+	/* 4: N_COLS ---------------------------*/
+	dfield = dtuple_get_nth_field(
+		entry, DICT_COL__SYS_TABLES__N_COLS);
 
-	ptr = mem_heap_alloc(heap, 4);
+	ptr = static_cast<byte*>(mem_heap_alloc(heap, 4));
 	mach_write_to_4(ptr, table->n_def
 			| ((table->flags & DICT_TF_COMPACT) << 31));
 	dfield_set_data(dfield, ptr, 4);
-	/* 5: TYPE -----------------------------*/
-	dfield = dtuple_get_nth_field(entry, 3/*TYPE*/);
-
-	ptr = mem_heap_alloc(heap, 4);
-	if (table->flags & (~DICT_TF_COMPACT & ~(~0 << DICT_TF_BITS))) {
-		ut_a(table->flags & DICT_TF_COMPACT);
-		ut_a(dict_table_get_format(table) >= DICT_TF_FORMAT_ZIP);
-		ut_a((table->flags & DICT_TF_ZSSIZE_MASK)
-		     <= (DICT_TF_ZSSIZE_MAX << DICT_TF_ZSSIZE_SHIFT));
-		ut_a(!(table->flags & (~0 << DICT_TF2_BITS)));
-		mach_write_to_4(ptr, table->flags & ~(~0 << DICT_TF_BITS));
-	} else {
-		mach_write_to_4(ptr, DICT_TABLE_ORDINARY);
-	}
+
+	/* 5: TYPE (table flags) -----------------------------*/
+	dfield = dtuple_get_nth_field(
+		entry, DICT_COL__SYS_TABLES__TYPE);
+
+	ptr = static_cast<byte*>(mem_heap_alloc(heap, 4));
+
+	/* Validate the table flags and convert them to what is saved in
+	SYS_TABLES.TYPE.  Table flag values 0 and 1 are both written to
+	SYS_TABLES.TYPE as 1. */
+	type = dict_tf_to_sys_tables_type(table->flags);
+	mach_write_to_4(ptr, type);
 
 	dfield_set_data(dfield, ptr, 4);
+
 	/* 6: MIX_ID (obsolete) ---------------------------*/
-	dfield = dtuple_get_nth_field(entry, 4/*MIX_ID*/);
+	dfield = dtuple_get_nth_field(
+		entry, DICT_COL__SYS_TABLES__MIX_ID);
 
-	ptr = mem_heap_zalloc(heap, 8);
+	ptr = static_cast<byte*>(mem_heap_zalloc(heap, 8));
 
 	dfield_set_data(dfield, ptr, 8);
-	/* 7: MIX_LEN (additional flags) --------------------------*/
 
-	dfield = dtuple_get_nth_field(entry, 5/*MIX_LEN*/);
+	/* 7: MIX_LEN (additional flags) --------------------------*/
+	dfield = dtuple_get_nth_field(
+		entry, DICT_COL__SYS_TABLES__MIX_LEN);
 
-	ptr = mem_heap_alloc(heap, 4);
-	mach_write_to_4(ptr, table->flags >> DICT_TF2_SHIFT);
+	ptr = static_cast<byte*>(mem_heap_alloc(heap, 4));
+	/* Be sure all non-used bits are zero. */
+	ut_a(!(table->flags2 & ~DICT_TF2_BIT_MASK));
+	mach_write_to_4(ptr, table->flags2);
 
 	dfield_set_data(dfield, ptr, 4);
+
 	/* 8: CLUSTER_NAME ---------------------*/
-	dfield = dtuple_get_nth_field(entry, 6/*CLUSTER_NAME*/);
+	dfield = dtuple_get_nth_field(
+		entry, DICT_COL__SYS_TABLES__CLUSTER_ID);
 	dfield_set_null(dfield); /* not supported */
 
 	/* 9: SPACE ----------------------------*/
-	dfield = dtuple_get_nth_field(entry, 7/*SPACE*/);
+	dfield = dtuple_get_nth_field(
+		entry, DICT_COL__SYS_TABLES__SPACE);
 
-	ptr = mem_heap_alloc(heap, 4);
+	ptr = static_cast<byte*>(mem_heap_alloc(heap, 4));
 	mach_write_to_4(ptr, table->space);
 
 	dfield_set_data(dfield, ptr, 4);
@@ -171,49 +182,57 @@ dict_create_sys_columns_tuple(
 	dict_table_copy_types(entry, sys_columns);
 
 	/* 0: TABLE_ID -----------------------*/
-	dfield = dtuple_get_nth_field(entry, 0/*TABLE_ID*/);
+	dfield = dtuple_get_nth_field(entry, DICT_COL__SYS_COLUMNS__TABLE_ID);
 
-	ptr = mem_heap_alloc(heap, 8);
+	ptr = static_cast<byte*>(mem_heap_alloc(heap, 8));
 	mach_write_to_8(ptr, table->id);
 
 	dfield_set_data(dfield, ptr, 8);
+
 	/* 1: POS ----------------------------*/
-	dfield = dtuple_get_nth_field(entry, 1/*POS*/);
+	dfield = dtuple_get_nth_field(entry, DICT_COL__SYS_COLUMNS__POS);
 
-	ptr = mem_heap_alloc(heap, 4);
+	ptr = static_cast<byte*>(mem_heap_alloc(heap, 4));
 	mach_write_to_4(ptr, i);
 
 	dfield_set_data(dfield, ptr, 4);
+
+	/* 2: DB_TRX_ID added later */
+	/* 3: DB_ROLL_PTR added later */
 	/* 4: NAME ---------------------------*/
-	dfield = dtuple_get_nth_field(entry, 2/*NAME*/);
+	dfield = dtuple_get_nth_field(entry, DICT_COL__SYS_COLUMNS__NAME);
 
 	col_name = dict_table_get_col_name(table, i);
 	dfield_set_data(dfield, col_name, ut_strlen(col_name));
+
 	/* 5: MTYPE --------------------------*/
-	dfield = dtuple_get_nth_field(entry, 3/*MTYPE*/);
+	dfield = dtuple_get_nth_field(entry, DICT_COL__SYS_COLUMNS__MTYPE);
 
-	ptr = mem_heap_alloc(heap, 4);
+	ptr = static_cast<byte*>(mem_heap_alloc(heap, 4));
 	mach_write_to_4(ptr, column->mtype);
 
 	dfield_set_data(dfield, ptr, 4);
+
 	/* 6: PRTYPE -------------------------*/
-	dfield = dtuple_get_nth_field(entry, 4/*PRTYPE*/);
+	dfield = dtuple_get_nth_field(entry, DICT_COL__SYS_COLUMNS__PRTYPE);
 
-	ptr = mem_heap_alloc(heap, 4);
+	ptr = static_cast<byte*>(mem_heap_alloc(heap, 4));
 	mach_write_to_4(ptr, column->prtype);
 
 	dfield_set_data(dfield, ptr, 4);
+
 	/* 7: LEN ----------------------------*/
-	dfield = dtuple_get_nth_field(entry, 5/*LEN*/);
+	dfield = dtuple_get_nth_field(entry, DICT_COL__SYS_COLUMNS__LEN);
 
-	ptr = mem_heap_alloc(heap, 4);
+	ptr = static_cast<byte*>(mem_heap_alloc(heap, 4));
 	mach_write_to_4(ptr, column->len);
 
 	dfield_set_data(dfield, ptr, 4);
+
 	/* 8: PREC ---------------------------*/
-	dfield = dtuple_get_nth_field(entry, 6/*PREC*/);
+	dfield = dtuple_get_nth_field(entry, DICT_COL__SYS_COLUMNS__PREC);
 
-	ptr = mem_heap_alloc(heap, 4);
+	ptr = static_cast<byte*>(mem_heap_alloc(heap, 4));
 	mach_write_to_4(ptr, 0/* unused */);
 
 	dfield_set_data(dfield, ptr, 4);
@@ -235,32 +254,24 @@ dict_build_table_def_step(
 	dict_table_t*	table;
 	dtuple_t*	row;
 	ulint		error;
-	ulint		flags;
 	const char*	path_or_name;
 	ibool		is_path;
 	mtr_t		mtr;
 	ulint		space = 0;
-	ibool		file_per_table;
+	bool		use_tablespace;
 
 	ut_ad(mutex_own(&(dict_sys->mutex)));
 
 	table = node->table;
-
-	/* Cache the global variable "srv_file_per_table" to
-	a local variable before using it. Please note
-	"srv_file_per_table" is not under dict_sys mutex
-	protection, and could be changed while executing
-	this function. So better to cache the current value
-	to a local variable, and all future reference to
-	"srv_file_per_table" should use this local variable. */
-	file_per_table = srv_file_per_table;
+	use_tablespace = !!(table->flags2 & DICT_TF2_USE_TABLESPACE);
 
 	dict_hdr_get_new_id(&table->id, NULL, NULL);
 
 	thr_get_trx(thr)->table_id = table->id;
 
-	if (file_per_table) {
-		/* Get a new space id if srv_file_per_table is set */
+	if (use_tablespace) {
+		/* This table will not use the system tablespace.
+		Get a new space id. */
 		dict_hdr_get_new_id(NULL, NULL, &space);
 
 		if (UNIV_UNLIKELY(space == ULINT_UNDEFINED)) {
@@ -286,14 +297,14 @@ dict_build_table_def_step(
 			is_path = FALSE;
 		}
 
-		ut_ad(dict_table_get_format(table) <= DICT_TF_FORMAT_MAX);
+		ut_ad(dict_table_get_format(table) <= UNIV_FORMAT_MAX);
 		ut_ad(!dict_table_zip_size(table)
-		      || dict_table_get_format(table) >= DICT_TF_FORMAT_ZIP);
+		      || dict_table_get_format(table) >= UNIV_FORMAT_B);
 
-		flags = table->flags & ~(~0 << DICT_TF_BITS);
 		error = fil_create_new_single_table_tablespace(
 			space, path_or_name, is_path,
-			flags == DICT_TF_COMPACT ? 0 : flags,
+			dict_tf_to_fsp_flags(table->flags),
+			table->flags2,
 			FIL_IBD_FILE_INITIAL_SIZE);
 		table->space = (unsigned int) space;
 
@@ -308,8 +319,10 @@ dict_build_table_def_step(
 
 		mtr_commit(&mtr);
 	} else {
-		/* Create in the system tablespace: disallow new features */
-		table->flags &= (~0 << DICT_TF_BITS) | DICT_TF_COMPACT;
+		/* Create in the system tablespace: disallow Barracuda
+		features by keeping only the first bit which says whether
+		the row format is redundant or compact */
+		table->flags &= DICT_TF_COMPACT;
 	}
 
 	row = dict_create_sys_tables_tuple(table, node->heap);
@@ -369,61 +382,69 @@ dict_create_sys_indexes_tuple(
 	dict_table_copy_types(entry, sys_indexes);
 
 	/* 0: TABLE_ID -----------------------*/
-	dfield = dtuple_get_nth_field(entry, 0/*TABLE_ID*/);
+	dfield = dtuple_get_nth_field(
+		entry, DICT_COL__SYS_INDEXES__TABLE_ID);
 
-	ptr = mem_heap_alloc(heap, 8);
+	ptr = static_cast<byte*>(mem_heap_alloc(heap, 8));
 	mach_write_to_8(ptr, table->id);
 
 	dfield_set_data(dfield, ptr, 8);
+
 	/* 1: ID ----------------------------*/
-	dfield = dtuple_get_nth_field(entry, 1/*ID*/);
+	dfield = dtuple_get_nth_field(
+		entry, DICT_COL__SYS_INDEXES__ID);
 
-	ptr = mem_heap_alloc(heap, 8);
+	ptr = static_cast<byte*>(mem_heap_alloc(heap, 8));
 	mach_write_to_8(ptr, index->id);
 
 	dfield_set_data(dfield, ptr, 8);
+
+	/* 2: DB_TRX_ID added later */
+	/* 3: DB_ROLL_PTR added later */
 	/* 4: NAME --------------------------*/
-	dfield = dtuple_get_nth_field(entry, 2/*NAME*/);
+	dfield = dtuple_get_nth_field(
+		entry, DICT_COL__SYS_INDEXES__NAME);
 
 	dfield_set_data(dfield, index->name, ut_strlen(index->name));
+
 	/* 5: N_FIELDS ----------------------*/
-	dfield = dtuple_get_nth_field(entry, 3/*N_FIELDS*/);
+	dfield = dtuple_get_nth_field(
+		entry, DICT_COL__SYS_INDEXES__N_FIELDS);
 
-	ptr = mem_heap_alloc(heap, 4);
+	ptr = static_cast<byte*>(mem_heap_alloc(heap, 4));
 	mach_write_to_4(ptr, index->n_fields);
 
 	dfield_set_data(dfield, ptr, 4);
+
 	/* 6: TYPE --------------------------*/
-	dfield = dtuple_get_nth_field(entry, 4/*TYPE*/);
+	dfield = dtuple_get_nth_field(
+		entry, DICT_COL__SYS_INDEXES__TYPE);
 
-	ptr = mem_heap_alloc(heap, 4);
+	ptr = static_cast<byte*>(mem_heap_alloc(heap, 4));
 	mach_write_to_4(ptr, index->type);
 
 	dfield_set_data(dfield, ptr, 4);
-	/* 7: SPACE --------------------------*/
 
-#if DICT_SYS_INDEXES_SPACE_NO_FIELD != 7
-#error "DICT_SYS_INDEXES_SPACE_NO_FIELD != 7"
-#endif
+	/* 7: SPACE --------------------------*/
 
-	dfield = dtuple_get_nth_field(entry, 5/*SPACE*/);
+	dfield = dtuple_get_nth_field(
+		entry, DICT_COL__SYS_INDEXES__SPACE);
 
-	ptr = mem_heap_alloc(heap, 4);
+	ptr = static_cast<byte*>(mem_heap_alloc(heap, 4));
 	mach_write_to_4(ptr, index->space);
 
 	dfield_set_data(dfield, ptr, 4);
-	/* 8: PAGE_NO --------------------------*/
 
-#if DICT_SYS_INDEXES_PAGE_NO_FIELD != 8
-#error "DICT_SYS_INDEXES_PAGE_NO_FIELD != 8"
-#endif
+	/* 8: PAGE_NO --------------------------*/
 
-	dfield = dtuple_get_nth_field(entry, 6/*PAGE_NO*/);
+	dfield = dtuple_get_nth_field(
+		entry, DICT_COL__SYS_INDEXES__PAGE_NO);
 
-	ptr = mem_heap_alloc(heap, 4);
+	ptr = static_cast<byte*>(mem_heap_alloc(heap, 4));
 	mach_write_to_4(ptr, FIL_NULL);
 
 	dfield_set_data(dfield, ptr, 4);
+
 	/*--------------------------------*/
 
 	return(entry);
@@ -438,7 +459,7 @@ dtuple_t*
 dict_create_sys_fields_tuple(
 /*=========================*/
 	const dict_index_t*	index,	/*!< in: index */
-	ulint			i,	/*!< in: field number */
+	ulint			fld_no,	/*!< in: field number */
 	mem_heap_t*		heap)	/*!< in: memory heap from
 					which the memory for the built
 					tuple is allocated */
@@ -461,7 +482,7 @@ dict_create_sys_fields_tuple(
 		}
 	}
 
-	field = dict_index_get_nth_field(index, i);
+	field = dict_index_get_nth_field(index, fld_no);
 
 	sys_fields = dict_sys->sys_fields;
 
@@ -470,35 +491,39 @@ dict_create_sys_fields_tuple(
 	dict_table_copy_types(entry, sys_fields);
 
 	/* 0: INDEX_ID -----------------------*/
-	dfield = dtuple_get_nth_field(entry, 0/*INDEX_ID*/);
+	dfield = dtuple_get_nth_field(entry, DICT_COL__SYS_FIELDS__INDEX_ID);
 
-	ptr = mem_heap_alloc(heap, 8);
+	ptr = static_cast<byte*>(mem_heap_alloc(heap, 8));
 	mach_write_to_8(ptr, index->id);
 
 	dfield_set_data(dfield, ptr, 8);
-	/* 1: POS + PREFIX LENGTH ----------------------------*/
 
-	dfield = dtuple_get_nth_field(entry, 1/*POS*/);
+	/* 1: POS; FIELD NUMBER & PREFIX LENGTH -----------------------*/
 
-	ptr = mem_heap_alloc(heap, 4);
+	dfield = dtuple_get_nth_field(entry, DICT_COL__SYS_FIELDS__POS);
+
+	ptr = static_cast<byte*>(mem_heap_alloc(heap, 4));
 
 	if (index_contains_column_prefix_field) {
 		/* If there are column prefix fields in the index, then
 		we store the number of the field to the 2 HIGH bytes
 		and the prefix length to the 2 low bytes, */
 
-		mach_write_to_4(ptr, (i << 16) + field->prefix_len);
+		mach_write_to_4(ptr, (fld_no << 16) + field->prefix_len);
 	} else {
 		/* Else we store the number of the field to the 2 LOW bytes.
 		This is to keep the storage format compatible with
 		InnoDB versions < 4.0.14. */
 
-		mach_write_to_4(ptr, i);
+		mach_write_to_4(ptr, fld_no);
 	}
 
 	dfield_set_data(dfield, ptr, 4);
+
+	/* 2: DB_TRX_ID added later */
+	/* 3: DB_ROLL_PTR added later */
 	/* 4: COL_NAME -------------------------*/
-	dfield = dtuple_get_nth_field(entry, 2/*COL_NAME*/);
+	dfield = dtuple_get_nth_field(entry, DICT_COL__SYS_FIELDS__COL_NAME);
 
 	dfield_set_data(dfield, field->name,
 			ut_strlen(field->name));
@@ -638,6 +663,11 @@ dict_create_index_tree_step(
 
 	sys_indexes = dict_sys->sys_indexes;
 
+	if (index->type == DICT_FTS) {
+		/* FTS index does not need an index tree */
+		return(DB_SUCCESS);
+	}
+
 	/* Run a mini-transaction in which the index tree is allocated for
 	the index and its root address is written to the index entry in
 	sys_indexes */
@@ -657,10 +687,10 @@ dict_create_index_tree_step(
 	node->page_no = btr_create(index->type, index->space, zip_size,
 				   index->id, index, &mtr);
 	/* printf("Created a new index tree in space %lu root page %lu\n",
-	index->space, index->page_no); */
+	index->space, node->page_no); */
 
 	page_rec_write_field(btr_pcur_get_rec(&pcur),
-			     DICT_SYS_INDEXES_PAGE_NO_FIELD,
+			     DICT_FLD__SYS_INDEXES__PAGE_NO,
 			     node->page_no, &mtr);
 	btr_pcur_close(&pcur);
 	mtr_commit(&mtr);
@@ -691,7 +721,8 @@ dict_drop_index_tree(
 
 	ut_ad(mutex_own(&(dict_sys->mutex)));
 	ut_a(!dict_table_is_comp(dict_sys->sys_indexes));
-	ptr = rec_get_nth_field_old(rec, DICT_SYS_INDEXES_PAGE_NO_FIELD, &len);
+	ptr = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_INDEXES__PAGE_NO, &len);
 
 	ut_ad(len == 4);
 
@@ -703,8 +734,8 @@ dict_drop_index_tree(
 		return;
 	}
 
-	ptr = rec_get_nth_field_old(rec,
-				    DICT_SYS_INDEXES_SPACE_NO_FIELD, &len);
+	ptr = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_INDEXES__SPACE, &len);
 
 	ut_ad(len == 4);
 
@@ -731,7 +762,7 @@ dict_drop_index_tree(
 	root_page_no); */
 	btr_free_root(space, zip_size, root_page_no, mtr);
 
-	page_rec_write_field(rec, DICT_SYS_INDEXES_PAGE_NO_FIELD,
+	page_rec_write_field(rec, DICT_FLD__SYS_INDEXES__PAGE_NO,
 			     FIL_NULL, mtr);
 }
 
@@ -767,7 +798,8 @@ dict_truncate_index_tree(
 	ut_ad(mutex_own(&(dict_sys->mutex)));
 	ut_a(!dict_table_is_comp(dict_sys->sys_indexes));
 	rec = btr_pcur_get_rec(pcur);
-	ptr = rec_get_nth_field_old(rec, DICT_SYS_INDEXES_PAGE_NO_FIELD, &len);
+	ptr = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_INDEXES__PAGE_NO, &len);
 
 	ut_ad(len == 4);
 
@@ -782,8 +814,8 @@ dict_truncate_index_tree(
 		drop = FALSE;
 	}
 
-	ptr = rec_get_nth_field_old(rec,
-				    DICT_SYS_INDEXES_SPACE_NO_FIELD, &len);
+	ptr = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_INDEXES__SPACE, &len);
 
 	ut_ad(len == 4);
 
@@ -803,12 +835,12 @@ dict_truncate_index_tree(
 		return(FIL_NULL);
 	}
 
-	ptr = rec_get_nth_field_old(rec,
-				    DICT_SYS_INDEXES_TYPE_FIELD, &len);
+	ptr = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_INDEXES__TYPE, &len);
 	ut_ad(len == 4);
 	type = mach_read_from_4(ptr);
 
-	ptr = rec_get_nth_field_old(rec, 1, &len);
+	ptr = rec_get_nth_field_old(rec, DICT_FLD__SYS_INDEXES__ID, &len);
 	ut_ad(len == 8);
 	index_id = mach_read_from_8(ptr);
 
@@ -835,7 +867,7 @@ create:
 	in SYS_INDEXES, so that the database will not get into an
 	inconsistent state in case it crashes between the mtr_commit()
 	below and the following mtr_commit() call. */
-	page_rec_write_field(rec, DICT_SYS_INDEXES_PAGE_NO_FIELD,
+	page_rec_write_field(rec, DICT_FLD__SYS_INDEXES__PAGE_NO,
 			     FIL_NULL, mtr);
 
 	/* We will need to commit the mini-transaction in order to avoid
@@ -882,7 +914,8 @@ tab_create_graph_create(
 {
 	tab_node_t*	node;
 
-	node = mem_heap_alloc(heap, sizeof(tab_node_t));
+	node = static_cast<tab_node_t*>(
+		mem_heap_alloc(heap, sizeof(tab_node_t)));
 
 	node->common.type = QUE_NODE_CREATE_TABLE;
 
@@ -899,7 +932,7 @@ tab_create_graph_create(
 					heap);
 	node->col_def->common.parent = node;
 
-	node->commit_node = commit_node_create(heap);
+	node->commit_node = trx_commit_node_create(heap);
 	node->commit_node->common.parent = node;
 
 	return(node);
@@ -918,7 +951,8 @@ ind_create_graph_create(
 {
 	ind_node_t*	node;
 
-	node = mem_heap_alloc(heap, sizeof(ind_node_t));
+	node = static_cast<ind_node_t*>(
+		mem_heap_alloc(heap, sizeof(ind_node_t)));
 
 	node->common.type = QUE_NODE_CREATE_INDEX;
 
@@ -936,7 +970,7 @@ ind_create_graph_create(
 					  dict_sys->sys_fields, heap);
 	node->field_def->common.parent = node;
 
-	node->commit_node = commit_node_create(heap);
+	node->commit_node = trx_commit_node_create(heap);
 	node->commit_node->common.parent = node;
 
 	return(node);
@@ -960,7 +994,7 @@ dict_create_table_step(
 
 	trx = thr_get_trx(thr);
 
-	node = thr->run_node;
+	node = static_cast<tab_node_t*>(thr->run_node);
 
 	ut_ad(que_node_get_type(node) == QUE_NODE_CREATE_TABLE);
 
@@ -1023,13 +1057,13 @@ dict_create_table_step(
 
 	if (node->state == TABLE_ADD_TO_CACHE) {
 
-		dict_table_add_to_cache(node->table, node->heap);
+		dict_table_add_to_cache(node->table, TRUE, node->heap);
 
 		err = DB_SUCCESS;
 	}
 
 function_exit:
-	trx->error_state = err;
+	trx->error_state = (enum db_err) err;
 
 	if (err == DB_SUCCESS) {
 		/* Ok: do nothing */
@@ -1067,7 +1101,7 @@ dict_create_index_step(
 
 	trx = thr_get_trx(thr);
 
-	node = thr->run_node;
+	node = static_cast<ind_node_t*>(thr->run_node);
 
 	ut_ad(que_node_get_type(node) == QUE_NODE_CREATE_INDEX);
 
@@ -1121,7 +1155,7 @@ dict_create_index_step(
 			node->table, node->index, FIL_NULL,
 			trx_is_strict(trx)
 			|| dict_table_get_format(node->table)
-			>= DICT_TF_FORMAT_ZIP);
+			>= UNIV_FORMAT_B);
 
 		node->index = dict_index_get_if_in_cache_low(index_id);
 		ut_a(!node->index == (err != DB_SUCCESS));
@@ -1163,7 +1197,7 @@ dict_create_index_step(
 	}
 
 function_exit:
-	trx->error_state = err;
+	trx->error_state = static_cast<enum db_err>(err);
 
 	if (err == DB_SUCCESS) {
 		/* Ok: do nothing */
@@ -1183,6 +1217,46 @@ function_exit:
 }
 
 /****************************************************************//**
+Check whether the system foreign key tables exist. Additionally, If
+they exist then move them to non-LRU end of the table LRU list.
+@return TRUE if they exist. */
+static
+ibool
+dict_check_sys_foreign_tables_exist(void)
+/*=====================================*/
+{
+	dict_table_t*	sys_foreign;
+	ibool		exists = FALSE;
+	dict_table_t*	sys_foreign_cols;
+
+	ut_a(srv_get_active_thread_type() == SRV_NONE);
+
+	mutex_enter(&dict_sys->mutex);
+
+	sys_foreign = dict_table_get_low("SYS_FOREIGN");
+	sys_foreign_cols = dict_table_get_low("SYS_FOREIGN_COLS");
+
+	if (sys_foreign != NULL
+	    && sys_foreign_cols != NULL
+	    && UT_LIST_GET_LEN(sys_foreign->indexes) == 3
+	    && UT_LIST_GET_LEN(sys_foreign_cols->indexes) == 1) {
+
+		/* Foreign constraint system tables have already been
+		created, and they are ok. Ensure that they can't be
+		evicted from the table LRU cache.  */
+
+		dict_table_move_from_lru_to_non_lru(sys_foreign);
+		dict_table_move_from_lru_to_non_lru(sys_foreign_cols);
+
+		exists = TRUE;
+	}
+
+	mutex_exit(&dict_sys->mutex);
+
+	return(exists);
+}
+
+/****************************************************************//**
 Creates the foreign key constraints system tables inside InnoDB
 at database creation or database start if they are not found or are
 not of the right form.
@@ -1192,47 +1266,39 @@ ulint
 dict_create_or_check_foreign_constraint_tables(void)
 /*================================================*/
 {
-	dict_table_t*	table1;
-	dict_table_t*	table2;
-	ulint		error;
 	trx_t*		trx;
+	ulint		error;
+	ibool		success;
+	ibool		srv_file_per_table_backup;
 
-	mutex_enter(&(dict_sys->mutex));
-
-	table1 = dict_table_get_low("SYS_FOREIGN");
-	table2 = dict_table_get_low("SYS_FOREIGN_COLS");
-
-	if (table1 && table2
-	    && UT_LIST_GET_LEN(table1->indexes) == 3
-	    && UT_LIST_GET_LEN(table2->indexes) == 1) {
-
-		/* Foreign constraint system tables have already been
-		created, and they are ok */
+	ut_a(srv_get_active_thread_type() == SRV_NONE);
 
-		mutex_exit(&(dict_sys->mutex));
+	/* Note: The master thread has not been started at this point. */
 
+	if (dict_check_sys_foreign_tables_exist()) {
 		return(DB_SUCCESS);
 	}
 
-	mutex_exit(&(dict_sys->mutex));
-
 	trx = trx_allocate_for_mysql();
 
 	trx->op_info = "creating foreign key sys tables";
 
 	row_mysql_lock_data_dictionary(trx);
 
-	if (table1) {
+	/* Check which incomplete table definition to drop. */
+
+	if (dict_table_get_low("SYS_FOREIGN") != NULL) {
 		fprintf(stderr,
 			"InnoDB: dropping incompletely created"
 			" SYS_FOREIGN table\n");
 		row_drop_table_for_mysql("SYS_FOREIGN", trx, TRUE);
 	}
 
-	if (table2) {
+	if (dict_table_get_low("SYS_FOREIGN_COLS") != NULL) {
 		fprintf(stderr,
 			"InnoDB: dropping incompletely created"
 			" SYS_FOREIGN_COLS table\n");
+
 		row_drop_table_for_mysql("SYS_FOREIGN_COLS", trx, TRUE);
 	}
 
@@ -1249,6 +1315,13 @@ dict_create_or_check_foreign_constraint_tables(void)
 	VARBINARY, like in other InnoDB system tables, to get a clean
 	design. */
 
+	srv_file_per_table_backup = (ibool) srv_file_per_table;
+
+	/* We always want SYSTEM tables to be created inside the system
+	tablespace. */
+
+	srv_file_per_table = 0;
+
 	error = que_eval_sql(NULL,
 			     "PROCEDURE CREATE_FOREIGN_SYS_TABLES_PROC () IS\n"
 			     "BEGIN\n"
@@ -1300,6 +1373,14 @@ dict_create_or_check_foreign_constraint_tables(void)
 			" created\n");
 	}
 
+	/* Note: The master thread has not been started at this point. */
+	/* Confirm and move to the non-LRU part of the table LRU list. */
+
+	success = dict_check_sys_foreign_tables_exist();
+	ut_a(success);
+
+	srv_file_per_table = (my_bool) srv_file_per_table_backup;
+
 	return(error);
 }
 
@@ -1428,8 +1509,12 @@ dict_create_add_foreign_to_dictionary(
 
 	if (foreign->id == NULL) {
 		/* Generate a new constraint id */
+		char*	id;
 		ulint	namelen	= strlen(table->name);
-		char*	id	= mem_heap_alloc(foreign->heap, namelen + 20);
+
+		id = static_cast<char*>(mem_heap_alloc(
+				foreign->heap, namelen + 20));
+
 		/* no overflow if number < 1e13 */
 		sprintf(id, "%s_ibfk_%lu", table->name, (ulong) (*id_nr)++);
 		foreign->id = id;
@@ -1468,12 +1553,11 @@ dict_create_add_foreign_to_dictionary(
 		}
 	}
 
-	error = dict_foreign_eval_sql(NULL,
-				      "PROCEDURE P () IS\n"
-				      "BEGIN\n"
-				      "COMMIT WORK;\n"
-				      "END;\n"
-				      , table, foreign, trx);
+	trx->op_info = "committing foreign key definitions";
+
+	trx_commit(trx);
+
+	trx->op_info = "";
 
 	return(error);
 }
diff --git a/storage/innobase/dict/dict0dict.c b/storage/innobase/dict/dict0dict.cc
index 5be94a10374..c97207c92be 100644
--- a/storage/innobase/dict/dict0dict.c
+++ b/storage/innobase/dict/dict0dict.cc
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1996, 2012, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 1996, 2011, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -17,16 +17,18 @@ this program; if not, write to the Free Software Foundation, Inc.,
 *****************************************************************************/
 
 /******************************************************************//**
-@file dict/dict0dict.c
+@file dict/dict0dict.cc
 Data dictionary system
 
 Created 1/8/1996 Heikki Tuuri
 ***********************************************************************/
 
 #include "dict0dict.h"
+#include "fts0fts.h"
 
 #ifdef UNIV_NONINL
 #include "dict0dict.ic"
+#include "dict0priv.ic"
 #endif
 
 /** dummy index for ROW_FORMAT=REDUNDANT supremum and infimum records */
@@ -41,6 +43,7 @@ UNIV_INTERN dict_index_t*	dict_ind_compact;
 #include "dict0boot.h"
 #include "dict0mem.h"
 #include "dict0crea.h"
+#include "dict0stats.h"
 #include "trx0undo.h"
 #include "btr0btr.h"
 #include "btr0cur.h"
@@ -51,12 +54,16 @@ UNIV_INTERN dict_index_t*	dict_ind_compact;
 #include "pars0sym.h"
 #include "que0que.h"
 #include "rem0cmp.h"
+#include "fts0fts.h"
+#include "fts0types.h"
 #include "row0merge.h"
 #include "m_ctype.h" /* my_isspace() */
-#include "ha_prototypes.h" /* innobase_strcasecmp(), innobase_casedn_str()*/
+#include "ha_prototypes.h" /* innobase_strcasecmp(), innobase_casedn_str() */
+#include "srv0mon.h"
+#include "srv0start.h"
+#include "lock0lock.h"
+#include "dict0priv.h"
 #include "row0upd.h"
-#include "m_string.h"
-#include "my_sys.h"
 
 #include <ctype.h>
 
@@ -141,6 +148,15 @@ dict_index_build_internal_non_clust(
 	dict_index_t*		index);	/*!< in: user representation of
 					a non-clustered index */
 /**********************************************************************//**
+Builds the internal dictionary cache representation for an FTS index.
+@return	own: the internal representation of the FTS index */
+static
+dict_index_t*
+dict_index_build_internal_fts(
+/*==========================*/
+	dict_table_t*	table,	/*!< in: table */
+	dict_index_t*	index);	/*!< in: user representation of an FTS index */
+/**********************************************************************//**
 Removes a foreign constraint struct from the dictionary cache. */
 static
 void
@@ -177,6 +193,50 @@ dict_foreign_free(
 /*==============*/
 	dict_foreign_t*	foreign);	/*!< in, own: foreign key struct */
 
+/**********************************************************************//**
+Removes an index from the dictionary cache. */
+static
+void
+dict_index_remove_from_cache_low(
+/*=============================*/
+	dict_table_t*	table,		/*!< in/out: table */
+	dict_index_t*	index,		/*!< in, own: index */
+	ibool		lru_evict);	/*!< in: TRUE if page being evicted
+					to make room in the table LRU list */
+/**********************************************************************//**
+Removes a table object from the dictionary cache. */
+static
+void
+dict_table_remove_from_cache_low(
+/*=============================*/
+	dict_table_t*	table,		/*!< in, own: table */
+	ibool		lru_evict);	/*!< in: TRUE if evicting from LRU */
+#ifdef UNIV_DEBUG
+/**********************************************************************//**
+Validate the dictionary table LRU list.
+@return TRUE if validate OK */
+UNIV_INTERN
+ibool
+dict_lru_validate(void);
+/*===================*/
+/**********************************************************************//**
+Check if table is in the dictionary table LRU list.
+@return TRUE if table found */
+UNIV_INTERN
+ibool
+dict_lru_find_table(
+/*================*/
+	const dict_table_t*	find_table);	/*!< in: table to find */
+/**********************************************************************//**
+Check if a table exists in the dict table non-LRU list.
+@return TRUE if table found */
+static
+ibool
+dict_non_lru_find_table(
+/*====================*/
+	const dict_table_t*	find_table);	/*!< in: table to find */
+#endif /* UNIV_DEBUG */
+
 /* Stream for storing detailed information about the latest foreign key
 and unique key errors */
 UNIV_INTERN FILE*	dict_foreign_err_file		= NULL;
@@ -329,11 +389,11 @@ dict_table_stats_unlock(
 }
 
 /********************************************************************//**
-Decrements the count of open MySQL handles to a table. */
+Decrements the count of open handles to a table. */
 UNIV_INTERN
 void
-dict_table_decrement_handle_count(
-/*==============================*/
+dict_table_close(
+/*=============*/
 	dict_table_t*	table,		/*!< in/out: table */
 	ibool		dict_locked)	/*!< in: TRUE=data dictionary locked */
 {
@@ -342,9 +402,21 @@ dict_table_decrement_handle_count(
 	}
 
 	ut_ad(mutex_own(&dict_sys->mutex));
-	ut_a(table->n_mysql_handles_opened > 0);
+	ut_a(table->n_ref_count > 0);
+
+	--table->n_ref_count;
 
-	table->n_mysql_handles_opened--;
+	MONITOR_DEC(MONITOR_TABLE_REFERENCE);
+
+	ut_ad(dict_lru_validate());
+
+#ifdef UNIV_DEBUG
+	if (table->can_be_evicted) {
+		ut_ad(dict_lru_find_table(table));
+	} else {
+		ut_ad(dict_non_lru_find_table(table));
+	}
+#endif /* UNIV_DEBUG */
 
 	if (!dict_locked) {
 		mutex_exit(&dict_sys->mutex);
@@ -406,6 +478,33 @@ dict_table_autoinc_initialize(
 	table->autoinc = value;
 }
 
+/************************************************************************
+Get all the FTS indexes on a table.
+@return	number of FTS indexes */
+UNIV_INTERN
+ulint
+dict_table_get_all_fts_indexes(
+/*===========================*/
+	dict_table_t*   table,          /*!< in: table */
+	ib_vector_t*    indexes)        /*!< out: all FTS indexes on this
+					table */
+{
+	dict_index_t* index;
+
+	ut_a(ib_vector_size(indexes) == 0);
+
+	for (index = dict_table_get_first_index(table);
+	     index;
+	     index = dict_table_get_next_index(index)) {
+
+		if (index->type == DICT_FTS) {
+			ib_vector_push(indexes, &index);
+		}
+	}
+
+	return(ib_vector_size(indexes));
+}
+
 /********************************************************************//**
 Reads the next autoinc value (== autoinc counter value), 0 if not yet
 initialized.
@@ -453,8 +552,8 @@ dict_table_autoinc_unlock(
 
 /**********************************************************************//**
 Looks for an index with the given table and index id.
-NOTE that we do not reserve the dictionary mutex.
-@return	index or NULL if not found from cache */
+Note: Does not reserve the dictionary mutex.
+@return	index or NULL if not found in cache */
 UNIV_INTERN
 dict_index_t*
 dict_index_get_on_id_low(
@@ -464,16 +563,15 @@ dict_index_get_on_id_low(
 {
 	dict_index_t*	index;
 
-	index = dict_table_get_first_index(table);
+	for (index = dict_table_get_first_index(table);
+	     index != NULL;
+	     index = dict_table_get_next_index(index)) {
 
-	while (index) {
 		if (id == index->id) {
 			/* Found */
 
 			return(index);
 		}
-
-		index = dict_table_get_next_index(index);
 	}
 
 	return(NULL);
@@ -486,10 +584,12 @@ Looks for column n in an index.
 ULINT_UNDEFINED if not contained */
 UNIV_INTERN
 ulint
-dict_index_get_nth_col_pos(
-/*=======================*/
-	const dict_index_t*	index,	/*!< in: index */
-	ulint			n)	/*!< in: column number */
+dict_index_get_nth_col_or_prefix_pos(
+/*=================================*/
+	const dict_index_t*	index,		/*!< in: index */
+	ulint			n,		/*!< in: column number */
+	ibool			inc_prefix)	/*!< in: TRUE=consider
+						column prefixes too */
 {
 	const dict_field_t*	field;
 	const dict_col_t*	col;
@@ -511,7 +611,8 @@ dict_index_get_nth_col_pos(
 	for (pos = 0; pos < n_fields; pos++) {
 		field = dict_index_get_nth_field(index, pos);
 
-		if (col == field->col && field->prefix_len == 0) {
+		if (col == field->col
+		    && (inc_prefix || field->prefix_len == 0)) {
 
 			return(pos);
 		}
@@ -607,28 +708,35 @@ Returns a table object based on table id.
 @return	table, NULL if does not exist */
 UNIV_INTERN
 dict_table_t*
-dict_table_get_on_id(
-/*=================*/
+dict_table_open_on_id(
+/*==================*/
 	table_id_t	table_id,	/*!< in: table id */
-	trx_t*		trx)		/*!< in: transaction handle */
+	ibool		dict_locked)	/*!< in: TRUE=data dictionary locked */
 {
 	dict_table_t*	table;
 
-	if (trx->dict_operation_lock_mode == RW_X_LATCH) {
+	if (!dict_locked) {
+		mutex_enter(&dict_sys->mutex);
+	}
 
-		/* Note: An X latch implies that the transaction
-		already owns the dictionary mutex. */
+	ut_ad(mutex_own(&dict_sys->mutex));
 
-		ut_ad(mutex_own(&dict_sys->mutex));
+	table = dict_table_open_on_id_low(table_id);
 
-		return(dict_table_get_on_id_low(table_id));
-	}
+	if (table != NULL) {
 
-	mutex_enter(&(dict_sys->mutex));
+		if (table->can_be_evicted) {
+			dict_move_to_mru(table);
+		}
 
-	table = dict_table_get_on_id_low(table_id);
+		++table->n_ref_count;
 
-	mutex_exit(&(dict_sys->mutex));
+		MONITOR_INC(MONITOR_TABLE_REFERENCE);
+	}
+
+	if (!dict_locked) {
+		mutex_exit(&dict_sys->mutex);
+	}
 
 	return(table);
 }
@@ -693,7 +801,7 @@ dict_init(void)
 {
 	int	i;
 
-	dict_sys = mem_alloc(sizeof(dict_sys_t));
+	dict_sys = static_cast<dict_sys_t*>(mem_zalloc(sizeof(*dict_sys)));
 
 	mutex_create(dict_sys_mutex_key, &dict_sys->mutex, SYNC_DICT);
 
@@ -703,10 +811,6 @@ dict_init(void)
 	dict_sys->table_id_hash = hash_create(buf_pool_get_curr_size()
 					      / (DICT_POOL_PER_TABLE_HASH
 						 * UNIV_WORD_SIZE));
-	dict_sys->size = 0;
-
-	UT_LIST_INIT(dict_sys->table_LRU);
-
 	rw_lock_create(dict_operation_lock_key,
 		       &dict_operation_lock, SYNC_DICT_OPERATION);
 
@@ -714,7 +818,7 @@ dict_init(void)
 	ut_a(dict_foreign_err_file);
 
 	mutex_create(dict_foreign_err_mutex_key,
-		     &dict_foreign_err_mutex, SYNC_ANY_LATCH);
+		     &dict_foreign_err_mutex, SYNC_NO_ORDER_CHECK);
 
 	for (i = 0; i < DICT_TABLE_STATS_LATCHES_SIZE; i++) {
 		rw_lock_create(dict_table_stats_latch_key,
@@ -723,41 +827,148 @@ dict_init(void)
 }
 
 /**********************************************************************//**
-Returns a table object and optionally increment its MySQL open handle count.
-NOTE! This is a high-level function to be used mainly from outside the
-'dict' directory. Inside this directory dict_table_get_low is usually the
-appropriate function.
-@return	table, NULL if does not exist */
+Move to the most recently used segment of the LRU list. */
 UNIV_INTERN
+void
+dict_move_to_mru(
+/*=============*/
+	dict_table_t*	table)		/*!< in: table to move to MRU */
+{
+	ut_ad(mutex_own(&dict_sys->mutex));
+	ut_ad(dict_lru_validate());
+	ut_ad(dict_lru_find_table(table));
+
+	ut_a(table->can_be_evicted);
+
+	UT_LIST_REMOVE(table_LRU, dict_sys->table_LRU, table);
+
+	UT_LIST_ADD_FIRST(table_LRU, dict_sys->table_LRU, table);
+
+	ut_ad(dict_lru_validate());
+}
+
+/**********************************************************************//**
+Returns a table object and increments its open handle count.
+@return	table, NULL if does not exist */
+static
 dict_table_t*
-dict_table_get(
-/*===========*/
+dict_table_open_on_name_low(
+/*========================*/
 	const char*	table_name,	/*!< in: table name */
-	ibool		inc_mysql_count)/*!< in: whether to increment the open
-					handle count on the table */
+	ibool		dict_locked,	/*!< in: TRUE=data dictionary locked */
+	dict_err_ignore_t
+			ignore_err)	/*!< in: error to be ignored when
+					loading a table definition */
 {
 	dict_table_t*	table;
 
-	mutex_enter(&(dict_sys->mutex));
+	if (!dict_locked) {
+		mutex_enter(&(dict_sys->mutex));
+	}
 
-	table = dict_table_get_low(table_name);
+	ut_ad(table_name);
+	ut_ad(mutex_own(&dict_sys->mutex));
+
+	table = dict_table_check_if_in_cache_low(table_name);
 
-	if (inc_mysql_count && table) {
-		table->n_mysql_handles_opened++;
+	if (table == NULL) {
+		table = dict_load_table(table_name, TRUE, ignore_err);
 	}
 
-	mutex_exit(&(dict_sys->mutex));
+	ut_ad(!table || table->cached);
+
+	if (table != NULL) {
+
+		/* If table is corrupted, return NULL */
+		if (ignore_err == DICT_ERR_IGNORE_NONE
+		    && table->corrupted) {
+
+			/* Make life easy for drop table. */
+			if (table->can_be_evicted) {
+				dict_table_move_from_lru_to_non_lru(table);
+			}
+
+			if (!dict_locked) {
+				mutex_exit(&dict_sys->mutex);
+			}
+
+			ut_print_timestamp(stderr);
+
+			fprintf(stderr, "  InnoDB: table ");
+			ut_print_name(stderr, NULL, TRUE, table->name);
+			fprintf(stderr, "is corrupted. Please drop the table "
+				"and recreate\n");
+
+			return(NULL);
+		}
+
+		if (table->can_be_evicted) {
+			dict_move_to_mru(table);
+		}
+
+		++table->n_ref_count;
+
+		MONITOR_INC(MONITOR_TABLE_REFERENCE);
+	}
+
+	ut_ad(dict_lru_validate());
+
+	if (!dict_locked) {
+		mutex_exit(&(dict_sys->mutex));
+	}
+
+	return(table);
+}
+
+/**********************************************************************//**
+Returns a table object and increment its open handle count.
+NOTE! This is a high-level function to be used mainly from outside the
+'dict' directory. Inside this directory dict_table_get_low
+is usually the appropriate function.
+@return	table, NULL if does not exist */
+UNIV_INTERN
+dict_table_t*
+dict_table_open_on_name(
+/*====================*/
+	const char*	table_name,	/*!< in: table name */
+	ibool		dict_locked)	/*!< in: TRUE=data dictionary locked */
+{
+	dict_table_t*	table;
+
+	table = dict_table_open_on_name_low(table_name, dict_locked,
+					    DICT_ERR_IGNORE_NONE);
 
 	if (table != NULL) {
 		/* If table->ibd_file_missing == TRUE, this will
 		print an error message and return without doing
 		anything. */
-		dict_update_statistics(table, TRUE /* only update stats
-				       if they have not been initialized */);
+		dict_stats_update(table,
+				  DICT_STATS_FETCH_ONLY_IF_NOT_IN_MEMORY,
+				  dict_locked);
 	}
 
 	return(table);
 }
+
+/**********************************************************************//**
+Returns a table object and increment its open handle count. Table
+statistics will not be updated if they are not initialized.
+Call this function when dropping a table.
+@return	table, NULL if does not exist */
+UNIV_INTERN
+dict_table_t*
+dict_table_open_on_name_no_stats(
+/*=============================*/
+	const char*	table_name,	/*!< in: table name */
+	ibool		dict_locked,	/*!< in: TRUE=data dictionary locked */
+	dict_err_ignore_t
+			ignore_err)	/*!< in: error to be ignored during
+					table open */
+{
+	return(dict_table_open_on_name_low(table_name, dict_locked,
+					   ignore_err));
+}
+
 #endif /* !UNIV_HOTBACKUP */
 
 /**********************************************************************//**
@@ -813,14 +1024,17 @@ UNIV_INTERN
 void
 dict_table_add_to_cache(
 /*====================*/
-	dict_table_t*	table,	/*!< in: table */
-	mem_heap_t*	heap)	/*!< in: temporary heap */
+	dict_table_t*	table,		/*!< in: table */
+	ibool		can_be_evicted,	/*!< in: TRUE if can be evicted */
+	mem_heap_t*	heap)		/*!< in: temporary heap */
 {
 	ulint	fold;
 	ulint	id_fold;
 	ulint	i;
 	ulint	row_len;
 
+	ut_ad(dict_lru_validate());
+
 	/* The lower limit for what we consider a "big" row */
 #define BIG_ROW_SIZE 1024
 
@@ -892,18 +1106,215 @@ dict_table_add_to_cache(
 	/* Add table to hash table of tables based on table id */
 	HASH_INSERT(dict_table_t, id_hash, dict_sys->table_id_hash, id_fold,
 		    table);
-	/* Add table to LRU list of tables */
-	UT_LIST_ADD_FIRST(table_LRU, dict_sys->table_LRU, table);
+
+	table->can_be_evicted = can_be_evicted;
+
+	if (table->can_be_evicted) {
+		UT_LIST_ADD_FIRST(table_LRU, dict_sys->table_LRU, table);
+	} else {
+		UT_LIST_ADD_FIRST(table_LRU, dict_sys->table_non_LRU, table);
+	}
+
+	ut_ad(dict_lru_validate());
 
 	dict_sys->size += mem_heap_get_size(table->heap)
 		+ strlen(table->name) + 1;
 }
 
 /**********************************************************************//**
+Test whether a table can be evicted from the LRU cache.
+@return TRUE if table can be evicted. */
+static
+ibool
+dict_table_can_be_evicted(
+/*======================*/
+	const dict_table_t*	table)		/*!< in: table to test */
+{
+	ut_ad(mutex_own(&dict_sys->mutex));
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+
+	ut_a(table->can_be_evicted);
+	ut_a(UT_LIST_GET_LEN(table->foreign_list) == 0);
+	ut_a(UT_LIST_GET_LEN(table->referenced_list) == 0);
+
+	if (table->n_ref_count == 0) {
+		dict_index_t*	index;
+
+		/* The transaction commit and rollback are called from
+		outside the handler interface. This means that there is
+		a window where the table->n_ref_count can be zero but
+		the table instance is in "use". */
+
+		if (lock_table_has_locks(table)) {
+			return(FALSE);
+		}
+
+		for (index = dict_table_get_first_index(table);
+		     index != NULL;
+		     index = dict_table_get_next_index(index)) {
+
+			btr_search_t*	info = index->search_info;
+
+			/* We are not allowed to free the in-memory index
+			struct dict_index_t until all entries in the adaptive
+			hash index that point to any of the page belonging to
+			his b-tree index are dropped. This is so because
+			dropping of these entries require access to
+			dict_index_t struct. To avoid such scenario we keep
+			a count of number of such pages in the search_info and
+			only free the dict_index_t struct when this count
+			drops to zero.
+
+			See also: dict_index_remove_from_cache_low() */
+
+			if (btr_search_info_get_ref_count(info) > 0) {
+				return(FALSE);
+			}
+		}
+
+		return(TRUE);
+	}
+
+	return(FALSE);
+}
+
+/**********************************************************************//**
+Make room in the table cache by evicting an unused table. The unused table
+should not be part of FK relationship and currently not used in any user
+transaction. There is no guarantee that it will remove a table.
+@return number of tables evicted. If the number of tables in the dict_LRU
+is less than max_tables it will not do anything. */
+UNIV_INTERN
+ulint
+dict_make_room_in_cache(
+/*====================*/
+	ulint		max_tables,	/*!< in: max tables allowed in cache */
+	ulint		pct_check)	/*!< in: max percent to check */
+{
+	ulint		i;
+	ulint		len;
+	dict_table_t*	table;
+	ulint		check_up_to;
+	ulint		n_evicted = 0;
+
+	ut_a(pct_check > 0);
+	ut_a(pct_check <= 100);
+	ut_ad(mutex_own(&dict_sys->mutex));
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+	ut_ad(dict_lru_validate());
+
+	i = len = UT_LIST_GET_LEN(dict_sys->table_LRU);
+
+	if (len < max_tables) {
+		return(0);
+	}
+
+	check_up_to = len - ((len * pct_check) / 100);
+
+	/* Check for overflow */
+	ut_a(i == 0 || check_up_to <= i);
+
+	/* Find a suitable candidate to evict from the cache. Don't scan the
+	entire LRU list. Only scan pct_check list entries. */
+
+	for (table = UT_LIST_GET_LAST(dict_sys->table_LRU);
+	     table != NULL
+	     && i > check_up_to
+	     && (len - n_evicted) > max_tables;
+	     --i) {
+
+		dict_table_t*	prev_table;
+
+	        prev_table = UT_LIST_GET_PREV(table_LRU, table);
+
+		if (dict_table_can_be_evicted(table)) {
+
+			dict_table_remove_from_cache_low(table, TRUE);
+
+			++n_evicted;
+		}
+
+		table = prev_table;
+	}
+
+	return(n_evicted);
+}
+
+/**********************************************************************//**
+Move a table to the non-LRU list from the LRU list. */
+UNIV_INTERN
+void
+dict_table_move_from_lru_to_non_lru(
+/*================================*/
+	dict_table_t*	table)	/*!< in: table to move from LRU to non-LRU */
+{
+	ut_ad(mutex_own(&dict_sys->mutex));
+	ut_ad(dict_lru_find_table(table));
+
+	ut_a(table->can_be_evicted);
+
+	UT_LIST_REMOVE(table_LRU, dict_sys->table_LRU, table);
+
+	UT_LIST_ADD_LAST(table_LRU, dict_sys->table_non_LRU, table);
+
+	table->can_be_evicted = FALSE;
+}
+
+/**********************************************************************//**
+Move a table to the LRU list from the non-LRU list. */
+UNIV_INTERN
+void
+dict_table_move_from_non_lru_to_lru(
+/*================================*/
+	dict_table_t*	table)	/*!< in: table to move from non-LRU to LRU */
+{
+	ut_ad(mutex_own(&dict_sys->mutex));
+	ut_ad(dict_non_lru_find_table(table));
+
+	ut_a(!table->can_be_evicted);
+
+	UT_LIST_REMOVE(table_LRU, dict_sys->table_non_LRU, table);
+
+	UT_LIST_ADD_LAST(table_LRU, dict_sys->table_LRU, table);
+
+	table->can_be_evicted = TRUE;
+}
+
+/**********************************************************************//**
+Looks for an index with the given id given a table instance.
+@return	index or NULL */
+static
+dict_index_t*
+dict_table_find_index_on_id(
+/*========================*/
+	const dict_table_t*	table,	/*!< in: table instance */
+	index_id_t		id)	/*!< in: index id */
+{
+	dict_index_t*	index;
+
+	for (index = dict_table_get_first_index(table);
+	     index != NULL;
+	     index = dict_table_get_next_index(index)) {
+
+		if (id == index->id) {
+			/* Found */
+
+			return(index);
+		}
+	}
+
+	return(NULL);
+}
+
+/**********************************************************************//**
 Looks for an index with the given id. NOTE that we do not reserve
 the dictionary mutex: this function is for emergency purposes like
 printing info of a corrupt database page!
-@return	index or NULL if not found from cache */
+@return	index or NULL if not found in cache */
 UNIV_INTERN
 dict_index_t*
 dict_index_find_on_id_low(
@@ -911,29 +1322,32 @@ dict_index_find_on_id_low(
 	index_id_t	id)	/*!< in: index id */
 {
 	dict_table_t*	table;
-	dict_index_t*	index;
 
 	/* This can happen if the system tablespace is the wrong page size */
 	if (dict_sys == NULL) {
 		return(NULL);
 	}
 
-	table = UT_LIST_GET_FIRST(dict_sys->table_LRU);
+	for (table = UT_LIST_GET_FIRST(dict_sys->table_LRU);
+	     table != NULL;
+	     table = UT_LIST_GET_NEXT(table_LRU, table)) {
 
-	while (table) {
-		index = dict_table_get_first_index(table);
+		dict_index_t*	index = dict_table_find_index_on_id(table, id);
 
-		while (index) {
-			if (id == index->id) {
-				/* Found */
+		if (index != NULL) {
+			return(index);
+		}
+	}
 
-				return(index);
-			}
+	for (table = UT_LIST_GET_FIRST(dict_sys->table_non_LRU);
+	     table != NULL;
+	     table = UT_LIST_GET_NEXT(table_LRU, table)) {
 
-			index = dict_table_get_next_index(index);
-		}
+		dict_index_t*	index = dict_table_find_index_on_id(table, id);
 
-		table = UT_LIST_GET_NEXT(table_LRU, table);
+		if (index != NULL) {
+			return(index);
+		}
 	}
 
 	return(NULL);
@@ -1021,7 +1435,9 @@ dict_table_rename_in_cache(
 		memory fragmentation, we assume a repeated calls of
 		ut_realloc() with the same size do not cause fragmentation */
 		ut_a(strlen(new_name) <= MAX_FULL_NAME_LEN);
-		table->name = ut_realloc(table->name, MAX_FULL_NAME_LEN + 1);
+
+		table->name = static_cast<char*>(
+			ut_realloc(table->name, MAX_FULL_NAME_LEN + 1));
 	}
 	memcpy(table->name, new_name, strlen(new_name) + 1);
 
@@ -1113,10 +1529,11 @@ dict_table_rename_in_cache(
 				/* This is a generated >= 4.0.18 format id */
 
 				if (strlen(table->name) > strlen(old_name)) {
-					foreign->id = mem_heap_alloc(
+					foreign->id = static_cast<char*>(
+						mem_heap_alloc(
 						foreign->heap,
 						strlen(table->name)
-						+ strlen(old_id) + 1);
+						+ strlen(old_id) + 1));
 				}
 
 				/* Replace the prefix 'databasename/tablename'
@@ -1132,9 +1549,10 @@ dict_table_rename_in_cache(
 				if (dict_get_db_name_len(table->name)
 				    > dict_get_db_name_len(foreign->id)) {
 
-					foreign->id = mem_heap_alloc(
+					foreign->id = static_cast<char*>(
+						mem_heap_alloc(
 						foreign->heap,
-						db_len + strlen(old_id) + 1);
+						db_len + strlen(old_id) + 1));
 				}
 
 				/* Replace the database prefix in id with the
@@ -1201,17 +1619,22 @@ dict_table_change_id_in_cache(
 
 /**********************************************************************//**
 Removes a table object from the dictionary cache. */
-UNIV_INTERN
+static
 void
-dict_table_remove_from_cache(
-/*=========================*/
-	dict_table_t*	table)	/*!< in, own: table */
+dict_table_remove_from_cache_low(
+/*=============================*/
+	dict_table_t*	table,		/*!< in, own: table */
+	ibool		lru_evict)	/*!< in: TRUE if table being evicted
+					to make room in the table LRU list */
 {
 	dict_foreign_t*	foreign;
 	dict_index_t*	index;
 	ulint		size;
 
 	ut_ad(table);
+	ut_ad(dict_lru_validate());
+	ut_a(table->n_ref_count == 0);
+	ut_a(table->n_rec_locks == 0);
 	ut_ad(mutex_own(&(dict_sys->mutex)));
 	ut_ad(table->magic_n == DICT_TABLE_MAGIC_N);
 
@@ -1222,40 +1645,51 @@ dict_table_remove_from_cache(
 #endif
 
 	/* Remove the foreign constraints from the cache */
-	foreign = UT_LIST_GET_LAST(table->foreign_list);
 
-	while (foreign != NULL) {
+	for (foreign = UT_LIST_GET_LAST(table->foreign_list);
+	     foreign != NULL;
+	     foreign = UT_LIST_GET_LAST(table->foreign_list)) {
+
 		dict_foreign_remove_from_cache(foreign);
-		foreign = UT_LIST_GET_LAST(table->foreign_list);
 	}
 
 	/* Reset table field in referencing constraints */
 
-	foreign = UT_LIST_GET_FIRST(table->referenced_list);
+	for (foreign = UT_LIST_GET_FIRST(table->referenced_list);
+	     foreign != NULL;
+	     foreign = UT_LIST_GET_NEXT(referenced_list, foreign)) {
 
-	while (foreign != NULL) {
 		foreign->referenced_table = NULL;
 		foreign->referenced_index = NULL;
-
-		foreign = UT_LIST_GET_NEXT(referenced_list, foreign);
 	}
 
 	/* Remove the indexes from the cache */
-	index = UT_LIST_GET_LAST(table->indexes);
 
-	while (index != NULL) {
-		dict_index_remove_from_cache(table, index);
-		index = UT_LIST_GET_LAST(table->indexes);
+	for (index = UT_LIST_GET_LAST(table->indexes);
+	     index != NULL;
+	     index = UT_LIST_GET_LAST(table->indexes)) {
+
+		dict_index_remove_from_cache_low(table, index, lru_evict);
 	}
 
 	/* Remove table from the hash tables of tables */
+
 	HASH_DELETE(dict_table_t, name_hash, dict_sys->table_hash,
 		    ut_fold_string(table->name), table);
+
 	HASH_DELETE(dict_table_t, id_hash, dict_sys->table_id_hash,
 		    ut_fold_ull(table->id), table);
 
-	/* Remove table from LRU list of tables */
-	UT_LIST_REMOVE(table_LRU, dict_sys->table_LRU, table);
+	/* Remove table from LRU or non-LRU list. */
+	if (table->can_be_evicted) {
+		ut_ad(dict_lru_find_table(table));
+		UT_LIST_REMOVE(table_LRU, dict_sys->table_LRU, table);
+	} else {
+		ut_ad(dict_non_lru_find_table(table));
+		UT_LIST_REMOVE(table_LRU, dict_sys->table_non_LRU, table);
+	}
+
+	ut_ad(dict_lru_validate());
 
 	size = mem_heap_get_size(table->heap) + strlen(table->name) + 1;
 
@@ -1266,6 +1700,17 @@ dict_table_remove_from_cache(
 	dict_mem_table_free(table);
 }
 
+/**********************************************************************//**
+Removes a table object from the dictionary cache. */
+UNIV_INTERN
+void
+dict_table_remove_from_cache(
+/*=========================*/
+	dict_table_t*	table)	/*!< in, own: table */
+{
+	dict_table_remove_from_cache_low(table, FALSE);
+}
+
 /****************************************************************//**
 If the given column name is reserved for InnoDB system columns, return
 TRUE.
@@ -1298,6 +1743,11 @@ dict_col_name_is_reserved(
 	return(FALSE);
 }
 
+#if 1	/* This function is not very accurate at determining
+	whether an UNDO record will be too big. See innodb_4k.test,
+	Bug 13336585, for a testcase that shows an index that can
+	be created but cannot be updated. */
+
 /****************************************************************//**
 If an undo log record for this table might not fit on a single page,
 return TRUE.
@@ -1326,7 +1776,7 @@ dict_index_too_big_for_undo(
 		+ 10 + FIL_PAGE_DATA_END /* trx_undo_left() */
 		+ 2/* pointer to previous undo log record */;
 
-	if (UNIV_UNLIKELY(!clust_index)) {
+	if (!clust_index) {
 		ut_a(dict_index_is_clust(new_index));
 		clust_index = new_index;
 	}
@@ -1413,7 +1863,7 @@ is_ord_part:
 			/* We only store the needed prefix length in undo log */
 			if (max_prefix) {
 			     ut_ad(dict_table_get_format(table)
-				   >= DICT_TF_FORMAT_ZIP);
+				   >= UNIV_FORMAT_B);
 
 				max_size = ut_min(max_prefix, max_size);
 			}
@@ -1426,6 +1876,7 @@ is_ord_part:
 
 	return(undo_page_len >= UNIV_PAGE_SIZE);
 }
+#endif
 
 /****************************************************************//**
 If a record of this index might not fit on a single B-tree page,
@@ -1613,7 +2064,9 @@ dict_index_add_to_cache(
 	/* Build the cache internal representation of the index,
 	containing also the added system fields */
 
-	if (dict_index_is_clust(index)) {
+	if (index->type == DICT_FTS) {
+		new_index = dict_index_build_internal_fts(table, index);
+	} else if (dict_index_is_clust(index)) {
 		new_index = dict_index_build_internal_clust(table, index);
 	} else {
 		new_index = dict_index_build_internal_non_clust(table, index);
@@ -1631,21 +2084,27 @@ too_big:
 		return(DB_TOO_BIG_RECORD);
 	}
 
-	if (UNIV_UNLIKELY(index->type & DICT_UNIVERSAL)) {
+	if (dict_index_is_univ(index)) {
 		n_ord = new_index->n_fields;
 	} else {
 		n_ord = new_index->n_uniq;
 	}
 
+#if 1	/* The following code predetermines whether to call
+	dict_index_too_big_for_undo().  This function is not
+	accurate. See innodb_4k.test, Bug 13336585, for a
+	testcase that shows an index that can be created but
+	cannot be updated. */
+
 	switch (dict_table_get_format(table)) {
-	case DICT_TF_FORMAT_51:
+	case UNIV_FORMAT_A:
 		/* ROW_FORMAT=REDUNDANT and ROW_FORMAT=COMPACT store
 		prefixes of externally stored columns locally within
 		the record.  There are no special considerations for
 		the undo log record size. */
 		goto undo_size_ok;
 
-	case DICT_TF_FORMAT_ZIP:
+	case UNIV_FORMAT_B:
 		/* In ROW_FORMAT=DYNAMIC and ROW_FORMAT=COMPRESSED,
 		column prefix indexes require that prefixes of
 		externally stored columns are written to the undo log.
@@ -1655,8 +2114,8 @@ too_big:
 		checked for below. */
 		break;
 
-#if DICT_TF_FORMAT_ZIP != DICT_TF_FORMAT_MAX
-# error "DICT_TF_FORMAT_ZIP != DICT_TF_FORMAT_MAX"
+#if UNIV_FORMAT_B != UNIV_FORMAT_MAX
+# error "UNIV_FORMAT_B != UNIV_FORMAT_MAX"
 #endif
 	}
 
@@ -1695,6 +2154,7 @@ too_big:
 	}
 
 undo_size_ok:
+#endif
 	/* Flag the ordering columns and also set column max_prefix */
 
 	for (i = 0; i < n_ord; i++) {
@@ -1724,17 +2184,25 @@ undo_size_ok:
 		       dict_index_is_ibuf(index)
 		       ? SYNC_IBUF_INDEX_TREE : SYNC_INDEX_TREE);
 
-	if (!UNIV_UNLIKELY(new_index->type & DICT_UNIVERSAL)) {
+	if (!dict_index_is_univ(new_index)) {
+
+		new_index->stat_n_diff_key_vals =
+			static_cast<ib_uint64_t*>(mem_heap_alloc(
+			new_index->heap,
+			(1 + dict_index_get_n_unique(new_index))
+			* sizeof(*new_index->stat_n_diff_key_vals)));
 
-		new_index->stat_n_diff_key_vals = mem_heap_alloc(
+		new_index->stat_n_sample_sizes =
+			static_cast<ib_uint64_t*>(mem_heap_alloc(
 			new_index->heap,
 			(1 + dict_index_get_n_unique(new_index))
-			* sizeof(ib_int64_t));
+			* sizeof(*new_index->stat_n_sample_sizes)));
 
-		new_index->stat_n_non_null_key_vals = mem_heap_zalloc(
+		new_index->stat_n_non_null_key_vals =
+			static_cast<ib_uint64_t*>(mem_heap_zalloc(
 			new_index->heap,
 			(1 + dict_index_get_n_unique(new_index))
-			* sizeof(*new_index->stat_n_non_null_key_vals));
+			* sizeof(*new_index->stat_n_non_null_key_vals)));
 
 		/* Give some sensible values to stat_n_... in case we do
 		not calculate statistics quickly enough */
@@ -1742,6 +2210,7 @@ undo_size_ok:
 		for (i = 0; i <= dict_index_get_n_unique(new_index); i++) {
 
 			new_index->stat_n_diff_key_vals[i] = 100;
+			new_index->stat_n_sample_sizes[i] = 0;
 		}
 	}
 
@@ -1754,12 +2223,14 @@ undo_size_ok:
 
 /**********************************************************************//**
 Removes an index from the dictionary cache. */
-UNIV_INTERN
+static
 void
-dict_index_remove_from_cache(
-/*=========================*/
-	dict_table_t*	table,	/*!< in/out: table */
-	dict_index_t*	index)	/*!< in, own: index */
+dict_index_remove_from_cache_low(
+/*=============================*/
+	dict_table_t*	table,		/*!< in/out: table */
+	dict_index_t*	index,		/*!< in, own: index */
+	ibool		lru_evict)	/*!< in: TRUE if index being evicted
+					to make room in the table LRU list */
 {
 	ulint		size;
 	ulint		retries = 0;
@@ -1776,16 +2247,17 @@ dict_index_remove_from_cache(
 	ut_ad(info);
 
 	/* We are not allowed to free the in-memory index struct
- 	dict_index_t until all entries in the adaptive hash index
+	dict_index_t until all entries in the adaptive hash index
 	that point to any of the page belonging to his b-tree index
 	are dropped. This is so because dropping of these entries
 	require access to dict_index_t struct. To avoid such scenario
 	We keep a count of number of such pages in the search_info and
 	only free the dict_index_t struct when this count drops to
-	zero. */
+	zero. See also: dict_table_can_be_evicted() */
 
-	for (;;) {
+	do {
 		ulint ref_count = btr_search_info_get_ref_count(info);
+
 		if (ref_count == 0) {
 			break;
 		}
@@ -1813,7 +2285,7 @@ dict_index_remove_from_cache(
 		if (retries >= 60000) {
 			ut_error;
 		}
-	}
+	} while (srv_shutdown_state == SRV_SHUTDOWN_NONE || !lru_evict);
 
 	rw_lock_free(&index->lock);
 
@@ -1829,6 +2301,18 @@ dict_index_remove_from_cache(
 	dict_mem_index_free(index);
 }
 
+/**********************************************************************//**
+Removes an index from the dictionary cache. */
+UNIV_INTERN
+void
+dict_index_remove_from_cache(
+/*=========================*/
+	dict_table_t*	table,	/*!< in/out: table */
+	dict_index_t*	index)	/*!< in, own: index */
+{
+	dict_index_remove_from_cache_low(table, index, FALSE);
+}
+
 /*******************************************************************//**
 Tries to find column names for the index and sets the col field of the
 index.
@@ -1963,7 +2447,7 @@ dict_index_copy_types(
 {
 	ulint		i;
 
-	if (UNIV_UNLIKELY(index->type & DICT_UNIVERSAL)) {
+	if (dict_index_is_univ(index)) {
 		dtuple_set_types_binary(tuple, n_fields);
 
 		return;
@@ -2002,6 +2486,33 @@ dict_table_copy_types(
 	}
 }
 
+/********************************************************************
+Wait until all the background threads of the given table have exited, i.e.,
+bg_threads == 0. Note: bg_threads_mutex must be reserved when
+calling this. */
+UNIV_INTERN
+void
+dict_table_wait_for_bg_threads_to_exit(
+/*===================================*/
+	dict_table_t*	table,	/*< in: table */
+	ulint		delay)	/*< in: time in microseconds to wait between
+				checks of bg_threads. */
+{
+	fts_t*		fts = table->fts;
+
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(mutex_own(&fts->bg_threads_mutex));
+#endif /* UNIV_SYNC_DEBUG */
+
+	while (fts->bg_threads > 0) {
+		mutex_exit(&fts->bg_threads_mutex);
+
+		os_thread_sleep(delay);
+
+		mutex_enter(&fts->bg_threads_mutex);
+	}
+}
+
 /*******************************************************************//**
 Builds the internal dictionary cache representation for a clustered
 index, containing also system fields not defined by the user.
@@ -2042,7 +2553,7 @@ dict_index_build_internal_clust(
 	/* Copy the fields of index */
 	dict_index_copy(new_index, index, table, 0, index->n_fields);
 
-	if (UNIV_UNLIKELY(index->type & DICT_UNIVERSAL)) {
+	if (dict_index_is_univ(index)) {
 		/* No fixed number of fields determines an entry uniquely */
 
 		new_index->n_uniq = REC_MAX_N_FIELDS;
@@ -2116,7 +2627,8 @@ dict_index_build_internal_clust(
 	}
 
 	/* Remember the table columns already contained in new_index */
-	indexed = mem_zalloc(table->n_cols * sizeof *indexed);
+	indexed = static_cast<ibool*>(
+		mem_zalloc(table->n_cols * sizeof *indexed));
 
 	/* Mark the table columns already contained in new_index */
 	for (i = 0; i < new_index->n_def; i++) {
@@ -2182,7 +2694,7 @@ dict_index_build_internal_non_clust(
 
 	ut_ad(clust_index);
 	ut_ad(dict_index_is_clust(clust_index));
-	ut_ad(!(clust_index->type & DICT_UNIVERSAL));
+	ut_ad(!dict_index_is_univ(clust_index));
 
 	/* Create a new index */
 	new_index = dict_mem_index_create(
@@ -2200,7 +2712,8 @@ dict_index_build_internal_non_clust(
 	dict_index_copy(new_index, index, table, 0, index->n_fields);
 
 	/* Remember the table columns already contained in new_index */
-	indexed = mem_zalloc(table->n_cols * sizeof *indexed);
+	indexed = static_cast<ibool*>(
+		mem_zalloc(table->n_cols * sizeof *indexed));
 
 	/* Mark the table columns already contained in new_index */
 	for (i = 0; i < new_index->n_def; i++) {
@@ -2247,6 +2760,54 @@ dict_index_build_internal_non_clust(
 	return(new_index);
 }
 
+/***********************************************************************
+Builds the internal dictionary cache representation for an FTS index.
+@return	own: the internal representation of the FTS index */
+static
+dict_index_t*
+dict_index_build_internal_fts(
+/*==========================*/
+	dict_table_t*	table,	/*!< in: table */
+	dict_index_t*	index)	/*!< in: user representation of an FTS index */
+{
+	dict_index_t*	new_index;
+
+	ut_ad(table && index);
+	ut_ad(index->type == DICT_FTS);
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(mutex_own(&(dict_sys->mutex)));
+#endif /* UNIV_SYNC_DEBUG */
+	ut_ad(table->magic_n == DICT_TABLE_MAGIC_N);
+
+	/* Create a new index */
+	new_index = dict_mem_index_create(
+		table->name, index->name, index->space, index->type,
+		index->n_fields);
+
+	/* Copy other relevant data from the old index struct to the new
+	struct: it inherits the values */
+
+	new_index->n_user_defined_cols = index->n_fields;
+
+	new_index->id = index->id;
+
+	/* Copy fields from index to new_index */
+	dict_index_copy(new_index, index, table, 0, index->n_fields);
+
+	new_index->n_uniq = 0;
+	new_index->cached = TRUE;
+
+	if (table->fts->cache == NULL) {
+		table->fts->cache = fts_cache_create(table);
+	}
+
+	rw_lock_x_lock(&table->fts->cache->init_lock);
+	/* Notify the FTS cache about this index. */
+	fts_cache_index_cache_create(table, new_index);
+	rw_lock_x_unlock(&table->fts->cache->init_lock);
+
+	return(new_index);
+}
 /*====================== FOREIGN KEY PROCESSING ========================*/
 
 /*********************************************************************//**
@@ -2331,8 +2892,6 @@ dict_foreign_free(
 /*==============*/
 	dict_foreign_t*	foreign)	/*!< in, own: foreign key struct */
 {
-	ut_a(foreign->foreign_table->n_foreign_key_checks_running == 0);
-
 	mem_heap_free(foreign->heap);
 }
 
@@ -2430,7 +2989,8 @@ dict_foreign_find_index(
 	while (index != NULL) {
 		/* Ignore matches that refer to the same instance
 		or the index is to be dropped */
-		if (index->to_be_dropped || types_idx == index) {
+		if (index->to_be_dropped || types_idx == index
+		    || index->type & DICT_FTS) {
 
 			goto next_rec;
 
@@ -2732,6 +3292,19 @@ dict_foreign_add_to_cache(
 				 for_in_cache);
 	}
 
+	/* We need to move the table to the non-LRU end of the table LRU
+	list. Otherwise it will be evicted from the cache. */
+
+	if (ref_table != NULL && ref_table->can_be_evicted) {
+		dict_table_move_from_lru_to_non_lru(ref_table);
+	}
+
+	if (for_table != NULL && for_table->can_be_evicted) {
+		dict_table_move_from_lru_to_non_lru(for_table);
+	}
+
+	ut_ad(dict_lru_validate());
+
 	return(DB_SUCCESS);
 }
 
@@ -2893,7 +3466,10 @@ dict_scan_id(
 
 	if (quote) {
 		char*	d;
-		str = d = mem_heap_alloc(heap, len + 1);
+
+		str = d = static_cast<char*>(
+			mem_heap_alloc(heap, len + 1));
+
 		while (len--) {
 			if ((*d++ = *s++) == quote) {
 				s++;
@@ -2912,21 +3488,21 @@ convert_id:
 		/* Convert the identifier from connection character set
 		to UTF-8. */
 		len = 3 * len + 1;
-		*id = dst = mem_heap_alloc(heap, len);
+		*id = dst = static_cast<char*>(mem_heap_alloc(heap, len));
 
 		innobase_convert_from_id(cs, dst, str, len);
 	} else if (!strncmp(str, srv_mysql50_table_name_prefix,
-			    sizeof srv_mysql50_table_name_prefix)) {
+			    sizeof(srv_mysql50_table_name_prefix) - 1)) {
 		/* This is a pre-5.1 table name
 		containing chars other than [A-Za-z0-9].
 		Discard the prefix and use raw UTF-8 encoding. */
-		str += sizeof srv_mysql50_table_name_prefix;
-		len -= sizeof srv_mysql50_table_name_prefix;
+		str += sizeof(srv_mysql50_table_name_prefix) - 1;
+		len -= sizeof(srv_mysql50_table_name_prefix) - 1;
 		goto convert_id;
 	} else {
 		/* Encode using filename-safe characters. */
 		len = 5 * len + 1;
-		*id = dst = mem_heap_alloc(heap, len);
+		*id = dst = static_cast<char*>(mem_heap_alloc(heap, len));
 
 		innobase_convert_from_table_id(cs, dst, str, len);
 	}
@@ -3064,7 +3640,9 @@ dict_scan_table_name(
 	table_name_len = strlen(table_name);
 
 	/* Copy database_name, '/', table_name, '\0' */
-	ref = mem_heap_alloc(heap, database_name_len + table_name_len + 2);
+	ref = static_cast<char*>(
+		mem_heap_alloc(heap, database_name_len + table_name_len + 2));
+
 	memcpy(ref, database_name, database_name_len);
 	ref[database_name_len] = '/';
 	memcpy(ref + database_name_len + 1, table_name, table_name_len + 1);
@@ -3142,7 +3720,7 @@ dict_strip_comments(
 	/* unclosed quote character (0 if none) */
 	char		quote	= 0;
 
-	str = mem_alloc(sql_length + 1);
+	str = static_cast<char*>(mem_alloc(sql_length + 1));
 
 	sptr = sql_string;
 	ptr = str;
@@ -3571,8 +4149,8 @@ col_loop1:
 
 		db_len = dict_get_db_name_len(table->name);
 
-		foreign->id = mem_heap_alloc(
-			foreign->heap, db_len + strlen(constraint_name) + 2);
+		foreign->id = static_cast<char*>(mem_heap_alloc(
+			foreign->heap, db_len + strlen(constraint_name) + 2));
 
 		ut_memcpy(foreign->id, table->name, db_len);
 		foreign->id[db_len] = '/';
@@ -3586,8 +4164,10 @@ col_loop1:
 
 	foreign->foreign_index = index;
 	foreign->n_fields = (unsigned int) i;
-	foreign->foreign_col_names = mem_heap_alloc(foreign->heap,
-						    i * sizeof(void*));
+
+	foreign->foreign_col_names = static_cast<const char**>(
+		mem_heap_alloc(foreign->heap, i * sizeof(void*)));
+
 	for (i = 0; i < foreign->n_fields; i++) {
 		foreign->foreign_col_names[i] = mem_heap_strdup(
 			foreign->heap,
@@ -3844,8 +4424,9 @@ try_find_index:
 		foreign->heap, referenced_table_name);
 	dict_mem_referenced_table_name_lookup_set(foreign, TRUE);
 
-	foreign->referenced_col_names = mem_heap_alloc(foreign->heap,
-						       i * sizeof(void*));
+	foreign->referenced_col_names = static_cast<const char**>(
+		mem_heap_alloc(foreign->heap, i * sizeof(void*)));
+
 	for (i = 0; i < foreign->n_fields; i++) {
 		foreign->referenced_col_names[i]
 			= mem_heap_strdup(foreign->heap, column_names[i]);
@@ -3863,6 +4444,23 @@ try_find_index:
 
 	goto loop;
 }
+/**************************************************************************
+Determines whether a string starts with the specified keyword.
+@return	TRUE if str starts with keyword */
+UNIV_INTERN
+ibool
+dict_str_starts_with_keyword(
+/*=========================*/
+	void*		mysql_thd,	/*!< in: MySQL thread handle */
+	const char*	str,		/*!< in: string to scan for keyword */
+	const char*	keyword)	/*!< in: keyword to look for */
+{
+	struct charset_info_st*	cs = innobase_get_charset(mysql_thd);
+	ibool			success;
+
+	dict_accept(cs, str, keyword, &success);
+	return(success);
+}
 
 /*********************************************************************//**
 Scans a table create SQL string and adds to the data dictionary the foreign
@@ -3945,7 +4543,8 @@ dict_foreign_parse_drop_constraints(
 
 	*n = 0;
 
-	*constraints_to_drop = mem_heap_alloc(heap, 1000 * sizeof(char*));
+	*constraints_to_drop = static_cast<const char**>(
+		mem_heap_alloc(heap, 1000 * sizeof(char*)));
 
 	ptr = innobase_get_stmt(trx->mysql_thd, &len);
 
@@ -4133,7 +4732,7 @@ dict_index_build_node_ptr(
 	byte*		buf;
 	ulint		n_unique;
 
-	if (UNIV_UNLIKELY(index->type & DICT_UNIVERSAL)) {
+	if (dict_index_is_univ(index)) {
 		/* In a universal index tree, we take the whole record as
 		the node pointer if the record is on the leaf level,
 		on non-leaf levels we remove the last field, which
@@ -4162,7 +4761,7 @@ dict_index_build_node_ptr(
 
 	dict_index_copy_types(tuple, index, n_unique);
 
-	buf = mem_heap_alloc(heap, 4);
+	buf = static_cast<byte*>(mem_heap_alloc(heap, 4));
 
 	mach_write_to_4(buf, page_no);
 
@@ -4200,7 +4799,7 @@ dict_index_copy_rec_order_prefix(
 
 	UNIV_PREFETCH_R(rec);
 
-	if (UNIV_UNLIKELY(index->type & DICT_UNIVERSAL)) {
+	if (dict_index_is_univ(index)) {
 		ut_a(!dict_table_is_comp(index->table));
 		n = rec_get_n_fields_old(rec);
 	} else {
@@ -4290,127 +4889,6 @@ dict_index_calc_min_rec_len(
 	return(sum);
 }
 
-/*********************************************************************//**
-Calculates new estimates for table and index statistics. The statistics
-are used in query optimization. */
-UNIV_INTERN
-void
-dict_update_statistics(
-/*===================*/
-	dict_table_t*	table,		/*!< in/out: table */
-	ibool		only_calc_if_missing_stats)/*!< in: only
-					update/recalc the stats if they have
-					not been initialized yet, otherwise
-					do nothing */
-{
-	dict_index_t*	index;
-	ulint		sum_of_index_sizes	= 0;
-
-	if (table->ibd_file_missing) {
-		ut_print_timestamp(stderr);
-		fprintf(stderr,
-			"  InnoDB: cannot calculate statistics for table %s\n"
-			"InnoDB: because the .ibd file is missing.  For help,"
-			" please refer to\n"
-			"InnoDB: " REFMAN "innodb-troubleshooting.html\n",
-			table->name);
-
-		return;
-	}
-
-	/* Find out the sizes of the indexes and how many different values
-	for the key they approximately have */
-
-	index = dict_table_get_first_index(table);
-
-	if (index == NULL) {
-		/* Table definition is corrupt */
-
-		return;
-	}
-
-	dict_table_stats_lock(table, RW_X_LATCH);
-
-	if (only_calc_if_missing_stats && table->stat_initialized) {
-		dict_table_stats_unlock(table, RW_X_LATCH);
-		return;
-	}
-
-	do {
-		if (UNIV_LIKELY
-		    (srv_force_recovery < SRV_FORCE_NO_IBUF_MERGE
-		     || (srv_force_recovery < SRV_FORCE_NO_LOG_REDO
-			 && dict_index_is_clust(index)))) {
-			mtr_t	mtr;
-			ulint	size;
-
-			mtr_start(&mtr);
-			mtr_s_lock(dict_index_get_lock(index), &mtr);
-
-			size = btr_get_size(index, BTR_TOTAL_SIZE, &mtr);
-
-			if (size != ULINT_UNDEFINED) {
-				sum_of_index_sizes += size;
-				index->stat_index_size = size;
-				size = btr_get_size(
-					index, BTR_N_LEAF_PAGES, &mtr);
-			}
-
-			mtr_commit(&mtr);
-
-			switch (size) {
-			case ULINT_UNDEFINED:
-				goto fake_statistics;
-			case 0:
-				/* The root node of the tree is a leaf */
-				size = 1;
-			}
-
-			index->stat_n_leaf_pages = size;
-
-			btr_estimate_number_of_different_key_vals(index);
-		} else {
-			/* If we have set a high innodb_force_recovery
-			level, do not calculate statistics, as a badly
-			corrupted index can cause a crash in it.
-			Initialize some bogus index cardinality
-			statistics, so that the data can be queried in
-			various means, also via secondary indexes. */
-			ulint	i;
-
-fake_statistics:
-			sum_of_index_sizes++;
-			index->stat_index_size = index->stat_n_leaf_pages = 1;
-
-			for (i = dict_index_get_n_unique(index); i; ) {
-				index->stat_n_diff_key_vals[i--] = 1;
-			}
-
-			memset(index->stat_n_non_null_key_vals, 0,
-			       (1 + dict_index_get_n_unique(index))
-                               * sizeof(*index->stat_n_non_null_key_vals));
-		}
-
-		index = dict_table_get_next_index(index);
-	} while (index);
-
-	index = dict_table_get_first_index(table);
-
-	table->stat_n_rows = index->stat_n_diff_key_vals[
-		dict_index_get_n_unique(index)];
-
-	table->stat_clustered_index_size = index->stat_index_size;
-
-	table->stat_sum_of_other_index_sizes = sum_of_index_sizes
-		- index->stat_index_size;
-
-	table->stat_initialized = TRUE;
-
-	table->stat_modified_counter = 0;
-
-	dict_table_stats_unlock(table, RW_X_LATCH);
-}
-
 /**********************************************************************//**
 Prints info of a foreign key constraint. */
 static
@@ -4488,7 +4966,7 @@ dict_table_print_low(
 
 	ut_ad(mutex_own(&(dict_sys->mutex)));
 
-	dict_update_statistics(table, FALSE /* update even if initialized */);
+	dict_stats_update(table, DICT_STATS_FETCH, TRUE);
 
 	dict_table_stats_lock(table, RW_S_LATCH);
 
@@ -4825,6 +5303,189 @@ dict_index_name_print(
 	fputs(" of table ", file);
 	ut_print_name(file, trx, TRUE, index->table_name);
 }
+
+/**********************************************************************//**
+Find a table in dict_sys->table_LRU list with specified space id
+@return table if found, NULL if not */
+static
+dict_table_t*
+dict_find_table_by_space(
+/*=====================*/
+	ulint	space_id)		/*!< in: space ID */
+{
+	dict_table_t*   table;
+	ulint		num_item;
+	ulint		count = 0;
+
+	ut_ad(space_id > 0);
+
+	table = UT_LIST_GET_FIRST(dict_sys->table_LRU);
+	num_item =  UT_LIST_GET_LEN(dict_sys->table_LRU);
+
+	/* This function intentionally does not acquire mutex as it is used
+	by error handling code in deep call stack as last means to avoid
+	killing the server, so it worth to risk some consequencies for
+	the action. */
+	while (table && count < num_item) {
+		if (table->space == space_id) {
+			return(table);
+		}
+
+		table = UT_LIST_GET_NEXT(table_LRU, table);
+		count++;
+	}
+
+	return(NULL);
+}
+
+/**********************************************************************//**
+Flags a table with specified space_id corrupted in the data dictionary
+cache
+@return TRUE if successful */
+UNIV_INTERN
+ibool
+dict_set_corrupted_by_space(
+/*========================*/
+	ulint	space_id)		/*!< in: space ID */
+{
+	dict_table_t*   table;
+
+	table = dict_find_table_by_space(space_id);
+
+	if (!table) {
+		return(FALSE);
+	}
+
+	/* mark the table->corrupted bit only, since the caller
+	could be too deep in the stack for SYS_INDEXES update */
+	table->corrupted = TRUE;
+
+	return(TRUE);
+}
+
+/**********************************************************************//**
+Flags an index corrupted both in the data dictionary cache
+and in the SYS_INDEXES */
+UNIV_INTERN
+void
+dict_set_corrupted(
+/*===============*/
+	dict_index_t*	index)		/*!< in/out: index */
+{
+	mem_heap_t*	heap;
+	mtr_t		mtr;
+	dict_index_t*	sys_index;
+	dtuple_t*	tuple;
+	dfield_t*	dfield;
+	byte*		buf;
+	const char*	status;
+	btr_cur_t	cursor;
+
+	ut_ad(index);
+	ut_ad(mutex_own(&dict_sys->mutex));
+	ut_ad(!dict_table_is_comp(dict_sys->sys_tables));
+	ut_ad(!dict_table_is_comp(dict_sys->sys_indexes));
+
+#ifdef UNIV_SYNC_DEBUG
+        ut_ad(sync_thread_levels_empty_except_dict());
+#endif
+
+	/* Mark the table as corrupted only if the clustered index
+	is corrupted */
+	if (dict_index_is_clust(index)) {
+		index->table->corrupted = TRUE;
+	}
+
+	if (index->type & DICT_CORRUPT) {
+		/* The index was already flagged corrupted. */
+		ut_ad(!dict_index_is_clust(index) || index->table->corrupted);
+		return;
+	}
+
+	heap = mem_heap_create(sizeof(dtuple_t) + 2 * (sizeof(dfield_t)
+			       + sizeof(que_fork_t) + sizeof(upd_node_t)
+			       + sizeof(upd_t) + 12));
+	mtr_start(&mtr);
+	index->type |= DICT_CORRUPT;
+
+	sys_index = UT_LIST_GET_FIRST(dict_sys->sys_indexes->indexes);
+
+	/* Find the index row in SYS_INDEXES */
+	tuple = dtuple_create(heap, 2);
+
+	dfield = dtuple_get_nth_field(tuple, 0);
+	buf = static_cast<byte*>(mem_heap_alloc(heap, 8));
+	mach_write_to_8(buf, index->table->id);
+	dfield_set_data(dfield, buf, 8);
+
+	dfield = dtuple_get_nth_field(tuple, 1);
+	buf = static_cast<byte*>(mem_heap_alloc(heap, 8));
+	mach_write_to_8(buf, index->id);
+	dfield_set_data(dfield, buf, 8);
+
+	dict_index_copy_types(tuple, sys_index, 2);
+
+	btr_cur_search_to_nth_level(sys_index, 0, tuple, PAGE_CUR_GE,
+				    BTR_MODIFY_LEAF,
+				    &cursor, 0, __FILE__, __LINE__, &mtr);
+
+	if (cursor.up_match == dtuple_get_n_fields(tuple)) {
+		/* UPDATE SYS_INDEXES SET TYPE=index->type
+		WHERE TABLE_ID=index->table->id AND INDEX_ID=index->id */
+		ulint	len;
+		byte*	field	= rec_get_nth_field_old(
+			btr_cur_get_rec(&cursor),
+			DICT_FLD__SYS_INDEXES__TYPE, &len);
+		if (len != 4) {
+			goto fail;
+		}
+		mlog_write_ulint(field, index->type, MLOG_4BYTES, &mtr);
+		status = "  InnoDB: Flagged corruption of ";
+	} else {
+fail:
+		status = "  InnoDB: Unable to flag corruption of ";
+	}
+
+	mtr_commit(&mtr);
+	mem_heap_free(heap);
+
+	ut_print_timestamp(stderr);
+	fputs(status, stderr);
+	dict_index_name_print(stderr, NULL, index);
+	putc('\n', stderr);
+}
+
+/**********************************************************************//**
+Flags an index corrupted in the data dictionary cache only. This
+is used mostly to mark a corrupted index when index's own dictionary
+is corrupted, and we force to load such index for repair purpose */
+UNIV_INTERN
+void
+dict_set_corrupted_index_cache_only(
+/*================================*/
+	dict_index_t*	index,		/*!< in/out: index */
+	dict_table_t*	table)		/*!< in/out: table */
+{
+	ut_ad(index);
+	ut_ad(mutex_own(&dict_sys->mutex));
+	ut_ad(!dict_table_is_comp(dict_sys->sys_tables));
+	ut_ad(!dict_table_is_comp(dict_sys->sys_indexes));
+
+	/* Mark the table as corrupted only if the clustered index
+	is corrupted */
+	if (dict_index_is_clust(index)) {
+		dict_table_t*	corrupt_table;
+
+		corrupt_table = table ? table : index->table;
+		ut_ad(!index->table || !table || index->table  == table);
+
+		if (corrupt_table) {
+			corrupt_table->corrupted = TRUE;
+		}
+	}
+
+	index->type |= DICT_CORRUPT;
+}
 #endif /* !UNIV_HOTBACKUP */
 
 /**********************************************************************//**
@@ -4837,7 +5498,7 @@ dict_ind_init(void)
 	dict_table_t*		table;
 
 	/* create dummy table and index for REDUNDANT infimum and supremum */
-	table = dict_mem_table_create("SYS_DUMMY1", DICT_HDR_SPACE, 1, 0);
+	table = dict_mem_table_create("SYS_DUMMY1", DICT_HDR_SPACE, 1, 0, 0);
 	dict_mem_table_add_col(table, NULL, NULL, DATA_CHAR,
 			       DATA_ENGLISH | DATA_NOT_NULL, 8);
 
@@ -4846,9 +5507,11 @@ dict_ind_init(void)
 	dict_index_add_col(dict_ind_redundant, table,
 			   dict_table_get_nth_col(table, 0), 0);
 	dict_ind_redundant->table = table;
+
 	/* create dummy table and index for COMPACT infimum and supremum */
 	table = dict_mem_table_create("SYS_DUMMY2",
-				      DICT_HDR_SPACE, 1, DICT_TF_COMPACT);
+				      DICT_HDR_SPACE, 1,
+				      DICT_TF_COMPACT, 0);
 	dict_mem_table_add_col(table, NULL, NULL, DATA_CHAR,
 			       DATA_ENGLISH | DATA_NOT_NULL, 8);
 	dict_ind_compact = dict_mem_index_create("SYS_DUMMY2", "SYS_DUMMY2",
@@ -4861,6 +5524,7 @@ dict_ind_init(void)
 	dict_ind_redundant->cached = dict_ind_compact->cached = TRUE;
 }
 
+#ifndef UNIV_HOTBACKUP
 /**********************************************************************//**
 Frees dict_ind_redundant and dict_ind_compact. */
 static
@@ -4881,7 +5545,6 @@ dict_ind_free(void)
 	dict_mem_table_free(table);
 }
 
-#ifndef UNIV_HOTBACKUP
 /**********************************************************************//**
 Get index by name
 @return	index, NULL if does not exist */
@@ -4894,10 +5557,15 @@ dict_table_get_index_on_name(
 {
 	dict_index_t*	index;
 
+	/* If name is NULL, just return */
+	if (!name) {
+		return NULL;
+	}
+
 	index = dict_table_get_first_index(table);
 
 	while (index != NULL) {
-		if (ut_strcmp(index->name, name) == 0) {
+		if (innobase_strcasecmp(index->name, name) == 0) {
 
 			return(index);
 		}
@@ -4931,7 +5599,7 @@ dict_table_replace_index_in_foreign_list(
 				= dict_foreign_find_equiv_index(foreign);
 
 			/* There must exist an alternative index if
-			check_foreigns (FOREIGN_KEY_CHECKS) is on, 
+			check_foreigns (FOREIGN_KEY_CHECKS) is on,
 			since ha_innobase::prepare_drop_index had done
 			the check before we reach here. */
 
@@ -5018,7 +5686,158 @@ dict_table_check_for_dup_indexes(
 }
 #endif /* UNIV_DEBUG */
 
-/**************************************************************************
+/*********************************************************************//**
+Checks whether a table exists and whether it has the given structure.
+The table must have the same number of columns with the same names and
+types. The order of the columns does not matter.
+The caller must own the dictionary mutex.
+dict_table_schema_check() @{
+@return DB_SUCCESS if the table exists and contains the necessary columns */
+UNIV_INTERN
+enum db_err
+dict_table_schema_check(
+/*====================*/
+	dict_table_schema_t*	req_schema,	/*!< in/out: required table
+						schema */
+	char*			errstr,		/*!< out: human readable error
+						message if != DB_SUCCESS and
+						!= DB_TABLE_NOT_FOUND is
+						returned */
+	size_t			errstr_sz)	/*!< in: errstr size */
+{
+	dict_table_t*	table;
+	ulint		i;
+
+	ut_ad(mutex_own(&dict_sys->mutex));
+
+	table = dict_table_get_low(req_schema->table_name);
+
+	if (table == NULL || table->ibd_file_missing) {
+		/* no such table or missing tablespace */
+
+		return(DB_TABLE_NOT_FOUND);
+	}
+
+	if ((ulint) table->n_def - DATA_N_SYS_COLS != req_schema->n_cols) {
+		/* the table has a different number of columns than
+		required */
+
+		ut_snprintf(errstr, errstr_sz,
+			    "%s has %d columns but should have %lu.",
+			    req_schema->table_name,
+			    table->n_def - DATA_N_SYS_COLS,
+			    req_schema->n_cols);
+
+		return(DB_ERROR);
+	}
+
+	/* For each column from req_schema->columns[] search
+	whether it is present in table->cols[].
+	The following algorithm is O(n_cols^2), but is optimized to
+	be O(n_cols) if the columns are in the same order in both arrays. */
+
+	for (i = 0; i < req_schema->n_cols; i++) {
+		ulint	j;
+
+		char	req_type[64];
+		char	actual_type[64];
+
+		/* check if i'th column is the same in both arrays */
+		if (innobase_strcasecmp(req_schema->columns[i].name,
+			       dict_table_get_col_name(table, i)) == 0) {
+
+			/* we found the column in table->cols[] quickly */
+			j = i;
+		} else {
+
+			/* columns in both arrays are not in the same order,
+			do a full scan of the second array */
+			for (j = 0; j < table->n_def; j++) {
+				const char*	name;
+
+				name = dict_table_get_col_name(table, j);
+
+				if (innobase_strcasecmp(name,
+					req_schema->columns[i].name) == 0) {
+
+					/* found the column on j'th
+					position */
+					break;
+				}
+			}
+
+			if (j == table->n_def) {
+
+				ut_snprintf(errstr, errstr_sz,
+					    "required column %s.%s not found.",
+					    req_schema->table_name,
+					    req_schema->columns[i].name);
+
+				return(DB_ERROR);
+			}
+		}
+
+		/* we found a column with the same name on j'th position,
+		compare column types and flags */
+
+		dtype_sql_name(req_schema->columns[i].mtype,
+			       req_schema->columns[i].prtype_mask,
+			       req_schema->columns[i].len,
+			       req_type, sizeof(req_type));
+
+		dtype_sql_name(table->cols[j].mtype,
+			       table->cols[j].prtype,
+			       table->cols[j].len,
+			       actual_type, sizeof(actual_type));
+
+		/* check length for exact match */
+		if (req_schema->columns[i].len != table->cols[j].len) {
+
+			ut_snprintf(errstr, errstr_sz,
+				    "Column %s.%s is %s but should be %s "
+				    "(length mismatch).",
+				    req_schema->table_name,
+				    req_schema->columns[i].name,
+				    actual_type, req_type);
+
+			return(DB_ERROR);
+		}
+
+		/* check mtype for exact match */
+		if (req_schema->columns[i].mtype != table->cols[j].mtype) {
+
+			ut_snprintf(errstr, errstr_sz,
+				    "Column %s.%s is %s but should be %s "
+				    "(type mismatch).",
+				    req_schema->table_name,
+				    req_schema->columns[i].name,
+				    actual_type, req_type);
+
+			return(DB_ERROR);
+		}
+
+		/* check whether required prtype mask is set */
+		if (req_schema->columns[i].prtype_mask != 0
+		    && (table->cols[j].prtype
+			& req_schema->columns[i].prtype_mask)
+		       != req_schema->columns[i].prtype_mask) {
+
+			ut_snprintf(errstr, errstr_sz,
+				    "Column %s.%s is %s but should be %s "
+				    "(flags mismatch).",
+				    req_schema->table_name,
+				    req_schema->columns[i].name,
+				    actual_type, req_type);
+
+			return(DB_ERROR);
+		}
+	}
+
+	return(DB_SUCCESS);
+}
+/* @} */
+
+/**********************************************************************//**
 Closes the data dictionary module. */
 UNIV_INTERN
 void
@@ -5032,12 +5851,14 @@ dict_close(void)
 	for (i = 0; i < hash_get_n_cells(dict_sys->table_hash); i++) {
 		dict_table_t*	table;
 
-		table = HASH_GET_FIRST(dict_sys->table_hash, i);
+		table = static_cast<dict_table_t*>(
+			HASH_GET_FIRST(dict_sys->table_hash, i));
 
 		while (table) {
 			dict_table_t*	prev_table = table;
 
-			table = HASH_GET_NEXT(name_hash, prev_table);
+			table = static_cast<dict_table_t*>(
+				HASH_GET_NEXT(name_hash, prev_table));
 #ifdef UNIV_DEBUG
 			ut_a(prev_table->magic_n == DICT_TABLE_MAGIC_N);
 #endif
@@ -5073,186 +5894,90 @@ dict_close(void)
 	}
 }
 
+# ifdef UNIV_DEBUG
 /**********************************************************************//**
-Find a table in dict_sys->table_LRU list with specified space id
-@return table if found, NULL if not */
-static
-dict_table_t*
-dict_find_table_by_space(
-/*=====================*/
-	ulint	space_id)		/*!< in: space ID */
+Validate the dictionary table LRU list.
+@return TRUE if valid  */
+UNIV_INTERN
+ibool
+dict_lru_validate(void)
+/*===================*/
 {
-	dict_table_t*   table;
-	ulint		num_item;
-	ulint		count = 0;
+	dict_table_t*	table;
 
-	ut_ad(space_id > 0);
+	ut_ad(mutex_own(&dict_sys->mutex));
 
-	table = UT_LIST_GET_FIRST(dict_sys->table_LRU);
-	num_item =  UT_LIST_GET_LEN(dict_sys->table_LRU);
+	for (table = UT_LIST_GET_FIRST(dict_sys->table_LRU);
+	     table != NULL;
+	     table = UT_LIST_GET_NEXT(table_LRU, table)) {
 
-	/* This function intentionally does not acquire mutex as it is used
-	by error handling code in deep call stack as last means to avoid
-	killing the server, so it worth to risk some consequencies for
-	the action. */
-	while (table && count < num_item) {
-		if (table->space == space_id) {
-			return(table);
-		}
-
-		table = UT_LIST_GET_NEXT(table_LRU, table);
-		count++;
+		ut_a(table->can_be_evicted);
 	}
 
-	return(NULL);
-}
-
-/**********************************************************************//**
-Flags a table with specified space_id corrupted in the data dictionary
-cache
-@return TRUE if successful */
-UNIV_INTERN
-ibool
-dict_set_corrupted_by_space(
-/*========================*/
-	ulint	space_id)		/*!< in: space ID */
-{
-	dict_table_t*   table;
-
-	table = dict_find_table_by_space(space_id);
+	for (table = UT_LIST_GET_FIRST(dict_sys->table_non_LRU);
+	     table != NULL;
+	     table = UT_LIST_GET_NEXT(table_LRU, table)) {
 
-	if (!table) {
-		return(FALSE);
+		ut_a(!table->can_be_evicted);
 	}
 
-	/* mark the table->corrupted bit only, since the caller
-	could be too deep in the stack for SYS_INDEXES update */
-	table->corrupted = TRUE;
-
 	return(TRUE);
 }
 
 /**********************************************************************//**
-Flags an index corrupted both in the data dictionary cache
-and in the SYS_INDEXES */
+Check if a table exists in the dict table LRU list.
+@return TRUE if table found in LRU list */
 UNIV_INTERN
-void
-dict_set_corrupted(
-/*===============*/
-	dict_index_t*	index)		/*!< in/out: index */
+ibool
+dict_lru_find_table(
+/*================*/
+	const dict_table_t*	find_table)	/*!< in: table to find */
 {
-	mem_heap_t*	heap;
-	mtr_t		mtr;
-	dict_index_t*	sys_index;
-	dtuple_t*	tuple;
-	dfield_t*	dfield;
-	byte*		buf;
-	const char*	status;
-	btr_cur_t	cursor;
+	dict_table_t*		table;
 
-	ut_ad(index);
+	ut_ad(find_table != NULL);
 	ut_ad(mutex_own(&dict_sys->mutex));
-	ut_ad(!dict_table_is_comp(dict_sys->sys_tables));
-	ut_ad(!dict_table_is_comp(dict_sys->sys_indexes));
 
-#ifdef UNIV_SYNC_DEBUG
-        ut_ad(sync_thread_levels_empty_except_dict());
-#endif
-
-	/* Mark the table as corrupted only if the clustered index
-	is corrupted */
-	if (dict_index_is_clust(index)) {
-		index->table->corrupted = TRUE;
-	}
-
-	if (UNIV_UNLIKELY(dict_index_is_corrupted(index))) {
-		/* The index was already flagged corrupted. */
-		ut_ad(index->table->corrupted);
-		return;
-	}
-
-	heap = mem_heap_create(sizeof(dtuple_t) + 2 * (sizeof(dfield_t)
-			       + sizeof(que_fork_t) + sizeof(upd_node_t)
-			       + sizeof(upd_t) + 12));
-	mtr_start(&mtr);
-	index->type |= DICT_CORRUPT;
-
-	sys_index = UT_LIST_GET_FIRST(dict_sys->sys_indexes->indexes);
-
-	/* Find the index row in SYS_INDEXES */
-	tuple = dtuple_create(heap, 2);
-
-	dfield = dtuple_get_nth_field(tuple, 0);
-	buf = mem_heap_alloc(heap, 8);
-	mach_write_to_8(buf, index->table->id);
-	dfield_set_data(dfield, buf, 8);
-
-	dfield = dtuple_get_nth_field(tuple, 1);
-	buf = mem_heap_alloc(heap, 8);
-	mach_write_to_8(buf, index->id);
-	dfield_set_data(dfield, buf, 8);
-
-	dict_index_copy_types(tuple, sys_index, 2);
+	for (table = UT_LIST_GET_FIRST(dict_sys->table_LRU);
+	     table != NULL;
+	     table = UT_LIST_GET_NEXT(table_LRU, table)) {
 
-	btr_cur_search_to_nth_level(sys_index, 0, tuple, PAGE_CUR_GE,
-				    BTR_MODIFY_LEAF,
-				    &cursor, 0, __FILE__, __LINE__, &mtr);
+		ut_a(table->can_be_evicted);
 
-	if (cursor.up_match == dtuple_get_n_fields(tuple)) {
-		/* UPDATE SYS_INDEXES SET TYPE=index->type
-		WHERE TABLE_ID=index->table->id AND INDEX_ID=index->id */
-		ulint	len;
-		byte*	field	= rec_get_nth_field_old(
-			btr_cur_get_rec(&cursor),
-			DICT_SYS_INDEXES_TYPE_FIELD, &len);
-		if (len != 4) {
-			goto fail;
+		if (table == find_table) {
+			return(TRUE);
 		}
-		mlog_write_ulint(field, index->type, MLOG_4BYTES, &mtr);
-		status = "  InnoDB: Flagged corruption of ";
-	} else {
-fail:
-		status = "  InnoDB: Unable to flag corruption of ";
 	}
 
-	mtr_commit(&mtr);
-	mem_heap_free(heap);
-
-	ut_print_timestamp(stderr);
-	fputs(status, stderr);
-	dict_index_name_print(stderr, NULL, index);
-	putc('\n', stderr);
+	return(FALSE);
 }
 
 /**********************************************************************//**
-Flags an index corrupted in the data dictionary cache only. This
-is used mostly to mark a corrupted index when index's own dictionary
-is corrupted, and we force to load such index for repair purpose */
-UNIV_INTERN
-void
-dict_set_corrupted_index_cache_only(
-/*================================*/
-	dict_index_t*	index,		/*!< in/out: index */
-	dict_table_t*	table)		/*!< in/out: table */
+Check if a table exists in the dict table non-LRU list.
+@return TRUE if table found in non-LRU list */
+static
+ibool
+dict_non_lru_find_table(
+/*====================*/
+	const dict_table_t*	find_table)	/*!< in: table to find */
 {
-	ut_ad(index);
+	dict_table_t*		table;
+
+	ut_ad(find_table != NULL);
 	ut_ad(mutex_own(&dict_sys->mutex));
-	ut_ad(!dict_table_is_comp(dict_sys->sys_tables));
-	ut_ad(!dict_table_is_comp(dict_sys->sys_indexes));
 
-	/* Mark the table as corrupted only if the clustered index
-	is corrupted */
-	if (dict_index_is_clust(index)) {
-		dict_table_t*	corrupt_table;
+	for (table = UT_LIST_GET_FIRST(dict_sys->table_non_LRU);
+	     table != NULL;
+	     table = UT_LIST_GET_NEXT(table_LRU, table)) {
 
-		corrupt_table = table ? table : index->table;
-		ut_ad(!index->table || !table || index->table  == table);
+		ut_a(!table->can_be_evicted);
 
-		if (corrupt_table) {
-			corrupt_table->corrupted = TRUE;
+		if (table == find_table) {
+			return(TRUE);
 		}
 	}
 
-	index->type |= DICT_CORRUPT;
+	return(FALSE);
 }
+# endif /* UNIV_DEBUG */
 #endif /* !UNIV_HOTBACKUP */
diff --git a/storage/innobase/dict/dict0load.c b/storage/innobase/dict/dict0load.cc
index 22f0d4456d5..ff93be3e76a 100644
--- a/storage/innobase/dict/dict0load.c
+++ b/storage/innobase/dict/dict0load.cc
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1996, 2010, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2011, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,13 +11,13 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
 /**************************************************//**
-@file dict/dict0load.c
+@file dict/dict0load.cc
 Loads to the memory cache database object definitions
 from dictionary tables
 
@@ -37,11 +37,13 @@ Created 4/24/1996 Heikki Tuuri
 #include "mach0data.h"
 #include "dict0dict.h"
 #include "dict0boot.h"
+#include "dict0stats.h"
 #include "rem0cmp.h"
 #include "srv0start.h"
 #include "srv0srv.h"
+#include "dict0priv.h"
 #include "ha_prototypes.h" /* innobase_casedn_str() */
-
+#include "fts0priv.h"
 
 /** Following are six InnoDB system tables */
 static const char* SYSTEM_TABLE_NAME[] = {
@@ -57,6 +59,7 @@ static const char* SYSTEM_TABLE_NAME[] = {
 metadata even if it is marked as "corrupted". */
 UNIV_INTERN my_bool     srv_load_corrupted = FALSE;
 
+#ifdef UNIV_DEBUG
 /****************************************************************//**
 Compare the name of an index column.
 @return	TRUE if the i'th column of index is 'name'. */
@@ -75,6 +78,7 @@ name_of_col_is(
 
 	return(strcmp(name, dict_table_get_col_name(table, tmp)) == 0);
 }
+#endif /* UNIV_DEBUG */
 
 /********************************************************************//**
 Finds the first table name in the given database.
@@ -105,7 +109,7 @@ dict_get_first_table_name_in_db(
 
 	sys_tables = dict_table_get_low("SYS_TABLES");
 	sys_index = UT_LIST_GET_FIRST(sys_tables->indexes);
-	ut_a(!dict_table_is_comp(sys_tables));
+	ut_ad(!dict_table_is_comp(sys_tables));
 
 	tuple = dtuple_create(heap, 1);
 	dfield = dtuple_get_nth_field(tuple, 0);
@@ -128,7 +132,8 @@ loop:
 		return(NULL);
 	}
 
-	field = rec_get_nth_field_old(rec, 0, &len);
+	field = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_TABLES__NAME, &len);
 
 	if (len < strlen(name)
 	    || ut_memcmp(name, field, strlen(name)) != 0) {
@@ -176,9 +181,9 @@ dict_print(void)
 	/* Enlarge the fatal semaphore wait timeout during the InnoDB table
 	monitor printout */
 
-	mutex_enter(&kernel_mutex);
-	srv_fatal_semaphore_wait_threshold += SRV_SEMAPHORE_WAIT_EXTENSION;
-	mutex_exit(&kernel_mutex);
+	os_increment_counter_by_amount(
+		server_mutex,
+		srv_fatal_semaphore_wait_threshold, 7200/*2 hours*/);
 
 	heap = mem_heap_create(1000);
 	mutex_enter(&(dict_sys->mutex));
@@ -189,11 +194,12 @@ dict_print(void)
 	while (rec) {
 		const char* err_msg;
 
-		err_msg = dict_process_sys_tables_rec(
-			heap, rec, &table, DICT_TABLE_LOAD_FROM_CACHE
-			| DICT_TABLE_UPDATE_STATS);
-
-		mtr_commit(&mtr);
+		err_msg = static_cast<const char*>(
+			dict_process_sys_tables_rec_and_mtr_commit(
+			heap, rec, &table,
+			static_cast<dict_table_info_t>(
+				DICT_TABLE_LOAD_FROM_CACHE
+				| DICT_TABLE_UPDATE_STATS), &mtr));
 
 		if (!err_msg) {
 			dict_table_print_low(table);
@@ -213,12 +219,11 @@ dict_print(void)
 	mem_heap_free(heap);
 
 	/* Restore the fatal semaphore wait timeout */
-	mutex_enter(&kernel_mutex);
-	srv_fatal_semaphore_wait_threshold -= SRV_SEMAPHORE_WAIT_EXTENSION;
-	mutex_exit(&kernel_mutex);
+	os_decrement_counter_by_amount(
+		server_mutex,
+		srv_fatal_semaphore_wait_threshold, 7200/*2 hours*/);
 }
 
-
 /********************************************************************//**
 This function gets the next system table record as it scans the table.
 @return	the next record if found, NULL if end of scan */
@@ -252,7 +257,7 @@ dict_getnext_system_low(
 }
 
 /********************************************************************//**
-This function opens a system table, and return the first record.
+This function opens a system table, and returns the first record.
 @return	first record of the system table */
 UNIV_INTERN
 const rec_t*
@@ -292,13 +297,13 @@ dict_getnext_system(
 					to the record */
 	mtr_t*		mtr)		/*!< in: the mini-transaction */
 {
-        const rec_t*	rec;
+	const rec_t*	rec;
 
 	/* Restore the position */
-        btr_pcur_restore_position(BTR_SEARCH_LEAF, pcur, mtr);
+	btr_pcur_restore_position(BTR_SEARCH_LEAF, pcur, mtr);
 
 	/* Get the next record */
-        rec = dict_getnext_system_low(pcur, mtr);
+	rec = dict_getnext_system_low(pcur, mtr);
 
 	return(rec);
 }
@@ -309,31 +314,40 @@ both monitor table output and information schema innodb_sys_tables output.
 @return error message, or NULL on success */
 UNIV_INTERN
 const char*
-dict_process_sys_tables_rec(
-/*========================*/
+dict_process_sys_tables_rec_and_mtr_commit(
+/*=======================================*/
 	mem_heap_t*	heap,		/*!< in/out: temporary memory heap */
 	const rec_t*	rec,		/*!< in: SYS_TABLES record */
 	dict_table_t**	table,		/*!< out: dict_table_t to fill */
-	dict_table_info_t status)	/*!< in: status bit controls
+	dict_table_info_t status,	/*!< in: status bit controls
 					options such as whether we shall
 					look for dict_table_t from cache
 					first */
+	mtr_t*		mtr)		/*!< in/out: mini-transaction,
+					will be committed */
 {
 	ulint		len;
 	const char*	field;
 	const char*	err_msg = NULL;
 	char*		table_name;
 
-	field = (const char*) rec_get_nth_field_old(rec, 0, &len);
+	field = (const char*) rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_TABLES__NAME, &len);
 
 	ut_a(!rec_get_deleted_flag(rec, 0));
 
+	ut_ad(mtr_memo_contains_page(mtr, rec, MTR_MEMO_PAGE_S_FIX));
+
 	/* Get the table name */
 	table_name = mem_heap_strdupl(heap, field, len);
 
 	/* If DICT_TABLE_LOAD_FROM_CACHE is set, first check
-	whether there is cached dict_table_t struct first */
+	whether there is cached dict_table_t struct */
 	if (status & DICT_TABLE_LOAD_FROM_CACHE) {
+
+		/* Commit before load the table again */
+		mtr_commit(mtr);
+
 		*table = dict_table_get_low(table_name);
 
 		if (!(*table)) {
@@ -341,6 +355,7 @@ dict_process_sys_tables_rec(
 		}
 	} else {
 		err_msg = dict_load_table_low(table_name, rec, table);
+		mtr_commit(mtr);
 	}
 
 	if (err_msg) {
@@ -350,10 +365,10 @@ dict_process_sys_tables_rec(
 	if ((status & DICT_TABLE_UPDATE_STATS)
 	    && dict_table_get_first_index(*table)) {
 
-		/* Update statistics if DICT_TABLE_UPDATE_STATS
-		is set */
-		dict_update_statistics(*table, FALSE /* update even if
-				       initialized */);
+		/* Update statistics member fields in *table if
+		DICT_TABLE_UPDATE_STATS is set */
+		ut_ad(mutex_own(&dict_sys->mutex));
+		dict_stats_update(*table, DICT_STATS_FETCH, TRUE);
 	}
 
 	return(NULL);
@@ -376,7 +391,7 @@ dict_process_sys_indexes_rec(
 	const char*	err_msg;
 	byte*		buf;
 
-	buf = mem_heap_alloc(heap, 8);
+	buf = static_cast<byte*>(mem_heap_alloc(heap, 8));
 
 	/* Parse the record, and get "dict_index_t" struct filled */
 	err_msg = dict_load_index_low(buf, NULL,
@@ -428,13 +443,13 @@ dict_process_sys_fields_rec(
 	byte*		last_index_id;
 	const char*	err_msg;
 
-	buf = mem_heap_alloc(heap, 8);
+	buf = static_cast<byte*>(mem_heap_alloc(heap, 8));
 
-	last_index_id = mem_heap_alloc(heap, 8);
+	last_index_id = static_cast<byte*>(mem_heap_alloc(heap, 8));
 	mach_write_to_8(last_index_id, last_id);
 
 	err_msg = dict_load_field_low(buf, NULL, sys_field,
-				      pos, last_index_id, heap, rec, NULL, 0);
+				      pos, last_index_id, heap, rec);
 
 	*index_id = mach_read_from_8(buf);
 
@@ -442,7 +457,6 @@ dict_process_sys_fields_rec(
 
 }
 
-#ifdef FOREIGN_NOT_USED
 /********************************************************************//**
 This function parses a SYS_FOREIGN record and populate a dict_foreign_t
 structure with the information from the record. For detail information
@@ -465,50 +479,56 @@ dict_process_sys_foreign_rec(
 		return("delete-marked record in SYS_FOREIGN");
 	}
 
-	if (UNIV_UNLIKELY(rec_get_n_fields_old(rec) != 6)) {
+	if (rec_get_n_fields_old(rec) != DICT_NUM_FIELDS__SYS_FOREIGN) {
 		return("wrong number of columns in SYS_FOREIGN record");
 	}
 
-	field = rec_get_nth_field_old(rec, 0/*ID*/, &len);
+	field = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_FOREIGN__ID, &len);
 	if (UNIV_UNLIKELY(len < 1 || len == UNIV_SQL_NULL)) {
 err_len:
 		return("incorrect column length in SYS_FOREIGN");
 	}
-	
+
 	/* This recieves a dict_foreign_t* that points to a stack variable.
 	So mem_heap_free(foreign->heap) is not used as elsewhere.
 	Since the heap used here is freed elsewhere, foreign->heap
 	is not assigned. */
 	foreign->id = mem_heap_strdupl(heap, (const char*) field, len);
 
-	rec_get_nth_field_offs_old(rec, 1/*DB_TRX_ID*/, &len);
-	if (UNIV_UNLIKELY(len != DATA_TRX_ID_LEN && len != UNIV_SQL_NULL)) {
+	rec_get_nth_field_offs_old(
+		rec, DICT_FLD__SYS_FOREIGN__DB_TRX_ID, &len);
+	if (len != DATA_TRX_ID_LEN && len != UNIV_SQL_NULL) {
 		goto err_len;
 	}
-	rec_get_nth_field_offs_old(rec, 2/*DB_ROLL_PTR*/, &len);
-	if (UNIV_UNLIKELY(len != DATA_ROLL_PTR_LEN && len != UNIV_SQL_NULL)) {
+	rec_get_nth_field_offs_old(
+		rec, DICT_FLD__SYS_FOREIGN__DB_ROLL_PTR, &len);
+	if (len != DATA_ROLL_PTR_LEN && len != UNIV_SQL_NULL) {
 		goto err_len;
 	}
 
 	/* The _lookup versions of the referenced and foreign table names
 	 are not assigned since they are not used in this dict_foreign_t */
 
-	field = rec_get_nth_field_old(rec, 3/*FOR_NAME*/, &len);
-	if (UNIV_UNLIKELY(len < 1 || len == UNIV_SQL_NULL)) {
+	field = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_FOREIGN__FOR_NAME, &len);
+	if (len < 1 || len == UNIV_SQL_NULL) {
 		goto err_len;
 	}
 	foreign->foreign_table_name = mem_heap_strdupl(
 		heap, (const char*) field, len);
 
-	field = rec_get_nth_field_old(rec, 4/*REF_NAME*/, &len);
-	if (UNIV_UNLIKELY(len < 1 || len == UNIV_SQL_NULL)) {
+	field = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_FOREIGN__REF_NAME, &len);
+	if (len < 1 || len == UNIV_SQL_NULL) {
 		goto err_len;
 	}
 	foreign->referenced_table_name = mem_heap_strdupl(
 		heap, (const char*) field, len);
 
-	field = rec_get_nth_field_old(rec, 5/*N_COLS*/, &len);
-	if (UNIV_UNLIKELY(len != 4)) {
+	field = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_FOREIGN__N_COLS, &len);
+	if (len != 4) {
 		goto err_len;
 	}
 	n_fields_and_type = mach_read_from_4(field);
@@ -518,9 +538,7 @@ err_len:
 
 	return(NULL);
 }
-#endif  /* FOREIGN_NOT_USED */
 
-#ifdef FOREIGN_NOT_USED
 /********************************************************************//**
 This function parses a SYS_FOREIGN_COLS record and extract necessary
 information from the record and return to caller.
@@ -540,56 +558,60 @@ dict_process_sys_foreign_col_rec(
 	ulint		len;
 	const byte*	field;
 
-	if (UNIV_UNLIKELY(rec_get_deleted_flag(rec, 0))) {
+	if (rec_get_deleted_flag(rec, 0)) {
 		return("delete-marked record in SYS_FOREIGN_COLS");
 	}
 
-	if (UNIV_UNLIKELY(rec_get_n_fields_old(rec) != 6)) {
+	if (rec_get_n_fields_old(rec) != DICT_NUM_FIELDS__SYS_FOREIGN_COLS) {
 		return("wrong number of columns in SYS_FOREIGN_COLS record");
 	}
 
-	field = rec_get_nth_field_old(rec, 0/*ID*/, &len);
-	if (UNIV_UNLIKELY(len < 1 || len == UNIV_SQL_NULL)) {
+	field = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_FOREIGN_COLS__ID, &len);
+	if (len < 1 || len == UNIV_SQL_NULL) {
 err_len:
 		return("incorrect column length in SYS_FOREIGN_COLS");
 	}
 	*name = mem_heap_strdupl(heap, (char*) field, len);
 
-	field = rec_get_nth_field_old(rec, 1/*POS*/, &len);
-	if (UNIV_UNLIKELY(len != 4)) {
+	field = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_FOREIGN_COLS__POS, &len);
+	if (len != 4) {
 		goto err_len;
 	}
 	*pos = mach_read_from_4(field);
 
-	rec_get_nth_field_offs_old(rec, 2/*DB_TRX_ID*/, &len);
-	if (UNIV_UNLIKELY(len != DATA_TRX_ID_LEN && len != UNIV_SQL_NULL)) {
+	rec_get_nth_field_offs_old(
+		rec, DICT_FLD__SYS_FOREIGN_COLS__DB_TRX_ID, &len);
+	if (len != DATA_TRX_ID_LEN && len != UNIV_SQL_NULL) {
 		goto err_len;
 	}
-	rec_get_nth_field_offs_old(rec, 3/*DB_ROLL_PTR*/, &len);
-	if (UNIV_UNLIKELY(len != DATA_ROLL_PTR_LEN && len != UNIV_SQL_NULL)) {
+	rec_get_nth_field_offs_old(
+		rec, DICT_FLD__SYS_FOREIGN_COLS__DB_ROLL_PTR, &len);
+	if (len != DATA_ROLL_PTR_LEN && len != UNIV_SQL_NULL) {
 		goto err_len;
 	}
 
-	field = rec_get_nth_field_old(rec, 4/*FOR_COL_NAME*/, &len);
-	if (UNIV_UNLIKELY(len < 1 || len == UNIV_SQL_NULL)) {
+	field = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_FOREIGN_COLS__FOR_COL_NAME, &len);
+	if (len < 1 || len == UNIV_SQL_NULL) {
 		goto err_len;
 	}
 	*for_col_name = mem_heap_strdupl(heap, (char*) field, len);
 
-	field = rec_get_nth_field_old(rec, 5/*REF_COL_NAME*/, &len);
-	if (UNIV_UNLIKELY(len < 1 || len == UNIV_SQL_NULL)) {
+	field = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_FOREIGN_COLS__REF_COL_NAME, &len);
+	if (len < 1 || len == UNIV_SQL_NULL) {
 		goto err_len;
 	}
 	*ref_col_name = mem_heap_strdupl(heap, (char*) field, len);
 
 	return(NULL);
 }
-#endif  /* FOREIGN_NOT_USED */
 
 /********************************************************************//**
-Determine the flags of a table described in SYS_TABLES.
-@return compressed page size in kilobytes; or 0 if the tablespace is
-uncompressed, ULINT_UNDEFINED on error */
+Determine the flags of a table as stored in SYS_TABLES.TYPE and N_COLS.
+@return  ULINT_UNDEFINED if error, else a valid dict_table_t::flags. */
 static
 ulint
 dict_sys_tables_get_flags(
@@ -598,54 +620,32 @@ dict_sys_tables_get_flags(
 {
 	const byte*	field;
 	ulint		len;
+	ulint		type;
 	ulint		n_cols;
-	ulint		flags;
 
-	field = rec_get_nth_field_old(rec, 5, &len);
+	/* read the 4 byte flags from the TYPE field */
+	field = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_TABLES__TYPE, &len);
 	ut_a(len == 4);
+	type = mach_read_from_4(field);
 
-	flags = mach_read_from_4(field);
-
-	if (UNIV_LIKELY(flags == DICT_TABLE_ORDINARY)) {
-		return(0);
-	}
-
-	field = rec_get_nth_field_old(rec, 4/*N_COLS*/, &len);
+	/* The low order bit of SYS_TABLES.TYPE is always set to 1. If no
+	other bits are used, that is defined as SYS_TABLE_TYPE_ANTELOPE.
+	But in dict_table_t::flags the low order bit is used to determine
+	if the row format is Redundant or Compact when the format is
+	Antelope.
+	Read the 4 byte N_COLS field and look at the high order bit.  It
+	should be set for COMPACT and later.  It should not be set for
+	REDUNDANT. */
+	field = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_TABLES__N_COLS, &len);
+	ut_a(len == 4);
 	n_cols = mach_read_from_4(field);
 
-	if (UNIV_UNLIKELY(!(n_cols & 0x80000000UL))) {
-		/* New file formats require ROW_FORMAT=COMPACT. */
-		return(ULINT_UNDEFINED);
-	}
-
-	switch (flags & (DICT_TF_FORMAT_MASK | DICT_TF_COMPACT)) {
-	default:
-	case DICT_TF_FORMAT_51 << DICT_TF_FORMAT_SHIFT:
-	case DICT_TF_FORMAT_51 << DICT_TF_FORMAT_SHIFT | DICT_TF_COMPACT:
-		/* flags should be DICT_TABLE_ORDINARY,
-		or DICT_TF_FORMAT_MASK should be nonzero. */
-		return(ULINT_UNDEFINED);
-
-	case DICT_TF_FORMAT_ZIP << DICT_TF_FORMAT_SHIFT | DICT_TF_COMPACT:
-#if DICT_TF_FORMAT_MAX > DICT_TF_FORMAT_ZIP
-# error "missing case labels for DICT_TF_FORMAT_ZIP .. DICT_TF_FORMAT_MAX"
-#endif
-		/* We support this format. */
-		break;
-	}
-
-	if (UNIV_UNLIKELY((flags & DICT_TF_ZSSIZE_MASK)
-			  > (DICT_TF_ZSSIZE_MAX << DICT_TF_ZSSIZE_SHIFT))) {
-		/* Unsupported compressed page size. */
-		return(ULINT_UNDEFINED);
-	}
-
-	if (UNIV_UNLIKELY(flags & (~0 << DICT_TF_BITS))) {
-		/* Some unused bits are set. */
-		return(ULINT_UNDEFINED);
-	}
-
-	return(flags);
+	/* This validation function also combines the DICT_N_COLS_COMPACT
+	flag in n_cols into the type field to effectively make it a
+	dict_table_t::flags. */
+	return(dict_sys_tables_type_validate(type, n_cols));
 }
 
 /********************************************************************//**
@@ -675,7 +675,7 @@ dict_check_tablespaces_and_store_max_id(
 
 	sys_tables = dict_table_get_low("SYS_TABLES");
 	sys_index = UT_LIST_GET_FIRST(sys_tables->indexes);
-	ut_a(!dict_table_is_comp(sys_tables));
+	ut_ad(!dict_table_is_comp(sys_tables));
 
 	max_space_id = mtr_read_ulint(dict_hdr_get(&mtr)
 				      + DICT_HDR_MAX_SPACE_ID,
@@ -716,13 +716,16 @@ loop:
 		ulint		flags;
 		char*		name;
 
-		field = rec_get_nth_field_old(rec, 0, &len);
+		field = rec_get_nth_field_old(
+			rec, DICT_FLD__SYS_TABLES__NAME, &len);
 		name = mem_strdupl((char*) field, len);
 
 		flags = dict_sys_tables_get_flags(rec);
 		if (UNIV_UNLIKELY(flags == ULINT_UNDEFINED)) {
-
-			field = rec_get_nth_field_old(rec, 5, &len);
+			/* Read again the 4 bytes from rec. */
+			field = rec_get_nth_field_old(
+				rec, DICT_FLD__SYS_TABLES__TYPE, &len);
+			ut_ad(len == 4); /* this was checked earlier */
 			flags = mach_read_from_4(field);
 
 			ut_print_timestamp(stderr);
@@ -736,7 +739,8 @@ loop:
 			goto loop;
 		}
 
-		field = rec_get_nth_field_old(rec, 9, &len);
+		field = rec_get_nth_field_old(
+			rec, DICT_FLD__SYS_TABLES__SPACE, &len);
 		ut_a(len == 4);
 
 		space_id = mach_read_from_4(field);
@@ -753,13 +757,15 @@ loop:
 			Do not print warnings for temporary tables. */
 			ibool	is_temp;
 
-			field = rec_get_nth_field_old(rec, 4, &len);
-			if (0x80000000UL &  mach_read_from_4(field)) {
+			field = rec_get_nth_field_old(
+				rec, DICT_FLD__SYS_TABLES__N_COLS, &len);
+			if (mach_read_from_4(field) & DICT_N_COLS_COMPACT) {
 				/* ROW_FORMAT=COMPACT: read the is_temp
 				flag from SYS_TABLES.MIX_LEN. */
-				field = rec_get_nth_field_old(rec, 7, &len);
-				is_temp = mach_read_from_4(field)
-					& DICT_TF2_TEMPORARY;
+				field = rec_get_nth_field_old(
+					rec, 7/*MIX_LEN*/, &len);
+				is_temp = !!(mach_read_from_4(field)
+					     & DICT_TF2_TEMPORARY);
 			} else {
 				/* For tables created with old versions
 				of InnoDB, SYS_TABLES.MIX_LEN may contain
@@ -772,13 +778,14 @@ loop:
 			}
 
 			fil_space_for_table_exists_in_mem(
-				space_id, name, is_temp, TRUE, !is_temp);
+				space_id, name, TRUE, !is_temp);
 		} else {
 			/* It is a normal database startup: create the space
 			object and check that the .ibd file exists. */
 
-			fil_open_single_table_tablespace(FALSE, space_id,
-							 flags, name);
+			fil_open_single_table_tablespace(
+				FALSE, space_id,
+				dict_tf_to_fsp_flags(flags), name);
 		}
 
 		mem_free(name);
@@ -825,49 +832,54 @@ dict_load_column_low(
 
 	ut_ad(table || column);
 
-	if (UNIV_UNLIKELY(rec_get_deleted_flag(rec, 0))) {
+	if (rec_get_deleted_flag(rec, 0)) {
 		return("delete-marked record in SYS_COLUMNS");
 	}
 
-	if (UNIV_UNLIKELY(rec_get_n_fields_old(rec) != 9)) {
+	if (rec_get_n_fields_old(rec) != DICT_NUM_FIELDS__SYS_COLUMNS) {
 		return("wrong number of columns in SYS_COLUMNS record");
 	}
 
-	field = rec_get_nth_field_old(rec, 0/*TABLE_ID*/, &len);
-	if (UNIV_UNLIKELY(len != 8)) {
+	field = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_COLUMNS__TABLE_ID, &len);
+	if (len != 8) {
 err_len:
 		return("incorrect column length in SYS_COLUMNS");
 	}
 
 	if (table_id) {
 		*table_id = mach_read_from_8(field);
-	} else if (UNIV_UNLIKELY(table->id != mach_read_from_8(field))) {
+	} else if (table->id != mach_read_from_8(field)) {
 		return("SYS_COLUMNS.TABLE_ID mismatch");
 	}
 
-	field = rec_get_nth_field_old(rec, 1/*POS*/, &len);
-	if (UNIV_UNLIKELY(len != 4)) {
+	field = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_COLUMNS__POS, &len);
+	if (len != 4) {
 
 		goto err_len;
 	}
 
 	pos = mach_read_from_4(field);
 
-	if (UNIV_UNLIKELY(table && table->n_def != pos)) {
+	if (table && table->n_def != pos) {
 		return("SYS_COLUMNS.POS mismatch");
 	}
 
-	rec_get_nth_field_offs_old(rec, 2/*DB_TRX_ID*/, &len);
-	if (UNIV_UNLIKELY(len != DATA_TRX_ID_LEN && len != UNIV_SQL_NULL)) {
+	rec_get_nth_field_offs_old(
+		rec, DICT_FLD__SYS_COLUMNS__DB_TRX_ID, &len);
+	if (len != DATA_TRX_ID_LEN && len != UNIV_SQL_NULL) {
 		goto err_len;
 	}
-	rec_get_nth_field_offs_old(rec, 3/*DB_ROLL_PTR*/, &len);
-	if (UNIV_UNLIKELY(len != DATA_ROLL_PTR_LEN && len != UNIV_SQL_NULL)) {
+	rec_get_nth_field_offs_old(
+		rec, DICT_FLD__SYS_COLUMNS__DB_ROLL_PTR, &len);
+	if (len != DATA_ROLL_PTR_LEN && len != UNIV_SQL_NULL) {
 		goto err_len;
 	}
 
-	field = rec_get_nth_field_old(rec, 4/*NAME*/, &len);
-	if (UNIV_UNLIKELY(len < 1 || len == UNIV_SQL_NULL)) {
+	field = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_COLUMNS__NAME, &len);
+	if (len < 1 || len == UNIV_SQL_NULL) {
 		goto err_len;
 	}
 
@@ -877,15 +889,17 @@ err_len:
 		*col_name = name;
 	}
 
-	field = rec_get_nth_field_old(rec, 5/*MTYPE*/, &len);
-	if (UNIV_UNLIKELY(len != 4)) {
+	field = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_COLUMNS__MTYPE, &len);
+	if (len != 4) {
 		goto err_len;
 	}
 
 	mtype = mach_read_from_4(field);
 
-	field = rec_get_nth_field_old(rec, 6/*PRTYPE*/, &len);
-	if (UNIV_UNLIKELY(len != 4)) {
+	field = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_COLUMNS__PRTYPE, &len);
+	if (len != 4) {
 		goto err_len;
 	}
 	prtype = mach_read_from_4(field);
@@ -911,13 +925,15 @@ err_len:
 		}
 	}
 
-	field = rec_get_nth_field_old(rec, 7/*LEN*/, &len);
-	if (UNIV_UNLIKELY(len != 4)) {
+	field = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_COLUMNS__LEN, &len);
+	if (len != 4) {
 		goto err_len;
 	}
 	col_len = mach_read_from_4(field);
-	field = rec_get_nth_field_old(rec, 8/*PREC*/, &len);
-	if (UNIV_UNLIKELY(len != 4)) {
+	field = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_COLUMNS__PREC, &len);
+	if (len != 4) {
 		goto err_len;
 	}
 
@@ -958,15 +974,17 @@ dict_load_columns(
 
 	sys_columns = dict_table_get_low("SYS_COLUMNS");
 	sys_index = UT_LIST_GET_FIRST(sys_columns->indexes);
-	ut_a(!dict_table_is_comp(sys_columns));
+	ut_ad(!dict_table_is_comp(sys_columns));
 
-	ut_a(name_of_col_is(sys_columns, sys_index, 4, "NAME"));
-	ut_a(name_of_col_is(sys_columns, sys_index, 8, "PREC"));
+	ut_ad(name_of_col_is(sys_columns, sys_index,
+			     DICT_FLD__SYS_COLUMNS__NAME, "NAME"));
+	ut_ad(name_of_col_is(sys_columns, sys_index,
+			     DICT_FLD__SYS_COLUMNS__PREC, "PREC"));
 
 	tuple = dtuple_create(heap, 1);
 	dfield = dtuple_get_nth_field(tuple, 0);
 
-	buf = mem_heap_alloc(heap, 8);
+	buf = static_cast<byte*>(mem_heap_alloc(heap, 8));
 	mach_write_to_8(buf, table->id);
 
 	dfield_set_data(dfield, buf, 8);
@@ -975,14 +993,49 @@ dict_load_columns(
 	btr_pcur_open_on_user_rec(sys_index, tuple, PAGE_CUR_GE,
 				  BTR_SEARCH_LEAF, &pcur, &mtr);
 	for (i = 0; i + DATA_N_SYS_COLS < (ulint) table->n_cols; i++) {
-		const char* err_msg;
+		const char*	err_msg;
+		const char*	name;
 
 		rec = btr_pcur_get_rec(&pcur);
 
 		ut_a(btr_pcur_is_on_user_rec(&pcur));
 
 		err_msg = dict_load_column_low(table, heap, NULL, NULL,
-					       NULL, rec);
+					       &name, rec);
+
+		/* Note: Currently we have one DOC_ID column that is
+		shared by all FTS indexes on a table. */
+		if (innobase_strcasecmp(name,
+					FTS_DOC_ID_COL_NAME) == 0) {
+			dict_col_t*	col;
+			/* As part of normal loading of tables the
+			table->flag is not set for tables with FTS
+			till after the FTS indexes are loaded. So we
+			create the fts_t instance here if there isn't
+			one already created.
+
+			This case does not arise for table create as
+			the flag is set before the table is created. */
+			if (table->fts == NULL) {
+				table->fts = fts_create(table);
+				fts_optimize_add_table(table);
+			}
+
+			ut_a(table->fts->doc_col == ULINT_UNDEFINED);
+
+			col = dict_table_get_nth_col(table, i);
+
+			ut_ad(col->len == sizeof(doc_id_t));
+
+			if (col->prtype & DATA_FTS_DOC_ID) {
+				DICT_TF2_FLAG_SET(
+					table, DICT_TF2_FTS_HAS_DOC_ID);
+				DICT_TF2_FLAG_UNSET(
+					table, DICT_TF2_FTS_ADD_DOC_ID);
+			}
+
+			table->fts->doc_col = i;
+		}
 
 		if (err_msg) {
 			fprintf(stderr, "InnoDB: %s\n", err_msg);
@@ -999,9 +1052,6 @@ dict_load_columns(
 /** Error message for a delete-marked record in dict_load_field_low() */
 static const char* dict_load_field_del = "delete-marked record in SYS_FIELDS";
 
-static const char* dict_load_field_too_big = "column prefix exceeds maximum"
-					     " limit";
-
 /********************************************************************//**
 Loads an index field definition from a SYS_FIELDS record to
 dict_index_t.
@@ -1012,23 +1062,18 @@ dict_load_field_low(
 /*================*/
 	byte*		index_id,	/*!< in/out: index id (8 bytes)
 					an "in" value if index != NULL
-                                        and "out" if index == NULL */
+					and "out" if index == NULL */
 	dict_index_t*	index,		/*!< in/out: index, could be NULL
 					if we just populate a dict_field_t
 					struct with information from
-					a SYS_FIELDSS record */
+					a SYS_FIELDS record */
 	dict_field_t*	sys_field,	/*!< out: dict_field_t to be
 					filled */
 	ulint*		pos,		/*!< out: Field position */
 	byte*		last_index_id,	/*!< in: last index id */
 	mem_heap_t*	heap,		/*!< in/out: memory heap
 					for temporary storage */
-	const rec_t*	rec,		/*!< in: SYS_FIELDS record */
-	char*		addition_err_str,/*!< out: additional error message
-					that requires information to be
-					filled, or NULL */
-	ulint		err_str_len)	/*!< in: length of addition_err_str
-					in bytes */
+	const rec_t*	rec)		/*!< in: SYS_FIELDS record */
 {
 	const byte*	field;
 	ulint		len;
@@ -1040,23 +1085,24 @@ dict_load_field_low(
 	/* Either index or sys_field is supplied, not both */
 	ut_a((!index) || (!sys_field));
 
-	if (UNIV_UNLIKELY(rec_get_deleted_flag(rec, 0))) {
+	if (rec_get_deleted_flag(rec, 0)) {
 		return(dict_load_field_del);
 	}
 
-	if (UNIV_UNLIKELY(rec_get_n_fields_old(rec) != 5)) {
+	if (rec_get_n_fields_old(rec) != DICT_NUM_FIELDS__SYS_FIELDS) {
 		return("wrong number of columns in SYS_FIELDS record");
 	}
 
-	field = rec_get_nth_field_old(rec, 0/*INDEX_ID*/, &len);
-	if (UNIV_UNLIKELY(len != 8)) {
+	field = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_FIELDS__INDEX_ID, &len);
+	if (len != 8) {
 err_len:
 		return("incorrect column length in SYS_FIELDS");
 	}
 
 	if (!index) {
 		ut_a(last_index_id);
-		memcpy(index_id, (const char*)field, 8);
+		memcpy(index_id, (const char*) field, 8);
 		first_field = memcmp(index_id, last_index_id, 8);
 	} else {
 		first_field = (index->n_def == 0);
@@ -1065,20 +1111,6 @@ err_len:
 		}
 	}
 
-	field = rec_get_nth_field_old(rec, 1/*POS*/, &len);
-	if (UNIV_UNLIKELY(len != 4)) {
-		goto err_len;
-	}
-
-	rec_get_nth_field_offs_old(rec, 2/*DB_TRX_ID*/, &len);
-	if (UNIV_UNLIKELY(len != DATA_TRX_ID_LEN && len != UNIV_SQL_NULL)) {
-		goto err_len;
-	}
-	rec_get_nth_field_offs_old(rec, 3/*DB_ROLL_PTR*/, &len);
-	if (UNIV_UNLIKELY(len != DATA_ROLL_PTR_LEN && len != UNIV_SQL_NULL)) {
-		goto err_len;
-	}
-
 	/* The next field stores the field position in the index and a
 	possible column prefix length if the index field does not
 	contain the whole column. The storage format is like this: if
@@ -1087,6 +1119,12 @@ err_len:
 	bytes the prefix length for the field. Otherwise the field
 	number (index->n_def) is contained in the 2 LOW bytes. */
 
+	field = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_FIELDS__POS, &len);
+	if (len != 4) {
+		goto err_len;
+	}
+
 	pos_and_prefix_len = mach_read_from_4(field);
 
 	if (index && UNIV_UNLIKELY
@@ -1103,22 +1141,21 @@ err_len:
 		position = pos_and_prefix_len & 0xFFFFUL;
 	}
 
-	field = rec_get_nth_field_old(rec, 4, &len);
-	if (UNIV_UNLIKELY(len < 1 || len == UNIV_SQL_NULL)) {
+	rec_get_nth_field_offs_old(
+		rec, DICT_FLD__SYS_FIELDS__DB_TRX_ID, &len);
+	if (len != DATA_TRX_ID_LEN && len != UNIV_SQL_NULL) {
+		goto err_len;
+	}
+	rec_get_nth_field_offs_old(
+		rec, DICT_FLD__SYS_FIELDS__DB_ROLL_PTR, &len);
+	if (len != DATA_ROLL_PTR_LEN && len != UNIV_SQL_NULL) {
 		goto err_len;
 	}
 
-	if (prefix_len > REC_VERSION_56_MAX_INDEX_COL_LEN) {
-		if (addition_err_str) {
-			ut_snprintf(addition_err_str, err_str_len,
-				    "index field '%s' has a prefix length"
-				    " of %lu bytes",
-				    mem_heap_strdupl(
-						heap, (const char*) field, len),
-				    (ulong) prefix_len);
-		}
-
-		return(dict_load_field_too_big);
+	field = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_FIELDS__COL_NAME, &len);
+	if (len < 1 || len == UNIV_SQL_NULL) {
+		goto err_len;
 	}
 
 	if (index) {
@@ -1165,13 +1202,14 @@ dict_load_fields(
 
 	sys_fields = dict_table_get_low("SYS_FIELDS");
 	sys_index = UT_LIST_GET_FIRST(sys_fields->indexes);
-	ut_a(!dict_table_is_comp(sys_fields));
-	ut_a(name_of_col_is(sys_fields, sys_index, 4, "COL_NAME"));
+	ut_ad(!dict_table_is_comp(sys_fields));
+	ut_ad(name_of_col_is(sys_fields, sys_index,
+			     DICT_FLD__SYS_FIELDS__COL_NAME, "COL_NAME"));
 
 	tuple = dtuple_create(heap, 1);
 	dfield = dtuple_get_nth_field(tuple, 0);
 
-	buf = mem_heap_alloc(heap, 8);
+	buf = static_cast<byte*>(mem_heap_alloc(heap, 8));
 	mach_write_to_8(buf, index->id);
 
 	dfield_set_data(dfield, buf, 8);
@@ -1180,16 +1218,14 @@ dict_load_fields(
 	btr_pcur_open_on_user_rec(sys_index, tuple, PAGE_CUR_GE,
 				  BTR_SEARCH_LEAF, &pcur, &mtr);
 	for (i = 0; i < index->n_fields; i++) {
-		const char*	err_msg;
-		char		addition_err_str[1024];
+		const char* err_msg;
 
 		rec = btr_pcur_get_rec(&pcur);
 
 		ut_a(btr_pcur_is_on_user_rec(&pcur));
 
 		err_msg = dict_load_field_low(buf, index, NULL, NULL, NULL,
-					      heap, rec, addition_err_str,
-					      sizeof(addition_err_str));
+					      heap, rec);
 
 		if (err_msg == dict_load_field_del) {
 			/* There could be delete marked records in
@@ -1198,24 +1234,7 @@ dict_load_fields(
 
 			goto next_rec;
 		} else if (err_msg) {
-			if (err_msg == dict_load_field_too_big) {
-				fprintf(stderr, "InnoDB: Error: load index"
-					" '%s' failed.\n"
-					"InnoDB: %s,\n"
-					"InnoDB: which exceeds the"
-					" maximum limit of %lu bytes.\n"
-					"InnoDB: Please use server that"
-					" supports long index prefix\n"
-					"InnoDB: or turn on"
-					" innodb_force_recovery to load"
-					" the table\n",
-					index->name, addition_err_str,
-					(ulong) (REC_VERSION_56_MAX_INDEX_COL_LEN));
-
-			} else {
-				fprintf(stderr, "InnoDB: %s\n", err_msg);
-			}
-
+			fprintf(stderr, "InnoDB: %s\n", err_msg);
 			error = DB_CORRUPTION;
 			goto func_exit;
 		}
@@ -1271,76 +1290,85 @@ dict_load_index_low(
 		*index = NULL;
 	}
 
-	if (UNIV_UNLIKELY(rec_get_deleted_flag(rec, 0))) {
+	if (rec_get_deleted_flag(rec, 0)) {
 		return(dict_load_index_del);
 	}
 
-	if (UNIV_UNLIKELY(rec_get_n_fields_old(rec) != 9)) {
+	if (rec_get_n_fields_old(rec) != DICT_NUM_FIELDS__SYS_INDEXES) {
 		return("wrong number of columns in SYS_INDEXES record");
 	}
 
-	field = rec_get_nth_field_old(rec, 0/*TABLE_ID*/, &len);
-	if (UNIV_UNLIKELY(len != 8)) {
+	field = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_INDEXES__TABLE_ID, &len);
+	if (len != 8) {
 err_len:
 		return("incorrect column length in SYS_INDEXES");
 	}
 
 	if (!allocate) {
 		/* We are reading a SYS_INDEXES record. Copy the table_id */
-		memcpy(table_id, (const char*)field, 8);
+		memcpy(table_id, (const char*) field, 8);
 	} else if (memcmp(field, table_id, 8)) {
 		/* Caller supplied table_id, verify it is the same
 		id as on the index record */
 		return(dict_load_index_id_err);
 	}
 
-	field = rec_get_nth_field_old(rec, 1/*ID*/, &len);
-	if (UNIV_UNLIKELY(len != 8)) {
+	field = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_INDEXES__ID, &len);
+	if (len != 8) {
 		goto err_len;
 	}
 
 	id = mach_read_from_8(field);
 
-	rec_get_nth_field_offs_old(rec, 2/*DB_TRX_ID*/, &len);
-	if (UNIV_UNLIKELY(len != DATA_TRX_ID_LEN && len != UNIV_SQL_NULL)) {
+	rec_get_nth_field_offs_old(
+		rec, DICT_FLD__SYS_INDEXES__DB_TRX_ID, &len);
+	if (len != DATA_TRX_ID_LEN && len != UNIV_SQL_NULL) {
 		goto err_len;
 	}
-	rec_get_nth_field_offs_old(rec, 3/*DB_ROLL_PTR*/, &len);
-	if (UNIV_UNLIKELY(len != DATA_ROLL_PTR_LEN && len != UNIV_SQL_NULL)) {
+	rec_get_nth_field_offs_old(
+		rec, DICT_FLD__SYS_INDEXES__DB_ROLL_PTR, &len);
+	if (len != DATA_ROLL_PTR_LEN && len != UNIV_SQL_NULL) {
 		goto err_len;
 	}
 
-	field = rec_get_nth_field_old(rec, 4/*NAME*/, &name_len);
-	if (UNIV_UNLIKELY(name_len == UNIV_SQL_NULL)) {
+	field = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_INDEXES__NAME, &name_len);
+	if (name_len == UNIV_SQL_NULL) {
 		goto err_len;
 	}
 
 	name_buf = mem_heap_strdupl(heap, (const char*) field,
 				    name_len);
 
-	field = rec_get_nth_field_old(rec, 5/*N_FIELDS*/, &len);
-	if (UNIV_UNLIKELY(len != 4)) {
+	field = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_INDEXES__N_FIELDS, &len);
+	if (len != 4) {
 		goto err_len;
 	}
 	n_fields = mach_read_from_4(field);
 
-	field = rec_get_nth_field_old(rec, 6/*TYPE*/, &len);
-	if (UNIV_UNLIKELY(len != 4)) {
+	field = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_INDEXES__TYPE, &len);
+	if (len != 4) {
 		goto err_len;
 	}
 	type = mach_read_from_4(field);
-	if (UNIV_UNLIKELY(type & (~0 << DICT_IT_BITS))) {
+	if (type & (~0 << DICT_IT_BITS)) {
 		return("unknown SYS_INDEXES.TYPE bits");
 	}
 
-	field = rec_get_nth_field_old(rec, 7/*SPACE*/, &len);
-	if (UNIV_UNLIKELY(len != 4)) {
+	field = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_INDEXES__SPACE, &len);
+	if (len != 4) {
 		goto err_len;
 	}
 	space = mach_read_from_4(field);
 
-	field = rec_get_nth_field_old(rec, 8/*PAGE_NO*/, &len);
-	if (UNIV_UNLIKELY(len != 4)) {
+	field = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_INDEXES__PAGE_NO, &len);
+	if (len != 4) {
 		goto err_len;
 	}
 
@@ -1392,14 +1420,16 @@ dict_load_indexes(
 
 	sys_indexes = dict_table_get_low("SYS_INDEXES");
 	sys_index = UT_LIST_GET_FIRST(sys_indexes->indexes);
-	ut_a(!dict_table_is_comp(sys_indexes));
-	ut_a(name_of_col_is(sys_indexes, sys_index, 4, "NAME"));
-	ut_a(name_of_col_is(sys_indexes, sys_index, 8, "PAGE_NO"));
+	ut_ad(!dict_table_is_comp(sys_indexes));
+	ut_ad(name_of_col_is(sys_indexes, sys_index,
+			     DICT_FLD__SYS_INDEXES__NAME, "NAME"));
+	ut_ad(name_of_col_is(sys_indexes, sys_index,
+			     DICT_FLD__SYS_INDEXES__PAGE_NO, "PAGE_NO"));
 
 	tuple = dtuple_create(heap, 1);
 	dfield = dtuple_get_nth_field(tuple, 0);
 
-	buf = mem_heap_alloc(heap, 8);
+	buf = static_cast<byte*>(mem_heap_alloc(heap, 8));
 	mach_write_to_8(buf, table->id);
 
 	dfield_set_data(dfield, buf, 8);
@@ -1420,7 +1450,8 @@ dict_load_indexes(
 
 		err_msg = dict_load_index_low(buf, table->name, heap, rec,
 					      TRUE, &index);
-		ut_ad((index == NULL) == (err_msg != NULL));
+		ut_ad((index == NULL && err_msg != NULL)
+		      || (index != NULL && err_msg == NULL));
 
 		if (err_msg == dict_load_index_id_err) {
 			/* TABLE_ID mismatch means that we have
@@ -1468,10 +1499,17 @@ dict_load_indexes(
 			}
 		}
 
+		if (index->type & DICT_FTS
+		    && !DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS)) {
+			/* This should have been created by now. */
+			ut_a(table->fts != NULL);
+			DICT_TF2_FLAG_SET(table, DICT_TF2_FTS);
+		}
+
 		/* We check for unsupported types first, so that the
 		subsequent checks are relevant for the supported types. */
 		if (index->type & ~(DICT_CLUSTERED | DICT_UNIQUE
-				    | DICT_CORRUPT)) {
+				    | DICT_CORRUPT | DICT_FTS)) {
 			fprintf(stderr,
 				"InnoDB: Error: unknown type %lu"
 				" of index %s of table %s\n",
@@ -1480,7 +1518,8 @@ dict_load_indexes(
 			error = DB_UNSUPPORTED;
 			dict_mem_index_free(index);
 			goto func_exit;
-		} else if (index->page == FIL_NULL) {
+		} else if (index->page == FIL_NULL
+			   && (!(index->type & DICT_FTS))) {
 
 			fprintf(stderr,
 				"InnoDB: Error: trying to load index %s"
@@ -1530,27 +1569,7 @@ corrupted:
 			of the database server */
 			dict_mem_index_free(index);
 		} else {
-			error = dict_load_fields(index, heap);
-
-			if (error != DB_SUCCESS) {
-
-				fprintf(stderr, "InnoDB: Error: load index '%s'"
-					" for table '%s' failed\n",
-					index->name, table->name);
-
-				/* If the force recovery flag is set, and
-				if the failed index is not the clustered index,
-				we will continue and open other indexes */
-				if ((srv_force_recovery
-				     || srv_load_corrupted)
-				    && !dict_index_is_clust(index)) {
-					error = DB_SUCCESS;
-					goto next_rec;
-				} else {
-					goto func_exit;
-				}
-			}
-
+			dict_load_fields(index, heap);
 			error = dict_index_add_to_cache(table, index,
 							index->page, FALSE);
 			/* The data dictionary tables should never contain
@@ -1563,11 +1582,17 @@ corrupted:
 				goto func_exit;
 			}
 		}
-
 next_rec:
 		btr_pcur_move_to_next_user_rec(&pcur, &mtr);
 	}
 
+	/* If the table contains FTS indexes, populate table->fts->indexes */
+	if (DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS)) {
+		/* table->fts->indexes should have been created. */
+		ut_a(table->fts->indexes != NULL);
+		dict_table_get_all_fts_indexes(table, table->fts->indexes);
+	}
+
 func_exit:
 	btr_pcur_close(&pcur);
 	mtr_commit(&mtr);
@@ -1591,109 +1616,109 @@ dict_load_table_low(
 	ulint		len;
 	ulint		space;
 	ulint		n_cols;
-	ulint		flags;
+	ulint		flags = 0;
+	ulint		flags2;
 
-	if (UNIV_UNLIKELY(rec_get_deleted_flag(rec, 0))) {
+	if (rec_get_deleted_flag(rec, 0)) {
 		return("delete-marked record in SYS_TABLES");
 	}
 
-	if (UNIV_UNLIKELY(rec_get_n_fields_old(rec) != 10)) {
+	if (rec_get_n_fields_old(rec) != DICT_NUM_FIELDS__SYS_TABLES) {
 		return("wrong number of columns in SYS_TABLES record");
 	}
 
-	rec_get_nth_field_offs_old(rec, 0/*NAME*/, &len);
-	if (UNIV_UNLIKELY(len < 1 || len == UNIV_SQL_NULL)) {
+	rec_get_nth_field_offs_old(
+		rec, DICT_FLD__SYS_TABLES__NAME, &len);
+	if (len < 1 || len == UNIV_SQL_NULL) {
 err_len:
 		return("incorrect column length in SYS_TABLES");
 	}
-	rec_get_nth_field_offs_old(rec, 1/*DB_TRX_ID*/, &len);
-	if (UNIV_UNLIKELY(len != DATA_TRX_ID_LEN && len != UNIV_SQL_NULL)) {
+	rec_get_nth_field_offs_old(
+		rec, DICT_FLD__SYS_TABLES__DB_TRX_ID, &len);
+	if (len != DATA_TRX_ID_LEN && len != UNIV_SQL_NULL) {
 		goto err_len;
 	}
-	rec_get_nth_field_offs_old(rec, 2/*DB_ROLL_PTR*/, &len);
-	if (UNIV_UNLIKELY(len != DATA_ROLL_PTR_LEN && len != UNIV_SQL_NULL)) {
+	rec_get_nth_field_offs_old(
+		rec, DICT_FLD__SYS_TABLES__DB_ROLL_PTR, &len);
+	if (len != DATA_ROLL_PTR_LEN && len != UNIV_SQL_NULL) {
 		goto err_len;
 	}
 
-	rec_get_nth_field_offs_old(rec, 3/*ID*/, &len);
-	if (UNIV_UNLIKELY(len != 8)) {
+	rec_get_nth_field_offs_old(rec, DICT_FLD__SYS_TABLES__ID, &len);
+	if (len != 8) {
 		goto err_len;
 	}
 
-	field = rec_get_nth_field_old(rec, 4/*N_COLS*/, &len);
-	if (UNIV_UNLIKELY(len != 4)) {
+	field = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_TABLES__N_COLS, &len);
+	if (len != 4) {
 		goto err_len;
 	}
 
 	n_cols = mach_read_from_4(field);
 
-	rec_get_nth_field_offs_old(rec, 5/*TYPE*/, &len);
-	if (UNIV_UNLIKELY(len != 4)) {
+	rec_get_nth_field_offs_old(rec, DICT_FLD__SYS_TABLES__TYPE, &len);
+	if (len != 4) {
 		goto err_len;
 	}
 
-	rec_get_nth_field_offs_old(rec, 6/*MIX_ID*/, &len);
-	if (UNIV_UNLIKELY(len != 8)) {
+	rec_get_nth_field_offs_old(
+		rec, DICT_FLD__SYS_TABLES__MIX_ID, &len);
+	if (len != 8) {
 		goto err_len;
 	}
 
-	rec_get_nth_field_offs_old(rec, 7/*MIX_LEN*/, &len);
-	if (UNIV_UNLIKELY(len != 4)) {
+	field = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_TABLES__MIX_LEN, &len);
+	if (len != 4) {
 		goto err_len;
 	}
 
-	rec_get_nth_field_offs_old(rec, 8/*CLUSTER_ID*/, &len);
-	if (UNIV_UNLIKELY(len != UNIV_SQL_NULL)) {
+	/* MIX_LEN may hold additional flags in post-antelope file formats. */
+	flags2 = mach_read_from_4(field);
+
+	/* DICT_TF2_FTS will be set when indexes is being loaded */
+	flags2 &= ~DICT_TF2_FTS;
+
+	rec_get_nth_field_offs_old(
+		rec, DICT_FLD__SYS_TABLES__CLUSTER_ID, &len);
+	if (len != UNIV_SQL_NULL) {
 		goto err_len;
 	}
 
-	field = rec_get_nth_field_old(rec, 9/*SPACE*/, &len);
-
-	if (UNIV_UNLIKELY(len != 4)) {
+	field = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_TABLES__SPACE, &len);
+	if (len != 4) {
 		goto err_len;
 	}
 
 	space = mach_read_from_4(field);
 
 	/* Check if the tablespace exists and has the right name */
-	if (space != 0) {
-		flags = dict_sys_tables_get_flags(rec);
+	flags = dict_sys_tables_get_flags(rec);
 
-		if (UNIV_UNLIKELY(flags == ULINT_UNDEFINED)) {
-			field = rec_get_nth_field_old(rec, 5/*TYPE*/, &len);
-			ut_ad(len == 4); /* this was checked earlier */
-			flags = mach_read_from_4(field);
+	if (UNIV_UNLIKELY(flags == ULINT_UNDEFINED)) {
+		field = rec_get_nth_field_old(
+			rec, DICT_FLD__SYS_TABLES__TYPE, &len);
+		ut_ad(len == 4); /* this was checked earlier */
+		flags = mach_read_from_4(field);
 
-			ut_print_timestamp(stderr);
-			fputs("  InnoDB: Error: table ", stderr);
-			ut_print_filename(stderr, name);
-			fprintf(stderr, "\n"
-				"InnoDB: in InnoDB data dictionary"
-				" has unknown type %lx.\n",
-				(ulong) flags);
-			return("incorrect flags in SYS_TABLES");
-		}
-	} else {
-		flags = 0;
+		ut_print_timestamp(stderr);
+		fputs("  InnoDB: Error: table ", stderr);
+		ut_print_filename(stderr, name);
+		fprintf(stderr, "\n"
+			"InnoDB: in InnoDB data dictionary"
+			" has unknown type %lx.\n",
+			(ulong) flags);
+		return("incorrect flags in SYS_TABLES");
 	}
 
 	/* The high-order bit of N_COLS is the "compact format" flag.
 	For tables in that format, MIX_LEN may hold additional flags. */
-	if (n_cols & 0x80000000UL) {
-		ulint	flags2;
-
-		flags |= DICT_TF_COMPACT;
-
-		field = rec_get_nth_field_old(rec, 7, &len);
-
-		if (UNIV_UNLIKELY(len != 4)) {
-
-			goto err_len;
-		}
-
-		flags2 = mach_read_from_4(field);
+	if (n_cols & DICT_N_COLS_COMPACT) {
+		ut_ad(flags & DICT_TF_COMPACT);
 
-		if (flags2 & (~0 << (DICT_TF2_BITS - DICT_TF2_SHIFT))) {
+		if (flags2 & ~DICT_TF2_BIT_MASK) {
 			ut_print_timestamp(stderr);
 			fputs("  InnoDB: Warning: table ", stderr);
 			ut_print_filename(stderr, name);
@@ -1702,17 +1727,20 @@ err_len:
 				" has unknown flags %lx.\n",
 				(ulong) flags2);
 
-			flags2 &= ~(~0 << (DICT_TF2_BITS - DICT_TF2_SHIFT));
+			/* Clean it up and keep going */
+			flags2 &= DICT_TF2_BIT_MASK;
 		}
-
-		flags |= flags2 << DICT_TF2_SHIFT;
+	} else {
+		/* Do not trust the MIX_LEN field when the
+		row format is Redundant. */
+		flags2 = 0;
 	}
 
 	/* See if the tablespace is available. */
-	*table = dict_mem_table_create(name, space, n_cols & ~0x80000000UL,
-				       flags);
+	*table = dict_mem_table_create(
+		name, space, n_cols & ~DICT_N_COLS_COMPACT, flags, flags2);
 
-	field = rec_get_nth_field_old(rec, 3/*ID*/, &len);
+	field = rec_get_nth_field_old(rec, DICT_FLD__SYS_TABLES__ID, &len);
 	ut_ad(len == 8); /* this was checked earlier */
 
 	(*table)->id = mach_read_from_8(field);
@@ -1764,12 +1792,17 @@ dict_load_table(
 
 	sys_tables = dict_table_get_low("SYS_TABLES");
 	sys_index = UT_LIST_GET_FIRST(sys_tables->indexes);
-	ut_a(!dict_table_is_comp(sys_tables));
-	ut_a(name_of_col_is(sys_tables, sys_index, 3, "ID"));
-	ut_a(name_of_col_is(sys_tables, sys_index, 4, "N_COLS"));
-	ut_a(name_of_col_is(sys_tables, sys_index, 5, "TYPE"));
-	ut_a(name_of_col_is(sys_tables, sys_index, 7, "MIX_LEN"));
-	ut_a(name_of_col_is(sys_tables, sys_index, 9, "SPACE"));
+	ut_ad(!dict_table_is_comp(sys_tables));
+	ut_ad(name_of_col_is(sys_tables, sys_index,
+			     DICT_FLD__SYS_TABLES__ID, "ID"));
+	ut_ad(name_of_col_is(sys_tables, sys_index,
+			     DICT_FLD__SYS_TABLES__N_COLS, "N_COLS"));
+	ut_ad(name_of_col_is(sys_tables, sys_index,
+			     DICT_FLD__SYS_TABLES__TYPE, "TYPE"));
+	ut_ad(name_of_col_is(sys_tables, sys_index,
+			     DICT_FLD__SYS_TABLES__MIX_LEN, "MIX_LEN"));
+	ut_ad(name_of_col_is(sys_tables, sys_index,
+			     DICT_FLD__SYS_TABLES__SPACE, "SPACE"));
 
 	tuple = dtuple_create(heap, 1);
 	dfield = dtuple_get_nth_field(tuple, 0);
@@ -1792,7 +1825,8 @@ err_exit:
 		return(NULL);
 	}
 
-	field = rec_get_nth_field_old(rec, 0, &len);
+	field = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_TABLES__NAME, &len);
 
 	/* Check if the table name in record is the searched one */
 	if (len != ut_strlen(name) || ut_memcmp(name, field, len) != 0) {
@@ -1812,12 +1846,9 @@ err_exit:
 	if (table->space == 0) {
 		/* The system tablespace is always available. */
 	} else if (!fil_space_for_table_exists_in_mem(
-			   table->space, name,
-			   (table->flags >> DICT_TF2_SHIFT)
-			   & DICT_TF2_TEMPORARY,
-			   FALSE, FALSE)) {
+			table->space, name, FALSE, FALSE)) {
 
-		if (table->flags & (DICT_TF2_TEMPORARY << DICT_TF2_SHIFT)) {
+		if (table->flags2 & DICT_TF2_TEMPORARY) {
 			/* Do not bother to retry opening temporary tables. */
 			table->ibd_file_missing = TRUE;
 		} else {
@@ -1832,8 +1863,8 @@ err_exit:
 			/* Try to open the tablespace */
 			if (!fil_open_single_table_tablespace(
 				TRUE, table->space,
-				table->flags == DICT_TF_COMPACT ? 0 :
-				table->flags & ~(~0 << DICT_TF_BITS), name)) {
+				dict_tf_to_fsp_flags(table->flags),
+				name)) {
 				/* We failed to find a sensible
 				tablespace file */
 
@@ -1848,7 +1879,7 @@ err_exit:
 	dict_load_columns(table, heap);
 
 	if (cached) {
-		dict_table_add_to_cache(table, heap);
+		dict_table_add_to_cache(table, TRUE, heap);
 	} else {
 		dict_table_add_system_columns(table, heap);
 	}
@@ -1884,7 +1915,7 @@ err_exit:
 	/* Initialize table foreign_child value. Its value could be
 	changed when dict_load_foreigns() is called below */
 	table->fk_max_recusive_level = 0;
- 
+
 	/* If the force recovery flag is set, we open the table irrespective
 	of the error condition, since the user may want to dump data from the
 	clustered index. However we load the foreign key information only if
@@ -1892,7 +1923,7 @@ err_exit:
 	if (!cached) {
 	} else if (err == DB_SUCCESS) {
 		err = dict_load_foreigns(table->name, TRUE, TRUE);
- 
+
 		if (err != DB_SUCCESS) {
 			dict_table_remove_from_cache(table);
 			table = NULL;
@@ -1900,7 +1931,7 @@ err_exit:
 			table->fk_max_recusive_level = 0;
 		}
 	} else {
-		dict_index_t*	index;
+		dict_index_t*   index;
 
 		/* Make sure that at least the clustered index was loaded.
 		Otherwise refuse to load the table */
@@ -1945,6 +1976,9 @@ err_exit:
 func_exit:
 	mem_heap_free(heap);
 
+	ut_ad(!table || ignore_err != DICT_ERR_IGNORE_NONE
+	      || !table->corrupted);
+
 	return(table);
 }
 
@@ -1984,7 +2018,7 @@ dict_load_table_on_id(
 	sys_tables = dict_sys->sys_tables;
 	sys_table_ids = dict_table_get_next_index(
 		dict_table_get_first_index(sys_tables));
-	ut_a(!dict_table_is_comp(sys_tables));
+	ut_ad(!dict_table_is_comp(sys_tables));
 	heap = mem_heap_create(256);
 
 	tuple  = dtuple_create(heap, 1);
@@ -1998,40 +2032,44 @@ dict_load_table_on_id(
 
 	btr_pcur_open_on_user_rec(sys_table_ids, tuple, PAGE_CUR_GE,
 				  BTR_SEARCH_LEAF, &pcur, &mtr);
-	rec = btr_pcur_get_rec(&pcur);
-
-	if (!btr_pcur_is_on_user_rec(&pcur)) {
-		/* Not found */
-		goto func_exit;
-	}
-
-	/* Find the first record that is not delete marked */
-	while (rec_get_deleted_flag(rec, 0)) {
-		if (!btr_pcur_move_to_next_user_rec(&pcur, &mtr)) {
-			goto func_exit;
-		}
-		rec = btr_pcur_get_rec(&pcur);
-	}
-
-	/*---------------------------------------------------*/
-	/* Now we have the record in the secondary index containing the
-	table ID and NAME */
 
+check_rec:
 	rec = btr_pcur_get_rec(&pcur);
-	field = rec_get_nth_field_old(rec, 0, &len);
-	ut_ad(len == 8);
 
-	/* Check if the table id in record is the one searched for */
-	if (table_id != mach_read_from_8(field)) {
-		goto func_exit;
+	if (page_rec_is_user_rec(rec)) {
+		/*---------------------------------------------------*/
+		/* Now we have the record in the secondary index
+		containing the table ID and NAME */
+
+		field = rec_get_nth_field_old(
+			rec, DICT_FLD__SYS_TABLE_IDS__ID, &len);
+		ut_ad(len == 8);
+
+		/* Check if the table id in record is the one searched for */
+		if (table_id == mach_read_from_8(field)) {
+			if (rec_get_deleted_flag(rec, 0)) {
+				/* Until purge has completed, there
+				may be delete-marked duplicate records
+				for the same SYS_TABLES.ID.
+				Due to Bug #60049, some delete-marked
+				records may survive the purge forever. */
+				if (btr_pcur_move_to_next(&pcur, &mtr)) {
+
+					goto check_rec;
+				}
+			} else {
+				/* Now we get the table name from the record */
+				field = rec_get_nth_field_old(rec,
+					DICT_FLD__SYS_TABLE_IDS__NAME, &len);
+				/* Load the table definition to memory */
+				table = dict_load_table(
+					mem_heap_strdupl(
+						heap, (char*) field, len),
+					TRUE, DICT_ERR_IGNORE_NONE);
+			}
+		}
 	}
 
-	/* Now we get the table name from the record */
-	field = rec_get_nth_field_old(rec, 1, &len);
-	/* Load the table definition to memory */
-	table = dict_load_table(mem_heap_strdupl(heap, (char*) field, len),
-				TRUE, DICT_ERR_IGNORE_NONE);
-func_exit:
 	btr_pcur_close(&pcur);
 	mtr_commit(&mtr);
 	mem_heap_free(heap);
@@ -2084,16 +2122,20 @@ dict_load_foreign_cols(
 
 	ut_ad(mutex_own(&(dict_sys->mutex)));
 
-	foreign->foreign_col_names = mem_heap_alloc(
-		foreign->heap, foreign->n_fields * sizeof(void*));
+	foreign->foreign_col_names = static_cast<const char**>(
+		mem_heap_alloc(foreign->heap,
+			       foreign->n_fields * sizeof(void*)));
+
+	foreign->referenced_col_names = static_cast<const char**>(
+		mem_heap_alloc(foreign->heap,
+			       foreign->n_fields * sizeof(void*)));
 
-	foreign->referenced_col_names = mem_heap_alloc(
-		foreign->heap, foreign->n_fields * sizeof(void*));
 	mtr_start(&mtr);
 
 	sys_foreign_cols = dict_table_get_low("SYS_FOREIGN_COLS");
+
 	sys_index = UT_LIST_GET_FIRST(sys_foreign_cols->indexes);
-	ut_a(!dict_table_is_comp(sys_foreign_cols));
+	ut_ad(!dict_table_is_comp(sys_foreign_cols));
 
 	tuple = dtuple_create(foreign->heap, 1);
 	dfield = dtuple_get_nth_field(tuple, 0);
@@ -2110,19 +2152,23 @@ dict_load_foreign_cols(
 		ut_a(btr_pcur_is_on_user_rec(&pcur));
 		ut_a(!rec_get_deleted_flag(rec, 0));
 
-		field = rec_get_nth_field_old(rec, 0, &len);
+		field = rec_get_nth_field_old(
+			rec, DICT_FLD__SYS_FOREIGN_COLS__ID, &len);
 		ut_a(len == id_len);
 		ut_a(ut_memcmp(id, field, len) == 0);
 
-		field = rec_get_nth_field_old(rec, 1, &len);
+		field = rec_get_nth_field_old(
+			rec, DICT_FLD__SYS_FOREIGN_COLS__POS, &len);
 		ut_a(len == 4);
 		ut_a(i == mach_read_from_4(field));
 
-		field = rec_get_nth_field_old(rec, 4, &len);
+		field = rec_get_nth_field_old(
+			rec, DICT_FLD__SYS_FOREIGN_COLS__FOR_COL_NAME, &len);
 		foreign->foreign_col_names[i] = mem_heap_strdupl(
 			foreign->heap, (char*) field, len);
 
-		field = rec_get_nth_field_old(rec, 5, &len);
+		field = rec_get_nth_field_old(
+			rec, DICT_FLD__SYS_FOREIGN_COLS__REF_COL_NAME, &len);
 		foreign->referenced_col_names[i] = mem_heap_strdupl(
 			foreign->heap, (char*) field, len);
 
@@ -2172,8 +2218,9 @@ dict_load_foreign(
 	mtr_start(&mtr);
 
 	sys_foreign = dict_table_get_low("SYS_FOREIGN");
+
 	sys_index = UT_LIST_GET_FIRST(sys_foreign->indexes);
-	ut_a(!dict_table_is_comp(sys_foreign));
+	ut_ad(!dict_table_is_comp(sys_foreign));
 
 	tuple = dtuple_create(heap2, 1);
 	dfield = dtuple_get_nth_field(tuple, 0);
@@ -2190,8 +2237,9 @@ dict_load_foreign(
 		/* Not found */
 
 		fprintf(stderr,
-			"InnoDB: Error A: cannot load foreign constraint "
-			"%.*s\n", (int) id_len, id);
+			"InnoDB: Error: cannot load foreign constraint "
+			"%.*s: could not find the relevant record in "
+			"SYS_FOREIGN\n", (int) id_len, id);
 
 		btr_pcur_close(&pcur);
 		mtr_commit(&mtr);
@@ -2200,14 +2248,15 @@ dict_load_foreign(
 		return(DB_ERROR);
 	}
 
-	field = rec_get_nth_field_old(rec, 0, &len);
+	field = rec_get_nth_field_old(rec, DICT_FLD__SYS_FOREIGN__ID, &len);
 
 	/* Check if the id in record is the searched one */
 	if (len != id_len || ut_memcmp(id, field, len) != 0) {
 
 		fprintf(stderr,
-			"InnoDB: Error B: cannot load foreign constraint "
-			"%.*s\n", (int) id_len, id);
+			"InnoDB: Error: cannot load foreign constraint "
+			"%.*s: found %.*s instead in SYS_FOREIGN\n",
+			(int) id_len, id, (int) len, field);
 
 		btr_pcur_close(&pcur);
 		mtr_commit(&mtr);
@@ -2224,7 +2273,8 @@ dict_load_foreign(
 	foreign = dict_mem_foreign_create();
 
 	n_fields_and_type = mach_read_from_4(
-		rec_get_nth_field_old(rec, 5, &len));
+		rec_get_nth_field_old(
+			rec, DICT_FLD__SYS_FOREIGN__N_COLS, &len));
 
 	ut_a(len == 4);
 
@@ -2235,13 +2285,15 @@ dict_load_foreign(
 
 	foreign->id = mem_heap_strdupl(foreign->heap, id, id_len);
 
-	field = rec_get_nth_field_old(rec, 3, &len);
+	field = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_FOREIGN__FOR_NAME, &len);
 
 	foreign->foreign_table_name = mem_heap_strdupl(
 		foreign->heap, (char*) field, len);
 	dict_mem_foreign_table_name_lookup_set(foreign, TRUE);
 
-	field = rec_get_nth_field_old(rec, 4, &len);
+	field = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_FOREIGN__REF_NAME, &len);
 	foreign->referenced_table_name = mem_heap_strdupl(
 		foreign->heap, (char*) field, len);
 	dict_mem_referenced_table_name_lookup_set(foreign, TRUE);
@@ -2354,7 +2406,7 @@ dict_load_foreigns(
 		return(DB_ERROR);
 	}
 
-	ut_a(!dict_table_is_comp(sys_foreign));
+	ut_ad(!dict_table_is_comp(sys_foreign));
 	mtr_start(&mtr);
 
 	/* Get the secondary index based on FOR_NAME from table
@@ -2385,7 +2437,8 @@ loop:
 	name and a foreign constraint ID */
 
 	rec = btr_pcur_get_rec(&pcur);
-	field = rec_get_nth_field_old(rec, 0, &len);
+	field = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_FOREIGN_FOR_NAME__NAME, &len);
 
 	/* Check if the table name in the record is the one searched for; the
 	following call does the comparison in the latin1_swedish_ci
@@ -2393,7 +2446,9 @@ loop:
 
 	if (0 != cmp_data_data(dfield_get_type(dfield)->mtype,
 			       dfield_get_type(dfield)->prtype,
-			       dfield_get_data(dfield), dfield_get_len(dfield),
+			       static_cast<const byte*>(
+				       dfield_get_data(dfield)),
+			       dfield_get_len(dfield),
 			       field, len)) {
 
 		goto load_next_index;
@@ -2417,7 +2472,8 @@ loop:
 	}
 
 	/* Now we get a foreign key constraint id */
-	field = rec_get_nth_field_old(rec, 1, &len);
+	field = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_FOREIGN_FOR_NAME__ID, &len);
 
 	btr_pcur_store_position(&pcur, &mtr);
 
diff --git a/storage/innobase/dict/dict0mem.c b/storage/innobase/dict/dict0mem.cc
index 982cca5a796..28b935d2e58 100644
--- a/storage/innobase/dict/dict0mem.c
+++ b/storage/innobase/dict/dict0mem.cc
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1996, 2010, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2011, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,13 +11,13 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
 /******************************************************************//**
-@file dict/dict0mem.c
+@file dict/dict0mem.cc
 Data dictionary memory object creation
 
 Created 1/8/1996 Heikki Tuuri
@@ -33,8 +33,10 @@ Created 1/8/1996 Heikki Tuuri
 #include "data0type.h"
 #include "mach0data.h"
 #include "dict0dict.h"
-#include "ha_prototypes.h" /* innobase_casedn_str()*/
+#include "fts0priv.h"
 #ifndef UNIV_HOTBACKUP
+#include "ha_prototypes.h"	/* innobase_casedn_str(),
+				innobase_get_lower_case_table_names */
 # include "lock0lock.h"
 #endif /* !UNIV_HOTBACKUP */
 #ifdef UNIV_BLOB_DEBUG
@@ -62,31 +64,40 @@ dict_mem_table_create(
 				ignored if the table is made a member of
 				a cluster */
 	ulint		n_cols,	/*!< in: number of columns */
-	ulint		flags)	/*!< in: table flags */
+	ulint		flags,	/*!< in: table flags */
+	ulint		flags2)	/*!< in: table flags2 */
 {
 	dict_table_t*	table;
 	mem_heap_t*	heap;
 
 	ut_ad(name);
-	ut_a(!(flags & (~0 << DICT_TF2_BITS)));
+	dict_tf_validate(flags);
+	ut_a(!(flags2 & ~DICT_TF2_BIT_MASK));
 
 	heap = mem_heap_create(DICT_HEAP_SIZE);
 
-	table = mem_heap_zalloc(heap, sizeof(dict_table_t));
+	table = static_cast<dict_table_t*>(
+		mem_heap_zalloc(heap, sizeof(dict_table_t)));
 
 	table->heap = heap;
 
 	table->flags = (unsigned int) flags;
-	table->name = ut_malloc(strlen(name) + 1);
+	table->flags2 = (unsigned int) flags2;
+	table->name = static_cast<char*>(ut_malloc(strlen(name) + 1));
 	memcpy(table->name, name, strlen(name) + 1);
 	table->space = (unsigned int) space;
 	table->n_cols = (unsigned int) (n_cols + DATA_N_SYS_COLS);
 
-	table->cols = mem_heap_alloc(heap, (n_cols + DATA_N_SYS_COLS)
-				     * sizeof(dict_col_t));
+	table->cols = static_cast<dict_col_t*>(
+		mem_heap_alloc(heap,
+			       (n_cols + DATA_N_SYS_COLS)
+			       * sizeof(dict_col_t)));
+
+	ut_d(table->magic_n = DICT_TABLE_MAGIC_N);
 
 #ifndef UNIV_HOTBACKUP
-	table->autoinc_lock = mem_heap_alloc(heap, lock_get_size());
+	table->autoinc_lock = static_cast<ib_lock_t*>(
+		mem_heap_alloc(heap, lock_get_size()));
 
 	mutex_create(autoinc_mutex_key,
 		     &table->autoinc_mutex, SYNC_DICT_AUTOINC_MUTEX);
@@ -96,9 +107,20 @@ dict_mem_table_create(
 	/* The number of transactions that are either waiting on the
 	AUTOINC lock or have been granted the lock. */
 	table->n_waiting_or_granted_auto_inc_locks = 0;
+
+	/* If the table has an FTS index or we are in the process
+	of building one, create the table->fts */
+	if (dict_table_has_fts_index(table)
+	    || DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_HAS_DOC_ID)
+	    || DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_ADD_DOC_ID)) {
+		table->fts = fts_create(table);
+		table->fts->cache = fts_cache_create(table);
+		fts_optimize_add_table(table);
+	} else {
+		table->fts = NULL;
+	}
 #endif /* !UNIV_HOTBACKUP */
 
-	ut_d(table->magic_n = DICT_TABLE_MAGIC_N);
 	return(table);
 }
 
@@ -114,6 +136,15 @@ dict_mem_table_free(
 	ut_ad(table->magic_n == DICT_TABLE_MAGIC_N);
 	ut_d(table->cached = FALSE);
 
+        if (dict_table_has_fts_index(table)
+            || DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_HAS_DOC_ID)
+            || DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_ADD_DOC_ID)) {
+		if (table->fts) {
+			fts_free(table);
+		}
+
+		fts_optimize_remove_table(table);
+	}
 #ifndef UNIV_HOTBACKUP
 	mutex_free(&(table->autoinc_mutex));
 #endif /* UNIV_HOTBACKUP */
@@ -158,7 +189,7 @@ dict_add_col_name(
 	new_len = strlen(name) + 1;
 	total_len = old_len + new_len;
 
-	res = mem_heap_alloc(heap, total_len);
+	res = static_cast<char*>(mem_heap_alloc(heap, total_len));
 
 	if (old_len > 0) {
 		memcpy(res, col_names, old_len);
@@ -197,7 +228,9 @@ dict_mem_table_add_col(
 		}
 		if (UNIV_LIKELY(i) && UNIV_UNLIKELY(!table->col_names)) {
 			/* All preceding column names are empty. */
-			char* s = mem_heap_zalloc(heap, table->n_def);
+			char* s = static_cast<char*>(
+				mem_heap_zalloc(heap, table->n_def));
+
 			table->col_names = s;
 		}
 
@@ -264,7 +297,9 @@ dict_mem_index_create(
 	ut_ad(table_name && index_name);
 
 	heap = mem_heap_create(DICT_HEAP_SIZE);
-	index = mem_heap_zalloc(heap, sizeof(dict_index_t));
+
+	index = static_cast<dict_index_t*>(
+		mem_heap_zalloc(heap, sizeof(*index)));
 
 	dict_mem_fill_index_struct(index, heap, table_name, index_name,
 				   space, type, n_fields);
@@ -272,6 +307,7 @@ dict_mem_index_create(
 	return(index);
 }
 
+#ifndef UNIV_HOTBACKUP
 /**********************************************************************//**
 Creates and initializes a foreign constraint memory object.
 @return	own: foreign constraint struct */
@@ -285,7 +321,8 @@ dict_mem_foreign_create(void)
 
 	heap = mem_heap_create(100);
 
-	foreign = mem_heap_zalloc(heap, sizeof(dict_foreign_t));
+	foreign = static_cast<dict_foreign_t*>(
+		mem_heap_zalloc(heap, sizeof(dict_foreign_t)));
 
 	foreign->heap = heap;
 
@@ -306,9 +343,13 @@ dict_mem_foreign_table_name_lookup_set(
 {
 	if (innobase_get_lower_case_table_names() == 2) {
 		if (do_alloc) {
-			foreign->foreign_table_name_lookup = mem_heap_alloc(
-				foreign->heap,
-				strlen(foreign->foreign_table_name) + 1);
+			ulint	len;
+
+			len = strlen(foreign->foreign_table_name) + 1;
+
+			foreign->foreign_table_name_lookup =
+				static_cast<char*>(
+					mem_heap_alloc(foreign->heap, len));
 		}
 		strcpy(foreign->foreign_table_name_lookup,
 		       foreign->foreign_table_name);
@@ -333,9 +374,13 @@ dict_mem_referenced_table_name_lookup_set(
 {
 	if (innobase_get_lower_case_table_names() == 2) {
 		if (do_alloc) {
-			foreign->referenced_table_name_lookup = mem_heap_alloc(
-				foreign->heap,
-				strlen(foreign->referenced_table_name) + 1);
+			ulint	len;
+
+			len = strlen(foreign->referenced_table_name) + 1;
+
+			foreign->referenced_table_name_lookup =
+				static_cast<char*>(
+					mem_heap_alloc(foreign->heap, len));
 		}
 		strcpy(foreign->referenced_table_name_lookup,
 		       foreign->referenced_table_name);
@@ -345,6 +390,7 @@ dict_mem_referenced_table_name_lookup_set(
 			= foreign->referenced_table_name;
 	}
 }
+#endif /* !UNIV_HOTBACKUP */
 
 /**********************************************************************//**
 Adds a field definition to an index. NOTE: does not take a copy
diff --git a/storage/innobase/dict/dict0stats.cc b/storage/innobase/dict/dict0stats.cc
new file mode 100644
index 00000000000..aa587f54a0d
--- /dev/null
+++ b/storage/innobase/dict/dict0stats.cc
@@ -0,0 +1,3181 @@
+/*****************************************************************************
+
+Copyright (c) 2009, 2011, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file dict/dict0stats.cc
+Code used for calculating and manipulating table statistics.
+
+Created Jan 06, 2010 Vasil Dimov
+*******************************************************/
+
+#ifndef UNIV_HOTBACKUP
+
+#include "univ.i"
+
+#include "btr0btr.h" /* btr_get_size() */
+#include "btr0cur.h" /* btr_estimate_number_of_different_key_vals() */
+#include "dict0dict.h" /* dict_table_get_first_index() */
+#include "dict0mem.h" /* DICT_TABLE_MAGIC_N */
+#include "dict0stats.h"
+#include "data0type.h" /* dtype_t */
+#include "db0err.h" /* db_err */
+#include "dyn0dyn.h" /* dyn_array* */
+#include "pars0pars.h" /* pars_info_create() */
+#include "pars0types.h" /* pars_info_t */
+#include "que0que.h" /* que_eval_sql() */
+#include "rem0cmp.h" /* REC_MAX_N_FIELDS,cmp_rec_rec_with_match() */
+#include "row0sel.h" /* sel_node_struct */
+#include "row0types.h" /* sel_node_t */
+#include "trx0trx.h" /* trx_create() */
+#include "trx0roll.h" /* trx_rollback_to_savepoint() */
+#include "ut0rnd.h" /* ut_rnd_interval() */
+
+#include "ha_prototypes.h" /* innobase_strcasecmp() */
+
+/* Sampling algorithm description @{
+
+The algorithm is controlled by one number - srv_stats_persistent_sample_pages,
+let it be A, which is the number of leaf pages to analyze for a given index
+for each n-prefix (if the index is on 3 columns, then 3*A leaf pages will be
+analyzed).
+
+Let the total number of leaf pages in the table be T.
+Level 0 - leaf pages, level H - root.
+
+Definition: N-prefix-boring record is a record on a non-leaf page that equals
+the next (to the right, cross page boundaries, skipping the supremum and
+infimum) record on the same level when looking at the fist n-prefix columns.
+The last (user) record on a level is not boring (it does not match the
+non-existent user record to the right). We call the records boring because all
+the records on the page below a boring record are equal to that boring record.
+
+We avoid diving below boring records when searching for a leaf page to
+estimate the number of distinct records because we know that such a leaf
+page will have number of distinct records == 1.
+
+For each n-prefix: start from the root level and full scan subsequent lower
+levels until a level that contains at least A*10 distinct records is found.
+Lets call this level LA.
+As an optimization the search is canceled if it has reached level 1 (never
+descend to the level 0 (leaf)) and also if the next level to be scanned
+would contain more than A pages. The latter is because the user has asked
+to analyze A leaf pages and it does not make sense to scan much more than
+A non-leaf pages with the sole purpose of finding a good sample of A leaf
+pages.
+
+After finding the appropriate level LA with >A*10 distinct records (or less in
+the exceptions described above), divide it into groups of equal records and
+pick A such groups. Then pick the last record from each group. For example,
+let the level be:
+
+index:  0,1,2,3,4,5,6,7,8,9,10
+record: 1,1,1,2,2,7,7,7,7,7,9
+
+There are 4 groups of distinct records and if A=2 random ones are selected,
+e.g. 1,1,1 and 7,7,7,7,7, then records with indexes 2 and 9 will be selected.
+
+After selecting A records as described above, dive below them to find A leaf
+pages and analyze them, finding the total number of distinct records. The
+dive to the leaf level is performed by selecting a non-boring record from
+each page and diving below it.
+
+This way, a total of A leaf pages are analyzed for the given n-prefix.
+
+Let the number of different key values found in each leaf page i be Pi (i=1..A).
+Let N_DIFF_AVG_LEAF be (P1 + P2 + ... + PA) / A.
+Let the number of different key values on level LA be N_DIFF_LA.
+Let the total number of records on level LA be TOTAL_LA.
+Let R be N_DIFF_LA / TOTAL_LA, we assume this ratio is the same on the
+leaf level.
+Let the number of leaf pages be N.
+Then the total number of different key values on the leaf level is:
+N * R * N_DIFF_AVG_LEAF.
+See REF01 for the implementation.
+
+The above describes how to calculate the cardinality of an index.
+This algorithm is executed for each n-prefix of a multi-column index
+where n=1..n_uniq.
+@} */
+
+/* names of the tables from the persistent statistics storage */
+#define TABLE_STATS_NAME	"mysql/innodb_table_stats"
+#define TABLE_STATS_NAME_PRINT	"mysql.innodb_table_stats"
+#define INDEX_STATS_NAME	"mysql/innodb_index_stats"
+#define INDEX_STATS_NAME_PRINT	"mysql.innodb_index_stats"
+
+#ifdef UNIV_STATS_DEBUG
+#define DEBUG_PRINTF(fmt, ...)	printf(fmt, ## __VA_ARGS__)
+#else /* UNIV_STATS_DEBUG */
+#define DEBUG_PRINTF(fmt, ...)	/* noop */
+#endif /* UNIV_STATS_DEBUG */
+
+/* number of distinct records on a given level that are required to stop
+descending to lower levels and fetch
+srv_stats_persistent_sample_pages records from that level */
+#define N_DIFF_REQUIRED	(srv_stats_persistent_sample_pages * 10)
+
+/** Open handles on the stats tables. Currently this is used to increase the
+reference count of the stats tables. */
+typedef struct dict_stats_struct {
+	dict_table_t*	table_stats;	/*!< Handle to open TABLE_STATS_NAME */
+	dict_table_t*	index_stats;	/*!< Handle to open INDEX_STATS_NAME */
+} dict_stats_t;
+
+/*********************************************************************//**
+Calculates new estimates for table and index statistics. This function
+is relatively quick and is used to calculate transient statistics that
+are not saved on disk.
+This was the only way to calculate statistics before the
+Persistent Statistics feature was introduced.
+dict_stats_update_transient() @{ */
+static
+void
+dict_stats_update_transient(
+/*========================*/
+	dict_table_t*	table)	/*!< in/out: table */
+{
+	dict_index_t*	index;
+	ulint		sum_of_index_sizes	= 0;
+
+	/* Find out the sizes of the indexes and how many different values
+	for the key they approximately have */
+
+	index = dict_table_get_first_index(table);
+
+	if (index == NULL) {
+		/* Table definition is corrupt */
+		ut_print_timestamp(stderr);
+		fprintf(stderr, " InnoDB: table %s has no indexes. "
+			"Cannot calculate statistics.\n", table->name);
+		return;
+	}
+
+	do {
+
+		if (index->type & DICT_FTS) {
+			index = dict_table_get_next_index(index);
+			continue;
+		}
+
+		if (UNIV_LIKELY
+		    (srv_force_recovery < SRV_FORCE_NO_IBUF_MERGE
+		     || (srv_force_recovery < SRV_FORCE_NO_LOG_REDO
+			 && dict_index_is_clust(index)))) {
+			mtr_t	mtr;
+			ulint	size;
+
+			mtr_start(&mtr);
+			mtr_s_lock(dict_index_get_lock(index), &mtr);
+
+			size = btr_get_size(index, BTR_TOTAL_SIZE, &mtr);
+
+			if (size != ULINT_UNDEFINED) {
+				index->stat_index_size = size;
+
+				size = btr_get_size(
+					index, BTR_N_LEAF_PAGES, &mtr);
+			}
+
+			mtr_commit(&mtr);
+
+			switch (size) {
+			case ULINT_UNDEFINED:
+				goto fake_statistics;
+			case 0:
+				/* The root node of the tree is a leaf */
+				size = 1;
+			}
+
+			sum_of_index_sizes += index->stat_index_size;
+
+			index->stat_n_leaf_pages = size;
+
+			btr_estimate_number_of_different_key_vals(index);
+		} else {
+			/* If we have set a high innodb_force_recovery
+			level, do not calculate statistics, as a badly
+			corrupted index can cause a crash in it.
+			Initialize some bogus index cardinality
+			statistics, so that the data can be queried in
+			various means, also via secondary indexes. */
+			ulint	i;
+
+fake_statistics:
+			sum_of_index_sizes++;
+			index->stat_index_size = index->stat_n_leaf_pages = 1;
+
+			for (i = dict_index_get_n_unique(index); i; ) {
+				index->stat_n_diff_key_vals[i--] = 1;
+			}
+
+			memset(index->stat_n_non_null_key_vals, 0,
+			       (1 + dict_index_get_n_unique(index))
+			       * sizeof(*index->stat_n_non_null_key_vals));
+		}
+
+		index = dict_table_get_next_index(index);
+	} while (index);
+
+	index = dict_table_get_first_index(table);
+
+	table->stat_n_rows = index->stat_n_diff_key_vals[
+		dict_index_get_n_unique(index)];
+
+	table->stat_clustered_index_size = index->stat_index_size;
+
+	table->stat_sum_of_other_index_sizes = sum_of_index_sizes
+		- index->stat_index_size;
+
+	table->stat_modified_counter = 0;
+
+	table->stat_initialized = TRUE;
+}
+/* @} */
+
+/*********************************************************************//**
+Checks whether the persistent statistics storage exists and that all
+tables have the proper structure.
+dict_stats_persistent_storage_check() @{
+@return TRUE if exists and all tables are ok */
+static
+ibool
+dict_stats_persistent_storage_check(
+/*================================*/
+	ibool	caller_has_dict_sys_mutex)	/*!< in: TRUE if the caller
+						owns dict_sys->mutex */
+{
+	/* definition for the table TABLE_STATS_NAME */
+	dict_col_meta_t	table_stats_columns[] = {
+		{"database_name", DATA_VARMYSQL,
+			DATA_NOT_NULL, 192 /* NAME_LEN from mysql_com.h */},
+
+		{"table_name", DATA_VARMYSQL,
+			DATA_NOT_NULL, 192 /* NAME_LEN from mysql_com.h */},
+
+		{"last_update", DATA_FIXBINARY,
+			DATA_NOT_NULL, 4},
+
+		{"n_rows", DATA_INT,
+			DATA_NOT_NULL | DATA_UNSIGNED, 8},
+
+		{"clustered_index_size", DATA_INT,
+			DATA_NOT_NULL | DATA_UNSIGNED, 8},
+
+		{"sum_of_other_index_sizes", DATA_INT,
+			DATA_NOT_NULL | DATA_UNSIGNED, 8}
+	};
+	dict_table_schema_t	table_stats_schema = {
+		TABLE_STATS_NAME,
+		UT_ARR_SIZE(table_stats_columns),
+		table_stats_columns
+	};
+
+	/* definition for the table INDEX_STATS_NAME */
+	dict_col_meta_t	index_stats_columns[] = {
+		{"database_name", DATA_VARMYSQL,
+			DATA_NOT_NULL, 192 /* NAME_LEN from mysql_com.h */},
+
+		{"table_name", DATA_VARMYSQL,
+			DATA_NOT_NULL, 192 /* NAME_LEN from mysql_com.h */},
+
+		{"index_name", DATA_VARMYSQL,
+			DATA_NOT_NULL, 192 /* NAME_LEN from mysql_com.h */},
+
+		{"last_update", DATA_FIXBINARY,
+			DATA_NOT_NULL, 4},
+
+		{"stat_name", DATA_VARMYSQL,
+			DATA_NOT_NULL, 64*3},
+
+		{"stat_value", DATA_INT,
+			DATA_NOT_NULL | DATA_UNSIGNED, 8},
+
+		{"sample_size", DATA_INT,
+			DATA_UNSIGNED, 8},
+
+		{"stat_description", DATA_VARMYSQL,
+			DATA_NOT_NULL, 1024*3}
+	};
+	dict_table_schema_t	index_stats_schema = {
+		INDEX_STATS_NAME,
+		UT_ARR_SIZE(index_stats_columns),
+		index_stats_columns
+	};
+
+	char		errstr[512];
+	enum db_err	ret;
+
+	if (!caller_has_dict_sys_mutex) {
+		mutex_enter(&(dict_sys->mutex));
+	}
+
+	ut_ad(mutex_own(&dict_sys->mutex));
+
+	/* first check table_stats */
+	ret = dict_table_schema_check(&table_stats_schema, errstr,
+				      sizeof(errstr));
+	if (ret == DB_SUCCESS) {
+		/* if it is ok, then check index_stats */
+		ret = dict_table_schema_check(&index_stats_schema, errstr,
+					      sizeof(errstr));
+	}
+
+	if (!caller_has_dict_sys_mutex) {
+		mutex_exit(&(dict_sys->mutex));
+	}
+
+	if (ret != DB_SUCCESS && ret != DB_TABLE_NOT_FOUND) {
+
+		ut_print_timestamp(stderr);
+		fprintf(stderr, " InnoDB: %s\n", errstr);
+	}
+	/* We return silently if some of the tables are not present because
+	this code is executed during open table. By design we check if the
+	persistent statistics storage is present and whether there are stats
+	for the table being opened and if so, then we use them, otherwise we
+	silently switch back to using the transient stats. */
+
+	return(ret == DB_SUCCESS);
+}
+/* @} */
+
+/* @{ Pseudo code about the relation between the following functions
+
+let N = srv_stats_persistent_sample_pages
+
+dict_stats_analyze_index()
+  for each n_prefix
+    search for good enough level:
+      dict_stats_analyze_index_level() // only called if level has <= N pages
+        // full scan of the level in one mtr
+        collect statistics about the given level
+      if we are not satisfied with the level, search next lower level
+    we have found a good enough level here
+    dict_stats_analyze_index_for_n_prefix(that level, stats collected above)
+      // full scan of the level in one mtr
+      dive below some records and analyze the leaf page there:
+      dict_stats_analyze_index_below_cur()
+@} */
+
+/*********************************************************************//**
+Find the total number and the number of distinct keys on a given level in
+an index. Each of the 1..n_uniq prefixes are looked up and the results are
+saved in the array n_diff[]. Notice that n_diff[] must be able to store
+n_uniq+1 numbers because the results are saved in
+n_diff[1] .. n_diff[n_uniq]. The total number of records on the level is
+saved in total_recs.
+Also, the index of the last record in each group of equal records is saved
+in n_diff_boundaries[1..n_uniq], records indexing starts from the leftmost
+record on the level and continues cross pages boundaries, counting from 0.
+dict_stats_analyze_index_level() @{ */
+static
+void
+dict_stats_analyze_index_level(
+/*===========================*/
+	dict_index_t*	index,		/*!< in: index */
+	ulint		level,		/*!< in: level */
+	ib_uint64_t*	n_diff,		/*!< out: array for number of
+					distinct keys for all prefixes */
+	ib_uint64_t*	total_recs,	/*!< out: total number of records */
+	ib_uint64_t*	total_pages,	/*!< out: total number of pages */
+	dyn_array_t*	n_diff_boundaries)/*!< out: boundaries of the groups
+					of distinct keys */
+{
+	ulint		n_uniq;
+	mem_heap_t*	heap;
+	dtuple_t*	dtuple;
+	btr_pcur_t	pcur;
+	mtr_t		mtr;
+	const page_t*	page;
+	const rec_t*	rec;
+	const rec_t*	prev_rec;
+	byte*		prev_rec_buf = NULL;
+	ulint		prev_rec_buf_size = 0;
+	ulint		i;
+
+	DEBUG_PRINTF("    %s(table=%s, index=%s, level=%lu)\n", __func__,
+		     index->table->name, index->name, level);
+
+	n_uniq = dict_index_get_n_unique(index);
+
+	/* elements in the n_diff array are 1..n_uniq (inclusive) */
+	memset(n_diff, 0x0, (n_uniq + 1) * sizeof(*n_diff));
+
+	heap = mem_heap_create(256);
+
+	/* reset the dynamic arrays n_diff_boundaries[1..n_uniq];
+	n_diff_boundaries[0] is ignored to follow the same convention
+	as n_diff[] */
+	if (n_diff_boundaries != NULL) {
+		for (i = 1; i <= n_uniq; i++) {
+			dyn_array_free(&n_diff_boundaries[i]);
+
+			dyn_array_create(&n_diff_boundaries[i]);
+		}
+	}
+
+	/* craft a record that is always smaller than the others,
+	this way we are sure that the cursor pcur will be positioned
+	on the leftmost record on the leftmost page on the desired level */
+	dtuple = dtuple_create(heap, dict_index_get_n_unique(index));
+	dict_table_copy_types(dtuple, index->table);
+	dtuple_set_info_bits(dtuple, REC_INFO_MIN_REC_FLAG);
+
+	mtr_start(&mtr);
+
+	btr_pcur_open_low(index, level, dtuple, PAGE_CUR_LE, BTR_SEARCH_LEAF,
+			  &pcur, __FILE__, __LINE__, &mtr);
+
+	page = btr_pcur_get_page(&pcur);
+
+	/* check that we are indeed on the desired level */
+	ut_a(btr_page_get_level(page, &mtr) == level);
+
+	/* there should not be any pages on the left */
+	ut_a(btr_page_get_prev(page, &mtr) == FIL_NULL);
+
+	/* check whether the first record on the leftmost page is marked
+	as such, if we are on a non-leaf level */
+	ut_a(level == 0
+	     || (REC_INFO_MIN_REC_FLAG & rec_get_info_bits(
+		     page_rec_get_next_const(page_get_infimum_rec(page)),
+		     page_is_comp(page))));
+
+	if (btr_pcur_is_before_first_on_page(&pcur)) {
+		btr_pcur_move_to_next_on_page(&pcur);
+	}
+
+	if (btr_pcur_is_after_last_on_page(&pcur)) {
+		btr_pcur_move_to_prev_on_page(&pcur);
+	}
+
+	prev_rec = NULL;
+
+	/* no records by default */
+	*total_recs = 0;
+
+	*total_pages = 0;
+
+	/* iterate over all user records on this level
+	and compare each two adjacent ones, even the last on page
+	X and the fist on page X+1 */
+	for (;
+	     btr_pcur_is_on_user_rec(&pcur);
+	     btr_pcur_move_to_next_user_rec(&pcur, &mtr)) {
+
+		ulint	matched_fields = 0;
+		ulint	matched_bytes = 0;
+		ulint	offsets_rec_onstack[REC_OFFS_NORMAL_SIZE];
+		ulint*	offsets_rec;
+
+		rec_offs_init(offsets_rec_onstack);
+
+		rec = btr_pcur_get_rec(&pcur);
+
+		/* increment the pages counter at the end of each page */
+		if (page_rec_is_supremum(page_rec_get_next_const(rec))) {
+
+			(*total_pages)++;
+		}
+
+		/* skip delete-marked records */
+		if (rec_get_deleted_flag(rec, page_is_comp(
+				btr_pcur_get_page(&pcur)))) {
+
+			continue;
+		}
+
+		offsets_rec = rec_get_offsets(rec, index, offsets_rec_onstack,
+					      n_uniq, &heap);
+
+		(*total_recs)++;
+
+		if (prev_rec != NULL) {
+
+			ulint	offsets_prev_rec_onstack[REC_OFFS_NORMAL_SIZE];
+			ulint*	offsets_prev_rec;
+
+			rec_offs_init(offsets_prev_rec_onstack);
+
+			offsets_prev_rec = rec_get_offsets(
+				prev_rec, index, offsets_prev_rec_onstack,
+				n_uniq, &heap);
+
+			cmp_rec_rec_with_match(rec,
+					       prev_rec,
+					       offsets_rec,
+					       offsets_prev_rec,
+					       index,
+					       FALSE,
+					       &matched_fields,
+					       &matched_bytes);
+
+			for (i = matched_fields + 1; i <= n_uniq; i++) {
+
+				if (n_diff_boundaries != NULL) {
+					/* push the index of the previous
+					record, that is - the last one from
+					a group of equal keys */
+
+					void*		p;
+					ib_uint64_t	idx;
+
+					/* the index of the current record
+					is total_recs - 1, the index of the
+					previous record is total_recs - 2;
+					we know that idx is not going to
+					become negative here because if we
+					are in this branch then there is a
+					previous record and thus
+					total_recs >= 2 */
+					idx = *total_recs - 2;
+
+					p = dyn_array_push(
+						&n_diff_boundaries[i],
+						sizeof(ib_uint64_t));
+
+					memcpy(p, &idx, sizeof(ib_uint64_t));
+				}
+
+				/* increment the number of different keys
+				for n_prefix=i */
+				n_diff[i]++;
+			}
+		} else {
+			/* this is the first non-delete marked record */
+			for (i = 1; i <= n_uniq; i++) {
+				n_diff[i] = 1;
+			}
+		}
+
+		if (page_rec_is_supremum(page_rec_get_next_const(rec))) {
+			/* end of a page has been reached */
+
+			/* we need to copy the record instead of assigning
+			like prev_rec = rec; because when we traverse the
+			records on this level at some point we will jump from
+			one page to the next and then rec and prev_rec will
+			be on different pages and
+			btr_pcur_move_to_next_user_rec() will release the
+			latch on the page that prev_rec is on */
+			prev_rec = rec_copy_prefix_to_buf(
+				rec, index, rec_offs_n_fields(offsets_rec),
+				&prev_rec_buf, &prev_rec_buf_size);
+
+		} else {
+			/* still on the same page, the next call to
+			btr_pcur_move_to_next_user_rec() will not jump
+			on the next page, we can simply assign pointers
+			instead of copying the records like above */
+
+			prev_rec = rec;
+		}
+	}
+
+	/* if *total_pages is left untouched then the above loop was not
+	entered at all and there is one page in the whole tree which is
+	empty */
+	if (*total_pages == 0) {
+
+		ut_ad(level == 0);
+		ut_ad(*total_recs == 0);
+
+		*total_pages = 1;
+	}
+
+	/* if there are records on this level and boundaries
+	should be saved */
+	if (*total_recs > 0 && n_diff_boundaries != NULL) {
+
+		/* remember the index of the last record on the level as the
+		last one from the last group of equal keys; this holds for
+		all possible prefixes */
+		for (i = 1; i <= n_uniq; i++) {
+			void*		p;
+			ib_uint64_t	idx;
+
+			idx = *total_recs - 1;
+
+			p = dyn_array_push(&n_diff_boundaries[i],
+					   sizeof(ib_uint64_t));
+
+			memcpy(p, &idx, sizeof(ib_uint64_t));
+		}
+	}
+
+	/* now in n_diff_boundaries[i] there are exactly n_diff[i] integers,
+	for i=1..n_uniq */
+
+#ifdef UNIV_STATS_DEBUG
+	for (i = 1; i <= n_uniq; i++) {
+
+		DEBUG_PRINTF("    %s(): total recs: " UINT64PF
+			     ", total pages: " UINT64PF
+			     ", n_diff[%lu]: " UINT64PF "\n",
+			     __func__, *total_recs,
+			     *total_pages,
+			     i, n_diff[i]);
+
+#if 0
+		if (n_diff_boundaries != NULL) {
+			ib_uint64_t	j;
+
+			DEBUG_PRINTF("    %s(): boundaries[%lu]: ",
+				     __func__, i);
+
+			for (j = 0; j < n_diff[i]; j++) {
+				ib_uint64_t	idx;
+
+				idx = *(ib_uint64_t*) dyn_array_get_element(
+					&n_diff_boundaries[i],
+					j * sizeof(ib_uint64_t));
+
+				DEBUG_PRINTF(UINT64PF "=" UINT64PF ", ",
+					     j, idx);
+			}
+			DEBUG_PRINTF("\n");
+		}
+#endif
+	}
+#endif /* UNIV_STATS_DEBUG */
+
+	btr_pcur_close(&pcur);
+
+	mtr_commit(&mtr);
+
+	if (prev_rec_buf != NULL) {
+
+		mem_free(prev_rec_buf);
+	}
+
+	mem_heap_free(heap);
+}
+/* @} */
+
+/* aux enum for controlling the behavior of dict_stats_scan_page() @{ */
+typedef enum page_scan_method_enum {
+	COUNT_ALL_NON_BORING,	/* scan all records on the given page
+				and count the number of distinct ones */
+	QUIT_ON_FIRST_NON_BORING/* quit when the first record that differs
+				from its right neighbor is found */
+} page_scan_method_t;
+/* @} */
+
+/*********************************************************************//**
+Scan a page, reading records from left to right and counting the number
+of distinct records on that page (looking only at the first n_prefix
+columns). If scan_method is QUIT_ON_FIRST_NON_BORING then the function
+will return as soon as it finds a record that does not match its neighbor
+to the right, which means that in the case of QUIT_ON_FIRST_NON_BORING the
+returned n_diff can either be 0 (empty page), 1 (the whole page has all keys
+equal) or 2 (the function found a non-boring record and returned).
+@return offsets1 or offsets2 (the offsets of *out_rec),
+or NULL if the page is empty and does not contain user records.
+dict_stats_scan_page() @{ */
+UNIV_INLINE __attribute__((nonnull))
+ulint*
+dict_stats_scan_page(
+/*=================*/
+	const rec_t**		out_rec,	/*!< out: record, or NULL */
+	ulint*			offsets1,	/*!< out: rec_get_offsets()
+						working space (must be big
+						enough) */
+	ulint*			offsets2,	/*!< out: rec_get_offsets()
+						working space (must be big
+						enough) */
+	dict_index_t*		index,		/*!< in: index of the page */
+	const page_t*		page,		/*!< in: the page to scan */
+	ulint			n_prefix,	/*!< in: look at the first
+						n_prefix columns */
+	page_scan_method_t	scan_method,	/*!< in: scan to the end of
+						the page or not */
+	ib_uint64_t*		n_diff)		/*!< out: number of distinct
+						records encountered */
+{
+	ulint*		offsets_rec		= offsets1;
+	ulint*		offsets_next_rec	= offsets2;
+	const rec_t*	rec;
+	const rec_t*	next_rec;
+	/* A dummy heap, to be passed to rec_get_offsets().
+	Because offsets1,offsets2 should be big enough,
+	this memory heap should never be used. */
+	mem_heap_t*	heap			= NULL;
+
+	rec = page_rec_get_next_const(page_get_infimum_rec(page));
+
+	if (page_rec_is_supremum(rec)) {
+		/* the page is empty */
+		*n_diff = 0;
+		*out_rec = NULL;
+		return(NULL);
+	}
+
+	offsets_rec = rec_get_offsets(rec, index, offsets_rec,
+				      ULINT_UNDEFINED, &heap);
+
+	next_rec = page_rec_get_next_const(rec);
+
+	*n_diff = 1;
+
+	while (!page_rec_is_supremum(next_rec)) {
+
+		ulint	matched_fields = 0;
+		ulint	matched_bytes = 0;
+
+		offsets_next_rec = rec_get_offsets(next_rec, index,
+						   offsets_next_rec,
+						   ULINT_UNDEFINED,
+						   &heap);
+
+		/* check whether rec != next_rec when looking at
+		the first n_prefix fields */
+		cmp_rec_rec_with_match(rec, next_rec,
+				       offsets_rec, offsets_next_rec,
+				       index, FALSE, &matched_fields,
+				       &matched_bytes);
+
+		if (matched_fields < n_prefix) {
+			/* rec != next_rec, => rec is non-boring */
+
+			(*n_diff)++;
+
+			if (scan_method == QUIT_ON_FIRST_NON_BORING) {
+				goto func_exit;
+			}
+		}
+
+		rec = next_rec;
+		{
+			/* Assign offsets_rec = offsets_next_rec
+			so that offsets_rec matches with rec which
+			was just assigned rec = next_rec above.
+			Also need to point offsets_next_rec to the
+			place where offsets_rec was pointing before
+			because we have just 2 placeholders where
+			data is actually stored:
+			offsets_onstack1 and offsets_onstack2 and we
+			are using them in circular fashion
+			(offsets[_next]_rec are just pointers to
+			those placeholders). */
+			ulint*	offsets_tmp;
+			offsets_tmp = offsets_rec;
+			offsets_rec = offsets_next_rec;
+			offsets_next_rec = offsets_tmp;
+		}
+		next_rec = page_rec_get_next_const(next_rec);
+	}
+
+func_exit:
+	/* offsets1,offsets2 should have been big enough */
+	ut_a(heap == NULL);
+	*out_rec = rec;
+	return(offsets_rec);
+}
+/* @} */
+
+/*********************************************************************//**
+Dive below the current position of a cursor and calculate the number of
+distinct records on the leaf page, when looking at the fist n_prefix
+columns.
+dict_stats_analyze_index_below_cur() @{
+@return number of distinct records on the leaf page */
+static
+ib_uint64_t
+dict_stats_analyze_index_below_cur(
+/*===============================*/
+	const btr_cur_t*cur,		/*!< in: cursor */
+	ulint		n_prefix,	/*!< in: look at the first n_prefix
+					columns when comparing records */
+	mtr_t*		mtr)		/*!< in/out: mini-transaction */
+{
+	dict_index_t*	index;
+	ulint		space;
+	ulint		zip_size;
+	buf_block_t*	block;
+	ulint		page_no;
+	const page_t*	page;
+	mem_heap_t*	heap;
+	const rec_t*	rec;
+	ulint*		offsets1;
+	ulint*		offsets2;
+	ulint*		offsets_rec;
+	ulint		root_height;
+	ib_uint64_t	n_diff; /* the result */
+	ulint		size;
+
+	index = btr_cur_get_index(cur);
+
+	/* Allocate offsets for the record and the node pointer, for
+	node pointer records. In a secondary index, the node pointer
+	record will consist of all index fields followed by a child
+	page number.
+	Allocate space for the offsets header (the allocation size at
+	offsets[0] and the REC_OFFS_HEADER_SIZE bytes), and n_fields + 1,
+	so that this will never be less than the size calculated in
+	rec_get_offsets_func(). */
+	size = (1 + REC_OFFS_HEADER_SIZE) + 1 + dict_index_get_n_fields(index);
+
+	heap = mem_heap_create(size * (sizeof *offsets1 + sizeof *offsets2));
+
+	offsets1 = static_cast<ulint*>(mem_heap_alloc(
+			heap, size * sizeof *offsets1));
+
+	offsets2 = static_cast<ulint*>(mem_heap_alloc(
+			heap, size * sizeof *offsets2));
+
+	rec_offs_set_n_alloc(offsets1, size);
+	rec_offs_set_n_alloc(offsets2, size);
+
+	root_height = btr_page_get_level(btr_root_get(index, mtr), mtr);
+
+	space = dict_index_get_space(index);
+	zip_size = dict_table_zip_size(index->table);
+
+	rec = btr_cur_get_rec(cur);
+
+	offsets_rec = rec_get_offsets(rec, index, offsets1,
+				      ULINT_UNDEFINED, &heap);
+
+	page_no = btr_node_ptr_get_child_page_no(rec, offsets_rec);
+
+	/* descend to the leaf level on the B-tree */
+	for (;;) {
+
+		block = buf_page_get_gen(space, zip_size, page_no, RW_S_LATCH,
+					 NULL /* no guessed block */,
+					 BUF_GET, __FILE__, __LINE__, mtr);
+
+		page = buf_block_get_frame(block);
+
+		if (btr_page_get_level(page, mtr) == 0) {
+			/* leaf level */
+			break;
+		}
+		/* else */
+
+		/* search for the first non-boring record on the page */
+		offsets_rec = dict_stats_scan_page(
+			&rec, offsets1, offsets2, index, page, n_prefix,
+			QUIT_ON_FIRST_NON_BORING, &n_diff);
+
+		/* pages on level > 0 are not allowed to be empty */
+		ut_a(offsets_rec != NULL);
+		/* if page is not empty (offsets_rec != NULL) then n_diff must
+		be > 0, otherwise there is a bug in dict_stats_scan_page() */
+		ut_a(n_diff > 0);
+
+		if (n_diff == 1) {
+			/* page has all keys equal and the end of the page
+			was reached by dict_stats_scan_page(), no need to
+			descend to the leaf level */
+			mem_heap_free(heap);
+			return(1);
+		}
+		/* else */
+
+		/* when we instruct dict_stats_scan_page() to quit on the
+		first non-boring record it finds, then the returned n_diff
+		can either be 0 (empty page), 1 (page has all keys equal) or
+		2 (non-boring record was found) */
+		ut_a(n_diff == 2);
+
+		/* we have a non-boring record in rec, descend below it */
+
+		page_no = btr_node_ptr_get_child_page_no(rec, offsets_rec);
+	}
+
+	/* make sure we got a leaf page as a result from the above loop */
+	ut_ad(btr_page_get_level(page, mtr) == 0);
+
+	/* scan the leaf page and find the number of distinct keys,
+	when looking only at the first n_prefix columns */
+
+	offsets_rec = dict_stats_scan_page(
+		&rec, offsets1, offsets2, index, page, n_prefix,
+		COUNT_ALL_NON_BORING, &n_diff);
+
+	if (root_height > 0) {
+
+		/* empty pages are allowed only if the whole B-tree is empty
+		and contains a single empty page */
+		ut_a(offsets_rec != NULL);
+	}
+
+#if 0
+	DEBUG_PRINTF("      %s(): n_diff below page_no=%lu: " UINT64PF "\n",
+		     __func__, page_no, n_diff);
+#endif
+
+	mem_heap_free(heap);
+
+	return(n_diff);
+}
+/* @} */
+
+/*********************************************************************//**
+For a given level in an index select srv_stats_persistent_sample_pages
+(or less) records from that level and dive below them to the corresponding
+leaf pages, then scan those leaf pages and save the sampling results in
+index->stat_n_diff_key_vals[n_prefix] and the number of pages scanned in
+index->stat_n_sample_sizes[n_prefix].
+dict_stats_analyze_index_for_n_prefix() @{ */
+static
+void
+dict_stats_analyze_index_for_n_prefix(
+/*==================================*/
+	dict_index_t*	index,			/*!< in/out: index */
+	ulint		level,			/*!< in: level,
+						must be >= 1 */
+	ib_uint64_t	total_recs_on_level,	/*!< in: total number of
+						records on the given level */
+	ulint		n_prefix,		/*!< in: look at first
+						n_prefix columns when
+						comparing records */
+	ib_uint64_t	n_diff_for_this_prefix,	/*!< in: number of distinct
+						records on the given level,
+						when looking at the first
+						n_prefix columns */
+	dyn_array_t*	boundaries)		/*!< in: array that contains
+						n_diff_for_this_prefix
+						integers each of which
+						represents the index (on the
+						level, counting from
+						left/smallest to right/biggest
+						from 0) of the last record
+						from each group of distinct
+						keys */
+{
+	mem_heap_t*	heap;
+	dtuple_t*	dtuple;
+	btr_pcur_t	pcur;
+	mtr_t		mtr;
+	const page_t*	page;
+	ib_uint64_t	rec_idx;
+	ib_uint64_t	last_idx_on_level;
+	ib_uint64_t	n_recs_to_dive_below;
+	ib_uint64_t	n_diff_sum_of_all_analyzed_pages;
+	ib_uint64_t	i;
+
+#if 0
+	DEBUG_PRINTF("    %s(table=%s, index=%s, level=%lu, n_prefix=%lu, "
+		     "n_diff_for_this_prefix=" UINT64PF ")\n",
+		     __func__, index->table->name, index->name, level,
+		     n_prefix, n_diff_for_this_prefix);
+#endif
+
+	/* if some of those is 0 then this means that there is exactly one
+	page in the B-tree and it is empty and we should have done full scan
+	and should not be here */
+	ut_ad(total_recs_on_level > 0);
+	ut_ad(n_diff_for_this_prefix > 0);
+
+	/* this is configured to be min 1, someone has changed the code */
+	ut_ad(srv_stats_persistent_sample_pages > 0);
+
+	heap = mem_heap_create(256);
+
+	/* craft a record that is always smaller than the others,
+	this way we are sure that the cursor pcur will be positioned
+	on the leftmost record on the leftmost page on the desired level */
+	dtuple = dtuple_create(heap, dict_index_get_n_unique(index));
+	dict_table_copy_types(dtuple, index->table);
+	dtuple_set_info_bits(dtuple, REC_INFO_MIN_REC_FLAG);
+
+	mtr_start(&mtr);
+
+	btr_pcur_open_low(index, level, dtuple, PAGE_CUR_LE, BTR_SEARCH_LEAF,
+			  &pcur, __FILE__, __LINE__, &mtr);
+
+	page = btr_pcur_get_page(&pcur);
+
+	/* check that we are indeed on the desired level */
+	ut_a(btr_page_get_level(page, &mtr) == level);
+
+	/* there should not be any pages on the left */
+	ut_a(btr_page_get_prev(page, &mtr) == FIL_NULL);
+
+	/* check whether the first record on the leftmost page is marked
+	as such, if we are on a non-leaf level */
+	ut_a(level == 0 || REC_INFO_MIN_REC_FLAG
+	     & rec_get_info_bits(page_rec_get_next_const(
+					 page_get_infimum_rec(page)),
+				 page_is_comp(page)));
+
+	if (btr_pcur_is_before_first_on_page(&pcur)) {
+		btr_pcur_move_to_next_on_page(&pcur);
+	}
+
+	if (btr_pcur_is_after_last_on_page(&pcur)) {
+		btr_pcur_move_to_prev_on_page(&pcur);
+	}
+
+	last_idx_on_level = *(ib_uint64_t*) dyn_array_get_element(boundaries,
+		(ulint) ((n_diff_for_this_prefix - 1) * sizeof(ib_uint64_t)));
+
+	rec_idx = 0;
+
+	n_diff_sum_of_all_analyzed_pages = 0;
+
+	n_recs_to_dive_below = ut_min(srv_stats_persistent_sample_pages,
+				      n_diff_for_this_prefix);
+
+	for (i = 0; i < n_recs_to_dive_below; i++) {
+		ib_uint64_t	left;
+		ib_uint64_t	right;
+		ulint		rnd;
+		ib_uint64_t	dive_below_idx;
+
+		/* there are n_diff_for_this_prefix elements
+		in the array boundaries[] and we divide those elements
+		into n_recs_to_dive_below segments, for example:
+
+		let n_diff_for_this_prefix=100, n_recs_to_dive_below=4, then:
+		segment i=0:  [0, 24]
+		segment i=1: [25, 49]
+		segment i=2: [50, 74]
+		segment i=3: [75, 99] or
+
+		let n_diff_for_this_prefix=1, n_recs_to_dive_below=1, then:
+		segment i=0: [0, 0] or
+
+		let n_diff_for_this_prefix=2, n_recs_to_dive_below=2, then:
+		segment i=0: [0, 0]
+		segment i=1: [1, 1] or
+
+		let n_diff_for_this_prefix=13, n_recs_to_dive_below=7, then:
+		segment i=0:  [0,  0]
+		segment i=1:  [1,  2]
+		segment i=2:  [3,  4]
+		segment i=3:  [5,  6]
+		segment i=4:  [7,  8]
+		segment i=5:  [9, 10]
+		segment i=6: [11, 12]
+
+		then we select a random record from each segment and dive
+		below it */
+		left = n_diff_for_this_prefix * i / n_recs_to_dive_below;
+		right = n_diff_for_this_prefix * (i + 1)
+			/ n_recs_to_dive_below - 1;
+
+		ut_a(left <= right);
+		ut_a(right <= last_idx_on_level);
+
+		/* we do not pass (left, right) because we do not want to ask
+		ut_rnd_interval() to work with too big numbers since
+		ib_uint64_t could be bigger than ulint */
+		rnd = ut_rnd_interval(0, (ulint) (right - left));
+
+		dive_below_idx = *(ib_uint64_t*) dyn_array_get_element(
+			boundaries, (ulint) ((left + rnd)
+					     * sizeof(ib_uint64_t)));
+
+#if 0
+		DEBUG_PRINTF("    %s(): dive below record with index="
+			     UINT64PF "\n", __func__, dive_below_idx);
+#endif
+
+		/* seek to the record with index dive_below_idx */
+		while (rec_idx < dive_below_idx
+		       && btr_pcur_is_on_user_rec(&pcur)) {
+
+			btr_pcur_move_to_next_user_rec(&pcur, &mtr);
+			rec_idx++;
+		}
+
+		/* if the level has finished before the record we are
+		searching for, this means that the B-tree has changed in
+		the meantime, quit our sampling and use whatever stats
+		we have collected so far */
+		if (rec_idx < dive_below_idx) {
+
+			ut_ad(!btr_pcur_is_on_user_rec(&pcur));
+			break;
+		}
+
+		ut_a(rec_idx == dive_below_idx);
+
+		ib_uint64_t	n_diff_on_leaf_page;
+
+		n_diff_on_leaf_page = dict_stats_analyze_index_below_cur(
+			btr_pcur_get_btr_cur(&pcur), n_prefix, &mtr);
+
+		/* We adjust n_diff_on_leaf_page here to avoid counting
+		one record twice - once as the last on some page and once
+		as the first on another page. Consider the following example:
+		Leaf level:
+		page: (2,2,2,2,3,3)
+		... many pages like (3,3,3,3,3,3) ...
+		page: (3,3,3,3,5,5)
+		... many pages like (5,5,5,5,5,5) ...
+		page: (5,5,5,5,8,8)
+		page: (8,8,8,8,9,9)
+		our algo would (correctly) get an estimate that there are
+		2 distinct records per page (average). Having 4 pages below
+		non-boring records, it would (wrongly) estimate the number
+		of distinct records to 8. */
+		if (n_diff_on_leaf_page > 0) {
+			n_diff_on_leaf_page--;
+		}
+
+		n_diff_sum_of_all_analyzed_pages += n_diff_on_leaf_page;
+	}
+
+	if (n_diff_sum_of_all_analyzed_pages == 0) {
+		n_diff_sum_of_all_analyzed_pages = 1;
+	}
+
+	/* See REF01 for an explanation of the algorithm */
+	index->stat_n_diff_key_vals[n_prefix]
+		= index->stat_n_leaf_pages
+
+		* n_diff_for_this_prefix
+		/ total_recs_on_level
+
+		* n_diff_sum_of_all_analyzed_pages
+		/ n_recs_to_dive_below;
+
+	index->stat_n_sample_sizes[n_prefix] = n_recs_to_dive_below;
+
+	DEBUG_PRINTF("    %s(): n_diff=" UINT64PF " for n_prefix=%lu "
+		     "(%lu"
+		     " * " UINT64PF " / " UINT64PF
+		     " * " UINT64PF " / " UINT64PF ")\n",
+		     __func__, index->stat_n_diff_key_vals[n_prefix],
+		     n_prefix,
+		     index->stat_n_leaf_pages,
+		     n_diff_for_this_prefix, total_recs_on_level,
+		     n_diff_sum_of_all_analyzed_pages, n_recs_to_dive_below);
+
+	btr_pcur_close(&pcur);
+
+	mtr_commit(&mtr);
+
+	mem_heap_free(heap);
+}
+/* @} */
+
+/*********************************************************************//**
+Calculates new statistics for a given index and saves them to the index
+members stat_n_diff_key_vals[], stat_n_sample_sizes[], stat_index_size and
+stat_n_leaf_pages. This function could be slow.
+dict_stats_analyze_index() @{ */
+static
+void
+dict_stats_analyze_index(
+/*=====================*/
+	dict_index_t*	index)	/*!< in/out: index to analyze */
+{
+	ulint		root_level;
+	ulint		level;
+	ibool		level_is_analyzed;
+	ulint		n_uniq;
+	ulint		n_prefix;
+	ib_uint64_t*	n_diff_on_level;
+	ib_uint64_t	total_recs;
+	ib_uint64_t	total_pages;
+	dyn_array_t*	n_diff_boundaries;
+	mtr_t		mtr;
+	ulint		size;
+	ulint		i;
+
+	DEBUG_PRINTF("  %s(index=%s)\n", __func__, index->name);
+
+	mtr_start(&mtr);
+
+	mtr_s_lock(dict_index_get_lock(index), &mtr);
+
+	size = btr_get_size(index, BTR_TOTAL_SIZE, &mtr);
+
+	if (size != ULINT_UNDEFINED) {
+		index->stat_index_size = size;
+		size = btr_get_size(index, BTR_N_LEAF_PAGES, &mtr);
+	}
+
+	switch (size) {
+	case ULINT_UNDEFINED:
+		mtr_commit(&mtr);
+		/* Fake some statistics. */
+		index->stat_index_size = index->stat_n_leaf_pages = 1;
+
+		for (i = dict_index_get_n_unique(index); i; ) {
+			index->stat_n_diff_key_vals[i--] = 1;
+		}
+
+		memset(index->stat_n_non_null_key_vals, 0,
+		       (1 + dict_index_get_n_unique(index))
+		       * sizeof(*index->stat_n_non_null_key_vals));
+		return;
+	case 0:
+		/* The root node of the tree is a leaf */
+		size = 1;
+	}
+
+	index->stat_n_leaf_pages = size;
+
+	root_level = btr_page_get_level(btr_root_get(index, &mtr), &mtr);
+
+	mtr_commit(&mtr);
+
+	n_uniq = dict_index_get_n_unique(index);
+
+	/* if the tree has just one level (and one page) or if the user
+	has requested to sample too many pages then do full scan */
+	if (root_level == 0
+	    /* for each n-column prefix (for n=1..n_uniq)
+	    srv_stats_persistent_sample_pages will be sampled, so in total
+	    srv_stats_persistent_sample_pages * n_uniq leaf pages will be
+	    sampled. If that number is bigger than the total number of leaf
+	    pages then do full scan of the leaf level instead since it will
+	    be faster and will give better results. */
+	    || srv_stats_persistent_sample_pages * n_uniq
+	       > index->stat_n_leaf_pages) {
+
+		if (root_level == 0) {
+			DEBUG_PRINTF("  %s(): just one page, "
+				     "doing full scan\n", __func__);
+		} else {
+			DEBUG_PRINTF("  %s(): too many pages requested for "
+				     "sampling, doing full scan\n", __func__);
+		}
+
+		/* do full scan of level 0; save results directly
+		into the index */
+
+		dict_stats_analyze_index_level(index,
+					       0 /* leaf level */,
+					       index->stat_n_diff_key_vals,
+					       &total_recs,
+					       &total_pages,
+					       NULL /*boundaries not needed*/);
+
+		for (i = 1; i <= n_uniq; i++) {
+			index->stat_n_sample_sizes[i] = total_pages;
+		}
+
+		return;
+	}
+	/* else */
+
+	/* set to zero */
+	n_diff_on_level = (ib_uint64_t*) mem_zalloc((n_uniq + 1)
+						    * sizeof(ib_uint64_t));
+
+	n_diff_boundaries = (dyn_array_t*) mem_alloc((n_uniq + 1)
+						     * sizeof(dyn_array_t));
+
+	for (i = 1; i <= n_uniq; i++) {
+		/* initialize the dynamic arrays, the first one
+		(index=0) is ignored to follow the same indexing
+		scheme as n_diff_on_level[] */
+		dyn_array_create(&n_diff_boundaries[i]);
+	}
+
+	/* total_recs is also used to estimate the number of pages on one
+	level below, so at the start we have 1 page (the root) */
+	total_recs = 1;
+
+	/* Here we use the following optimization:
+	If we find that level L is the first one (searching from the
+	root) that contains at least D distinct keys when looking at
+	the first n_prefix columns, then:
+	if we look at the first n_prefix-1 columns then the first
+	level that contains D distinct keys will be either L or a
+	lower one.
+	So if we find that the first level containing D distinct
+	keys (on n_prefix columns) is L, we continue from L when
+	searching for D distinct keys on n_prefix-1 columns. */
+	level = (long) root_level;
+	level_is_analyzed = FALSE;
+	for (n_prefix = n_uniq; n_prefix >= 1; n_prefix--) {
+
+		DEBUG_PRINTF("  %s(): searching level with >=%llu "
+			     "distinct records, n_prefix=%lu\n",
+			     __func__, N_DIFF_REQUIRED, n_prefix);
+
+		/* check whether we should pick the current level;
+		we pick level 1 even if it does not have enough
+		distinct records because we do not want to scan the
+		leaf level because it may contain too many records */
+		if (level_is_analyzed
+		    && (n_diff_on_level[n_prefix] >= N_DIFF_REQUIRED
+			|| level == 1)) {
+
+			goto found_level;
+		}
+		/* else */
+
+		/* search for a level that contains enough distinct records */
+
+		if (level_is_analyzed && level > 1) {
+
+			/* if this does not hold we should be on
+			"found_level" instead of here */
+			ut_ad(n_diff_on_level[n_prefix] < N_DIFF_REQUIRED);
+
+			level--;
+			level_is_analyzed = FALSE;
+		}
+
+		for (;;) {
+
+			/* make sure we do not scan the leaf level
+			accidentally, it may contain too many pages */
+			ut_ad(level > 0);
+
+			/* scanning the same level twice is an optimization
+			bug */
+			ut_ad(!level_is_analyzed);
+
+			/* Do not scan if this would read too many pages.
+			Here we use the following fact:
+			the number of pages on level L equals the number
+			of records on level L+1, thus we deduce that the
+			following call would scan total_recs pages, because
+			total_recs is left from the previous iteration when
+			we scanned one level upper or we have not scanned any
+			levels yet in which case total_recs is 1. */
+			if (total_recs > srv_stats_persistent_sample_pages) {
+
+				/* if the above cond is true then we are not
+				at the root level since on the root level
+				total_recs == 1 and cannot
+				be > srv_stats_persistent_sample_pages */
+				ut_a(level != root_level);
+
+				/* step one level back and be satisfied with
+				whatever it contains */
+				level++;
+				level_is_analyzed = TRUE;
+
+				break;
+			}
+
+			dict_stats_analyze_index_level(index,
+						       level,
+						       n_diff_on_level,
+						       &total_recs,
+						       &total_pages,
+						       n_diff_boundaries);
+
+			level_is_analyzed = TRUE;
+
+			if (n_diff_on_level[n_prefix] >= N_DIFF_REQUIRED
+			    || level == 1) {
+				/* we found a good level with many distinct
+				records or we have reached the last level we
+				could scan */
+				break;
+			}
+			/* else */
+
+			level--;
+			level_is_analyzed = FALSE;
+		}
+found_level:
+
+		DEBUG_PRINTF("  %s(): found level %lu that has " UINT64PF
+			     " distinct records for n_prefix=%lu\n",
+			     __func__, level, n_diff_on_level[n_prefix],
+			     n_prefix);
+
+		/* here we are either on level 1 or the level that we are on
+		contains >= N_DIFF_REQUIRED distinct keys or we did not scan
+		deeper levels because they would contain too many pages */
+
+		ut_ad(level > 0);
+
+		ut_ad(level_is_analyzed);
+
+		/* pick some records from this level and dive below them for
+		the given n_prefix */
+
+		dict_stats_analyze_index_for_n_prefix(
+			index, level, total_recs, n_prefix,
+			n_diff_on_level[n_prefix],
+			&n_diff_boundaries[n_prefix]);
+	}
+
+	for (i = 1; i <= n_uniq; i++) {
+		dyn_array_free(&n_diff_boundaries[i]);
+	}
+
+	mem_free(n_diff_boundaries);
+
+	mem_free(n_diff_on_level);
+}
+/* @} */
+
+/*********************************************************************//**
+Calculates new estimates for table and index statistics. This function
+is relatively slow and is used to calculate persistent statistics that
+will be saved on disk.
+dict_stats_update_persistent() @{
+@return DB_SUCCESS or error code */
+static
+enum db_err
+dict_stats_update_persistent(
+/*=========================*/
+	dict_table_t*	table)		/*!< in/out: table */
+{
+	dict_index_t*	index;
+
+	DEBUG_PRINTF("%s(table=%s)\n", __func__, table->name);
+
+	/* XXX quit if interrupted, e.g. SIGTERM */
+
+	/* analyze the clustered index first */
+
+	index = dict_table_get_first_index(table);
+
+	if (index == NULL) {
+		/* Table definition is corrupt */
+		return(DB_CORRUPTION);
+	}
+
+	dict_stats_analyze_index(index);
+
+	table->stat_n_rows
+		= index->stat_n_diff_key_vals[dict_index_get_n_unique(index)];
+
+	table->stat_clustered_index_size = index->stat_index_size;
+
+	/* analyze other indexes from the table, if any */
+
+	table->stat_sum_of_other_index_sizes = 0;
+
+	for (index = dict_table_get_next_index(index);
+	     index != NULL;
+	     index = dict_table_get_next_index(index)) {
+
+		if (index->type & DICT_FTS) {
+			continue;
+		}
+
+		dict_stats_analyze_index(index);
+
+		table->stat_sum_of_other_index_sizes
+			+= index->stat_index_size;
+	}
+
+	table->stat_modified_counter = 0;
+
+	table->stat_initialized = TRUE;
+
+	return(DB_SUCCESS);
+}
+/* @} */
+
+/*********************************************************************//**
+Save an individual index's statistic into the persistent statistics
+storage.
+dict_stats_save_index_stat() @{
+@return DB_SUCCESS or error code */
+static
+enum db_err
+dict_stats_save_index_stat(
+/*=======================*/
+	dict_index_t*	index,		/*!< in: index */
+	lint		last_update,	/*!< in: timestamp of the stat */
+	const char*	stat_name,	/*!< in: name of the stat */
+	ib_uint64_t	stat_value,	/*!< in: value of the stat */
+	ib_uint64_t*	sample_size,	/*!< in: n pages sampled or NULL */
+	const char*	stat_description,/*!< in: description of the stat */
+	trx_t*		trx,		/*!< in/out: transaction to use */
+	ibool		caller_has_dict_sys_mutex)/*!< in: TRUE if the caller
+					owns dict_sys->mutex */
+{
+	pars_info_t*	pinfo;
+	enum db_err	ret;
+
+	pinfo = pars_info_create();
+
+	pars_info_add_literal(pinfo, "database_name", index->table->name,
+			      dict_get_db_name_len(index->table->name),
+			      DATA_VARCHAR, 0);
+
+	pars_info_add_str_literal(pinfo, "table_name",
+				  dict_remove_db_name(index->table->name));
+
+	pars_info_add_str_literal(pinfo, "index_name", index->name);
+
+	pars_info_add_int4_literal(pinfo, "last_update", last_update);
+
+	pars_info_add_str_literal(pinfo, "stat_name", stat_name);
+
+	pars_info_add_ull_literal(pinfo, "stat_value", stat_value);
+
+	if (sample_size != NULL) {
+		pars_info_add_ull_literal(pinfo, "sample_size", *sample_size);
+	} else {
+		pars_info_add_literal(pinfo, "sample_size", NULL,
+				      UNIV_SQL_NULL, DATA_FIXBINARY, 0);
+	}
+
+	pars_info_add_str_literal(pinfo, "stat_description",
+				  stat_description);
+
+	ret = que_eval_sql(pinfo,
+			   "PROCEDURE INDEX_STATS_SAVE () IS\n"
+			   "dummy CHAR;\n"
+			   "BEGIN\n"
+
+			   "SELECT database_name INTO dummy\n"
+			   "FROM \"" INDEX_STATS_NAME "\"\n"
+			   "WHERE\n"
+			   "database_name = :database_name AND\n"
+			   "table_name = :table_name AND\n"
+			   "index_name = :index_name AND\n"
+			   "stat_name = :stat_name\n"
+			   "FOR UPDATE;\n"
+
+			   "IF (SQL % NOTFOUND) THEN\n"
+			   "  INSERT INTO \"" INDEX_STATS_NAME "\"\n"
+			   "  VALUES\n"
+			   "  (\n"
+			   "  :database_name,\n"
+			   "  :table_name,\n"
+			   "  :index_name,\n"
+			   "  :last_update,\n"
+			   "  :stat_name,\n"
+			   "  :stat_value,\n"
+			   "  :sample_size,\n"
+			   "  :stat_description\n"
+			   "  );\n"
+			   "ELSE\n"
+			   "  UPDATE \"" INDEX_STATS_NAME "\" SET\n"
+			   "  last_update = :last_update,\n"
+			   "  stat_value = :stat_value,\n"
+			   "  sample_size = :sample_size,\n"
+			   "  stat_description = :stat_description\n"
+			   "  WHERE\n"
+			   "  database_name = :database_name AND\n"
+			   "  table_name = :table_name AND\n"
+			   "  index_name = :index_name AND\n"
+			   "  stat_name = :stat_name;\n"
+			   "END IF;\n"
+			   "END;",
+		!caller_has_dict_sys_mutex, trx);
+
+	/* pinfo is freed by que_eval_sql() */
+
+	if (ret != DB_SUCCESS) {
+		ut_print_timestamp(stderr);
+		fprintf(stderr,
+			" InnoDB: Error while trying to save index "
+			"statistics for table %s, index %s, "
+			"stat name %s: %s\n",
+			index->table->name, index->name,
+			stat_name, ut_strerr(ret));
+
+		trx->error_state = DB_SUCCESS;
+	}
+
+	return(ret);
+}
+/* @} */
+
+/*********************************************************************//**
+Save the table's statistics into the persistent statistics storage.
+dict_stats_save() @{
+@return DB_SUCCESS or error code */
+static
+enum db_err
+dict_stats_save(
+/*============*/
+	dict_table_t*	table,		/*!< in: table */
+	ibool		caller_has_dict_sys_mutex)/*!< in: TRUE if the caller
+					owns dict_sys->mutex */
+{
+	trx_t*		trx;
+	pars_info_t*	pinfo;
+	dict_index_t*	index;
+	lint		now;
+	enum db_err	ret;
+
+	/* MySQL's timestamp is 4 byte, so we use
+	pars_info_add_int4_literal() which takes a lint arg, so "now" is
+	lint */
+	now = (lint) ut_time();
+
+	trx = trx_allocate_for_background();
+
+	/* Use 'read-uncommitted' so that the SELECTs we execute
+	do not get blocked in case some user has locked the rows we
+	are SELECTing */
+
+	trx->isolation_level = TRX_ISO_READ_UNCOMMITTED;
+
+	trx_start_if_not_started(trx);
+
+	pinfo = pars_info_create();
+
+	pars_info_add_literal(pinfo, "database_name", table->name,
+			      dict_get_db_name_len(table->name),
+			      DATA_VARCHAR, 0);
+
+	pars_info_add_str_literal(pinfo, "table_name",
+				  dict_remove_db_name(table->name));
+
+	pars_info_add_int4_literal(pinfo, "last_update", now);
+
+	pars_info_add_ull_literal(pinfo, "n_rows", table->stat_n_rows);
+
+	pars_info_add_ull_literal(pinfo, "clustered_index_size",
+				     table->stat_clustered_index_size);
+
+	pars_info_add_ull_literal(pinfo, "sum_of_other_index_sizes",
+				     table->stat_sum_of_other_index_sizes);
+
+	ret = que_eval_sql(pinfo,
+			   "PROCEDURE TABLE_STATS_SAVE () IS\n"
+			   "dummy CHAR;\n"
+			   "BEGIN\n"
+
+			   "SELECT database_name INTO dummy\n"
+			   "FROM \"" TABLE_STATS_NAME "\"\n"
+			   "WHERE\n"
+			   "database_name = :database_name AND\n"
+			   "table_name = :table_name\n"
+			   "FOR UPDATE;\n"
+
+			   "IF (SQL % NOTFOUND) THEN\n"
+			   "  INSERT INTO \"" TABLE_STATS_NAME "\"\n"
+			   "  VALUES\n"
+			   "  (\n"
+			   "  :database_name,\n"
+			   "  :table_name,\n"
+			   "  :last_update,\n"
+			   "  :n_rows,\n"
+			   "  :clustered_index_size,\n"
+			   "  :sum_of_other_index_sizes\n"
+			   "  );\n"
+			   "ELSE\n"
+			   "  UPDATE \"" TABLE_STATS_NAME "\" SET\n"
+			   "  last_update = :last_update,\n"
+			   "  n_rows = :n_rows,\n"
+			   "  clustered_index_size = :clustered_index_size,\n"
+			   "  sum_of_other_index_sizes = "
+			   "    :sum_of_other_index_sizes\n"
+			   "  WHERE\n"
+			   "  database_name = :database_name AND\n"
+			   "  table_name = :table_name;\n"
+			   "END IF;\n"
+			   "END;",
+			   !caller_has_dict_sys_mutex, trx);
+
+	/* pinfo is freed by que_eval_sql() */
+
+	if (ret != DB_SUCCESS) {
+
+		ut_print_timestamp(stderr);
+		fprintf(stderr,
+			" InnoDB: Error while trying to save table "
+			"statistics for table %s: %s\n",
+			table->name, ut_strerr(ret));
+
+		goto end_rollback;
+	}
+
+	for (index = dict_table_get_first_index(table);
+	     index != NULL;
+	     index = dict_table_get_next_index(index)) {
+
+		ib_uint64_t	stat_n_diff_key_vals[REC_MAX_N_FIELDS];
+		ib_uint64_t	stat_n_sample_sizes[REC_MAX_N_FIELDS];
+		ulint		n_uniq;
+		ulint		i;
+
+		ret = dict_stats_save_index_stat(index, now, "size",
+						 index->stat_index_size,
+						 NULL,
+						 "Number of pages "
+						 "in the index",
+						 trx,
+						 caller_has_dict_sys_mutex);
+		if (ret != DB_SUCCESS) {
+			goto end_rollback;
+		}
+
+		ret = dict_stats_save_index_stat(index, now, "n_leaf_pages",
+						 index->stat_n_leaf_pages,
+						 NULL,
+						 "Number of leaf pages "
+						 "in the index",
+						 trx,
+						 caller_has_dict_sys_mutex);
+		if (ret != DB_SUCCESS) {
+			goto end_rollback;
+		}
+
+		n_uniq = dict_index_get_n_unique(index);
+
+		ut_ad(n_uniq + 1 <= UT_ARR_SIZE(stat_n_diff_key_vals));
+
+		memcpy(stat_n_diff_key_vals, index->stat_n_diff_key_vals,
+		       (n_uniq + 1) * sizeof(index->stat_n_diff_key_vals[0]));
+
+		ut_ad(n_uniq + 1 <= UT_ARR_SIZE(stat_n_sample_sizes));
+
+		memcpy(stat_n_sample_sizes, index->stat_n_sample_sizes,
+		       (n_uniq + 1) * sizeof(index->stat_n_sample_sizes[0]));
+
+		for (i = 1; i <= n_uniq; i++) {
+
+			char	stat_name[16];
+			char	stat_description[1024];
+			ulint	j;
+
+			ut_snprintf(stat_name, sizeof(stat_name),
+				    "n_diff_pfx%02lu", i);
+
+			/* craft a string that contains the columns names */
+			ut_snprintf(stat_description,
+				    sizeof(stat_description),
+				    "%s", index->fields[0].name);
+			for (j = 2; j <= i; j++) {
+				size_t	len;
+
+				len = strlen(stat_description);
+
+				ut_snprintf(stat_description + len,
+					    sizeof(stat_description) - len,
+					    ",%s", index->fields[j - 1].name);
+			}
+
+			ret = dict_stats_save_index_stat(
+				index, now, stat_name,
+				stat_n_diff_key_vals[i],
+				&stat_n_sample_sizes[i],
+				stat_description, trx,
+				caller_has_dict_sys_mutex);
+
+			if (ret != DB_SUCCESS) {
+				goto end_rollback;
+			}
+		}
+	}
+
+	trx_commit_for_mysql(trx);
+	ret = DB_SUCCESS;
+	goto end_free;
+
+end_rollback:
+
+	trx->op_info = "rollback of internal transaction on stats tables";
+	trx_rollback_to_savepoint(trx, NULL);
+	trx->op_info = "";
+	ut_a(trx->error_state == DB_SUCCESS);
+
+end_free:
+
+	trx_free_for_background(trx);
+
+	return(ret);
+}
+/* @} */
+
+/*********************************************************************//**
+Called for the row that is selected by
+SELECT ... FROM mysql.innodb_table_stats WHERE table='...'
+The second argument is a pointer to the table and the fetched stats are
+written to it.
+dict_stats_fetch_table_stats_step() @{
+@return non-NULL dummy */
+static
+ibool
+dict_stats_fetch_table_stats_step(
+/*==============================*/
+	void*	node_void,	/*!< in: select node */
+	void*	table_void)	/*!< out: table */
+{
+	sel_node_t*	node = (sel_node_t*) node_void;
+	dict_table_t*	table = (dict_table_t*) table_void;
+	que_common_t*	cnode;
+	int		i;
+
+	/* this should loop exactly 3 times - for
+	n_rows,clustered_index_size,sum_of_other_index_sizes */
+	for (cnode = static_cast<que_common_t*>(node->select_list), i = 0;
+	     cnode != NULL;
+	     cnode = static_cast<que_common_t*>(que_node_get_next(cnode)),
+	     i++) {
+
+		const byte*	data;
+		dfield_t*	dfield = que_node_get_val(cnode);
+		dtype_t*	type = dfield_get_type(dfield);
+		ulint		len = dfield_get_len(dfield);
+
+		data = static_cast<const byte*>(dfield_get_data(dfield));
+
+		switch (i) {
+		case 0: /* mysql.innodb_table_stats.n_rows */
+
+			ut_a(dtype_get_mtype(type) == DATA_INT);
+			ut_a(len == 8);
+
+			table->stat_n_rows = mach_read_from_8(data);
+
+			break;
+
+		case 1: /* mysql.innodb_table_stats.clustered_index_size */
+
+			ut_a(dtype_get_mtype(type) == DATA_INT);
+			ut_a(len == 8);
+
+			table->stat_clustered_index_size
+				= (ulint) mach_read_from_8(data);
+
+			break;
+
+		case 2: /* mysql.innodb_table_stats.sum_of_other_index_sizes */
+
+			ut_a(dtype_get_mtype(type) == DATA_INT);
+			ut_a(len == 8);
+
+			table->stat_sum_of_other_index_sizes
+				= (ulint) mach_read_from_8(data);
+
+			break;
+
+		default:
+
+			/* someone changed SELECT
+			n_rows,clustered_index_size,sum_of_other_index_sizes
+			to select more columns from innodb_table_stats without
+			adjusting here */
+			ut_error;
+		}
+	}
+
+	/* if i < 3 this means someone changed the
+	SELECT n_rows,clustered_index_size,sum_of_other_index_sizes
+	to select less columns from innodb_table_stats without adjusting here;
+	if i > 3 we would have ut_error'ed earlier */
+	ut_a(i == 3 /*n_rows,clustered_index_size,sum_of_other_index_sizes*/);
+
+	/* XXX this is not used but returning non-NULL is necessary */
+	return(TRUE);
+}
+/* @} */
+
+/** Aux struct used to pass a table and a boolean to
+dict_stats_fetch_index_stats_step(). */
+typedef struct index_fetch_struct {
+	dict_table_t*	table;	/*!< table whose indexes are to be modified */
+	ibool		stats_were_modified; /*!< will be set to TRUE if at
+				least one index stats were modified */
+} index_fetch_t;
+
+/*********************************************************************//**
+Called for the rows that are selected by
+SELECT ... FROM mysql.innodb_index_stats WHERE table='...'
+The second argument is a pointer to the table and the fetched stats are
+written to its indexes.
+Let a table has N indexes and each index has Ui unique columns for i=1..N,
+then mysql.innodb_index_stats will have SUM(Ui) i=1..N rows for that table.
+So this function will be called SUM(Ui) times where SUM(Ui) is of magnitude
+N*AVG(Ui). In each call it searches for the currently fetched index into
+table->indexes linearly, assuming this list is not sorted. Thus, overall,
+fetching all indexes' stats from mysql.innodb_index_stats is O(N^2) where N
+is the number of indexes.
+This can be improved if we sort table->indexes in a temporary area just once
+and then search in that sorted list. Then the complexity will be O(N*log(N)).
+We assume a table will not have more than 100 indexes, so we go with the
+simpler N^2 algorithm.
+dict_stats_fetch_index_stats_step() @{
+@return non-NULL dummy */
+static
+ibool
+dict_stats_fetch_index_stats_step(
+/*==============================*/
+	void*	node_void,	/*!< in: select node */
+	void*	arg_void)	/*!< out: table + a flag that tells if we
+				modified anything */
+{
+	sel_node_t*	node = (sel_node_t*) node_void;
+	index_fetch_t*	arg = (index_fetch_t*) arg_void;
+	dict_table_t*	table = arg->table;
+	dict_index_t*	index = NULL;
+	que_common_t*	cnode;
+	const char*	stat_name = NULL;
+	ulint		stat_name_len = ULINT_UNDEFINED;
+	ib_uint64_t	stat_value = UINT64_UNDEFINED;
+	ib_uint64_t	sample_size = UINT64_UNDEFINED;
+	int		i;
+
+	/* this should loop exactly 4 times - for the columns that
+	were selected: index_name,stat_name,stat_value,sample_size */
+	for (cnode = static_cast<que_common_t*>(node->select_list), i = 0;
+	     cnode != NULL;
+	     cnode = static_cast<que_common_t*>(que_node_get_next(cnode)),
+	     i++) {
+
+		const byte*	data;
+		dfield_t*	dfield = que_node_get_val(cnode);
+		dtype_t*	type = dfield_get_type(dfield);
+		ulint		len = dfield_get_len(dfield);
+
+		data = static_cast<const byte*>(dfield_get_data(dfield));
+
+		switch (i) {
+		case 0: /* mysql.innodb_index_stats.index_name */
+
+			ut_a(dtype_get_mtype(type) == DATA_VARMYSQL);
+
+			/* search for index in table's indexes whose name
+			matches data; the fetched index name is in data,
+			has no terminating '\0' and has length len */
+			for (index = dict_table_get_first_index(table);
+			     index != NULL;
+			     index = dict_table_get_next_index(index)) {
+
+				if (strlen(index->name) == len
+				    && memcmp(index->name, data, len) == 0) {
+					/* the corresponding index was found */
+					break;
+				}
+			}
+
+			/* if index is NULL here this means that
+			mysql.innodb_index_stats contains more rows than the
+			number of indexes in the table; this is ok, we just
+			return ignoring those extra rows; in other words
+			dict_stats_fetch_index_stats_step() has been called
+			for a row from index_stats with unknown index_name
+			column */
+			if (index == NULL) {
+
+				return(TRUE);
+			}
+
+			break;
+
+		case 1: /* mysql.innodb_index_stats.stat_name */
+
+			ut_a(dtype_get_mtype(type) == DATA_VARMYSQL);
+
+			ut_a(index != NULL);
+
+			stat_name = (const char*) data;
+			stat_name_len = len;
+
+			break;
+
+		case 2: /* mysql.innodb_index_stats.stat_value */
+
+			ut_a(dtype_get_mtype(type) == DATA_INT);
+			ut_a(len == 8);
+
+			ut_a(index != NULL);
+			ut_a(stat_name != NULL);
+			ut_a(stat_name_len != ULINT_UNDEFINED);
+
+			stat_value = mach_read_from_8(data);
+
+			break;
+
+		case 3: /* mysql.innodb_index_stats.sample_size */
+
+			ut_a(dtype_get_mtype(type) == DATA_INT);
+			ut_a(len == 8 || len == UNIV_SQL_NULL);
+
+			ut_a(index != NULL);
+			ut_a(stat_name != NULL);
+			ut_a(stat_name_len != ULINT_UNDEFINED);
+			ut_a(stat_value != UINT64_UNDEFINED);
+
+			if (len == UNIV_SQL_NULL) {
+				break;
+			}
+			/* else */
+
+			sample_size = mach_read_from_8(data);
+
+			break;
+
+		default:
+
+			/* someone changed
+			SELECT index_name,stat_name,stat_value,sample_size
+			to select more columns from innodb_index_stats without
+			adjusting here */
+			ut_error;
+		}
+	}
+
+	/* if i < 4 this means someone changed the
+	SELECT index_name,stat_name,stat_value,sample_size
+	to select less columns from innodb_index_stats without adjusting here;
+	if i > 4 we would have ut_error'ed earlier */
+	ut_a(i == 4 /* index_name,stat_name,stat_value,sample_size */);
+
+	ut_a(index != NULL);
+	ut_a(stat_name != NULL);
+	ut_a(stat_name_len != ULINT_UNDEFINED);
+	ut_a(stat_value != UINT64_UNDEFINED);
+	/* sample_size could be UINT64_UNDEFINED here, if it is NULL */
+
+#define PFX	"n_diff_pfx"
+#define PFX_LEN	10
+
+	if (stat_name_len == 4 /* strlen("size") */
+	    && strncasecmp("size", stat_name, stat_name_len) == 0) {
+		index->stat_index_size = (ulint) stat_value;
+		arg->stats_were_modified = TRUE;
+	} else if (stat_name_len == 12 /* strlen("n_leaf_pages") */
+		   && strncasecmp("n_leaf_pages", stat_name, stat_name_len)
+		   == 0) {
+		index->stat_n_leaf_pages = (ulint) stat_value;
+		arg->stats_were_modified = TRUE;
+	} else if (stat_name_len > PFX_LEN /* e.g. stat_name=="n_diff_pfx01" */
+		   && strncasecmp(PFX, stat_name, PFX_LEN) == 0) {
+
+		const char*	num_ptr;
+		unsigned long	n_pfx;
+
+		/* point num_ptr into "1" from "n_diff_pfx12..." */
+		num_ptr = stat_name + PFX_LEN;
+
+		/* stat_name should have exactly 2 chars appended to PFX
+		and they should be digits */
+		if (stat_name_len != PFX_LEN + 2
+		    || num_ptr[0] < '0' || num_ptr[0] > '9'
+		    || num_ptr[1] < '0' || num_ptr[1] > '9') {
+
+			ut_print_timestamp(stderr);
+			fprintf(stderr,
+				" InnoDB: Ignoring strange row from "
+				"%s WHERE "
+				"database_name = '%.*s' AND "
+				"table_name = '%s' AND "
+				"index_name = '%s' AND "
+				"stat_name = '%.*s'; because stat_name "
+				"is malformed\n",
+				INDEX_STATS_NAME_PRINT,
+				(int) dict_get_db_name_len(table->name),
+				table->name,
+				dict_remove_db_name(table->name),
+				index->name,
+				(int) stat_name_len,
+				stat_name);
+			return(TRUE);
+		}
+		/* else */
+
+		/* extract 12 from "n_diff_pfx12..." into n_pfx
+		note that stat_name does not have a terminating '\0' */
+		n_pfx = (num_ptr[0] - '0') * 10 + (num_ptr[1] - '0');
+
+		if (n_pfx == 0 || n_pfx > dict_index_get_n_unique(index)) {
+
+			ut_print_timestamp(stderr);
+			fprintf(stderr,
+				" InnoDB: Ignoring strange row from "
+				"%s WHERE "
+				"database_name = '%.*s' AND "
+				"table_name = '%s' AND "
+				"index_name = '%s' AND "
+				"stat_name = '%.*s'; because stat_name is "
+				"out of range, the index has %lu unique "
+				"columns\n",
+				INDEX_STATS_NAME_PRINT,
+				(int) dict_get_db_name_len(table->name),
+				table->name,
+				dict_remove_db_name(table->name),
+				index->name,
+				(int) stat_name_len,
+				stat_name,
+				dict_index_get_n_unique(index));
+			return(TRUE);
+		}
+		/* else */
+
+		index->stat_n_diff_key_vals[n_pfx] = stat_value;
+
+		if (sample_size != UINT64_UNDEFINED) {
+			index->stat_n_sample_sizes[n_pfx] = sample_size;
+		} else {
+			/* hmm, strange... the user must have UPDATEd the
+			table manually and SET sample_size = NULL */
+			index->stat_n_sample_sizes[n_pfx] = 0;
+		}
+
+		arg->stats_were_modified = TRUE;
+	} else {
+		/* silently ignore rows with unknown stat_name, the
+		user may have developed her own stats */
+	}
+
+	/* XXX this is not used but returning non-NULL is necessary */
+	return(TRUE);
+}
+/* @} */
+
+/*********************************************************************//**
+Read table's statistics from the persistent statistics storage.
+dict_stats_fetch_from_ps() @{
+@return DB_SUCCESS or error code */
+static
+enum db_err
+dict_stats_fetch_from_ps(
+/*=====================*/
+	dict_table_t*	table,		/*!< in/out: table */
+	ibool		caller_has_dict_sys_mutex)/*!< in: TRUE if the caller
+					owns dict_sys->mutex */
+{
+	index_fetch_t	index_fetch_arg;
+	trx_t*		trx;
+	pars_info_t*	pinfo;
+	enum db_err	ret;
+
+	ut_ad(mutex_own(&dict_sys->mutex) == caller_has_dict_sys_mutex);
+
+	trx = trx_allocate_for_background();
+
+	/* Use 'read-uncommitted' so that the SELECTs we execute
+	do not get blocked in case some user has locked the rows we
+	are SELECTing */
+
+	trx->isolation_level = TRX_ISO_READ_UNCOMMITTED;
+
+	trx_start_if_not_started(trx);
+
+	pinfo = pars_info_create();
+
+	pars_info_add_literal(pinfo, "database_name", table->name,
+			      dict_get_db_name_len(table->name),
+			      DATA_VARCHAR, 0);
+
+	pars_info_add_str_literal(pinfo, "table_name",
+				  dict_remove_db_name(table->name));
+
+	pars_info_bind_function(pinfo,
+			       "fetch_table_stats_step",
+			       dict_stats_fetch_table_stats_step,
+			       table);
+
+	index_fetch_arg.table = table;
+	index_fetch_arg.stats_were_modified = FALSE;
+	pars_info_bind_function(pinfo,
+			        "fetch_index_stats_step",
+			        dict_stats_fetch_index_stats_step,
+			        &index_fetch_arg);
+
+	ret = que_eval_sql(pinfo,
+			   "PROCEDURE FETCH_STATS () IS\n"
+			   "found INT;\n"
+			   "DECLARE FUNCTION fetch_table_stats_step;\n"
+			   "DECLARE FUNCTION fetch_index_stats_step;\n"
+			   "DECLARE CURSOR table_stats_cur IS\n"
+			   "  SELECT\n"
+			   /* if you change the selected fields, be
+			   sure to adjust
+			   dict_stats_fetch_table_stats_step() */
+			   "  n_rows,\n"
+			   "  clustered_index_size,\n"
+			   "  sum_of_other_index_sizes\n"
+			   "  FROM \"" TABLE_STATS_NAME "\"\n"
+			   "  WHERE\n"
+			   "  database_name = :database_name AND\n"
+			   "  table_name = :table_name;\n"
+			   "DECLARE CURSOR index_stats_cur IS\n"
+			   "  SELECT\n"
+			   /* if you change the selected fields, be
+			   sure to adjust
+			   dict_stats_fetch_index_stats_step() */
+			   "  index_name,\n"
+			   "  stat_name,\n"
+			   "  stat_value,\n"
+			   "  sample_size\n"
+			   "  FROM \"" INDEX_STATS_NAME "\"\n"
+			   "  WHERE\n"
+			   "  database_name = :database_name AND\n"
+			   "  table_name = :table_name;\n"
+
+			   "BEGIN\n"
+
+			   "OPEN table_stats_cur;\n"
+			   "FETCH table_stats_cur INTO\n"
+			   "  fetch_table_stats_step();\n"
+			   "IF (SQL % NOTFOUND) THEN\n"
+			   "  CLOSE table_stats_cur;\n"
+			   "  RETURN;\n"
+			   "END IF;\n"
+			   "CLOSE table_stats_cur;\n"
+
+			   "OPEN index_stats_cur;\n"
+			   "found := 1;\n"
+			   "WHILE found = 1 LOOP\n"
+			   "  FETCH index_stats_cur INTO\n"
+			   "    fetch_index_stats_step();\n"
+			   "  IF (SQL % NOTFOUND) THEN\n"
+			   "    found := 0;\n"
+			   "  END IF;\n"
+			   "END LOOP;\n"
+			   "CLOSE index_stats_cur;\n"
+
+			   "END;",
+			   !caller_has_dict_sys_mutex, trx);
+
+	/* pinfo is freed by que_eval_sql() */
+
+	/* XXX If mysql.innodb_index_stats contained less rows than the number
+	of indexes in the table, then some of the indexes of the table
+	were left uninitialized. Currently this is ignored and those
+	indexes are left with uninitialized stats until ANALYZE TABLE is
+	run. This condition happens when the user creates a new index
+	on a table. We could return DB_STATS_DO_NOT_EXIST from here,
+	forcing the usage of transient stats until mysql.innodb_index_stats
+	is complete. */
+
+	trx_commit_for_mysql(trx);
+
+	trx_free_for_background(trx);
+
+	if (!index_fetch_arg.stats_were_modified) {
+		return(DB_STATS_DO_NOT_EXIST);
+	}
+
+	return(ret);
+}
+/* @} */
+
+/*********************************************************************//**
+Calculates new estimates for table and index statistics. The statistics
+are used in query optimization.
+dict_stats_update() @{
+@return DB_* error code or DB_SUCCESS */
+UNIV_INTERN
+enum db_err
+dict_stats_update(
+/*==============*/
+	dict_table_t*		table,	/*!< in/out: table */
+	dict_stats_upd_option_t	stats_upd_option,
+					/*!< in: whether to (re) calc
+					the stats or to fetch them from
+					the persistent statistics
+					storage */
+	ibool			caller_has_dict_sys_mutex)
+					/*!< in: TRUE if the caller
+					owns dict_sys->mutex */
+{
+	enum db_err	ret = DB_ERROR;
+
+	/* check whether caller_has_dict_sys_mutex is set correctly;
+	note that mutex_own() is not implemented in non-debug code so
+	we cannot avoid having this extra param to the current function */
+	ut_ad(caller_has_dict_sys_mutex
+	      ? mutex_own(&dict_sys->mutex)
+	      : !mutex_own(&dict_sys->mutex));
+
+	if (table->ibd_file_missing) {
+		ut_print_timestamp(stderr);
+		fprintf(stderr,
+			" InnoDB: cannot calculate statistics for table %s "
+			"because the .ibd file is missing. For help, please "
+			"refer to " REFMAN "innodb-troubleshooting.html\n",
+			table->name);
+
+		return(DB_TABLESPACE_DELETED);
+	}
+
+	/* If we have set a high innodb_force_recovery level, do not calculate
+	statistics, as a badly corrupted index can cause a crash in it. */
+
+	if (srv_force_recovery >= SRV_FORCE_NO_IBUF_MERGE) {
+
+		return(DB_SUCCESS);
+	}
+
+	switch (stats_upd_option) {
+	case DICT_STATS_RECALC_PERSISTENT:
+	case DICT_STATS_RECALC_PERSISTENT_SILENT:
+		/* Persistent recalculation requested, called from
+		ANALYZE TABLE or from TRUNCATE TABLE */
+
+		/* FTS auxiliary tables do not need persistent stats */
+		if ((ut_strcount(table->name, "FTS") > 0
+		     && (ut_strcount(table->name, "CONFIG") > 0
+			 || ut_strcount(table->name, "INDEX") > 0
+			 || ut_strcount(table->name, "DELETED") > 0
+			 || ut_strcount(table->name, "DOC_ID") > 0
+			 || ut_strcount(table->name, "ADDED") > 0))) {
+			goto transient;
+		}
+
+		/* check if the persistent statistics storage exists
+		before calling the potentially slow function
+		dict_stats_update_persistent(); that is a
+		prerequisite for dict_stats_save() succeeding */
+		if (dict_stats_persistent_storage_check(
+				caller_has_dict_sys_mutex)) {
+
+			dict_table_stats_lock(table, RW_X_LATCH);
+
+			ret = dict_stats_update_persistent(table);
+
+			/* XXX Currently dict_stats_save() would read the
+			stats from the table without dict_table_stats_lock()
+			which means it could save inconsistent data on the
+			disk. This is because we must call
+			dict_table_stats_lock() after locking dict_sys->mutex.
+			A solution is to copy here the stats to a temporary
+			buffer while holding the _stats_lock(), release it,
+			and pass that buffer to dict_stats_save(). */
+
+			dict_table_stats_unlock(table, RW_X_LATCH);
+
+			if (ret == DB_SUCCESS) {
+				ret = dict_stats_save(
+					table,
+					caller_has_dict_sys_mutex);
+			}
+
+			return(ret);
+		}
+		/* else */
+
+		/* Fall back to transient stats since the persistent
+		storage is not present or is corrupted */
+
+		if (stats_upd_option == DICT_STATS_RECALC_PERSISTENT) {
+
+			ut_print_timestamp(stderr);
+			/* XXX add link to the doc about storage
+			creation */
+			fprintf(stderr,
+				" InnoDB: Recalculation of persistent "
+				"statistics requested but the required "
+				"persistent statistics storage is not "
+				"present or is corrupted. "
+				"Using quick transient stats "
+				"instead.\n");
+		}
+
+		goto transient;
+
+	case DICT_STATS_RECALC_TRANSIENT:
+
+		goto transient;
+
+	case DICT_STATS_FETCH:
+	case DICT_STATS_FETCH_ONLY_IF_NOT_IN_MEMORY:
+		/* fetch requested, either fetch from persistent statistics
+		storage or use the old method */
+
+		dict_table_stats_lock(table, RW_X_LATCH);
+
+		if (stats_upd_option == DICT_STATS_FETCH_ONLY_IF_NOT_IN_MEMORY
+		    && table->stat_initialized) {
+
+			dict_table_stats_unlock(table, RW_X_LATCH);
+			return(DB_SUCCESS);
+		}
+		/* else */
+
+		/* Must unlock because otherwise there is a lock order
+		violation with dict_sys->mutex below. Declare stats to be
+		initialized before unlocking. */
+		table->stat_initialized = TRUE;
+		dict_table_stats_unlock(table, RW_X_LATCH);
+
+		if (strchr(table->name, '/') == NULL
+		    || strcmp(table->name, INDEX_STATS_NAME) == 0
+		    || strcmp(table->name, TABLE_STATS_NAME) == 0
+		    || (ut_strcount(table->name, "FTS") > 0
+		        && (ut_strcount(table->name, "CONFIG") > 0
+			    || ut_strcount(table->name, "INDEX") > 0
+			    || ut_strcount(table->name, "DELETED") > 0
+			    || ut_strcount(table->name, "DOC_ID") > 0
+			    || ut_strcount(table->name, "ADDED") > 0))) {
+			/* Use the quick transient stats method for
+			InnoDB internal tables, because we know the
+			persistent stats storage does not contain data
+			for them */
+
+			goto transient;
+		}
+		/* else */
+
+		if (dict_stats_persistent_storage_check(
+			caller_has_dict_sys_mutex)) {
+
+			ret = dict_stats_fetch_from_ps(table,
+				caller_has_dict_sys_mutex);
+
+			if (ret == DB_STATS_DO_NOT_EXIST
+			    || (ret != DB_SUCCESS && stats_upd_option
+				== DICT_STATS_FETCH_ONLY_IF_NOT_IN_MEMORY)) {
+				/* Stats for this particular table do not
+				exist or we have been called from open table
+				which needs to initialize the stats,
+				calculate the quick transient statistics */
+				goto transient;
+			}
+			/* else */
+
+			return(ret);
+		} else {
+			/* persistent statistics storage does not exist,
+			calculate the transient stats */
+			goto transient;
+		}
+
+		break;
+
+	/* no "default:" in order to produce a compilation warning
+	about unhandled enumeration value */
+	}
+
+transient:
+
+	dict_table_stats_lock(table, RW_X_LATCH);
+
+	dict_stats_update_transient(table);
+
+	dict_table_stats_unlock(table, RW_X_LATCH);
+
+	return(DB_SUCCESS);
+}
+/* @} */
+
+/*********************************************************************//**
+Close the stats tables. Should always be called after successful
+dict_stats_open(). It will free the dict_stats handle.
+dict_stats_close() @{ */
+UNIV_INLINE
+void
+dict_stats_close(
+/*=============*/
+	dict_stats_t*	dict_stats)	/*!< in/own: Handle to open
+					statistics tables */
+{
+	if (dict_stats->table_stats != NULL) {
+		dict_table_close(dict_stats->table_stats, FALSE);
+		dict_stats->table_stats = NULL;
+	}
+
+	if (dict_stats->index_stats != NULL) {
+		dict_table_close(dict_stats->index_stats, FALSE);
+		dict_stats->index_stats = NULL;
+	}
+
+	mem_free(dict_stats);
+}
+/* @} */
+
+/*********************************************************************//**
+Open stats tables to prevent these tables from being DROPped.
+Also check whether they have the correct structure. The caller
+must call dict_stats_close() when he has finished DMLing the tables.
+dict_stats_open() @{
+@return pointer to open tables or NULL on failure */
+UNIV_INLINE
+dict_stats_t*
+dict_stats_open(void)
+/*=================*/
+{
+	dict_stats_t*	dict_stats;
+
+	dict_stats = static_cast<dict_stats_t*>(
+		mem_zalloc(sizeof(*dict_stats)));
+
+	dict_stats->table_stats = dict_table_open_on_name_no_stats(
+		TABLE_STATS_NAME, FALSE, DICT_ERR_IGNORE_NONE);
+
+	dict_stats->index_stats = dict_table_open_on_name_no_stats(
+		INDEX_STATS_NAME, FALSE, DICT_ERR_IGNORE_NONE);
+
+	/* Check if the tables have the correct structure, if yes then
+	after this function we can safely DELETE from them without worrying
+	that they may get DROPped or DDLed because the open will have
+	increased the reference count. */
+
+	if (dict_stats->table_stats == NULL
+	    || dict_stats->index_stats == NULL
+	    || !dict_stats_persistent_storage_check(FALSE)) {
+
+		/* There was an error, close the tables and free the handle. */
+		dict_stats_close(dict_stats);
+		dict_stats = NULL;
+	}
+
+	return(dict_stats);
+}
+/* @} */
+
+/*********************************************************************//**
+Removes the information for a particular index's stats from the persistent
+storage if it exists and if there is data stored for this index.
+The transaction is not committed, it must not be committed in this
+function because this is the user trx that is running DROP INDEX.
+The transaction will be committed at the very end when dropping an
+index.
+A note from Marko why we cannot edit user and sys_* tables in one trx:
+marko: The problem is that ibuf merges should be disabled while we are
+rolling back dict transactions.
+marko: If ibuf merges are not disabled, we need to scan the *.ibd files.
+But we shouldn't open *.ibd files before we have rolled back dict
+transactions and opened the SYS_* records for the *.ibd files.
+dict_stats_delete_index_stats() @{
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+enum db_err
+dict_stats_delete_index_stats(
+/*==========================*/
+	dict_index_t*	index,	/*!< in: index */
+	trx_t*		trx,	/*!< in: transaction to use */
+	char*		errstr, /*!< out: error message if != DB_SUCCESS
+				is returned */
+	ulint		errstr_sz)/*!< in: size of the errstr buffer */
+{
+	char		database_name[MAX_DATABASE_NAME_LEN + 1];
+	const char*	table_name;
+	pars_info_t*	pinfo;
+	enum db_err	ret;
+	dict_stats_t*	dict_stats;
+	void*		mysql_thd = trx->mysql_thd;
+
+	/* skip indexes whose table names do not contain a database name
+	e.g. if we are dropping an index from SYS_TABLES */
+	if (strchr(index->table_name, '/') == NULL) {
+
+		return(DB_SUCCESS);
+	}
+
+	/* Increment table reference count to prevent the tables from
+	being DROPped just before que_eval_sql(). */
+	dict_stats = dict_stats_open();
+
+	if (dict_stats == NULL) {
+		/* stats tables do not exist or have unexpected structure */
+		return(DB_SUCCESS);
+	}
+
+	/* the stats tables cannot be DROPped now */
+
+	ut_snprintf(database_name, sizeof(database_name), "%.*s",
+		    (int) dict_get_db_name_len(index->table_name),
+		    index->table_name);
+
+	table_name = dict_remove_db_name(index->table_name);
+
+	pinfo = pars_info_create();
+
+	pars_info_add_str_literal(pinfo, "database_name", database_name);
+
+	pars_info_add_str_literal(pinfo, "table_name", table_name);
+
+	pars_info_add_str_literal(pinfo, "index_name", index->name);
+
+	/* Force lock wait timeout to be instantaneous because the incoming
+	transaction was created via MySQL. */
+
+	mysql_thd = trx->mysql_thd;
+	trx->mysql_thd = NULL;
+
+	ret = que_eval_sql(pinfo,
+			   "PROCEDURE DROP_INDEX_STATS () IS\n"
+			   "BEGIN\n"
+			   "DELETE FROM \"" INDEX_STATS_NAME "\" WHERE\n"
+			   "database_name = :database_name AND\n"
+			   "table_name = :table_name AND\n"
+			   "index_name = :index_name;\n"
+			   "END;\n",
+			   TRUE,
+			   trx);
+
+	trx->mysql_thd = mysql_thd;
+
+	/* pinfo is freed by que_eval_sql() */
+
+	/* do not to commit here, see the function's comment */
+
+	if (ret != DB_SUCCESS) {
+
+		ut_snprintf(errstr, errstr_sz,
+			    "Unable to delete statistics for index %s "
+			    "from %s%s. They can be deleted later using "
+			    "DELETE FROM %s WHERE "
+			    "database_name = '%s' AND "
+			    "table_name = '%s' AND "
+			    "index_name = '%s';",
+			    index->name,
+			    INDEX_STATS_NAME_PRINT,
+			    (ret == DB_LOCK_WAIT_TIMEOUT
+			     ? " because the rows are locked"
+			     : ""),
+			    INDEX_STATS_NAME_PRINT,
+			    database_name,
+			    table_name,
+			    index->name);
+
+		ut_print_timestamp(stderr);
+		fprintf(stderr, " InnoDB: %s\n", errstr);
+
+		trx->error_state = DB_SUCCESS;
+	}
+
+	dict_stats_close(dict_stats);
+
+	return(ret);
+}
+/* @} */
+
+/*********************************************************************//**
+Removes the statistics for a table and all of its indexes from the
+persistent statistics storage if it exists and if there is data stored for
+the table.  This function creates its own transaction and commits it.
+dict_stats_delete_table_stats() @{
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+enum db_err
+dict_stats_delete_table_stats(
+/*==========================*/
+	const char*	table_name,	/*!< in: table name */
+	char*		errstr,		/*!< out: error message
+					if != DB_SUCCESS is returned */
+	ulint		errstr_sz)	/*!< in: size of errstr buffer */
+{
+	char		database_name[MAX_DATABASE_NAME_LEN + 1];
+	const char*	table_name_strip; /* without leading db name */
+	trx_t*		trx;
+	pars_info_t*	pinfo;
+	enum db_err	ret = DB_ERROR;
+	dict_stats_t*	dict_stats;
+
+	/* skip tables that do not contain a database name
+	e.g. if we are dropping SYS_TABLES */
+	if (strchr(table_name, '/') == NULL) {
+
+		return(DB_SUCCESS);
+	}
+
+	/* skip innodb_table_stats and innodb_index_stats themselves */
+	if (strcmp(table_name, TABLE_STATS_NAME) == 0
+	    || strcmp(table_name, INDEX_STATS_NAME) == 0) {
+
+		return(DB_SUCCESS);
+	}
+
+	/* Create a new private trx */
+
+	trx = trx_allocate_for_background();
+
+	/* Use 'read-uncommitted' so that the SELECTs we execute
+	do not get blocked in case some user has locked the rows we
+	are SELECTing */
+
+	trx->isolation_level = TRX_ISO_READ_UNCOMMITTED;
+
+	trx_start_if_not_started(trx);
+
+	/* Increment table reference count to prevent the tables from
+	being DROPped just before que_eval_sql(). */
+	dict_stats = dict_stats_open();
+
+	if (dict_stats == NULL) {
+		/* stats tables do not exist or have unexpected structure */
+		ret = DB_SUCCESS;
+		goto commit_and_return;
+	}
+
+	ut_snprintf(database_name, sizeof(database_name), "%.*s",
+		    (int) dict_get_db_name_len(table_name),
+		    table_name);
+
+	table_name_strip = dict_remove_db_name(table_name);
+
+	pinfo = pars_info_create();
+
+	pars_info_add_str_literal(pinfo, "database_name", database_name);
+
+	pars_info_add_str_literal(pinfo, "table_name", table_name_strip);
+
+	ret = que_eval_sql(pinfo,
+			   "PROCEDURE DROP_TABLE_STATS () IS\n"
+			   "BEGIN\n"
+
+			   "DELETE FROM \"" INDEX_STATS_NAME "\" WHERE\n"
+			   "database_name = :database_name AND\n"
+			   "table_name = :table_name;\n"
+
+			   "DELETE FROM \"" TABLE_STATS_NAME "\" WHERE\n"
+			   "database_name = :database_name AND\n"
+			   "table_name = :table_name;\n"
+
+			   "END;\n",
+			   TRUE,
+			   trx);
+
+	/* pinfo is freed by que_eval_sql() */
+
+	if (ret != DB_SUCCESS) {
+
+		ut_snprintf(errstr, errstr_sz,
+			    "Unable to delete statistics for table %s.%s "
+			    "from %s or %s%s. "
+			    "They can be deleted later using "
+
+			    "DELETE FROM %s WHERE "
+			    "database_name = '%s' AND "
+			    "table_name = '%s'; "
+
+			    "DELETE FROM %s WHERE "
+			    "database_name = '%s' AND "
+			    "table_name = '%s';",
+
+			    database_name, table_name_strip,
+			    TABLE_STATS_NAME_PRINT, INDEX_STATS_NAME_PRINT,
+
+			    (ret == DB_LOCK_WAIT_TIMEOUT
+			     ? " because the rows are locked"
+			     : ""),
+
+			    INDEX_STATS_NAME_PRINT,
+			    database_name, table_name_strip,
+
+			    TABLE_STATS_NAME_PRINT,
+			    database_name, table_name_strip);
+
+		ut_print_timestamp(stderr);
+		fprintf(stderr, " InnoDB: %s\n", errstr);
+	}
+
+	dict_stats_close(dict_stats);
+
+commit_and_return:
+
+	trx_commit_for_mysql(trx);
+
+	trx_free_for_background(trx);
+
+	return(ret);
+}
+/* @} */
+
+/* tests @{ */
+#ifdef UNIV_COMPILE_TEST_FUNCS
+
+/* The following unit tests test some of the functions in this file
+individually, such testing cannot be performed by the mysql-test framework
+via SQL. */
+
+/* test_dict_table_schema_check() @{ */
+void
+test_dict_table_schema_check()
+{
+	/*
+	CREATE TABLE tcheck (
+		c01 VARCHAR(123),
+		c02 INT,
+		c03 INT NOT NULL,
+		c04 INT UNSIGNED,
+		c05 BIGINT,
+		c06 BIGINT UNSIGNED NOT NULL,
+		c07 TIMESTAMP
+	) ENGINE=INNODB;
+	*/
+	/* definition for the table 'test/tcheck' */
+	dict_col_meta_t	columns[] = {
+		{"c01", DATA_VARCHAR, 0, 123},
+		{"c02", DATA_INT, 0, 4},
+		{"c03", DATA_INT, DATA_NOT_NULL, 4},
+		{"c04", DATA_INT, DATA_UNSIGNED, 4},
+		{"c05", DATA_INT, 0, 8},
+		{"c06", DATA_INT, DATA_NOT_NULL | DATA_UNSIGNED, 8},
+		{"c07", DATA_INT, 0, 4},
+		{"c_extra", DATA_INT, 0, 4}
+	};
+	dict_table_schema_t	schema = {
+		"test/tcheck",
+		0 /* will be set individually for each test below */,
+		columns
+	};
+	char	errstr[512];
+
+	ut_snprintf(errstr, sizeof(errstr), "Table not found");
+
+	/* prevent any data dictionary modifications while we are checking
+	the tables' structure */
+
+	mutex_enter(&(dict_sys->mutex));
+
+	/* check that a valid table is reported as valid */
+	schema.n_cols = 7;
+	if (dict_table_schema_check(&schema, errstr, sizeof(errstr))
+	    == DB_SUCCESS) {
+		printf("OK: test.tcheck ok\n");
+	} else {
+		printf("ERROR: %s\n", errstr);
+		printf("ERROR: test.tcheck not present or corrupted\n");
+		goto test_dict_table_schema_check_end;
+	}
+
+	/* check columns with wrong length */
+	schema.columns[1].len = 8;
+	if (dict_table_schema_check(&schema, errstr, sizeof(errstr))
+	    != DB_SUCCESS) {
+		printf("OK: test.tcheck.c02 has different length and is "
+		       "reported as corrupted\n");
+	} else {
+		printf("OK: test.tcheck.c02 has different length but is "
+		       "reported as ok\n");
+		goto test_dict_table_schema_check_end;
+	}
+	schema.columns[1].len = 4;
+
+	/* request that c02 is NOT NULL while actually it does not have
+	this flag set */
+	schema.columns[1].prtype_mask |= DATA_NOT_NULL;
+	if (dict_table_schema_check(&schema, errstr, sizeof(errstr))
+	    != DB_SUCCESS) {
+		printf("OK: test.tcheck.c02 does not have NOT NULL while "
+		       "it should and is reported as corrupted\n");
+	} else {
+		printf("ERROR: test.tcheck.c02 does not have NOT NULL while "
+		       "it should and is not reported as corrupted\n");
+		goto test_dict_table_schema_check_end;
+	}
+	schema.columns[1].prtype_mask &= ~DATA_NOT_NULL;
+
+	/* check a table that contains some extra columns */
+	schema.n_cols = 6;
+	if (dict_table_schema_check(&schema, errstr, sizeof(errstr))
+	    == DB_SUCCESS) {
+		printf("ERROR: test.tcheck has more columns but is not "
+		       "reported as corrupted\n");
+		goto test_dict_table_schema_check_end;
+	} else {
+		printf("OK: test.tcheck has more columns and is "
+		       "reported as corrupted\n");
+	}
+
+	/* check a table that has some columns missing */
+	schema.n_cols = 8;
+	if (dict_table_schema_check(&schema, errstr, sizeof(errstr))
+	    != DB_SUCCESS) {
+		printf("OK: test.tcheck has missing columns and is "
+		       "reported as corrupted\n");
+	} else {
+		printf("ERROR: test.tcheck has missing columns but is "
+		       "reported as ok\n");
+		goto test_dict_table_schema_check_end;
+	}
+
+	/* check non-existent table */
+	schema.table_name = "test/tcheck_nonexistent";
+	if (dict_table_schema_check(&schema, errstr, sizeof(errstr))
+	    != DB_SUCCESS) {
+		printf("OK: test.tcheck_nonexistent is not present\n");
+	} else {
+		printf("ERROR: test.tcheck_nonexistent is present!?\n");
+		goto test_dict_table_schema_check_end;
+	}
+
+test_dict_table_schema_check_end:
+
+	mutex_exit(&(dict_sys->mutex));
+}
+/* @} */
+
+/* save/fetch aux macros @{ */
+#define TEST_DATABASE_NAME		"foobardb"
+#define TEST_TABLE_NAME			"test_dict_stats"
+
+#define TEST_N_ROWS			111
+#define TEST_CLUSTERED_INDEX_SIZE	222
+#define TEST_SUM_OF_OTHER_INDEX_SIZES	333
+
+#define TEST_IDX1_NAME			"tidx1"
+#define TEST_IDX1_COL1_NAME		"tidx1_col1"
+#define TEST_IDX1_INDEX_SIZE		123
+#define TEST_IDX1_N_LEAF_PAGES		234
+#define TEST_IDX1_N_DIFF1		50
+#define TEST_IDX1_N_DIFF1_SAMPLE_SIZE	500
+
+#define TEST_IDX2_NAME			"tidx2"
+#define TEST_IDX2_COL1_NAME		"tidx2_col1"
+#define TEST_IDX2_COL2_NAME		"tidx2_col2"
+#define TEST_IDX2_COL3_NAME		"tidx2_col3"
+#define TEST_IDX2_COL4_NAME		"tidx2_col4"
+#define TEST_IDX2_INDEX_SIZE		321
+#define TEST_IDX2_N_LEAF_PAGES		432
+#define TEST_IDX2_N_DIFF1		60
+#define TEST_IDX2_N_DIFF1_SAMPLE_SIZE	600
+#define TEST_IDX2_N_DIFF2		61
+#define TEST_IDX2_N_DIFF2_SAMPLE_SIZE	610
+#define TEST_IDX2_N_DIFF3		62
+#define TEST_IDX2_N_DIFF3_SAMPLE_SIZE	620
+#define TEST_IDX2_N_DIFF4		63
+#define TEST_IDX2_N_DIFF4_SAMPLE_SIZE	630
+/* @} */
+
+/* test_dict_stats_save() @{ */
+void
+test_dict_stats_save()
+{
+	dict_table_t	table;
+	dict_index_t	index1;
+	dict_field_t	index1_fields[1];
+	ib_uint64_t	index1_stat_n_diff_key_vals[2];
+	ib_uint64_t	index1_stat_n_sample_sizes[2];
+	dict_index_t	index2;
+	dict_field_t	index2_fields[4];
+	ib_uint64_t	index2_stat_n_diff_key_vals[5];
+	ib_uint64_t	index2_stat_n_sample_sizes[5];
+	enum db_err	ret;
+
+	/* craft a dummy dict_table_t */
+	table.name = (char*) (TEST_DATABASE_NAME "/" TEST_TABLE_NAME);
+	table.stat_n_rows = TEST_N_ROWS;
+	table.stat_clustered_index_size = TEST_CLUSTERED_INDEX_SIZE;
+	table.stat_sum_of_other_index_sizes = TEST_SUM_OF_OTHER_INDEX_SIZES;
+	UT_LIST_INIT(table.indexes);
+	UT_LIST_ADD_LAST(indexes, table.indexes, &index1);
+	UT_LIST_ADD_LAST(indexes, table.indexes, &index2);
+#ifdef UNIV_DEBUG
+	table.magic_n = DICT_TABLE_MAGIC_N;
+#endif /* UNIV_DEBUG */
+
+	index1.name = TEST_IDX1_NAME;
+	index1.table = &table;
+#ifdef UNIV_DEBUG
+	index1.magic_n = DICT_INDEX_MAGIC_N;
+#endif /* UNIV_DEBUG */
+	index1.to_be_dropped = 0;
+	index1.cached = 1;
+	index1.n_uniq = 1;
+	index1.fields = index1_fields;
+	index1.stat_n_diff_key_vals = index1_stat_n_diff_key_vals;
+	index1.stat_n_sample_sizes = index1_stat_n_sample_sizes;
+	index1.stat_index_size = TEST_IDX1_INDEX_SIZE;
+	index1.stat_n_leaf_pages = TEST_IDX1_N_LEAF_PAGES;
+	index1_fields[0].name = TEST_IDX1_COL1_NAME;
+	index1_stat_n_diff_key_vals[0] = 1; /* dummy */
+	index1_stat_n_diff_key_vals[1] = TEST_IDX1_N_DIFF1;
+	index1_stat_n_sample_sizes[0] = 0; /* dummy */
+	index1_stat_n_sample_sizes[1] = TEST_IDX1_N_DIFF1_SAMPLE_SIZE;
+
+	index2.name = TEST_IDX2_NAME;
+	index2.table = &table;
+#ifdef UNIV_DEBUG
+	index2.magic_n = DICT_INDEX_MAGIC_N;
+#endif /* UNIV_DEBUG */
+	index2.to_be_dropped = 0;
+	index2.cached = 1;
+	index2.n_uniq = 4;
+	index2.fields = index2_fields;
+	index2.stat_n_diff_key_vals = index2_stat_n_diff_key_vals;
+	index2.stat_n_sample_sizes = index2_stat_n_sample_sizes;
+	index2.stat_index_size = TEST_IDX2_INDEX_SIZE;
+	index2.stat_n_leaf_pages = TEST_IDX2_N_LEAF_PAGES;
+	index2_fields[0].name = TEST_IDX2_COL1_NAME;
+	index2_fields[1].name = TEST_IDX2_COL2_NAME;
+	index2_fields[2].name = TEST_IDX2_COL3_NAME;
+	index2_fields[3].name = TEST_IDX2_COL4_NAME;
+	index2_stat_n_diff_key_vals[0] = 1; /* dummy */
+	index2_stat_n_diff_key_vals[1] = TEST_IDX2_N_DIFF1;
+	index2_stat_n_diff_key_vals[2] = TEST_IDX2_N_DIFF2;
+	index2_stat_n_diff_key_vals[3] = TEST_IDX2_N_DIFF3;
+	index2_stat_n_diff_key_vals[4] = TEST_IDX2_N_DIFF4;
+	index2_stat_n_sample_sizes[0] = 0; /* dummy */
+	index2_stat_n_sample_sizes[1] = TEST_IDX2_N_DIFF1_SAMPLE_SIZE;
+	index2_stat_n_sample_sizes[2] = TEST_IDX2_N_DIFF2_SAMPLE_SIZE;
+	index2_stat_n_sample_sizes[3] = TEST_IDX2_N_DIFF3_SAMPLE_SIZE;
+	index2_stat_n_sample_sizes[4] = TEST_IDX2_N_DIFF4_SAMPLE_SIZE;
+
+	ret = dict_stats_save(&table, FALSE);
+
+	ut_a(ret == DB_SUCCESS);
+
+	printf("\nOK: stats saved successfully, now go ahead and read "
+	       "what's inside %s and %s:\n\n",
+	       TABLE_STATS_NAME_PRINT,
+	       INDEX_STATS_NAME_PRINT);
+
+	printf("SELECT COUNT(*) = 1 AS table_stats_saved_successfully\n"
+	       "FROM %s\n"
+	       "WHERE\n"
+	       "database_name = '%s' AND\n"
+	       "table_name = '%s' AND\n"
+	       "n_rows = %d AND\n"
+	       "clustered_index_size = %d AND\n"
+	       "sum_of_other_index_sizes = %d;\n"
+	       "\n",
+	       TABLE_STATS_NAME_PRINT,
+	       TEST_DATABASE_NAME,
+	       TEST_TABLE_NAME,
+	       TEST_N_ROWS,
+	       TEST_CLUSTERED_INDEX_SIZE,
+	       TEST_SUM_OF_OTHER_INDEX_SIZES);
+
+	printf("SELECT COUNT(*) = 3 AS tidx1_stats_saved_successfully\n"
+	       "FROM %s\n"
+	       "WHERE\n"
+	       "database_name = '%s' AND\n"
+	       "table_name = '%s' AND\n"
+	       "index_name = '%s' AND\n"
+	       "(\n"
+	       " (stat_name = 'size' AND stat_value = %d AND"
+	       "  sample_size IS NULL) OR\n"
+	       " (stat_name = 'n_leaf_pages' AND stat_value = %d AND"
+	       "  sample_size IS NULL) OR\n"
+	       " (stat_name = 'n_diff_pfx01' AND stat_value = %d AND"
+	       "  sample_size = '%d' AND stat_description = '%s')\n"
+	       ");\n"
+	       "\n",
+	       INDEX_STATS_NAME_PRINT,
+	       TEST_DATABASE_NAME,
+	       TEST_TABLE_NAME,
+	       TEST_IDX1_NAME,
+	       TEST_IDX1_INDEX_SIZE,
+	       TEST_IDX1_N_LEAF_PAGES,
+	       TEST_IDX1_N_DIFF1,
+	       TEST_IDX1_N_DIFF1_SAMPLE_SIZE,
+	       TEST_IDX1_COL1_NAME);
+
+	printf("SELECT COUNT(*) = 6 AS tidx2_stats_saved_successfully\n"
+	       "FROM %s\n"
+	       "WHERE\n"
+	       "database_name = '%s' AND\n"
+	       "table_name = '%s' AND\n"
+	       "index_name = '%s' AND\n"
+	       "(\n"
+	       " (stat_name = 'size' AND stat_value = %d AND"
+	       "  sample_size IS NULL) OR\n"
+	       " (stat_name = 'n_leaf_pages' AND stat_value = %d AND"
+	       "  sample_size IS NULL) OR\n"
+	       " (stat_name = 'n_diff_pfx01' AND stat_value = %d AND"
+	       "  sample_size = '%d' AND stat_description = '%s') OR\n"
+	       " (stat_name = 'n_diff_pfx02' AND stat_value = %d AND"
+	       "  sample_size = '%d' AND stat_description = '%s,%s') OR\n"
+	       " (stat_name = 'n_diff_pfx03' AND stat_value = %d AND"
+	       "  sample_size = '%d' AND stat_description = '%s,%s,%s') OR\n"
+	       " (stat_name = 'n_diff_pfx04' AND stat_value = %d AND"
+	       "  sample_size = '%d' AND stat_description = '%s,%s,%s,%s')\n"
+	       ");\n"
+	       "\n",
+	       INDEX_STATS_NAME_PRINT,
+	       TEST_DATABASE_NAME,
+	       TEST_TABLE_NAME,
+	       TEST_IDX2_NAME,
+	       TEST_IDX2_INDEX_SIZE,
+	       TEST_IDX2_N_LEAF_PAGES,
+	       TEST_IDX2_N_DIFF1,
+	       TEST_IDX2_N_DIFF1_SAMPLE_SIZE, TEST_IDX2_COL1_NAME,
+	       TEST_IDX2_N_DIFF2,
+	       TEST_IDX2_N_DIFF2_SAMPLE_SIZE,
+	       TEST_IDX2_COL1_NAME, TEST_IDX2_COL2_NAME,
+	       TEST_IDX2_N_DIFF3,
+	       TEST_IDX2_N_DIFF3_SAMPLE_SIZE,
+	       TEST_IDX2_COL1_NAME, TEST_IDX2_COL2_NAME, TEST_IDX2_COL3_NAME,
+	       TEST_IDX2_N_DIFF4,
+	       TEST_IDX2_N_DIFF4_SAMPLE_SIZE,
+	       TEST_IDX2_COL1_NAME, TEST_IDX2_COL2_NAME, TEST_IDX2_COL3_NAME,
+	       TEST_IDX2_COL4_NAME);
+}
+/* @} */
+
+/* test_dict_stats_fetch_from_ps() @{ */
+void
+test_dict_stats_fetch_from_ps()
+{
+	dict_table_t	table;
+	dict_index_t	index1;
+	ib_uint64_t	index1_stat_n_diff_key_vals[2];
+	ib_uint64_t	index1_stat_n_sample_sizes[2];
+	dict_index_t	index2;
+	ib_uint64_t	index2_stat_n_diff_key_vals[5];
+	ib_uint64_t	index2_stat_n_sample_sizes[5];
+	enum db_err	ret;
+
+	/* craft a dummy dict_table_t */
+	table.name = (char*) (TEST_DATABASE_NAME "/" TEST_TABLE_NAME);
+	UT_LIST_INIT(table.indexes);
+	UT_LIST_ADD_LAST(indexes, table.indexes, &index1);
+	UT_LIST_ADD_LAST(indexes, table.indexes, &index2);
+#ifdef UNIV_DEBUG
+	table.magic_n = DICT_TABLE_MAGIC_N;
+#endif /* UNIV_DEBUG */
+
+	index1.name = TEST_IDX1_NAME;
+#ifdef UNIV_DEBUG
+	index1.magic_n = DICT_INDEX_MAGIC_N;
+#endif /* UNIV_DEBUG */
+	index1.cached = 1;
+	index1.n_uniq = 1;
+	index1.stat_n_diff_key_vals = index1_stat_n_diff_key_vals;
+	index1.stat_n_sample_sizes = index1_stat_n_sample_sizes;
+
+	index2.name = TEST_IDX2_NAME;
+#ifdef UNIV_DEBUG
+	index2.magic_n = DICT_INDEX_MAGIC_N;
+#endif /* UNIV_DEBUG */
+	index2.cached = 1;
+	index2.n_uniq = 4;
+	index2.stat_n_diff_key_vals = index2_stat_n_diff_key_vals;
+	index2.stat_n_sample_sizes = index2_stat_n_sample_sizes;
+
+	ret = dict_stats_fetch_from_ps(&table, FALSE);
+
+	ut_a(ret == DB_SUCCESS);
+
+	ut_a(table.stat_n_rows == TEST_N_ROWS);
+	ut_a(table.stat_clustered_index_size == TEST_CLUSTERED_INDEX_SIZE);
+	ut_a(table.stat_sum_of_other_index_sizes
+	     == TEST_SUM_OF_OTHER_INDEX_SIZES);
+
+	ut_a(index1.stat_index_size == TEST_IDX1_INDEX_SIZE);
+	ut_a(index1.stat_n_leaf_pages == TEST_IDX1_N_LEAF_PAGES);
+	ut_a(index1_stat_n_diff_key_vals[1] == TEST_IDX1_N_DIFF1);
+	ut_a(index1_stat_n_sample_sizes[1] == TEST_IDX1_N_DIFF1_SAMPLE_SIZE);
+
+	ut_a(index2.stat_index_size == TEST_IDX2_INDEX_SIZE);
+	ut_a(index2.stat_n_leaf_pages == TEST_IDX2_N_LEAF_PAGES);
+	ut_a(index2_stat_n_diff_key_vals[1] == TEST_IDX2_N_DIFF1);
+	ut_a(index2_stat_n_sample_sizes[1] == TEST_IDX2_N_DIFF1_SAMPLE_SIZE);
+	ut_a(index2_stat_n_diff_key_vals[2] == TEST_IDX2_N_DIFF2);
+	ut_a(index2_stat_n_sample_sizes[2] == TEST_IDX2_N_DIFF2_SAMPLE_SIZE);
+	ut_a(index2_stat_n_diff_key_vals[3] == TEST_IDX2_N_DIFF3);
+	ut_a(index2_stat_n_sample_sizes[3] == TEST_IDX2_N_DIFF3_SAMPLE_SIZE);
+	ut_a(index2_stat_n_diff_key_vals[4] == TEST_IDX2_N_DIFF4);
+	ut_a(index2_stat_n_sample_sizes[4] == TEST_IDX2_N_DIFF4_SAMPLE_SIZE);
+
+	printf("OK: fetch successful\n");
+}
+/* @} */
+
+/* test_dict_stats_all() @{ */
+void
+test_dict_stats_all()
+{
+	test_dict_table_schema_check();
+
+	test_dict_stats_save();
+
+	test_dict_stats_fetch_from_ps();
+}
+/* @} */
+
+#endif /* UNIV_COMPILE_TEST_FUNCS */
+/* @} */
+
+#endif /* UNIV_HOTBACKUP */
+
+/* vim: set foldmethod=marker foldmarker=@{,@}: */
diff --git a/storage/innobase/dyn/dyn0dyn.c b/storage/innobase/dyn/dyn0dyn.cc
index e1275f040f3..b157c7707f4 100644
--- a/storage/innobase/dyn/dyn0dyn.c
+++ b/storage/innobase/dyn/dyn0dyn.cc
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2011, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,13 +11,13 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
 /**************************************************//**
-@file dyn/dyn0dyn.c
+@file dyn/dyn0dyn.cc
 The dynamically allocated array
 
 Created 2/5/1996 Heikki Tuuri
@@ -55,7 +55,8 @@ dyn_array_add_block(
 
 	heap = arr->heap;
 
-	block = mem_heap_alloc(heap, sizeof(dyn_block_t));
+	block = static_cast<dyn_block_t*>(
+		mem_heap_alloc(heap, sizeof(dyn_block_t)));
 
 	block->used = 0;
 
diff --git a/storage/innobase/eval/eval0eval.c b/storage/innobase/eval/eval0eval.cc
index dcd416adeee..ccc54781102 100644
--- a/storage/innobase/eval/eval0eval.c
+++ b/storage/innobase/eval/eval0eval.cc
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1997, 2011, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,13 +11,13 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
 /**************************************************//**
-@file eval/eval0eval.c
+@file eval/eval0eval.cc
 SQL evaluator: evaluates simple data structures, like expressions, in
 a query graph
 
@@ -32,6 +32,7 @@ Created 12/29/1997 Heikki Tuuri
 
 #include "data0data.h"
 #include "row0sel.h"
+#include "rem0cmp.h"
 
 /** The RND function seed */
 static ulint	eval_rnd	= 128367121;
@@ -41,6 +42,18 @@ eval_node_alloc_val_buf */
 
 static byte	eval_dummy;
 
+/*************************************************************************
+Gets the like node from the node */
+UNIV_INLINE
+que_node_t*
+que_node_get_like_node(
+/*===================*/
+				/* out: next node in a list of nodes */
+	que_node_t*     node)   /* in: node in a list */
+{
+	return(((sym_node_t*) node)->like_node);
+}
+
 /*****************************************************************//**
 Allocate a buffer from global dynamic memory for a value of a que_node.
 NOTE that this memory must be explicitly freed when the query graph is
@@ -65,7 +78,7 @@ eval_node_alloc_val_buf(
 
 	dfield = que_node_get_val(node);
 
-	data = dfield_get_data(dfield);
+	data = static_cast<byte*>(dfield_get_data(dfield));
 
 	if (data && data != &eval_dummy) {
 		mem_free(data);
@@ -74,7 +87,7 @@ eval_node_alloc_val_buf(
 	if (size == 0) {
 		data = &eval_dummy;
 	} else {
-		data = mem_alloc(size);
+		data = static_cast<byte*>(mem_alloc(size));
 	}
 
 	que_node_set_val_buf_size(node, size);
@@ -102,7 +115,7 @@ eval_node_free_val_buf(
 
 	dfield = que_node_get_val(node);
 
-	data = dfield_get_data(dfield);
+	data = static_cast<byte*>(dfield_get_data(dfield));
 
 	if (que_node_get_val_buf_size(node) > 0) {
 		ut_a(data);
@@ -111,10 +124,80 @@ eval_node_free_val_buf(
 	}
 }
 
-/*****************************************************************//**
+/*********************************************************************
+Evaluates a LIKE comparison node.
+@return the result of the comparison */
+UNIV_INLINE
+ibool
+eval_cmp_like(
+/*==========*/
+	que_node_t*	arg1,		/* !< in: left operand */
+	que_node_t*	arg2)		/* !< in: right operand */
+{
+	ib_like_t	op;
+	int		res;
+	que_node_t*	arg3;
+	que_node_t*	arg4;
+	dfield_t*	dfield;
+	dtype_t*	dtype;
+	ibool		val = TRUE;
+
+	arg3 = que_node_get_like_node(arg2);
+
+	/* Get the comparison type operator */
+	ut_a(arg3);
+
+	dfield = que_node_get_val(arg3);
+	dtype = dfield_get_type(dfield);
+
+	ut_a(dtype_get_mtype(dtype) == DATA_INT);
+	op = static_cast<ib_like_t>(mach_read_from_4(static_cast<const unsigned char*>(dfield_get_data(dfield))));
+
+	switch (op) {
+	case	IB_LIKE_PREFIX:
+
+		arg4 = que_node_get_next(arg3);
+		res = cmp_dfield_dfield_like_prefix(
+			que_node_get_val(arg1),
+			que_node_get_val(arg4));
+		break;
+
+	case	IB_LIKE_SUFFIX:
+
+		arg4 = que_node_get_next(arg3);
+		res = cmp_dfield_dfield_like_suffix(
+			que_node_get_val(arg1),
+			que_node_get_val(arg4));
+		break;
+
+	case	IB_LIKE_SUBSTR:
+
+		arg4 = que_node_get_next(arg3);
+		res = cmp_dfield_dfield_like_substr(
+			que_node_get_val(arg1),
+			que_node_get_val(arg4));
+		break;
+
+	case	IB_LIKE_EXACT:
+		res = cmp_dfield_dfield(
+			que_node_get_val(arg1),
+			que_node_get_val(arg2));
+		break;
+
+	default:
+		ut_error;
+	}
+
+	if (res != 0) {
+		val = FALSE;
+	}
+
+	return(val);
+}
+
+/*********************************************************************
 Evaluates a comparison node.
-@return	the result of the comparison */
-UNIV_INTERN
+@return the result of the comparison */
 ibool
 eval_cmp(
 /*=====*/
@@ -123,45 +206,52 @@ eval_cmp(
 	que_node_t*	arg1;
 	que_node_t*	arg2;
 	int		res;
-	ibool		val;
 	int		func;
+	ibool		val = TRUE;
 
 	ut_ad(que_node_get_type(cmp_node) == QUE_NODE_FUNC);
 
 	arg1 = cmp_node->args;
 	arg2 = que_node_get_next(arg1);
 
-	res = cmp_dfield_dfield(que_node_get_val(arg1),
-				que_node_get_val(arg2));
-	val = TRUE;
-
 	func = cmp_node->func;
 
-	if (func == '=') {
-		if (res != 0) {
-			val = FALSE;
-		}
-	} else if (func == '<') {
-		if (res != -1) {
-			val = FALSE;
-		}
-	} else if (func == PARS_LE_TOKEN) {
-		if (res == 1) {
-			val = FALSE;
-		}
-	} else if (func == PARS_NE_TOKEN) {
-		if (res == 0) {
-			val = FALSE;
-		}
-	} else if (func == PARS_GE_TOKEN) {
-		if (res == -1) {
-			val = FALSE;
-		}
+	if (func == PARS_LIKE_TOKEN_EXACT
+	    || func == PARS_LIKE_TOKEN_PREFIX
+	    || func == PARS_LIKE_TOKEN_SUFFIX
+	    || func == PARS_LIKE_TOKEN_SUBSTR) {
+
+		val = eval_cmp_like(arg1, arg2);
 	} else {
-		ut_ad(func == '>');
+		res = cmp_dfield_dfield(
+			que_node_get_val(arg1), que_node_get_val(arg2));
 
-		if (res != 1) {
-			val = FALSE;
+		if (func == '=') {
+			if (res != 0) {
+				val = FALSE;
+			}
+		} else if (func == '<') {
+			if (res != -1) {
+				val = FALSE;
+			}
+		} else if (func == PARS_LE_TOKEN) {
+			if (res == 1) {
+				val = FALSE;
+			}
+		} else if (func == PARS_NE_TOKEN) {
+			if (res == 0) {
+				val = FALSE;
+			}
+		} else if (func == PARS_GE_TOKEN) {
+			if (res == -1) {
+				val = FALSE;
+			}
+		} else {
+			ut_ad(func == '>');
+
+			if (res != 1) {
+				val = FALSE;
+			}
 		}
 	}
 
@@ -344,8 +434,8 @@ eval_predefined_2(
 
 	} else if (func == PARS_RND_TOKEN) {
 
-		len1 = (ulint)eval_node_get_int_val(arg1);
-		len2 = (ulint)eval_node_get_int_val(arg2);
+		len1 = (ulint) eval_node_get_int_val(arg1);
+		len2 = (ulint) eval_node_get_int_val(arg2);
 
 		ut_ad(len2 >= len1);
 
@@ -362,7 +452,7 @@ eval_predefined_2(
 
 	} else if (func == PARS_RND_STR_TOKEN) {
 
-		len1 = (ulint)eval_node_get_int_val(arg1);
+		len1 = (ulint) eval_node_get_int_val(arg1);
 
 		data = eval_node_ensure_val_buf(func_node, len1);
 
@@ -390,7 +480,7 @@ eval_notfound(
 
 	ut_ad(func_node->func == PARS_NOTFOUND_TOKEN);
 
-	cursor = func_node->args;
+	cursor = static_cast<sym_node_t*>(func_node->args);
 
 	ut_ad(que_node_get_type(cursor) == QUE_NODE_SYMBOL);
 
@@ -436,10 +526,10 @@ eval_substr(
 
 	arg3 = que_node_get_next(arg2);
 
-	str1 = dfield_get_data(que_node_get_val(arg1));
+	str1 = static_cast<byte*>(dfield_get_data(que_node_get_val(arg1)));
 
-	len1 = (ulint)eval_node_get_int_val(arg2);
-	len2 = (ulint)eval_node_get_int_val(arg3);
+	len1 = (ulint) eval_node_get_int_val(arg2);
+	len2 = (ulint) eval_node_get_int_val(arg3);
 
 	dfield = que_node_get_val(func_node);
 
@@ -471,11 +561,11 @@ eval_replstr(
 	arg3 = que_node_get_next(arg2);
 	arg4 = que_node_get_next(arg3);
 
-	str1 = dfield_get_data(que_node_get_val(arg1));
-	str2 = dfield_get_data(que_node_get_val(arg2));
+	str1 = static_cast<byte*>(dfield_get_data(que_node_get_val(arg1)));
+	str2 = static_cast<byte*>(dfield_get_data(que_node_get_val(arg2)));
 
-	len1 = (ulint)eval_node_get_int_val(arg3);
-	len2 = (ulint)eval_node_get_int_val(arg4);
+	len1 = (ulint) eval_node_get_int_val(arg3);
+	len2 = (ulint) eval_node_get_int_val(arg4);
 
 	if ((dfield_get_len(que_node_get_val(arg1)) < len1 + len2)
 	    || (dfield_get_len(que_node_get_val(arg2)) < len2)) {
@@ -513,8 +603,8 @@ eval_instr(
 	dfield1 = que_node_get_val(arg1);
 	dfield2 = que_node_get_val(arg2);
 
-	str1 = dfield_get_data(dfield1);
-	str2 = dfield_get_data(dfield2);
+	str1 = static_cast<byte*>(dfield_get_data(dfield1));
+	str2 = static_cast<byte*>(dfield_get_data(dfield2));
 
 	len1 = dfield_get_len(dfield1);
 	len2 = dfield_get_len(dfield2);
@@ -577,7 +667,7 @@ eval_binary_to_number(
 
 	dfield = que_node_get_val(arg1);
 
-	str1 = dfield_get_data(dfield);
+	str1 = static_cast<byte*>(dfield_get_data(dfield));
 	len1 = dfield_get_len(dfield);
 
 	if (len1 > 4) {
@@ -588,7 +678,7 @@ eval_binary_to_number(
 		str2 = str1;
 	} else {
 		int_val = 0;
-		str2 = (byte*)&int_val;
+		str2 = (byte*) &int_val;
 
 		ut_memcpy(str2 + (4 - len1), str1, len1);
 	}
@@ -659,7 +749,7 @@ eval_to_binary(
 
 	arg1 = func_node->args;
 
-	str1 = dfield_get_data(que_node_get_val(arg1));
+	str1 = static_cast<byte*>(dfield_get_data(que_node_get_val(arg1)));
 
 	if (dtype_get_mtype(que_node_get_data_type(arg1)) != DATA_INT) {
 
@@ -674,7 +764,7 @@ eval_to_binary(
 
 	arg2 = que_node_get_next(arg1);
 
-	len1 = (ulint)eval_node_get_int_val(arg2);
+	len1 = (ulint) eval_node_get_int_val(arg2);
 
 	if (len1 > 4) {
 
@@ -705,7 +795,7 @@ eval_predefined(
 
 	if (func == PARS_LENGTH_TOKEN) {
 
-		int_val = (lint)dfield_get_len(que_node_get_val(arg1));
+		int_val = (lint) dfield_get_len(que_node_get_val(arg1));
 
 	} else if (func == PARS_TO_CHAR_TOKEN) {
 
@@ -768,7 +858,7 @@ eval_predefined(
 			       dfield_get_data(que_node_get_val(arg1)));
 
 	} else if (func == PARS_SYSDATE_TOKEN) {
-		int_val = (lint)ut_time();
+		int_val = (lint) ut_time();
 	} else {
 		eval_predefined_2(func_node);
 
@@ -787,12 +877,12 @@ eval_func(
 	func_node_t*	func_node)	/*!< in: function node */
 {
 	que_node_t*	arg;
-	ulint		class;
+	ulint		fclass;
 	ulint		func;
 
 	ut_ad(que_node_get_type(func_node) == QUE_NODE_FUNC);
 
-	class = func_node->class;
+	fclass = func_node->fclass;
 	func = func_node->func;
 
 	arg = func_node->args;
@@ -805,7 +895,7 @@ eval_func(
 		values, except for eval_cmp and notfound */
 
 		if (dfield_is_null(que_node_get_val(arg))
-		    && (class != PARS_FUNC_CMP)
+		    && (fclass != PARS_FUNC_CMP)
 		    && (func != PARS_NOTFOUND_TOKEN)
 		    && (func != PARS_PRINTF_TOKEN)) {
 			ut_error;
@@ -814,34 +904,47 @@ eval_func(
 		arg = que_node_get_next(arg);
 	}
 
-	if (class == PARS_FUNC_CMP) {
+	switch (fclass) {
+	case PARS_FUNC_CMP:
 		eval_cmp(func_node);
-	} else if (class == PARS_FUNC_ARITH) {
+		return;
+	case PARS_FUNC_ARITH:
 		eval_arith(func_node);
-	} else if (class == PARS_FUNC_AGGREGATE) {
+		return;
+	case PARS_FUNC_AGGREGATE:
 		eval_aggregate(func_node);
-	} else if (class == PARS_FUNC_PREDEFINED) {
-
-		if (func == PARS_NOTFOUND_TOKEN) {
+		return;
+	case PARS_FUNC_PREDEFINED:
+		switch (func) {
+		case PARS_NOTFOUND_TOKEN:
 			eval_notfound(func_node);
-		} else if (func == PARS_SUBSTR_TOKEN) {
+			return;
+		case PARS_SUBSTR_TOKEN:
 			eval_substr(func_node);
-		} else if (func == PARS_REPLSTR_TOKEN) {
+			return;
+		case PARS_REPLSTR_TOKEN:
 			eval_replstr(func_node);
-		} else if (func == PARS_INSTR_TOKEN) {
+			return;
+		case PARS_INSTR_TOKEN:
 			eval_instr(func_node);
-		} else if (func == PARS_BINARY_TO_NUMBER_TOKEN) {
+			return;
+		case PARS_BINARY_TO_NUMBER_TOKEN:
 			eval_binary_to_number(func_node);
-		} else if (func == PARS_CONCAT_TOKEN) {
+			return;
+		case PARS_CONCAT_TOKEN:
 			eval_concat(func_node);
-		} else if (func == PARS_TO_BINARY_TOKEN) {
+			return;
+		case PARS_TO_BINARY_TOKEN:
 			eval_to_binary(func_node);
-		} else {
+			return;
+		default:
 			eval_predefined(func_node);
+			return;
 		}
-	} else {
-		ut_ad(class == PARS_FUNC_LOGICAL);
-
+	case PARS_FUNC_LOGICAL:
 		eval_logical(func_node);
+		return;
 	}
+
+	ut_error;
 }
diff --git a/storage/innobase/eval/eval0proc.c b/storage/innobase/eval/eval0proc.cc
index 3a4218d92bf..e6f3a32cd48 100644
--- a/storage/innobase/eval/eval0proc.c
+++ b/storage/innobase/eval/eval0proc.cc
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1998, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1998, 2011, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,13 +11,13 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
 /**************************************************//**
-@file eval/eval0proc.c
+@file eval/eval0proc.cc
 Executes SQL stored procedures and their control structures
 
 Created 1/20/1998 Heikki Tuuri
@@ -43,7 +43,7 @@ if_step(
 
 	ut_ad(thr);
 
-	node = thr->run_node;
+	node = static_cast<if_node_t*>(thr->run_node);
 	ut_ad(que_node_get_type(node) == QUE_NODE_IF);
 
 	if (thr->prev_node == que_node_get_parent(node)) {
@@ -80,7 +80,8 @@ if_step(
 					break;
 				}
 
-				elsif_node = que_node_get_next(elsif_node);
+				elsif_node = static_cast<elsif_node_t*>(
+					que_node_get_next(elsif_node));
 
 				if (elsif_node == NULL) {
 					thr->run_node = NULL;
@@ -118,7 +119,7 @@ while_step(
 
 	ut_ad(thr);
 
-	node = thr->run_node;
+	node = static_cast<while_node_t*>(thr->run_node);
 	ut_ad(que_node_get_type(node) == QUE_NODE_WHILE);
 
 	ut_ad((thr->prev_node == que_node_get_parent(node))
@@ -154,7 +155,7 @@ assign_step(
 
 	ut_ad(thr);
 
-	node = thr->run_node;
+	node = static_cast<assign_node_t*>(thr->run_node);
 	ut_ad(que_node_get_type(node) == QUE_NODE_ASSIGNMENT);
 
 	/* Evaluate the value to assign */
@@ -183,7 +184,7 @@ for_step(
 
 	ut_ad(thr);
 
-	node = thr->run_node;
+	node = static_cast<for_node_t*>(thr->run_node);
 
 	ut_ad(que_node_get_type(node) == QUE_NODE_FOR);
 
@@ -244,7 +245,7 @@ exit_step(
 
 	ut_ad(thr);
 
-	node = thr->run_node;
+	node = static_cast<exit_node_t*>(thr->run_node);
 
 	ut_ad(que_node_get_type(node) == QUE_NODE_EXIT);
 
@@ -276,7 +277,7 @@ return_step(
 
 	ut_ad(thr);
 
-	node = thr->run_node;
+	node = static_cast<return_node_t*>(thr->run_node);
 
 	ut_ad(que_node_get_type(node) == QUE_NODE_RETURN);
 
diff --git a/storage/innobase/fil/fil0fil.c b/storage/innobase/fil/fil0fil.cc
index 0a467d40345..4c6ed9807f6 100644
--- a/storage/innobase/fil/fil0fil.c
+++ b/storage/innobase/fil/fil0fil.cc
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1995, 2010, Innobase Oy. All Rights Reserved.
+Copyright (c) 1995, 2011, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,13 +11,13 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
 /**************************************************//**
-@file fil/fil0fil.c
+@file fil/fil0fil.cc
 The tablespace memory cache
 
 Created 10/25/1995 Heikki Tuuri
@@ -40,12 +40,15 @@ Created 10/25/1995 Heikki Tuuri
 #include "dict0dict.h"
 #include "page0page.h"
 #include "page0zip.h"
+#include "trx0sys.h"
+#include "buf0rea.h"
 #ifndef UNIV_HOTBACKUP
 # include "buf0lru.h"
 # include "ibuf0ibuf.h"
 # include "sync0sync.h"
 # include "os0sync.h"
 #else /* !UNIV_HOTBACKUP */
+# include "srv0srv.h"
 static ulint srv_data_read, srv_data_written;
 #endif /* !UNIV_HOTBACKUP */
 
@@ -118,6 +121,9 @@ UNIV_INTERN ulint	fil_n_pending_log_flushes		= 0;
 /** Number of pending tablespace flushes */
 UNIV_INTERN ulint	fil_n_pending_tablespace_flushes	= 0;
 
+/** Number of files currently open */
+UNIV_INTERN ulint	fil_n_file_opened			= 0;
+
 /** The null file address */
 UNIV_INTERN fil_addr_t	fil_addr_null = {FIL_NULL, 0};
 
@@ -151,6 +157,9 @@ struct fil_node_struct {
 				/*!< count of pending flushes on this file;
 				closing of the file is not allowed if
 				this is > 0 */
+	ibool		being_extended;
+				/*!< TRUE if the node is currently
+				being extended. */
 	ib_int64_t	modification_counter;/*!< when we write to the file we
 				increment this by one */
 	ib_int64_t	flush_counter;/*!< up to what
@@ -185,7 +194,7 @@ struct fil_space_struct {
 				.ibd file of tablespace and want to
 				stop temporarily posting of new i/o
 				requests on the file */
-	ibool		stop_new_ops;
+	ibool		stop_ibuf_merges;
 				/*!< we set this TRUE when we start
 				deleting a single-table tablespace */
 	ibool		is_being_deleted;
@@ -203,20 +212,20 @@ struct fil_space_struct {
 				tablespace whose size we do not know yet;
 				last incomplete megabytes in data files may be
 				ignored if space == 0 */
-	ulint		flags;	/*!< compressed page size and file format, or 0 */
+	ulint		flags;	/*!< tablespace flags; see
+				fsp_flags_validate(), fsp_flags_get_zip_size() */
 	ulint		n_reserved_extents;
 				/*!< number of reserved free extents for
 				ongoing operations like B-tree page split */
 	ulint		n_pending_flushes; /*!< this is positive when flushing
 				the tablespace to disk; dropping of the
 				tablespace is forbidden if this is positive */
-	ulint		n_pending_ops;/*!< this is positive when we
-				have pending operations against this
-				tablespace. The pending operations can
-				be ibuf merges or lock validation code
-				trying to read a block.
-				Dropping of the tablespace is forbidden
-				if this is positive */
+	ulint		n_pending_ibuf_merges;/*!< this is positive
+				when merging insert buffer entries to
+				a page so that we may need to access
+				the ibuf bitmap page in the
+				tablespade: dropping of the tablespace
+				is forbidden if this is positive */
 	hash_node_t	hash;	/*!< hash chain node */
 	hash_node_t	name_hash;/*!< hash chain the name_hash table */
 #ifndef UNIV_HOTBACKUP
@@ -300,6 +309,9 @@ struct fil_system_struct {
 initialized. */
 static fil_system_t*	fil_system	= NULL;
 
+/** Determine if (i) is a user tablespace id or not. */
+# define fil_is_user_tablespace_id(i) ((i) > srv_undo_tablespaces)
+
 #ifdef UNIV_DEBUG
 /** Try fil_validate() every this many times */
 # define FIL_VALIDATE_SKIP	17
@@ -330,6 +342,19 @@ fil_validate_skip(void)
 #endif /* UNIV_DEBUG */
 
 /********************************************************************//**
+Determines if a file node belongs to the least-recently-used list.
+@return TRUE if the file belongs to fil_system->LRU mutex. */
+UNIV_INLINE
+ibool
+fil_space_belongs_in_lru(
+/*=====================*/
+	const fil_space_t*	space)	/*!< in: file space */
+{
+	return(space->purpose == FIL_TABLESPACE
+	       && fil_is_user_tablespace_id(space->id));
+}
+
+/********************************************************************//**
 NOTE: you must call fil_mutex_enter_and_prepare_for_io() first!
 
 Prepares a file node for i/o. Opens the file if it is closed. Updates the
@@ -610,21 +635,15 @@ fil_node_create(
 
 	mutex_enter(&fil_system->mutex);
 
-	node = mem_alloc(sizeof(fil_node_t));
+	node = static_cast<fil_node_t*>(mem_zalloc(sizeof(fil_node_t)));
 
 	node->name = mem_strdup(name);
-	node->open = FALSE;
 
 	ut_a(!is_raw || srv_start_raw_disk_in_use);
 
 	node->is_raw_disk = is_raw;
 	node->size = size;
 	node->magic_n = FIL_NODE_MAGIC_N;
-	node->n_pending = 0;
-	node->n_pending_flushes = 0;
-
-	node->modification_counter = 0;
-	node->flush_counter = 0;
 
 	space = fil_space_get_by_id(id);
 
@@ -659,7 +678,7 @@ fil_node_create(
 }
 
 /********************************************************************//**
-Opens a the file of a node of a tablespace. The caller must own the fil_system
+Opens a file of a node of a tablespace. The caller must own the fil_system
 mutex. */
 static
 void
@@ -669,15 +688,14 @@ fil_node_open_file(
 	fil_system_t*	system,	/*!< in: tablespace memory cache */
 	fil_space_t*	space)	/*!< in: space */
 {
-	ib_int64_t	size_bytes;
-	ulint		size_low;
-	ulint		size_high;
+	os_offset_t	size_bytes;
 	ibool		ret;
 	ibool		success;
 	byte*		buf2;
 	byte*		page;
 	ulint		space_id;
 	ulint		flags;
+	ulint		page_size;
 
 	ut_ad(mutex_own(&(system->mutex)));
 	ut_a(node->n_pending == 0);
@@ -709,10 +727,8 @@ fil_node_open_file(
 			ut_a(0);
 		}
 
-		os_file_get_size(node->handle, &size_low, &size_high);
-
-		size_bytes = (((ib_int64_t)size_high) << 32)
-			+ (ib_int64_t)size_low;
+		size_bytes = os_file_get_size(node->handle);
+		ut_a(size_bytes != (os_offset_t) -1);
 #ifdef UNIV_HOTBACKUP
 		if (space->id == 0) {
 			node->size = (ulint) (size_bytes / UNIV_PAGE_SIZE);
@@ -721,17 +737,16 @@ fil_node_open_file(
 		}
 #endif /* UNIV_HOTBACKUP */
 		ut_a(space->purpose != FIL_LOG);
-		ut_a(space->id != 0);
+		ut_a(fil_is_user_tablespace_id(space->id));
 
 		if (size_bytes < FIL_IBD_FILE_INITIAL_SIZE * UNIV_PAGE_SIZE) {
 			fprintf(stderr,
 				"InnoDB: Error: the size of single-table"
 				" tablespace file %s\n"
-				"InnoDB: is only %lu %lu,"
+				"InnoDB: is only "UINT64PF","
 				" should be at least %lu!\n",
 				node->name,
-				(ulong) size_high,
-				(ulong) size_low,
+				size_bytes,
 				(ulong) (FIL_IBD_FILE_INITIAL_SIZE
 					 * UNIV_PAGE_SIZE));
 
@@ -740,15 +755,15 @@ fil_node_open_file(
 
 		/* Read the first page of the tablespace */
 
-		buf2 = ut_malloc(2 * UNIV_PAGE_SIZE);
+		buf2 = static_cast<byte*>(ut_malloc(2 * UNIV_PAGE_SIZE));
 		/* Align the memory for file i/o if we might have O_DIRECT
 		set */
-		page = ut_align(buf2, UNIV_PAGE_SIZE);
+		page = static_cast<byte*>(ut_align(buf2, UNIV_PAGE_SIZE));
 
-		success = os_file_read(node->handle, page, 0, 0,
-				       UNIV_PAGE_SIZE);
+		success = os_file_read(node->handle, page, 0, UNIV_PAGE_SIZE);
 		space_id = fsp_header_get_space_id(page);
 		flags = fsp_header_get_flags(page);
+		page_size = fsp_flags_get_page_size(flags);
 
 		ut_free(buf2);
 
@@ -776,6 +791,19 @@ fil_node_open_file(
 			ut_error;
 		}
 
+		if (UNIV_UNLIKELY(fsp_flags_get_page_size(space->flags)
+				  != page_size)) {
+			fprintf(stderr,
+				"InnoDB: Error: tablespace file %s"
+				" has page size %lx\n"
+				"InnoDB: but the data dictionary"
+				" expects page size %lx!\n",
+				node->name, flags,
+				fsp_flags_get_page_size(space->flags));
+
+			ut_error;
+		}
+
 		if (UNIV_UNLIKELY(space->flags != flags)) {
 			fprintf(stderr,
 				"InnoDB: Error: table flags are %lx"
@@ -791,12 +819,12 @@ fil_node_open_file(
 			size_bytes = ut_2pow_round(size_bytes, 1024 * 1024);
 		}
 
-		if (!(flags & DICT_TF_ZSSIZE_MASK)) {
+		if (!fsp_flags_is_compressed(flags)) {
 			node->size = (ulint) (size_bytes / UNIV_PAGE_SIZE);
 		} else {
 			node->size = (ulint)
 				(size_bytes
-				 / dict_table_flags_to_zip_size(flags));
+				 / fsp_flags_get_zip_size(flags));
 		}
 
 #ifdef UNIV_HOTBACKUP
@@ -834,8 +862,10 @@ add_size:
 	node->open = TRUE;
 
 	system->n_open++;
+	fil_n_file_opened++;
+
+	if (fil_space_belongs_in_lru(space)) {
 
-	if (space->purpose == FIL_TABLESPACE && space->id != 0) {
 		/* Put the node to the LRU list */
 		UT_LIST_ADD_FIRST(LRU, system->LRU, node);
 	}
@@ -857,8 +887,11 @@ fil_node_close_file(
 	ut_a(node->open);
 	ut_a(node->n_pending == 0);
 	ut_a(node->n_pending_flushes == 0);
+	ut_a(!node->being_extended);
+#ifndef UNIV_HOTBACKUP
 	ut_a(node->modification_counter == node->flush_counter
 	     || srv_fast_shutdown == 2);
+#endif /* !UNIV_HOTBACKUP */
 
 	ret = os_file_close(node->handle);
 	ut_a(ret);
@@ -868,8 +901,10 @@ fil_node_close_file(
 	node->open = FALSE;
 	ut_a(system->n_open > 0);
 	system->n_open--;
+	fil_n_file_opened--;
+
+	if (fil_space_belongs_in_lru(node->space)) {
 
-	if (node->space->purpose == FIL_TABLESPACE && node->space->id != 0) {
 		ut_a(UT_LIST_GET_LEN(system->LRU) > 0);
 
 		/* The node is in the LRU list, remove it */
@@ -896,32 +931,37 @@ fil_try_to_close_file_in_LRU(
 
 	ut_ad(mutex_own(&fil_system->mutex));
 
-	node = UT_LIST_GET_LAST(fil_system->LRU);
-
 	if (print_info) {
 		fprintf(stderr,
 			"InnoDB: fil_sys open file LRU len %lu\n",
 			(ulong) UT_LIST_GET_LEN(fil_system->LRU));
 	}
 
-	while (node != NULL) {
+	for (node = UT_LIST_GET_LAST(fil_system->LRU);
+	     node != NULL;
+	     node = UT_LIST_GET_PREV(LRU, node)) {
+
 		if (node->modification_counter == node->flush_counter
-		    && node->n_pending_flushes == 0) {
+		    && node->n_pending_flushes == 0
+		    && !node->being_extended) {
 
 			fil_node_close_file(node, fil_system);
 
 			return(TRUE);
 		}
 
-		if (print_info && node->n_pending_flushes > 0) {
+		if (!print_info) {
+			continue;
+		}
+
+		if (node->n_pending_flushes > 0) {
 			fputs("InnoDB: cannot close file ", stderr);
 			ut_print_filename(stderr, node->name);
 			fprintf(stderr, ", because n_pending_flushes %lu\n",
 				(ulong) node->n_pending_flushes);
 		}
 
-		if (print_info
-		    && node->modification_counter != node->flush_counter) {
+		if (node->modification_counter != node->flush_counter) {
 			fputs("InnoDB: cannot close file ", stderr);
 			ut_print_filename(stderr, node->name);
 			fprintf(stderr,
@@ -930,7 +970,11 @@ fil_try_to_close_file_in_LRU(
 				(long) node->flush_counter);
 		}
 
-		node = UT_LIST_GET_PREV(LRU, node);
+		if (node->being_extended) {
+			fputs("InnoDB: cannot close file ", stderr);
+			ut_print_filename(stderr, node->name);
+			fprintf(stderr, ", because it is being extended\n");
+		}
 	}
 
 	return(FALSE);
@@ -1088,6 +1132,7 @@ fil_node_free(
 	ut_ad(mutex_own(&(system->mutex)));
 	ut_a(node->magic_n == FIL_NODE_MAGIC_N);
 	ut_a(node->n_pending == 0);
+	ut_a(!node->being_extended);
 
 	if (node->open) {
 		/* We fool the assertion in fil_node_close_file() to think
@@ -1153,8 +1198,8 @@ fil_space_truncate_start(
 #endif /* UNIV_LOG_ARCHIVE */
 
 /*******************************************************************//**
-Creates a space memory object and puts it to the tablespace memory cache. If
-there is an error, prints an error message to the .err log.
+Creates a space memory object and puts it to the 'fil system' hash table.
+If there is an error, prints an error message to the .err log.
 @return	TRUE if success */
 UNIV_INTERN
 ibool
@@ -1162,20 +1207,12 @@ fil_space_create(
 /*=============*/
 	const char*	name,	/*!< in: space name */
 	ulint		id,	/*!< in: space id */
-	ulint		flags,	/*!< in: compressed page size
-				and file format, or 0 */
+	ulint		flags,	/*!< in: tablespace flags */
 	ulint		purpose)/*!< in: FIL_TABLESPACE, or FIL_LOG if log */
 {
 	fil_space_t*	space;
 
-	/* The tablespace flags (FSP_SPACE_FLAGS) should be 0 for
-	ROW_FORMAT=COMPACT
-	((table->flags & ~(~0 << DICT_TF_BITS)) == DICT_TF_COMPACT) and
-	ROW_FORMAT=REDUNDANT (table->flags == 0).  For any other
-	format, the tablespace flags should equal
-	(table->flags & ~(~0 << DICT_TF_BITS)). */
-	ut_a(flags != DICT_TF_COMPACT);
-	ut_a(!(flags & (~0UL << DICT_TF_BITS)));
+	fsp_flags_validate(flags);
 
 try_again:
 	/*printf(
@@ -1257,7 +1294,7 @@ try_again:
 		return(FALSE);
 	}
 
-	space = mem_alloc(sizeof(fil_space_t));
+	space = static_cast<fil_space_t*>(mem_zalloc(sizeof(*space)));
 
 	space->name = mem_strdup(name);
 	space->id = id;
@@ -1282,19 +1319,9 @@ try_again:
 		fil_system->max_assigned_id = id;
 	}
 
-	space->stop_ios = FALSE;
-	space->stop_new_ops = FALSE;
-	space->is_being_deleted = FALSE;
 	space->purpose = purpose;
-	space->size = 0;
 	space->flags = flags;
 
-	space->n_reserved_extents = 0;
-
-	space->n_pending_flushes = 0;
-	space->n_pending_ops = 0;
-
-	UT_LIST_INIT(space->chain);
 	space->magic_n = FIL_SPACE_MAGIC_N;
 
 	rw_lock_create(fil_space_latch_key, &space->latch, SYNC_FSP);
@@ -1387,7 +1414,7 @@ fil_space_free(
 					in X mode */
 {
 	fil_space_t*	space;
-	fil_space_t*	namespace;
+	fil_space_t*	fnamespace;
 	fil_node_t*	fil_node;
 
 	ut_ad(mutex_own(&fil_system->mutex));
@@ -1406,9 +1433,9 @@ fil_space_free(
 
 	HASH_DELETE(fil_space_t, hash, fil_system->spaces, id, space);
 
-	namespace = fil_space_get_by_name(space->name);
-	ut_a(namespace);
-	ut_a(space == namespace);
+	fnamespace = fil_space_get_by_name(space->name);
+	ut_a(fnamespace);
+	ut_a(space == fnamespace);
 
 	HASH_DELETE(fil_space_t, name_hash, fil_system->name_hash,
 		    ut_fold_string(space->name), space);
@@ -1563,7 +1590,7 @@ fil_space_get_zip_size(
 
 	if (flags && flags != ULINT_UNDEFINED) {
 
-		return(dict_table_flags_to_zip_size(flags));
+		return(fsp_flags_get_zip_size(flags));
 	}
 
 	return(flags);
@@ -1602,7 +1629,8 @@ fil_init(
 	ut_a(hash_size > 0);
 	ut_a(max_n_open > 0);
 
-	fil_system = mem_zalloc(sizeof(fil_system_t));
+	fil_system = static_cast<fil_system_t*>(
+		mem_zalloc(sizeof(fil_system_t)));
 
 	mutex_create(fil_system_mutex_key,
 		     &fil_system->mutex, SYNC_ANY_LATCH);
@@ -1627,47 +1655,51 @@ fil_open_log_and_system_tablespace_files(void)
 /*==========================================*/
 {
 	fil_space_t*	space;
-	fil_node_t*	node;
 
 	mutex_enter(&fil_system->mutex);
 
-	space = UT_LIST_GET_FIRST(fil_system->space_list);
+	for (space = UT_LIST_GET_FIRST(fil_system->space_list);
+	     space != NULL;
+	     space = UT_LIST_GET_NEXT(space_list, space)) {
 
-	while (space != NULL) {
-		if (space->purpose != FIL_TABLESPACE || space->id == 0) {
-			node = UT_LIST_GET_FIRST(space->chain);
+		fil_node_t*	node;
 
-			while (node != NULL) {
-				if (!node->open) {
-					fil_node_open_file(node, fil_system,
-							   space);
-				}
-				if (fil_system->max_n_open
-				    < 10 + fil_system->n_open) {
-					fprintf(stderr,
-						"InnoDB: Warning: you must"
-						" raise the value of"
-						" innodb_open_files in\n"
-						"InnoDB: my.cnf! Remember that"
-						" InnoDB keeps all log files"
-						" and all system\n"
-						"InnoDB: tablespace files open"
-						" for the whole time mysqld is"
-						" running, and\n"
-						"InnoDB: needs to open also"
-						" some .ibd files if the"
-						" file-per-table storage\n"
-						"InnoDB: model is used."
-						" Current open files %lu,"
-						" max allowed"
-						" open files %lu.\n",
-						(ulong) fil_system->n_open,
-						(ulong) fil_system->max_n_open);
-				}
-				node = UT_LIST_GET_NEXT(chain, node);
+		if (fil_space_belongs_in_lru(space)) {
+
+			continue;
+		}
+
+		for (node = UT_LIST_GET_FIRST(space->chain);
+		     node != NULL;
+		     node = UT_LIST_GET_NEXT(chain, node)) {
+
+			if (!node->open) {
+				fil_node_open_file(node, fil_system, space);
+			}
+
+			if (fil_system->max_n_open < 10 + fil_system->n_open) {
+
+				fprintf(stderr,
+					"InnoDB: Warning: you must"
+					" raise the value of"
+					" innodb_open_files in\n"
+					"InnoDB: my.cnf! Remember that"
+					" InnoDB keeps all log files"
+					" and all system\n"
+					"InnoDB: tablespace files open"
+					" for the whole time mysqld is"
+					" running, and\n"
+					"InnoDB: needs to open also"
+					" some .ibd files if the"
+					" file-per-table storage\n"
+					"InnoDB: model is used."
+					" Current open files %lu,"
+					" max allowed"
+					" open files %lu.\n",
+					(ulong) fil_system->n_open,
+					(ulong) fil_system->max_n_open);
 			}
 		}
-		space = UT_LIST_GET_NEXT(space_list, space);
 	}
 
 	mutex_exit(&fil_system->mutex);
@@ -1742,23 +1774,24 @@ static
 ulint
 fil_write_lsn_and_arch_no_to_file(
 /*==============================*/
-	ulint		sum_of_sizes,	/*!< in: combined size of previous files
-					in space, in database pages */
-	ib_uint64_t	lsn,		/*!< in: lsn to write */
-	ulint		arch_log_no __attribute__((unused)))
-					/*!< in: archived log number to write */
+	ulint	space,		/*!< in: space to write to */
+	ulint	sum_of_sizes,	/*!< in: combined size of previous files
+				in space, in database pages */
+	lsn_t	lsn,		/*!< in: lsn to write */
+	ulint	arch_log_no __attribute__((unused)))
+				/*!< in: archived log number to write */
 {
 	byte*	buf1;
 	byte*	buf;
 
-	buf1 = mem_alloc(2 * UNIV_PAGE_SIZE);
-	buf = ut_align(buf1, UNIV_PAGE_SIZE);
+	buf1 = static_cast<byte*>(mem_alloc(2 * UNIV_PAGE_SIZE));
+	buf = static_cast<byte*>(ut_align(buf1, UNIV_PAGE_SIZE));
 
-	fil_read(TRUE, 0, 0, sum_of_sizes, 0, UNIV_PAGE_SIZE, buf, NULL);
+	fil_read(TRUE, space, 0, sum_of_sizes, 0, UNIV_PAGE_SIZE, buf, NULL);
 
 	mach_write_to_8(buf + FIL_PAGE_FILE_FLUSH_LSN, lsn);
 
-	fil_write(TRUE, 0, 0, sum_of_sizes, 0, UNIV_PAGE_SIZE, buf, NULL);
+	fil_write(TRUE, space, 0, sum_of_sizes, 0, UNIV_PAGE_SIZE, buf, NULL);
 
 	mem_free(buf1);
 
@@ -1773,36 +1806,40 @@ UNIV_INTERN
 ulint
 fil_write_flushed_lsn_to_data_files(
 /*================================*/
-	ib_uint64_t	lsn,		/*!< in: lsn to write */
-	ulint		arch_log_no)	/*!< in: latest archived log
-					file number */
+	lsn_t	lsn,		/*!< in: lsn to write */
+	ulint	arch_log_no)	/*!< in: latest archived log file number */
 {
 	fil_space_t*	space;
 	fil_node_t*	node;
-	ulint		sum_of_sizes;
 	ulint		err;
 
 	mutex_enter(&fil_system->mutex);
 
-	space = UT_LIST_GET_FIRST(fil_system->space_list);
+	for (space = UT_LIST_GET_FIRST(fil_system->space_list);
+	     space != NULL;
+	     space = UT_LIST_GET_NEXT(space_list, space)) {
 
-	while (space) {
 		/* We only write the lsn to all existing data files which have
 		been open during the lifetime of the mysqld process; they are
 		represented by the space objects in the tablespace memory
-		cache. Note that all data files in the system tablespace 0 are
-		always open. */
+		cache. Note that all data files in the system tablespace 0
+		and the UNDO log tablespaces (if separate) are always open. */
 
 		if (space->purpose == FIL_TABLESPACE
-		    && space->id == 0) {
-			sum_of_sizes = 0;
+		    && !fil_is_user_tablespace_id(space->id)) {
+
+			ulint	sum_of_sizes = 0;
+
+			for (node = UT_LIST_GET_FIRST(space->chain);
+			     node != NULL;
+			     node = UT_LIST_GET_NEXT(chain, node)) {
 
-			node = UT_LIST_GET_FIRST(space->chain);
-			while (node) {
 				mutex_exit(&fil_system->mutex);
 
 				err = fil_write_lsn_and_arch_no_to_file(
-					sum_of_sizes, lsn, arch_log_no);
+					space->id, sum_of_sizes, lsn,
+					arch_log_no);
+
 				if (err != DB_SUCCESS) {
 
 					return(err);
@@ -1811,10 +1848,8 @@ fil_write_flushed_lsn_to_data_files(
 				mutex_enter(&fil_system->mutex);
 
 				sum_of_sizes += node->size;
-				node = UT_LIST_GET_NEXT(chain, node);
 			}
 		}
-		space = UT_LIST_GET_NEXT(space_list, space);
 	}
 
 	mutex_exit(&fil_system->mutex);
@@ -1840,25 +1875,26 @@ fil_read_first_page(
 	ulint*		max_arch_log_no,	/*!< out: max of archived
 						log numbers in data files */
 #endif /* UNIV_LOG_ARCHIVE */
-	ib_uint64_t*	min_flushed_lsn,	/*!< out: min of flushed
+	lsn_t*		min_flushed_lsn,	/*!< out: min of flushed
 						lsn values in data files */
-	ib_uint64_t*	max_flushed_lsn)	/*!< out: max of flushed
+	lsn_t*		max_flushed_lsn)	/*!< out: max of flushed
 						lsn values in data files */
 {
-	byte*		buf;
-	page_t*		page;
-	ib_uint64_t	flushed_lsn;
+	byte*	buf;
+	byte*	page;
+	lsn_t	flushed_lsn;
+
+	buf = static_cast<byte*>(ut_malloc(2 * UNIV_PAGE_SIZE));
 
-	buf = ut_malloc(2 * UNIV_PAGE_SIZE);
 	/* Align the memory for a possible read from a raw device */
-	page = ut_align(buf, UNIV_PAGE_SIZE);
 
-	os_file_read(data_file, page, 0, 0, UNIV_PAGE_SIZE);
+	page = static_cast<byte*>(ut_align(buf, UNIV_PAGE_SIZE));
 
-	*flags = mach_read_from_4(page +
-		FSP_HEADER_OFFSET + FSP_SPACE_FLAGS);
+	os_file_read(data_file, page, 0, UNIV_PAGE_SIZE);
 
-	flushed_lsn = mach_read_from_8(page + FIL_PAGE_FILE_FLUSH_LSN);
+	*flags = fsp_header_get_flags(page);
+
+	flushed_lsn = mach_read_from_8(page+ FIL_PAGE_FILE_FLUSH_LSN);
 
 	ut_free(buf);
 
@@ -1892,12 +1928,13 @@ fil_read_first_page(
 
 #ifndef UNIV_HOTBACKUP
 /*******************************************************************//**
-Increments the count of pending operation, if space is not being deleted.
-@return	TRUE if being deleted, and operation should be skipped */
+Increments the count of pending insert buffer page merges, if space is not
+being deleted.
+@return	TRUE if being deleted, and ibuf merges should be skipped */
 UNIV_INTERN
 ibool
-fil_inc_pending_ops(
-/*================*/
+fil_inc_pending_ibuf_merges(
+/*========================*/
 	ulint	id)	/*!< in: space id */
 {
 	fil_space_t*	space;
@@ -1908,18 +1945,18 @@ fil_inc_pending_ops(
 
 	if (space == NULL) {
 		fprintf(stderr,
-			"InnoDB: Error: trying to do an operation on a"
+			"InnoDB: Error: trying to do ibuf merge to a"
 			" dropped tablespace %lu\n",
 			(ulong) id);
 	}
 
-	if (space == NULL || space->stop_new_ops) {
+	if (space == NULL || space->stop_ibuf_merges) {
 		mutex_exit(&fil_system->mutex);
 
 		return(TRUE);
 	}
 
-	space->n_pending_ops++;
+	space->n_pending_ibuf_merges++;
 
 	mutex_exit(&fil_system->mutex);
 
@@ -1927,11 +1964,11 @@ fil_inc_pending_ops(
 }
 
 /*******************************************************************//**
-Decrements the count of pending operations. */
+Decrements the count of pending insert buffer page merges. */
 UNIV_INTERN
 void
-fil_decr_pending_ops(
-/*=================*/
+fil_decr_pending_ibuf_merges(
+/*=========================*/
 	ulint	id)	/*!< in: space id */
 {
 	fil_space_t*	space;
@@ -1942,13 +1979,13 @@ fil_decr_pending_ops(
 
 	if (space == NULL) {
 		fprintf(stderr,
-			"InnoDB: Error: decrementing pending operation"
-			" of a dropped tablespace %lu\n",
+			"InnoDB: Error: decrementing ibuf merge of a"
+			" dropped tablespace %lu\n",
 			(ulong) id);
 	}
 
 	if (space != NULL) {
-		space->n_pending_ops--;
+		space->n_pending_ibuf_merges--;
 	}
 
 	mutex_exit(&fil_system->mutex);
@@ -1971,7 +2008,7 @@ fil_create_directory_for_tablename(
 	len = strlen(fil_path_to_mysql_datadir);
 	namend = strchr(name, '/');
 	ut_a(namend);
-	path = mem_alloc(len + (namend - name) + 2);
+	path = static_cast<char*>(mem_alloc(len + (namend - name) + 2));
 
 	memcpy(path, fil_path_to_mysql_datadir, len);
 	path[len] = '/';
@@ -2150,7 +2187,7 @@ fil_op_log_parse_or_replay(
 
 	/* Let us try to perform the file operation, if sensible. Note that
 	ibbackup has at this stage already read in all space id info to the
-	fil0fil.c data structures.
+	fil0fil.cc data structures.
 
 	NOTE that our algorithm is not guaranteed to work correctly if there
 	were renames of tables during the backup. See ibbackup code for more
@@ -2159,7 +2196,7 @@ fil_op_log_parse_or_replay(
 	switch (type) {
 	case MLOG_FILE_DELETE:
 		if (fil_tablespace_exists_in_mem(space_id)) {
-			ut_a(fil_delete_tablespace(space_id, TRUE));
+			ut_a(fil_delete_tablespace(space_id));
 		}
 
 		break;
@@ -2207,6 +2244,7 @@ fil_op_log_parse_or_replay(
 
 			if (fil_create_new_single_table_tablespace(
 				    space_id, name, FALSE, flags,
+				    DICT_TF2_USE_TABLESPACE,
 				    FIL_IBD_FILE_INITIAL_SIZE) != DB_SUCCESS) {
 				ut_error;
 			}
@@ -2229,9 +2267,7 @@ UNIV_INTERN
 ibool
 fil_delete_tablespace(
 /*==================*/
-	ulint	id,		/*!< in: space id */
-	ibool	evict_all)	/*!< in: TRUE if we want all pages
-				evicted from LRU. */
+	ulint	id)	/*!< in: space id */
 {
 	ibool		success;
 	fil_space_t*	space;
@@ -2240,15 +2276,15 @@ fil_delete_tablespace(
 	char*		path;
 
 	ut_a(id != 0);
-stop_new_ops:
+stop_ibuf_merges:
 	mutex_enter(&fil_system->mutex);
 
 	space = fil_space_get_by_id(id);
 
 	if (space != NULL) {
-		space->stop_new_ops = TRUE;
+		space->stop_ibuf_merges = TRUE;
 
-		if (space->n_pending_ops == 0) {
+		if (space->n_pending_ibuf_merges == 0) {
 			mutex_exit(&fil_system->mutex);
 
 			count = 0;
@@ -2262,10 +2298,9 @@ stop_new_ops:
 				ut_print_filename(stderr, space->name);
 				fprintf(stderr, ",\n"
 					"InnoDB: but there are %lu pending"
-					" operations (most likely ibuf merges)"
-					" on it.\n"
+					" ibuf merges on it.\n"
 					"InnoDB: Loop %lu.\n",
-					(ulong) space->n_pending_ops,
+					(ulong) space->n_pending_ibuf_merges,
 					(ulong) count);
 			}
 
@@ -2274,7 +2309,7 @@ stop_new_ops:
 			os_thread_sleep(20000);
 			count++;
 
-			goto stop_new_ops;
+			goto stop_ibuf_merges;
 		}
 	}
 
@@ -2300,14 +2335,18 @@ try_again:
 	}
 
 	ut_a(space);
-	ut_a(space->n_pending_ops == 0);
+	ut_a(space->n_pending_ibuf_merges == 0);
 
 	space->is_being_deleted = TRUE;
 
+	/* TODO: The following code must change when InnoDB supports
+	multiple datafiles per tablespace. */
 	ut_a(UT_LIST_GET_LEN(space->chain) == 1);
+
 	node = UT_LIST_GET_FIRST(space->chain);
 
-	if (space->n_pending_flushes > 0 || node->n_pending > 0) {
+	if (space->n_pending_flushes > 0 || node->n_pending > 0
+	    || node->being_extended) {
 		if (count > 1000) {
 			ut_print_timestamp(stderr);
 			fputs("  InnoDB: Warning: trying to"
@@ -2316,6 +2355,7 @@ try_again:
 			fprintf(stderr, ",\n"
 				"InnoDB: but there are %lu flushes"
 				" and %lu pending i/o's on it\n"
+				"InnoDB: Or it is being extended\n"
 				"InnoDB: Loop %lu.\n",
 				(ulong) space->n_pending_flushes,
 				(ulong) node->n_pending,
@@ -2329,7 +2369,7 @@ try_again:
 		goto try_again;
 	}
 
-	path = mem_strdup(space->name);
+	path = mem_strdup(node->name);
 
 	mutex_exit(&fil_system->mutex);
 
@@ -2353,10 +2393,7 @@ try_again:
 	completely and permanently. The flag is_being_deleted also prevents
 	fil_flush() from being applied to this tablespace. */
 
-	buf_LRU_flush_or_remove_pages(
-		id, evict_all
-		? BUF_REMOVE_ALL_NO_WRITE
-		: BUF_REMOVE_FLUSH_NO_WRITE);
+	buf_LRU_invalidate_tablespace(id);
 #endif
 	/* printf("Deleting tablespace %s id %lu\n", space->name, id); */
 
@@ -2444,7 +2481,7 @@ fil_discard_tablespace(
 {
 	ibool	success;
 
-	success = fil_delete_tablespace(id, TRUE);
+	success = fil_delete_tablespace(id);
 
 	if (!success) {
 		fprintf(stderr,
@@ -2472,7 +2509,8 @@ fil_rename_tablespace_in_mem(
 /*=========================*/
 	fil_space_t*	space,	/*!< in: tablespace memory object */
 	fil_node_t*	node,	/*!< in: file node of that tablespace */
-	const char*	path)	/*!< in: new name */
+	const char*	new_name,	/*!< in: new name */
+	const char*	new_path)	/*!< in: new file path */
 {
 	fil_space_t*	space2;
 	const char*	old_name	= space->name;
@@ -2488,10 +2526,10 @@ fil_rename_tablespace_in_mem(
 		return(FALSE);
 	}
 
-	space2 = fil_space_get_by_name(path);
+	space2 = fil_space_get_by_name(new_name);
 	if (space2 != NULL) {
 		fputs("InnoDB: Error: ", stderr);
-		ut_print_filename(stderr, path);
+		ut_print_filename(stderr, new_name);
 		fputs(" is already in tablespace memory cache\n", stderr);
 
 		return(FALSE);
@@ -2502,11 +2540,11 @@ fil_rename_tablespace_in_mem(
 	mem_free(space->name);
 	mem_free(node->name);
 
-	space->name = mem_strdup(path);
-	node->name = mem_strdup(path);
+	space->name = mem_strdup(new_name);
+	node->name = mem_strdup(new_path);
 
 	HASH_INSERT(fil_space_t, name_hash, fil_system->name_hash,
-		    ut_fold_string(path), space);
+		    ut_fold_string(new_name), space);
 	return(TRUE);
 }
 
@@ -2522,9 +2560,12 @@ fil_make_ibd_name(
 					TEMPORARY table */
 	ibool		is_temp)	/*!< in: TRUE if it is a dir path */
 {
+	char*	filename;
 	ulint	namelen		= strlen(name);
 	ulint	dirlen		= strlen(fil_path_to_mysql_datadir);
-	char*	filename	= mem_alloc(namelen + dirlen + sizeof "/.ibd");
+
+	filename = static_cast<char*>(
+		mem_alloc(namelen + dirlen + sizeof "/.ibd"));
 
 	if (is_temp) {
 		memcpy(filename, name, namelen);
@@ -2550,7 +2591,7 @@ UNIV_INTERN
 ibool
 fil_rename_tablespace(
 /*==================*/
-	const char*	old_name,	/*!< in: old table name in the standard
+	const char*	old_name_in,	/*!< in: old table name in the standard
 					databasename/tablename format of
 					InnoDB, or NULL if we do the rename
 					based on the space id only */
@@ -2563,23 +2604,21 @@ fil_rename_tablespace(
 	fil_space_t*	space;
 	fil_node_t*	node;
 	ulint		count		= 0;
-	char*		path;
-	ibool		old_name_was_specified		= TRUE;
+	char*		new_path;
+	char*		old_name;
 	char*		old_path;
+	const char*	not_given	= "(name not specified)";
 
 	ut_a(id != 0);
 
-	if (old_name == NULL) {
-		old_name = "(name not specified)";
-		old_name_was_specified = FALSE;
-	}
 retry:
 	count++;
 
 	if (!(count % 1000)) {
 		ut_print_timestamp(stderr);
 		fputs("  InnoDB: Warning: problems renaming ", stderr);
-		ut_print_filename(stderr, old_name);
+		ut_print_filename(stderr,
+				  old_name_in ? old_name_in : not_given);
 		fputs(" to ", stderr);
 		ut_print_filename(stderr, new_name);
 		fprintf(stderr, ", %lu iterations\n", (ulong) count);
@@ -2594,7 +2633,8 @@ retry:
 			"InnoDB: Error: cannot find space id %lu"
 			" in the tablespace memory cache\n"
 			"InnoDB: though the table ", (ulong) id);
-		ut_print_filename(stderr, old_name);
+		ut_print_filename(stderr,
+				  old_name_in ? old_name_in : not_given);
 		fputs(" in a rename operation should have that id\n", stderr);
 		mutex_exit(&fil_system->mutex);
 
@@ -2617,8 +2657,10 @@ retry:
 	ut_a(UT_LIST_GET_LEN(space->chain) == 1);
 	node = UT_LIST_GET_FIRST(space->chain);
 
-	if (node->n_pending > 0 || node->n_pending_flushes > 0) {
-		/* There are pending i/o's or flushes, sleep for a while and
+	if (node->n_pending > 0 || node->n_pending_flushes > 0
+	    || node->being_extended) {
+		/* There are pending i/o's or flushes or the file is
+		currently being extended, sleep for a while and
 		retry */
 
 		mutex_exit(&fil_system->mutex);
@@ -2646,34 +2688,35 @@ retry:
 
 	/* Check that the old name in the space is right */
 
-	if (old_name_was_specified) {
+	if (old_name_in) {
+		old_name = mem_strdup(old_name_in);
 		old_path = fil_make_ibd_name(old_name, FALSE);
 
-		ut_a(strcmp(space->name, old_path) == 0);
+		ut_a(strcmp(space->name, old_name) == 0);
 		ut_a(strcmp(node->name, old_path) == 0);
 	} else {
-		old_path = mem_strdup(space->name);
+		old_name = mem_strdup(space->name);
+		old_path = mem_strdup(node->name);
 	}
 
 	/* Rename the tablespace and the node in the memory cache */
-	path = fil_make_ibd_name(new_name, FALSE);
-	success = fil_rename_tablespace_in_mem(space, node, path);
+	new_path = fil_make_ibd_name(new_name, FALSE);
+	success = fil_rename_tablespace_in_mem(
+		space, node, new_name, new_path);
 
 	if (success) {
-		success = os_file_rename(innodb_file_data_key, old_path, path);
+		success = os_file_rename(
+			innodb_file_data_key, old_path, new_path);
 
 		if (!success) {
 			/* We have to revert the changes we made
 			to the tablespace memory cache */
 
-			ut_a(fil_rename_tablespace_in_mem(space, node,
-							  old_path));
+			ut_a(fil_rename_tablespace_in_mem(
+					space, node, old_name, old_path));
 		}
 	}
 
-	mem_free(path);
-	mem_free(old_path);
-
 	space->stop_ios = FALSE;
 
 	mutex_exit(&fil_system->mutex);
@@ -2689,6 +2732,11 @@ retry:
 		mtr_commit(&mtr);
 	}
 #endif
+
+	mem_free(new_path);
+	mem_free(old_path);
+	mem_free(old_name);
+
 	return(success);
 }
 
@@ -2711,6 +2759,7 @@ fil_create_new_single_table_tablespace(
 	ibool		is_temp,	/*!< in: TRUE if a table created with
 					CREATE TEMPORARY TABLE */
 	ulint		flags,		/*!< in: tablespace flags */
+	ulint		flags2,		/*!< in: table flags2 */
 	ulint		size)		/*!< in: the initial size of the
 					tablespace file in pages,
 					must be >= FIL_IBD_FILE_INITIAL_SIZE */
@@ -2720,26 +2769,23 @@ fil_create_new_single_table_tablespace(
 	ulint		err;
 	byte*		buf2;
 	byte*		page;
-	ibool		success;
 	char*		path;
+	ibool		success;
 
 	ut_a(space_id > 0);
 	ut_a(space_id < SRV_LOG_SPACE_FIRST_ID);
 	ut_a(size >= FIL_IBD_FILE_INITIAL_SIZE);
-	/* The tablespace flags (FSP_SPACE_FLAGS) should be 0 for
-	ROW_FORMAT=COMPACT
-	((table->flags & ~(~0 << DICT_TF_BITS)) == DICT_TF_COMPACT) and
-	ROW_FORMAT=REDUNDANT (table->flags == 0).  For any other
-	format, the tablespace flags should equal
-	(table->flags & ~(~0 << DICT_TF_BITS)). */
-	ut_a(flags != DICT_TF_COMPACT);
-	ut_a(!(flags & (~0UL << DICT_TF_BITS)));
+	fsp_flags_validate(flags);
 
 	path = fil_make_ibd_name(tablename, is_temp);
 
-	file = os_file_create(innodb_file_data_key, path,
-			      OS_FILE_CREATE, OS_FILE_NORMAL,
-			      OS_DATA_FILE, &ret);
+	file = os_file_create(
+		innodb_file_data_key, path,
+		OS_FILE_CREATE | OS_FILE_ON_ERROR_NO_EXIT,
+		OS_FILE_NORMAL,
+		OS_DATA_FILE,
+		&ret);
+
 	if (ret == FALSE) {
 		ut_print_timestamp(stderr);
 		fputs("  InnoDB: Error creating file ", stderr);
@@ -2782,7 +2828,7 @@ fil_create_new_single_table_tablespace(
 		return(DB_ERROR);
 	}
 
-	ret = os_file_set_size(path, file, size * UNIV_PAGE_SIZE, 0);
+	ret = os_file_set_size(path, file, size * UNIV_PAGE_SIZE);
 
 	if (!ret) {
 		err = DB_OUT_OF_FILE_SPACE;
@@ -2806,25 +2852,26 @@ error_exit2:
 	with zeros from the call of os_file_set_size(), until a buffer pool
 	flush would write to it. */
 
-	buf2 = ut_malloc(3 * UNIV_PAGE_SIZE);
+	buf2 = static_cast<byte*>(ut_malloc(3 * UNIV_PAGE_SIZE));
 	/* Align the memory for file i/o if we might have O_DIRECT set */
-	page = ut_align(buf2, UNIV_PAGE_SIZE);
+	page = static_cast<byte*>(ut_align(buf2, UNIV_PAGE_SIZE));
 
 	memset(page, '\0', UNIV_PAGE_SIZE);
 
+	/* Add the UNIV_PAGE_SIZE to the table flags and write them to the
+	tablespace header. */
+	flags = fsp_flags_set_page_size(flags, UNIV_PAGE_SIZE);
 	fsp_header_init_fields(page, space_id, flags);
 	mach_write_to_4(page + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, space_id);
 
-	if (!(flags & DICT_TF_ZSSIZE_MASK)) {
+	if (!(fsp_flags_is_compressed(flags))) {
 		buf_flush_init_for_writing(page, NULL, 0);
-		ret = os_file_write(path, file, page, 0, 0, UNIV_PAGE_SIZE);
+		ret = os_file_write(path, file, page, 0, UNIV_PAGE_SIZE);
 	} else {
 		page_zip_des_t	page_zip;
 		ulint		zip_size;
 
-		zip_size = ((PAGE_ZIP_MIN_SIZE >> 1)
-			    << ((flags & DICT_TF_ZSSIZE_MASK)
-				>> DICT_TF_ZSSIZE_SHIFT));
+		zip_size = fsp_flags_get_zip_size(flags);
 
 		page_zip_set_size(&page_zip, zip_size);
 		page_zip.data = page + UNIV_PAGE_SIZE;
@@ -2834,7 +2881,7 @@ error_exit2:
 			page_zip.m_end = page_zip.m_nonempty =
 			page_zip.n_blobs = 0;
 		buf_flush_init_for_writing(page, &page_zip, 0);
-		ret = os_file_write(path, file, page_zip.data, 0, 0, zip_size);
+		ret = os_file_write(path, file, page_zip.data, 0, zip_size);
 	}
 
 	ut_free(buf2);
@@ -2860,7 +2907,7 @@ error_exit2:
 
 	os_file_close(file);
 
-	success = fil_space_create(path, space_id, flags, FIL_TABLESPACE);
+	success = fil_space_create(tablename, space_id, flags, FIL_TABLESPACE);
 
 	if (!success) {
 		err = DB_ERROR;
@@ -2907,7 +2954,7 @@ fil_reset_too_high_lsns(
 /*====================*/
 	const char*	name,		/*!< in: table name in the
 					databasename/tablename format */
-	ib_uint64_t	current_lsn)	/*!< in: reset lsn's if the lsn stamped
+	lsn_t		current_lsn)	/*!< in: reset lsn's if the lsn stamped
 					to FIL_PAGE_FILE_FLUSH_LSN in the
 					first page is too high */
 {
@@ -2915,10 +2962,10 @@ fil_reset_too_high_lsns(
 	char*		filepath;
 	byte*		page;
 	byte*		buf2;
-	ib_uint64_t	flush_lsn;
+	lsn_t		flush_lsn;
 	ulint		space_id;
-	ib_int64_t	file_size;
-	ib_int64_t	offset;
+	os_offset_t	file_size;
+	os_offset_t	offset;
 	ulint		zip_size;
 	ibool		success;
 	page_zip_des_t	page_zip;
@@ -2946,11 +2993,11 @@ fil_reset_too_high_lsns(
 
 	/* Read the first page of the tablespace */
 
-	buf2 = ut_malloc(3 * UNIV_PAGE_SIZE);
+	buf2 = static_cast<byte*>(ut_malloc(3 * UNIV_PAGE_SIZE));
 	/* Align the memory for file i/o if we might have O_DIRECT set */
-	page = ut_align(buf2, UNIV_PAGE_SIZE);
+	page = static_cast<byte*>(ut_align(buf2, UNIV_PAGE_SIZE));
 
-	success = os_file_read(file, page, 0, 0, UNIV_PAGE_SIZE);
+	success = os_file_read(file, page, 0, UNIV_PAGE_SIZE);
 	if (!success) {
 
 		goto func_exit;
@@ -2980,8 +3027,8 @@ fil_reset_too_high_lsns(
 	fprintf(stderr,
 		"  InnoDB: Flush lsn in the tablespace file %lu"
 		" to be imported\n"
-		"InnoDB: is %llu, which exceeds current"
-		" system lsn %llu.\n"
+		"InnoDB: is " LSN_PF ", which exceeds current"
+		" system lsn " LSN_PF ".\n"
 		"InnoDB: We reset the lsn's in the file ",
 		(ulong) space_id,
 		flush_lsn, current_lsn);
@@ -2989,18 +3036,17 @@ fil_reset_too_high_lsns(
 	fputs(".\n", stderr);
 
 	ut_a(ut_is_2pow(zip_size));
-	ut_a(zip_size <= UNIV_PAGE_SIZE);
+	ut_a(zip_size <= UNIV_ZIP_SIZE_MAX);
 
 	/* Loop through all the pages in the tablespace and reset the lsn and
 	the page checksum if necessary */
 
-	file_size = os_file_get_size_as_iblonglong(file);
+	file_size = os_file_get_size(file);
+	ut_a(file_size != (os_offset_t) -1);
 
 	for (offset = 0; offset < file_size;
 	     offset += zip_size ? zip_size : UNIV_PAGE_SIZE) {
-		success = os_file_read(file, page,
-				       (ulint)(offset & 0xFFFFFFFFUL),
-				       (ulint)(offset >> 32),
+		success = os_file_read(file, page, offset,
 				       zip_size ? zip_size : UNIV_PAGE_SIZE);
 		if (!success) {
 
@@ -3015,16 +3061,13 @@ fil_reset_too_high_lsns(
 					page, &page_zip, current_lsn);
 				success = os_file_write(
 					filepath, file, page_zip.data,
-					(ulint) offset & 0xFFFFFFFFUL,
-					(ulint) (offset >> 32), zip_size);
+					offset, zip_size);
 			} else {
 				buf_flush_init_for_writing(
 					page, NULL, current_lsn);
 				success = os_file_write(
 					filepath, file, page,
-					(ulint)(offset & 0xFFFFFFFFUL),
-					(ulint)(offset >> 32),
-					UNIV_PAGE_SIZE);
+					offset, UNIV_PAGE_SIZE);
 			}
 
 			if (!success) {
@@ -3041,7 +3084,7 @@ fil_reset_too_high_lsns(
 	}
 
 	/* We now update the flush_lsn stamp at the start of the file */
-	success = os_file_read(file, page, 0, 0,
+	success = os_file_read(file, page, 0,
 			       zip_size ? zip_size : UNIV_PAGE_SIZE);
 	if (!success) {
 
@@ -3050,7 +3093,7 @@ fil_reset_too_high_lsns(
 
 	mach_write_to_8(page + FIL_PAGE_FILE_FLUSH_LSN, current_lsn);
 
-	success = os_file_write(filepath, file, page, 0, 0,
+	success = os_file_write(filepath, file, page, 0,
 				zip_size ? zip_size : UNIV_PAGE_SIZE);
 	if (!success) {
 
@@ -3088,7 +3131,7 @@ fil_open_single_table_tablespace(
 					accessing the first page of the file */
 	ulint		id,		/*!< in: space id */
 	ulint		flags,		/*!< in: tablespace flags */
-	const char*	name)		/*!< in: table name in the
+	const char*	tablename)	/*!< in: table name in the
 					databasename/tablename format */
 {
 	os_file_t	file;
@@ -3099,16 +3142,9 @@ fil_open_single_table_tablespace(
 	ulint		space_id;
 	ulint		space_flags;
 
-	filepath = fil_make_ibd_name(name, FALSE);
+	filepath = fil_make_ibd_name(tablename, FALSE);
 
-	/* The tablespace flags (FSP_SPACE_FLAGS) should be 0 for
-	ROW_FORMAT=COMPACT
-	((table->flags & ~(~0 << DICT_TF_BITS)) == DICT_TF_COMPACT) and
-	ROW_FORMAT=REDUNDANT (table->flags == 0).  For any other
-	format, the tablespace flags should equal
-	(table->flags & ~(~0 << DICT_TF_BITS)). */
-	ut_a(flags != DICT_TF_COMPACT);
-	ut_a(!(flags & (~0UL << DICT_TF_BITS)));
+	fsp_flags_validate(flags);
 
 	file = os_file_create_simple_no_error_handling(
 		innodb_file_data_key, filepath, OS_FILE_OPEN,
@@ -3132,7 +3168,8 @@ fil_open_single_table_tablespace(
 		      " a temporary table #sql...,\n"
 		      "InnoDB: and MySQL removed the .ibd file for this.\n"
 		      "InnoDB: Please refer to\n"
-		      "InnoDB: " REFMAN "innodb-troubleshooting-datadict.html\n"
+		      "InnoDB: " REFMAN
+		      "innodb-troubleshooting-datadict.html\n"
 		      "InnoDB: for how to resolve the issue.\n", stderr);
 
 		mem_free(filepath);
@@ -3148,11 +3185,11 @@ fil_open_single_table_tablespace(
 
 	/* Read the first page of the tablespace */
 
-	buf2 = ut_malloc(2 * UNIV_PAGE_SIZE);
+	buf2 = static_cast<byte*>(ut_malloc(2 * UNIV_PAGE_SIZE));
 	/* Align the memory for file i/o if we might have O_DIRECT set */
-	page = ut_align(buf2, UNIV_PAGE_SIZE);
+	page = static_cast<byte*>(ut_align(buf2, UNIV_PAGE_SIZE));
 
-	success = os_file_read(file, page, 0, 0, UNIV_PAGE_SIZE);
+	success = os_file_read(file, page, 0, UNIV_PAGE_SIZE);
 
 	/* We have to read the tablespace id and flags from the file. */
 
@@ -3161,8 +3198,7 @@ fil_open_single_table_tablespace(
 
 	ut_free(buf2);
 
-	if (UNIV_UNLIKELY(space_id != id
-			  || space_flags != (flags & ~(~0 << DICT_TF_BITS)))) {
+	if (UNIV_UNLIKELY(space_id != id || space_flags != flags)) {
 		ut_print_timestamp(stderr);
 
 		fputs("  InnoDB: Error: tablespace id and flags in file ",
@@ -3186,7 +3222,7 @@ fil_open_single_table_tablespace(
 	}
 
 skip_check:
-	success = fil_space_create(filepath, space_id, flags, FIL_TABLESPACE);
+	success = fil_space_create(tablename, space_id, flags, FIL_TABLESPACE);
 
 	if (!success) {
 		goto func_exit;
@@ -3216,8 +3252,10 @@ fil_make_ibbackup_old_name(
 	const char*	name)		/*!< in: original file name */
 {
 	static const char suffix[] = "_ibbackup_old_vers_";
+	char*	path;
 	ulint	len	= strlen(name);
-	char*	path	= mem_alloc(len + (15 + sizeof suffix));
+
+	path = static_cast<char*>(mem_alloc(len + (15 + sizeof suffix)));
 
 	memcpy(path, name, len);
 	memcpy(path + len, suffix, (sizeof suffix) - 1);
@@ -3228,7 +3266,7 @@ fil_make_ibbackup_old_name(
 
 /********************************************************************//**
 Opens an .ibd file and adds the associated single-table tablespace to the
-InnoDB fil0fil.c data structures. */
+InnoDB fil0fil.cc data structures. */
 static
 void
 fil_load_single_table_tablespace(
@@ -3239,23 +3277,31 @@ fil_load_single_table_tablespace(
 {
 	os_file_t	file;
 	char*		filepath;
+	char*		tablename;
 	ibool		success;
 	byte*		buf2;
 	byte*		page;
 	ulint		space_id;
 	ulint		flags;
-	ulint		size_low;
-	ulint		size_high;
-	ib_int64_t	size;
+	os_offset_t	size;
 #ifdef UNIV_HOTBACKUP
 	fil_space_t*	space;
 #endif
-	filepath = mem_alloc(strlen(dbname) + strlen(filename)
-			     + strlen(fil_path_to_mysql_datadir) + 3);
+	filepath = static_cast<char*>(
+		mem_alloc(
+			strlen(dbname)
+			+ strlen(filename)
+			+ strlen(fil_path_to_mysql_datadir) + 3));
 
 	sprintf(filepath, "%s/%s/%s", fil_path_to_mysql_datadir, dbname,
 		filename);
 	srv_normalize_path_for_win(filepath);
+
+	tablename = static_cast<char*>(
+		mem_alloc(strlen(dbname) + strlen(filename) + 2));
+	sprintf(tablename, "%s/%s", dbname, filename);
+	tablename[strlen(tablename) - strlen(".ibd")] = 0;
+
 #ifdef __WIN__
 # ifndef UNIV_HOTBACKUP
 	/* If lower_case_table_names is 0 or 2, then MySQL allows database
@@ -3299,6 +3345,7 @@ fil_load_single_table_tablespace(
 			"InnoDB: and force InnoDB to continue crash"
 			" recovery here.\n", filepath);
 
+		mem_free(tablename);
 		mem_free(filepath);
 
 		if (srv_force_recovery > 0) {
@@ -3314,9 +3361,9 @@ fil_load_single_table_tablespace(
 		exit(1);
 	}
 
-	success = os_file_get_size(file, &size_low, &size_high);
+	size = os_file_get_size(file);
 
-	if (!success) {
+	if (UNIV_UNLIKELY(size == (os_offset_t) -1)) {
 		/* The following call prints an error message */
 		os_file_get_last_error(TRUE);
 
@@ -3346,6 +3393,7 @@ fil_load_single_table_tablespace(
 			" crash recovery here.\n", filepath);
 
 		os_file_close(file);
+		mem_free(tablename);
 		mem_free(filepath);
 
 		if (srv_force_recovery > 0) {
@@ -3367,30 +3415,30 @@ fil_load_single_table_tablespace(
 	/* Every .ibd file is created >= 4 pages in size. Smaller files
 	cannot be ok. */
 
-	size = (((ib_int64_t)size_high) << 32) + (ib_int64_t)size_low;
 #ifndef UNIV_HOTBACKUP
 	if (size < FIL_IBD_FILE_INITIAL_SIZE * UNIV_PAGE_SIZE) {
 		fprintf(stderr,
-			"InnoDB: Error: the size of single-table tablespace"
-			" file %s\n"
-			"InnoDB: is only %lu %lu, should be at least %lu!",
+			"InnoDB: Error: the size of single-table"
+			" tablespace file %s\n"
+			"InnoDB: is only " UINT64PF
+			", should be at least %lu!\n",
 			filepath,
-			(ulong) size_high,
-			(ulong) size_low, (ulong) (4 * UNIV_PAGE_SIZE));
+			size, (ulong) (4 * UNIV_PAGE_SIZE));
 		os_file_close(file);
+		mem_free(tablename);
 		mem_free(filepath);
 
 		return;
 	}
 #endif
-	/* Read the first page of the tablespace if the size big enough */
+	/* Read the first page of the tablespace if the size is big enough */
 
-	buf2 = ut_malloc(2 * UNIV_PAGE_SIZE);
+	buf2 = static_cast<byte*>(ut_malloc(2 * UNIV_PAGE_SIZE));
 	/* Align the memory for file i/o if we might have O_DIRECT set */
-	page = ut_align(buf2, UNIV_PAGE_SIZE);
+	page = static_cast<byte*>(ut_align(buf2, UNIV_PAGE_SIZE));
 
 	if (size >= FIL_IBD_FILE_INITIAL_SIZE * UNIV_PAGE_SIZE) {
-		success = os_file_read(file, page, 0, 0, UNIV_PAGE_SIZE);
+		success = os_file_read(file, page, 0, UNIV_PAGE_SIZE);
 
 		/* We have to read the tablespace id from the file */
 
@@ -3430,6 +3478,7 @@ fil_load_single_table_tablespace(
 		ut_a(os_file_rename(innodb_file_data_key, filepath, new_path));
 
 		ut_free(buf2);
+		mem_free(tablename);
 		mem_free(filepath);
 		mem_free(new_path);
 
@@ -3468,6 +3517,7 @@ fil_load_single_table_tablespace(
 		ut_a(os_file_rename(innodb_file_data_key, filepath, new_path));
 
 		ut_free(buf2);
+		mem_free(tablename);
 		mem_free(filepath);
 		mem_free(new_path);
 
@@ -3475,7 +3525,7 @@ fil_load_single_table_tablespace(
 	}
 	mutex_exit(&fil_system->mutex);
 #endif
-	success = fil_space_create(filepath, space_id, flags, FIL_TABLESPACE);
+	success = fil_space_create(tablename, space_id, flags, FIL_TABLESPACE);
 
 	if (!success) {
 
@@ -3500,6 +3550,7 @@ fil_load_single_table_tablespace(
 func_exit:
 	os_file_close(file);
 	ut_free(buf2);
+	mem_free(tablename);
 	mem_free(filepath);
 }
 
@@ -3574,7 +3625,7 @@ fil_load_single_table_tablespaces(void)
 		return(DB_ERROR);
 	}
 
-	dbpath = mem_alloc(dbpath_len);
+	dbpath = static_cast<char*>(mem_alloc(dbpath_len));
 
 	/* Scan all directories under the datadir. They are the database
 	directories of MySQL. */
@@ -3603,10 +3654,10 @@ fil_load_single_table_tablespaces(void)
 				mem_free(dbpath);
 			}
 
-			dbpath = mem_alloc(dbpath_len);
+			dbpath = static_cast<char*>(mem_alloc(dbpath_len));
 		}
-		sprintf(dbpath, "%s/%s", fil_path_to_mysql_datadir,
-			dbinfo.name);
+		ut_snprintf(dbpath, dbpath_len,
+			    "%s/%s", fil_path_to_mysql_datadir, dbinfo.name);
 		srv_normalize_path_for_win(dbpath);
 
 		dbdir = os_file_opendir(dbpath, FALSE);
@@ -3675,7 +3726,7 @@ next_datadir_item:
 /*******************************************************************//**
 Returns TRUE if a single-table tablespace does not exist in the memory cache,
 or is being deleted there.
-@return	TRUE if does not exist or is being\ deleted */
+@return	TRUE if does not exist or is being deleted */
 UNIV_INTERN
 ibool
 fil_tablespace_deleted_or_being_deleted_in_mem(
@@ -3744,10 +3795,7 @@ fil_space_for_table_exists_in_mem(
 /*==============================*/
 	ulint		id,		/*!< in: space id */
 	const char*	name,		/*!< in: table name in the standard
-					'databasename/tablename' format or
-					the dir path to a temp table */
-	ibool		is_temp,	/*!< in: TRUE if created with CREATE
-					TEMPORARY TABLE */
+					'databasename/tablename' format */
 	ibool		mark_space,	/*!< in: in crash recovery, at database
 					startup we mark all spaces which have
 					an associated table in the InnoDB
@@ -3760,16 +3808,13 @@ fil_space_for_table_exists_in_mem(
 					matching tablespace is not found from
 					memory */
 {
-	fil_space_t*	namespace;
+	fil_space_t*	fnamespace;
 	fil_space_t*	space;
-	char*		path;
 
 	ut_ad(fil_system);
 
 	mutex_enter(&fil_system->mutex);
 
-	path = fil_make_ibd_name(name, is_temp);
-
 	/* Look if there is a space with the same id */
 
 	space = fil_space_get_by_id(id);
@@ -3777,15 +3822,14 @@ fil_space_for_table_exists_in_mem(
 	/* Look if there is a space with the same name; the name is the
 	directory path from the datadir to the file */
 
-	namespace = fil_space_get_by_name(path);
-	if (space && space == namespace) {
+	fnamespace = fil_space_get_by_name(name);
+	if (space && space == fnamespace) {
 		/* Found */
 
 		if (mark_space) {
 			space->mark = TRUE;
 		}
 
-		mem_free(path);
 		mutex_exit(&fil_system->mutex);
 
 		return(TRUE);
@@ -3793,14 +3837,13 @@ fil_space_for_table_exists_in_mem(
 
 	if (!print_error_if_does_not_exist) {
 
-		mem_free(path);
 		mutex_exit(&fil_system->mutex);
 
 		return(FALSE);
 	}
 
 	if (space == NULL) {
-		if (namespace == NULL) {
+		if (fnamespace == NULL) {
 			ut_print_timestamp(stderr);
 			fputs("  InnoDB: Error: table ", stderr);
 			ut_print_filename(stderr, name);
@@ -3829,21 +3872,20 @@ fil_space_for_table_exists_in_mem(
 				"InnoDB: a tablespace of name %s and id %lu,"
 				" though. Have\n"
 				"InnoDB: you deleted or moved .ibd files?\n",
-				(ulong) id, namespace->name,
-				(ulong) namespace->id);
+				(ulong) id, fnamespace->name,
+				(ulong) fnamespace->id);
 		}
 error_exit:
 		fputs("InnoDB: Please refer to\n"
 		      "InnoDB: " REFMAN "innodb-troubleshooting-datadict.html\n"
 		      "InnoDB: for how to resolve the issue.\n", stderr);
 
-		mem_free(path);
 		mutex_exit(&fil_system->mutex);
 
 		return(FALSE);
 	}
 
-	if (0 != strcmp(space->name, path)) {
+	if (0 != strcmp(space->name, name)) {
 		ut_print_timestamp(stderr);
 		fputs("  InnoDB: Error: table ", stderr);
 		ut_print_filename(stderr, name);
@@ -3855,19 +3897,18 @@ error_exit:
 			"InnoDB: Have you deleted or moved .ibd files?\n",
 			(ulong) id, space->name);
 
-		if (namespace != NULL) {
+		if (fnamespace != NULL) {
 			fputs("InnoDB: There is a tablespace"
 			      " with the right name\n"
 			      "InnoDB: ", stderr);
-			ut_print_filename(stderr, namespace->name);
+			ut_print_filename(stderr, fnamespace->name);
 			fprintf(stderr, ", but its id is %lu.\n",
-				(ulong) namespace->id);
+				(ulong) fnamespace->id);
 		}
 
 		goto error_exit;
 	}
 
-	mem_free(path);
 	mutex_exit(&fil_system->mutex);
 
 	return(FALSE);
@@ -3881,30 +3922,24 @@ static
 ulint
 fil_get_space_id_for_table(
 /*=======================*/
-	const char*	name)	/*!< in: table name in the standard
+	const char*	tablename)	/*!< in: table name in the standard
 				'databasename/tablename' format */
 {
-	fil_space_t*	namespace;
+	fil_space_t*	fnamespace;
 	ulint		id		= ULINT_UNDEFINED;
-	char*		path;
 
 	ut_ad(fil_system);
 
 	mutex_enter(&fil_system->mutex);
 
-	path = fil_make_ibd_name(name, FALSE);
-
-	/* Look if there is a space with the same name; the name is the
-	directory path to the file */
+	/* Look if there is a space with the same name. */
 
-	namespace = fil_space_get_by_name(path);
+	fnamespace = fil_space_get_by_name(tablename);
 
-	if (namespace) {
-		id = namespace->id;
+	if (fnamespace) {
+		id = fnamespace->id;
 	}
 
-	mem_free(path);
-
 	mutex_exit(&fil_system->mutex);
 
 	return(id);
@@ -3934,10 +3969,13 @@ fil_extend_space_to_desired_size(
 	ulint		buf_size;
 	ulint		start_page_no;
 	ulint		file_start_page_no;
-	ulint		offset_high;
-	ulint		offset_low;
 	ulint		page_size;
-	ibool		success		= TRUE;
+	ulint		pages_added;
+	ibool		success;
+
+retry:
+	pages_added = 0;
+	success = TRUE;
 
 	fil_mutex_enter_and_prepare_for_io(space_id);
 
@@ -3954,70 +3992,93 @@ fil_extend_space_to_desired_size(
 		return(TRUE);
 	}
 
-	page_size = dict_table_flags_to_zip_size(space->flags);
+	page_size = fsp_flags_get_zip_size(space->flags);
 	if (!page_size) {
 		page_size = UNIV_PAGE_SIZE;
 	}
 
 	node = UT_LIST_GET_LAST(space->chain);
 
+	if (!node->being_extended) {
+		/* Mark this node as undergoing extension. This flag
+		is used by other threads to wait for the extension
+		opereation to finish. */
+		node->being_extended = TRUE;
+	} else {
+		/* Another thread is currently extending the file. Wait
+		for it to finish.
+		It'd have been better to use event driven mechanism but
+		the entire module is peppered with polling stuff. */
+		mutex_exit(&fil_system->mutex);
+		os_thread_sleep(100000);
+		goto retry;
+	}
+
 	fil_node_prepare_for_io(node, fil_system, space);
 
+	/* At this point it is safe to release fil_system mutex. No
+	other thread can rename, delete or close the file because
+	we have set the node->being_extended flag. */
+	mutex_exit(&fil_system->mutex);
+
 	start_page_no = space->size;
 	file_start_page_no = space->size - node->size;
 
 	/* Extend at most 64 pages at a time */
 	buf_size = ut_min(64, size_after_extend - start_page_no) * page_size;
-	buf2 = mem_alloc(buf_size + page_size);
-	buf = ut_align(buf2, page_size);
+	buf2 = static_cast<byte*>(mem_alloc(buf_size + page_size));
+	buf = static_cast<byte*>(ut_align(buf2, page_size));
 
 	memset(buf, 0, buf_size);
 
 	while (start_page_no < size_after_extend) {
-		ulint	n_pages = ut_min(buf_size / page_size,
-					 size_after_extend - start_page_no);
+		ulint		n_pages
+			= ut_min(buf_size / page_size,
+				 size_after_extend - start_page_no);
 
-		offset_high = (start_page_no - file_start_page_no)
-			/ (4096 * ((1024 * 1024) / page_size));
-		offset_low  = ((start_page_no - file_start_page_no)
-			       % (4096 * ((1024 * 1024) / page_size)))
+		os_offset_t	offset
+			= ((os_offset_t) (start_page_no - file_start_page_no))
 			* page_size;
 #ifdef UNIV_HOTBACKUP
 		success = os_file_write(node->name, node->handle, buf,
-					offset_low, offset_high,
-					page_size * n_pages);
+					offset, page_size * n_pages);
 #else
 		success = os_aio(OS_FILE_WRITE, OS_AIO_SYNC,
 				 node->name, node->handle, buf,
-				 offset_low, offset_high,
-				 page_size * n_pages,
+				 offset, page_size * n_pages,
 				 NULL, NULL);
 #endif
 		if (success) {
-			node->size += n_pages;
-			space->size += n_pages;
-
 			os_has_said_disk_full = FALSE;
 		} else {
 			/* Let us measure the size of the file to determine
 			how much we were able to extend it */
+			os_offset_t	size;
 
-			n_pages = ((ulint)
-				   (os_file_get_size_as_iblonglong(
-					   node->handle)
-				    / page_size)) - node->size;
+			size = os_file_get_size(node->handle);
+			ut_a(size != (os_offset_t) -1);
 
-			node->size += n_pages;
-			space->size += n_pages;
+			n_pages = ((ulint) (size / page_size))
+				- node->size - pages_added;
 
+			pages_added += n_pages;
 			break;
 		}
 
 		start_page_no += n_pages;
+		pages_added += n_pages;
 	}
 
 	mem_free(buf2);
 
+	mutex_enter(&fil_system->mutex);
+
+	ut_a(node->being_extended);
+
+	space->size += pages_added;
+	node->size += pages_added;
+	node->being_extended = FALSE;
+
 	fil_node_complete_io(node, fil_system, OS_FILE_WRITE);
 
 	*actual_size = space->size;
@@ -4075,7 +4136,7 @@ fil_extend_tablespaces_to_stored_len(void)
 					      mutex, because this is a
 					      single-threaded operation */
 		error = fil_read(TRUE, space->id,
-				 dict_table_flags_to_zip_size(space->flags),
+				 fsp_flags_get_zip_size(space->flags),
 				 0, 0, UNIV_PAGE_SIZE, buf, NULL);
 		ut_a(error == DB_SUCCESS);
 
@@ -4227,12 +4288,10 @@ fil_node_prepare_for_io(
 	if (node->open == FALSE) {
 		/* File is closed: open it */
 		ut_a(node->n_pending == 0);
-
 		fil_node_open_file(node, system, space);
 	}
 
-	if (node->n_pending == 0 && space->purpose == FIL_TABLESPACE
-	    && space->id != 0) {
+	if (node->n_pending == 0 && fil_space_belongs_in_lru(space)) {
 		/* The node is in the LRU list, remove it */
 
 		ut_a(UT_LIST_GET_LEN(system->LRU) > 0);
@@ -4277,8 +4336,8 @@ fil_node_complete_io(
 		}
 	}
 
-	if (node->n_pending == 0 && node->space->purpose == FIL_TABLESPACE
-	    && node->space->id != 0) {
+	if (node->n_pending == 0 && fil_space_belongs_in_lru(node->space)) {
+
 		/* The node must be put back to the LRU list */
 		UT_LIST_ADD_FIRST(LRU, system->LRU, node);
 	}
@@ -4349,11 +4408,11 @@ fil_io(
 	ulint		mode;
 	fil_space_t*	space;
 	fil_node_t*	node;
-	ulint		offset_high;
-	ulint		offset_low;
 	ibool		ret;
 	ulint		is_log;
 	ulint		wake_later;
+	os_offset_t	offset;
+	ibool		ignore_nonexistent_pages;
 
 	is_log = type & OS_FILE_LOG;
 	type = type & ~OS_FILE_LOG;
@@ -4361,13 +4420,20 @@ fil_io(
 	wake_later = type & OS_AIO_SIMULATED_WAKE_LATER;
 	type = type & ~OS_AIO_SIMULATED_WAKE_LATER;
 
+	ignore_nonexistent_pages = type & BUF_READ_IGNORE_NONEXISTENT_PAGES;
+	type &= ~BUF_READ_IGNORE_NONEXISTENT_PAGES;
+
 	ut_ad(byte_offset < UNIV_PAGE_SIZE);
 	ut_ad(!zip_size || !byte_offset);
 	ut_ad(ut_is_2pow(zip_size));
 	ut_ad(buf);
 	ut_ad(len > 0);
-#if (1 << UNIV_PAGE_SIZE_SHIFT) != UNIV_PAGE_SIZE
-# error "(1 << UNIV_PAGE_SIZE_SHIFT) != UNIV_PAGE_SIZE"
+	ut_ad(UNIV_PAGE_SIZE == (ulong)(1 << UNIV_PAGE_SIZE_SHIFT));
+#if (1 << UNIV_PAGE_SIZE_SHIFT_MAX) != UNIV_PAGE_SIZE_MAX
+# error "(1 << UNIV_PAGE_SIZE_SHIFT_MAX) != UNIV_PAGE_SIZE_MAX"
+#endif
+#if (1 << UNIV_PAGE_SIZE_SHIFT_MIN) != UNIV_PAGE_SIZE_MIN
+# error "(1 << UNIV_PAGE_SIZE_SHIFT_MIN) != UNIV_PAGE_SIZE_MIN"
 #endif
 	ut_ad(fil_validate_skip());
 #ifndef UNIV_HOTBACKUP
@@ -4427,6 +4493,12 @@ fil_io(
 
 	for (;;) {
 		if (UNIV_UNLIKELY(node == NULL)) {
+			if (ignore_nonexistent_pages) {
+				mutex_exit(&fil_system->mutex);
+				return(DB_ERROR);
+			}
+			/* else */
+
 			fil_report_invalid_page_access(
 				block_offset, space_id, space->name,
 				byte_offset, len, type);
@@ -4434,7 +4506,7 @@ fil_io(
 			ut_error;
 		}
 
-		if (space->id != 0 && node->size == 0) {
+		if (fil_is_user_tablespace_id(space->id) && node->size == 0) {
 			/* We do not know the size of a single-table tablespace
 			before we open the file */
 
@@ -4454,7 +4526,7 @@ fil_io(
 	fil_node_prepare_for_io(node, fil_system, space);
 
 	/* Check that at least the start offset is within the bounds of a
-	single-table tablespace */
+	single-table tablespace, including rollback tablespaces. */
 	if (UNIV_UNLIKELY(node->size <= block_offset)
 	    && space->id != 0 && space->purpose == FIL_TABLESPACE) {
 
@@ -4471,9 +4543,8 @@ fil_io(
 	/* Calculate the low 32 bits and the high 32 bits of the file offset */
 
 	if (!zip_size) {
-		offset_high = (block_offset >> (32 - UNIV_PAGE_SIZE_SHIFT));
-		offset_low  = ((block_offset << UNIV_PAGE_SIZE_SHIFT)
-			       & 0xFFFFFFFFUL) + byte_offset;
+		offset = ((os_offset_t) block_offset << UNIV_PAGE_SIZE_SHIFT)
+			+ byte_offset;
 
 		ut_a(node->size - block_offset
 		     >= ((byte_offset + len + (UNIV_PAGE_SIZE - 1))
@@ -4488,8 +4559,7 @@ fil_io(
 		case 16384: zip_size_shift = 14; break;
 		default: ut_error;
 		}
-		offset_high = block_offset >> (32 - zip_size_shift);
-		offset_low = (block_offset << zip_size_shift & 0xFFFFFFFFUL)
+		offset = ((os_offset_t) block_offset << zip_size_shift)
 			+ byte_offset;
 		ut_a(node->size - block_offset
 		     >= (len + (zip_size - 1)) / zip_size);
@@ -4503,16 +4573,15 @@ fil_io(
 #ifdef UNIV_HOTBACKUP
 	/* In ibbackup do normal i/o, not aio */
 	if (type == OS_FILE_READ) {
-		ret = os_file_read(node->handle, buf, offset_low, offset_high,
-				   len);
+		ret = os_file_read(node->handle, buf, offset, len);
 	} else {
 		ret = os_file_write(node->name, node->handle, buf,
-				    offset_low, offset_high, len);
+				    offset, len);
 	}
 #else
 	/* Queue the aio request */
 	ret = os_aio(type, mode | wake_later, node->name, node->handle, buf,
-		     offset_low, offset_high, len, node, message);
+		     offset, len, node, message);
 #endif
 	ut_a(ret);
 
@@ -4536,7 +4605,7 @@ fil_io(
 /**********************************************************************//**
 Waits for an aio operation to complete. This function is used to write the
 handler for completed requests. The aio array of pending requests is divided
-into segments (see os0file.c for more info). The thread specifies which
+into segments (see os0file.cc for more info). The thread specifies which
 segment it wants to wait for. */
 UNIV_INTERN
 void
@@ -4595,10 +4664,10 @@ fil_aio_wait(
 
 	if (fil_node->space->purpose == FIL_TABLESPACE) {
 		srv_set_io_thread_op_info(segment, "complete io for buf page");
-		buf_page_io_complete(message);
+		buf_page_io_complete(static_cast<buf_page_t*>(message));
 	} else {
 		srv_set_io_thread_op_info(segment, "complete io for log");
-		log_io_complete(message);
+		log_io_complete(static_cast<log_group_t*>(message));
 	}
 }
 #endif /* UNIV_HOTBACKUP */
@@ -4745,7 +4814,8 @@ fil_flush_file_spaces(
 	traversed fil_system->unflushed_spaces and called UT_LIST_GET_NEXT()
 	on a space that was just removed from the list by fil_flush().
 	Thus, the space could be dropped and the memory overwritten. */
-	space_ids = mem_alloc(n_space_ids * sizeof *space_ids);
+	space_ids = static_cast<ulint*>(
+		mem_alloc(n_space_ids * sizeof *space_ids));
 
 	n_space_ids = 0;
 
@@ -4771,6 +4841,14 @@ fil_flush_file_spaces(
 	mem_free(space_ids);
 }
 
+/** Functor to validate the space list. */
+struct	Check {
+	void	operator()(const fil_node_t* elem)
+	{
+		ut_a(elem->open || !elem->n_pending);
+	}
+};
+
 /******************************************************************//**
 Checks the consistency of the tablespace cache.
 @return	TRUE if ok */
@@ -4790,16 +4868,19 @@ fil_validate(void)
 
 	for (i = 0; i < hash_get_n_cells(fil_system->spaces); i++) {
 
-		space = HASH_GET_FIRST(fil_system->spaces, i);
+		for (space = static_cast<fil_space_t*>(
+				HASH_GET_FIRST(fil_system->spaces, i));
+		     space != 0;
+		     space = static_cast<fil_space_t*>(
+			     	HASH_GET_NEXT(hash, space))) {
 
-		while (space != NULL) {
-			UT_LIST_VALIDATE(chain, fil_node_t, space->chain,
-					 ut_a(ut_list_node_313->open
-					      || !ut_list_node_313->n_pending));
+			UT_LIST_VALIDATE(
+				chain, fil_node_t, space->chain, Check());
 
-			fil_node = UT_LIST_GET_FIRST(space->chain);
+			for (fil_node = UT_LIST_GET_FIRST(space->chain);
+			     fil_node != 0;
+			     fil_node = UT_LIST_GET_NEXT(chain, fil_node)) {
 
-			while (fil_node != NULL) {
 				if (fil_node->n_pending > 0) {
 					ut_a(fil_node->open);
 				}
@@ -4807,25 +4888,22 @@ fil_validate(void)
 				if (fil_node->open) {
 					n_open++;
 				}
-				fil_node = UT_LIST_GET_NEXT(chain, fil_node);
 			}
-			space = HASH_GET_NEXT(hash, space);
 		}
 	}
 
 	ut_a(fil_system->n_open == n_open);
 
-	UT_LIST_VALIDATE(LRU, fil_node_t, fil_system->LRU, (void) 0);
+	UT_LIST_CHECK(LRU, fil_node_t, fil_system->LRU);
 
-	fil_node = UT_LIST_GET_FIRST(fil_system->LRU);
+	for (fil_node = UT_LIST_GET_FIRST(fil_system->LRU);
+	     fil_node != 0;
+	     fil_node = UT_LIST_GET_NEXT(LRU, fil_node)) {
 
-	while (fil_node != NULL) {
 		ut_a(fil_node->n_pending == 0);
+		ut_a(!fil_node->being_extended);
 		ut_a(fil_node->open);
-		ut_a(fil_node->space->purpose == FIL_TABLESPACE);
-		ut_a(fil_node->space->id != 0);
-
-		fil_node = UT_LIST_GET_NEXT(LRU, fil_node);
+		ut_a(fil_space_belongs_in_lru(fil_node->space));
 	}
 
 	mutex_exit(&fil_system->mutex);
@@ -4899,7 +4977,7 @@ fil_page_get_type(
 }
 
 /****************************************************************//**
-Initializes the tablespace memory cache. */
+Closes the tablespace memory cache. */
 UNIV_INTERN
 void
 fil_close(void)
diff --git a/storage/innobase/fsp/fsp0fsp.c b/storage/innobase/fsp/fsp0fsp.cc
index 6d00a1f1d4a..398dd24afed 100644
--- a/storage/innobase/fsp/fsp0fsp.c
+++ b/storage/innobase/fsp/fsp0fsp.cc
@@ -17,7 +17,7 @@ this program; if not, write to the Free Software Foundation, Inc.,
 *****************************************************************************/
 
 /******************************************************************//**
-@file fsp/fsp0fsp.c
+@file fsp/fsp0fsp.cc
 File space management
 
 Created 11/29/1995 Heikki Tuuri
@@ -48,129 +48,11 @@ Created 11/29/1995 Heikki Tuuri
 # include "log0log.h"
 #endif /* UNIV_HOTBACKUP */
 #include "dict0mem.h"
+#include "srv0start.h"
 
 
-/*			FILE SEGMENT INODE
-			==================
-
-Segment inode which is created for each segment in a tablespace. NOTE: in
-purge we assume that a segment having only one currently used page can be
-freed in a few steps, so that the freeing cannot fill the file buffer with
-bufferfixed file pages. */
-
-typedef	byte	fseg_inode_t;
-
-#define FSEG_INODE_PAGE_NODE	FSEG_PAGE_DATA
-					/* the list node for linking
-					segment inode pages */
-
-#define FSEG_ARR_OFFSET		(FSEG_PAGE_DATA + FLST_NODE_SIZE)
-/*-------------------------------------*/
-#define	FSEG_ID			0	/* 8 bytes of segment id: if this is 0,
-					it means that the header is unused */
-#define FSEG_NOT_FULL_N_USED	8
-					/* number of used segment pages in
-					the FSEG_NOT_FULL list */
-#define	FSEG_FREE		12
-					/* list of free extents of this
-					segment */
-#define	FSEG_NOT_FULL		(12 + FLST_BASE_NODE_SIZE)
-					/* list of partially free extents */
-#define	FSEG_FULL		(12 + 2 * FLST_BASE_NODE_SIZE)
-					/* list of full extents */
-#define	FSEG_MAGIC_N		(12 + 3 * FLST_BASE_NODE_SIZE)
-					/* magic number used in debugging */
-#define	FSEG_FRAG_ARR		(16 + 3 * FLST_BASE_NODE_SIZE)
-					/* array of individual pages
-					belonging to this segment in fsp
-					fragment extent lists */
-#define FSEG_FRAG_ARR_N_SLOTS	(FSP_EXTENT_SIZE / 2)
-					/* number of slots in the array for
-					the fragment pages */
-#define	FSEG_FRAG_SLOT_SIZE	4	/* a fragment page slot contains its
-					page number within space, FIL_NULL
-					means that the slot is not in use */
-/*-------------------------------------*/
-#define FSEG_INODE_SIZE					\
-	(16 + 3 * FLST_BASE_NODE_SIZE			\
-	 + FSEG_FRAG_ARR_N_SLOTS * FSEG_FRAG_SLOT_SIZE)
-
-#define FSP_SEG_INODES_PER_PAGE(zip_size)		\
-	(((zip_size ? zip_size : UNIV_PAGE_SIZE)	\
-	  - FSEG_ARR_OFFSET - 10) / FSEG_INODE_SIZE)
-				/* Number of segment inodes which fit on a
-				single page */
-
-#define FSEG_MAGIC_N_VALUE	97937874
-
-#define	FSEG_FILLFACTOR		8	/* If this value is x, then if
-					the number of unused but reserved
-					pages in a segment is less than
-					reserved pages * 1/x, and there are
-					at least FSEG_FRAG_LIMIT used pages,
-					then we allow a new empty extent to
-					be added to the segment in
-					fseg_alloc_free_page. Otherwise, we
-					use unused pages of the segment. */
-
-#define FSEG_FRAG_LIMIT		FSEG_FRAG_ARR_N_SLOTS
-					/* If the segment has >= this many
-					used pages, it may be expanded by
-					allocating extents to the segment;
-					until that only individual fragment
-					pages are allocated from the space */
-
-#define	FSEG_FREE_LIST_LIMIT	40	/* If the reserved size of a segment
-					is at least this many extents, we
-					allow extents to be put to the free
-					list of the extent: at most
-					FSEG_FREE_LIST_MAX_LEN many */
-#define	FSEG_FREE_LIST_MAX_LEN	4
-
-
-/*			EXTENT DESCRIPTOR
-			=================
-
-File extent descriptor data structure: contains bits to tell which pages in
-the extent are free and which contain old tuple version to clean. */
-
-/*-------------------------------------*/
-#define	XDES_ID			0	/* The identifier of the segment
-					to which this extent belongs */
-#define XDES_FLST_NODE		8	/* The list node data structure
-					for the descriptors */
-#define	XDES_STATE		(FLST_NODE_SIZE + 8)
-					/* contains state information
-					of the extent */
-#define	XDES_BITMAP		(FLST_NODE_SIZE + 12)
-					/* Descriptor bitmap of the pages
-					in the extent */
-/*-------------------------------------*/
-
-#define	XDES_BITS_PER_PAGE	2	/* How many bits are there per page */
-#define	XDES_FREE_BIT		0	/* Index of the bit which tells if
-					the page is free */
-#define	XDES_CLEAN_BIT		1	/* NOTE: currently not used!
-					Index of the bit which tells if
-					there are old versions of tuples
-					on the page */
-/* States of a descriptor */
-#define	XDES_FREE		1	/* extent is in free list of space */
-#define	XDES_FREE_FRAG		2	/* extent is in free fragment list of
-					space */
-#define	XDES_FULL_FRAG		3	/* extent is in full fragment list of
-					space */
-#define	XDES_FSEG		4	/* extent belongs to a segment */
-
-/* File extent data structure size in bytes. */
-#define	XDES_SIZE							\
-	(XDES_BITMAP + UT_BITS_IN_BYTES(FSP_EXTENT_SIZE * XDES_BITS_PER_PAGE))
-
-/* Offset of the descriptor array on a descriptor page */
-#define	XDES_ARR_OFFSET		(FSP_HEADER_OFFSET + FSP_HEADER_SIZE)
-
 #ifndef UNIV_HOTBACKUP
-/* Flag to indicate if we have printed the tablespace full error. */
+/** Flag to indicate if we have printed the tablespace full error. */
 static ibool fsp_tbs_full_error_printed = FALSE;
 
 /**********************************************************************//**
@@ -183,7 +65,7 @@ fsp_free_extent(
 	ulint		zip_size,/*!< in: compressed page size in bytes
 				or 0 for uncompressed pages */
 	ulint		page,	/*!< in: page offset in the extent */
-	mtr_t*		mtr);	/*!< in: mtr */
+	mtr_t*		mtr);	/*!< in/out: mini-transaction */
 /**********************************************************************//**
 Frees an extent of a segment to the space free list. */
 static
@@ -195,7 +77,7 @@ fseg_free_extent(
 	ulint		zip_size,/*!< in: compressed page size in bytes
 				or 0 for uncompressed pages */
 	ulint		page,	/*!< in: page offset in the extent */
-	mtr_t*		mtr);	/*!< in: mtr handle */
+	mtr_t*		mtr);	/*!< in/out: mini-transaction */
 /**********************************************************************//**
 Calculates the number of pages reserved by a segment, and how
 many pages are currently used.
@@ -207,18 +89,20 @@ fseg_n_reserved_pages_low(
 	fseg_inode_t*	header,	/*!< in: segment inode */
 	ulint*		used,	/*!< out: number of pages used (not
 				more than reserved) */
-	mtr_t*		mtr);	/*!< in: mtr handle */
+	mtr_t*		mtr);	/*!< in/out: mini-transaction */
 /********************************************************************//**
 Marks a page used. The page must reside within the extents of the given
 segment. */
-static __attribute__((nonnull))
+static
 void
 fseg_mark_page_used(
 /*================*/
 	fseg_inode_t*	seg_inode,/*!< in: segment inode */
+	ulint		space,	/*!< in: space id */
+	ulint		zip_size,/*!< in: compressed page size in bytes
+				or 0 for uncompressed pages */
 	ulint		page,	/*!< in: page offset */
-	xdes_t*		descr, /* extent descriptor */
-	mtr_t*		mtr);	/*!< in: mtr */
+	mtr_t*		mtr);	/*!< in/out: mini-transaction */
 /**********************************************************************//**
 Returns the first extent descriptor for a segment. We think of the extent
 lists of the segment catenated in the order FSEG_FULL -> FSEG_NOT_FULL
@@ -232,7 +116,7 @@ fseg_get_first_extent(
 	ulint		space,	/*!< in: space id */
 	ulint		zip_size,/*!< in: compressed page size in bytes
 				or 0 for uncompressed pages */
-	mtr_t*		mtr);	/*!< in: mtr */
+	mtr_t*		mtr);	/*!< in/out: mini-transaction */
 /**********************************************************************//**
 Puts new extents to the free list if
 there are free extents above the free limit. If an extent happens
@@ -250,7 +134,7 @@ fsp_fill_free_list(
 	ulint		space,		/*!< in: space */
 	fsp_header_t*	header,		/*!< in/out: space header */
 	mtr_t*		mtr)		/*!< in/out: mini-transaction */
-	__attribute__((nonnull));
+	UNIV_COLD __attribute__((nonnull));
 /**********************************************************************//**
 Allocates a single free page from a segment. This function implements
 the intelligent allocation strategy which tries to minimize file space
@@ -305,14 +189,14 @@ fsp_get_space_header(
 	ulint	id,	/*!< in: space id */
 	ulint	zip_size,/*!< in: compressed page size in bytes
 			or 0 for uncompressed pages */
-	mtr_t*	mtr)	/*!< in: mtr */
+	mtr_t*	mtr)	/*!< in/out: mini-transaction */
 {
 	buf_block_t*	block;
 	fsp_header_t*	header;
 
 	ut_ad(ut_is_2pow(zip_size));
-	ut_ad(zip_size <= UNIV_PAGE_SIZE);
-	ut_ad(!zip_size || zip_size >= PAGE_ZIP_MIN_SIZE);
+	ut_ad(zip_size <= UNIV_ZIP_SIZE_MAX);
+	ut_ad(!zip_size || zip_size >= UNIV_ZIP_SIZE_MIN);
 	ut_ad(id || !zip_size);
 
 	block = buf_page_get(id, zip_size, 0, RW_X_LATCH, mtr);
@@ -320,7 +204,7 @@ fsp_get_space_header(
 	buf_block_dbg_add_level(block, SYNC_FSP_PAGE);
 
 	ut_ad(id == mach_read_from_4(FSP_SPACE_ID + header));
-	ut_ad(zip_size == dict_table_flags_to_zip_size(
+	ut_ad(zip_size == fsp_flags_get_zip_size(
 		      mach_read_from_4(FSP_SPACE_FLAGS + header)));
 	return(header);
 }
@@ -336,7 +220,7 @@ xdes_get_bit(
 	ulint		bit,	/*!< in: XDES_FREE_BIT or XDES_CLEAN_BIT */
 	ulint		offset,	/*!< in: page offset within extent:
 				0 ... FSP_EXTENT_SIZE - 1 */
-	mtr_t*		mtr)	/*!< in: mtr */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
 {
 	ulint	index;
 	ulint	byte_index;
@@ -367,7 +251,7 @@ xdes_set_bit(
 	ulint	offset,	/*!< in: page offset within extent:
 			0 ... FSP_EXTENT_SIZE - 1 */
 	ibool	val,	/*!< in: bit value */
-	mtr_t*	mtr)	/*!< in: mtr */
+	mtr_t*	mtr)	/*!< in/out: mini-transaction */
 {
 	ulint	index;
 	ulint	byte_index;
@@ -404,7 +288,7 @@ xdes_find_bit(
 	ulint	bit,	/*!< in: XDES_FREE_BIT or XDES_CLEAN_BIT */
 	ibool	val,	/*!< in: desired bit value */
 	ulint	hint,	/*!< in: hint of which bit position would be desirable */
-	mtr_t*	mtr)	/*!< in: mtr */
+	mtr_t*	mtr)	/*!< in/out: mini-transaction */
 {
 	ulint	i;
 
@@ -441,7 +325,7 @@ xdes_find_bit_downward(
 	ulint	bit,	/*!< in: XDES_FREE_BIT or XDES_CLEAN_BIT */
 	ibool	val,	/*!< in: desired bit value */
 	ulint	hint,	/*!< in: hint of which bit position would be desirable */
-	mtr_t*	mtr)	/*!< in: mtr */
+	mtr_t*	mtr)	/*!< in/out: mini-transaction */
 {
 	ulint	i;
 
@@ -474,7 +358,7 @@ ulint
 xdes_get_n_used(
 /*============*/
 	const xdes_t*	descr,	/*!< in: descriptor */
-	mtr_t*		mtr)	/*!< in: mtr */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
 {
 	ulint	i;
 	ulint	count	= 0;
@@ -498,7 +382,7 @@ ibool
 xdes_is_free(
 /*=========*/
 	const xdes_t*	descr,	/*!< in: descriptor */
-	mtr_t*		mtr)	/*!< in: mtr */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
 {
 	if (0 == xdes_get_n_used(descr, mtr)) {
 
@@ -516,7 +400,7 @@ ibool
 xdes_is_full(
 /*=========*/
 	const xdes_t*	descr,	/*!< in: descriptor */
-	mtr_t*		mtr)	/*!< in: mtr */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
 {
 	if (FSP_EXTENT_SIZE == xdes_get_n_used(descr, mtr)) {
 
@@ -534,7 +418,7 @@ xdes_set_state(
 /*===========*/
 	xdes_t*	descr,	/*!< in/out: descriptor */
 	ulint	state,	/*!< in: state to set */
-	mtr_t*	mtr)	/*!< in: mtr handle */
+	mtr_t*	mtr)	/*!< in/out: mini-transaction */
 {
 	ut_ad(descr && mtr);
 	ut_ad(state >= XDES_FREE);
@@ -552,7 +436,7 @@ ulint
 xdes_get_state(
 /*===========*/
 	const xdes_t*	descr,	/*!< in: descriptor */
-	mtr_t*		mtr)	/*!< in: mtr handle */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
 {
 	ulint	state;
 
@@ -571,7 +455,7 @@ void
 xdes_init(
 /*======*/
 	xdes_t*	descr,	/*!< in: descriptor */
-	mtr_t*	mtr)	/*!< in: mtr */
+	mtr_t*	mtr)	/*!< in/out: mini-transaction */
 {
 	ulint	i;
 
@@ -598,15 +482,25 @@ xdes_calc_descriptor_page(
 	ulint	offset)		/*!< in: page offset */
 {
 #ifndef DOXYGEN /* Doxygen gets confused of these */
-# if UNIV_PAGE_SIZE <= XDES_ARR_OFFSET \
-		+ (UNIV_PAGE_SIZE / FSP_EXTENT_SIZE) * XDES_SIZE
+# if UNIV_PAGE_SIZE_MAX <= XDES_ARR_OFFSET				\
+			   + (UNIV_PAGE_SIZE_MAX / FSP_EXTENT_SIZE_MAX)	\
+			   * XDES_SIZE_MAX
 #  error
 # endif
-# if PAGE_ZIP_MIN_SIZE <= XDES_ARR_OFFSET \
-		+ (PAGE_ZIP_MIN_SIZE / FSP_EXTENT_SIZE) * XDES_SIZE
+# if UNIV_ZIP_SIZE_MIN <= XDES_ARR_OFFSET				\
+			  + (UNIV_ZIP_SIZE_MIN / FSP_EXTENT_SIZE_MIN)	\
+			  * XDES_SIZE_MIN
 #  error
 # endif
 #endif /* !DOXYGEN */
+
+	ut_ad(UNIV_PAGE_SIZE > XDES_ARR_OFFSET
+	      + (UNIV_PAGE_SIZE / FSP_EXTENT_SIZE)
+	      * XDES_SIZE);
+	ut_ad(UNIV_ZIP_SIZE_MIN > XDES_ARR_OFFSET
+	      + (UNIV_ZIP_SIZE_MIN / FSP_EXTENT_SIZE)
+	      * XDES_SIZE);
+
 	ut_ad(ut_is_2pow(zip_size));
 
 	if (!zip_size) {
@@ -641,10 +535,12 @@ xdes_calc_descriptor_index(
 
 /********************************************************************//**
 Gets pointer to a the extent descriptor of a page. The page where the extent
-descriptor resides is x-locked. This function no longer extends the data
-file.
+descriptor resides is x-locked. If the page offset is equal to the free limit
+of the space, adds new extents from above the free limit to the space free
+list, if not free limit == space size. This adding is necessary to make the
+descriptor defined, as they are uninitialized above the free limit.
 @return pointer to the extent descriptor, NULL if the page does not
-exist in the space or if the offset is >= the free limit */
+exist in the space or if the offset exceeds the free limit */
 UNIV_INLINE __attribute__((nonnull, warn_unused_result))
 xdes_t*
 xdes_get_descriptor_with_space_hdr(
@@ -671,13 +567,22 @@ xdes_get_descriptor_with_space_hdr(
 	/* Read free limit and space size */
 	limit = mach_read_from_4(sp_header + FSP_FREE_LIMIT);
 	size  = mach_read_from_4(sp_header + FSP_SIZE);
-	zip_size = dict_table_flags_to_zip_size(
+	zip_size = fsp_flags_get_zip_size(
 		mach_read_from_4(sp_header + FSP_SPACE_FLAGS));
 
-	if ((offset >= size) || (offset >= limit)) {
+	/* If offset is >= size or > limit, return NULL */
+
+	if ((offset >= size) || (offset > limit)) {
+
 		return(NULL);
 	}
 
+	/* If offset is == limit, fill free list of the space. */
+
+	if (offset == limit) {
+		fsp_fill_free_list(FALSE, space, sp_header, mtr);
+	}
+
 	descr_page_no = xdes_calc_descriptor_page(zip_size, offset);
 
 	if (descr_page_no == 0) {
@@ -743,7 +648,7 @@ xdes_lst_get_descriptor(
 				or 0 for uncompressed pages */
 	fil_addr_t	lst_node,/*!< in: file address of the list node
 				contained in the descriptor */
-	mtr_t*		mtr)	/*!< in: mtr handle */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
 {
 	xdes_t*	descr;
 
@@ -788,7 +693,7 @@ fsp_init_file_page_low(
 	block->check_index_page_at_flush = FALSE;
 #endif /* !UNIV_HOTBACKUP */
 
-	if (UNIV_LIKELY_NULL(page_zip)) {
+	if (page_zip) {
 		memset(page, 0, UNIV_PAGE_SIZE);
 		memset(page_zip->data, 0, page_zip_get_size(page_zip));
 		mach_write_to_4(page + FIL_PAGE_OFFSET,
@@ -817,7 +722,7 @@ void
 fsp_init_file_page(
 /*===============*/
 	buf_block_t*	block,	/*!< in: pointer to a page */
-	mtr_t*		mtr)	/*!< in: mtr */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
 {
 	fsp_init_file_page_low(block);
 
@@ -853,27 +758,33 @@ void
 fsp_init(void)
 /*==========*/
 {
+	/* FSP_EXTENT_SIZE must be a multiple of page & zip size */
+	ut_a(0 == (UNIV_PAGE_SIZE % FSP_EXTENT_SIZE));
+	ut_a(UNIV_PAGE_SIZE);
+
+#if UNIV_PAGE_SIZE_MAX % FSP_EXTENT_SIZE_MAX
+# error "UNIV_PAGE_SIZE_MAX % FSP_EXTENT_SIZE_MAX != 0"
+#endif
+#if UNIV_ZIP_SIZE_MIN % FSP_EXTENT_SIZE_MIN
+# error "UNIV_ZIP_SIZE_MIN % FSP_EXTENT_SIZE_MIN != 0"
+#endif
+
 	/* Does nothing at the moment */
 }
 
 /**********************************************************************//**
-Writes the space id and compressed page size to a tablespace header.
-This function is used past the buffer pool when we in fil0fil.c create
-a new single-table tablespace. */
+Writes the space id and flags to a tablespace header.  The flags contain
+row type, physical/compressed page size, and logical/uncompressed page
+size of the tablespace. */
 UNIV_INTERN
 void
 fsp_header_init_fields(
 /*===================*/
 	page_t*	page,		/*!< in/out: first page in the space */
 	ulint	space_id,	/*!< in: space id */
-	ulint	flags)		/*!< in: tablespace flags (FSP_SPACE_FLAGS):
-				0, or table->flags if newer than COMPACT */
+	ulint	flags)		/*!< in: tablespace flags (FSP_SPACE_FLAGS) */
 {
-	/* The tablespace flags (FSP_SPACE_FLAGS) should be 0 for
-	ROW_FORMAT=COMPACT (table->flags == DICT_TF_COMPACT) and
-	ROW_FORMAT=REDUNDANT (table->flags == 0).  For any other
-	format, the tablespace flags should equal table->flags. */
-	ut_a(flags != DICT_TF_COMPACT);
+	fsp_flags_validate(flags);
 
 	mach_write_to_4(FSP_HEADER_OFFSET + FSP_SPACE_ID + page,
 			space_id);
@@ -891,7 +802,7 @@ fsp_header_init(
 /*============*/
 	ulint	space,		/*!< in: space id */
 	ulint	size,		/*!< in: current size in blocks */
-	mtr_t*	mtr)		/*!< in: mini-transaction handle */
+	mtr_t*	mtr)		/*!< in/out: mini-transaction */
 {
 	fsp_header_t*	header;
 	buf_block_t*	block;
@@ -903,7 +814,7 @@ fsp_header_init(
 
 	mtr_x_lock(fil_space_get_latch(space, &flags), mtr);
 
-	zip_size = dict_table_flags_to_zip_size(flags);
+	zip_size = fsp_flags_get_zip_size(flags);
 	block = buf_page_create(space, 0, zip_size, mtr);
 	buf_page_get(space, zip_size, 0, RW_X_LATCH, mtr);
 	buf_block_dbg_add_level(block, SYNC_FSP_PAGE);
@@ -998,7 +909,7 @@ fsp_header_get_zip_size(
 {
 	ulint	flags = fsp_header_get_flags(page);
 
-	return(dict_table_flags_to_zip_size(flags));
+	return(fsp_flags_get_zip_size(flags));
 }
 
 #ifndef UNIV_HOTBACKUP
@@ -1008,9 +919,9 @@ UNIV_INTERN
 void
 fsp_header_inc_size(
 /*================*/
-	ulint	space,	/*!< in: space id */
-	ulint	size_inc,/*!< in: size increment in pages */
-	mtr_t*	mtr)	/*!< in: mini-transaction handle */
+	ulint	space,		/*!< in: space id */
+	ulint	size_inc,	/*!< in: size increment in pages */
+	mtr_t*	mtr)		/*!< in/out: mini-transaction */
 {
 	fsp_header_t*	header;
 	ulint		size;
@@ -1021,7 +932,7 @@ fsp_header_inc_size(
 	mtr_x_lock(fil_space_get_latch(space, &flags), mtr);
 
 	header = fsp_get_space_header(space,
-				      dict_table_flags_to_zip_size(flags),
+				      fsp_flags_get_zip_size(flags),
 				      mtr);
 
 	size = mtr_read_ulint(header + FSP_SIZE, MLOG_4BYTES, mtr);
@@ -1031,38 +942,6 @@ fsp_header_inc_size(
 }
 
 /**********************************************************************//**
-Gets the current free limit of the system tablespace.  The free limit
-means the place of the first page which has never been put to the
-free list for allocation.  The space above that address is initialized
-to zero.  Sets also the global variable log_fsp_current_free_limit.
-@return	free limit in megabytes */
-UNIV_INTERN
-ulint
-fsp_header_get_free_limit(void)
-/*===========================*/
-{
-	fsp_header_t*	header;
-	ulint		limit;
-	mtr_t		mtr;
-
-	mtr_start(&mtr);
-
-	mtr_x_lock(fil_space_get_latch(0, NULL), &mtr);
-
-	header = fsp_get_space_header(0, 0, &mtr);
-
-	limit = mtr_read_ulint(header + FSP_FREE_LIMIT, MLOG_4BYTES, &mtr);
-
-	limit /= ((1024 * 1024) / UNIV_PAGE_SIZE);
-
-	log_fsp_current_free_limit_set_and_checkpoint(limit);
-
-	mtr_commit(&mtr);
-
-	return(limit);
-}
-
-/**********************************************************************//**
 Gets the size of the system tablespace from the tablespace header.  If
 we do not have an auto-extending data file, this should be equal to
 the size of the data files.  If there is an auto-extending data file,
@@ -1094,7 +973,7 @@ fsp_header_get_tablespace_size(void)
 Tries to extend a single-table tablespace so that a page would fit in the
 data file.
 @return	TRUE if success */
-static __attribute__((nonnull, warn_unused_result))
+static UNIV_COLD __attribute__((nonnull, warn_unused_result))
 ibool
 fsp_try_extend_data_file_with_pages(
 /*================================*/
@@ -1126,7 +1005,7 @@ fsp_try_extend_data_file_with_pages(
 /***********************************************************************//**
 Tries to extend the last data file of a tablespace if it is auto-extending.
 @return	FALSE if not auto-extending */
-static __attribute__((nonnull))
+static UNIV_COLD __attribute__((nonnull))
 ibool
 fsp_try_extend_data_file(
 /*=====================*/
@@ -1168,7 +1047,7 @@ fsp_try_extend_data_file(
 	}
 
 	size = mtr_read_ulint(header + FSP_SIZE, MLOG_4BYTES, mtr);
-	zip_size = dict_table_flags_to_zip_size(
+	zip_size = fsp_flags_get_zip_size(
 		mach_read_from_4(header + FSP_SPACE_FLAGS));
 
 	old_size = size;
@@ -1292,11 +1171,11 @@ fsp_fill_free_list(
 	size = mtr_read_ulint(header + FSP_SIZE, MLOG_4BYTES, mtr);
 	limit = mtr_read_ulint(header + FSP_FREE_LIMIT, MLOG_4BYTES, mtr);
 
-	zip_size = dict_table_flags_to_zip_size(
+	zip_size = fsp_flags_get_zip_size(
 		mach_read_from_4(FSP_SPACE_FLAGS + header));
 	ut_a(ut_is_2pow(zip_size));
-	ut_a(zip_size <= UNIV_PAGE_SIZE);
-	ut_a(!zip_size || zip_size >= PAGE_ZIP_MIN_SIZE);
+	ut_a(zip_size <= UNIV_ZIP_SIZE_MAX);
+	ut_a(!zip_size || zip_size >= UNIV_ZIP_SIZE_MIN);
 
 	if (space == 0 && srv_auto_extend_last_data_file
 	    && size < limit + FSP_EXTENT_SIZE * FSP_FREE_ADD) {
@@ -1329,15 +1208,6 @@ fsp_fill_free_list(
 		mlog_write_ulint(header + FSP_FREE_LIMIT, i + FSP_EXTENT_SIZE,
 				 MLOG_4BYTES, mtr);
 
-		/* Update the free limit info in the log system and make
-		a checkpoint */
-		if (space == 0) {
-			ut_a(!zip_size);
-			log_fsp_current_free_limit_set_and_checkpoint(
-				(i + FSP_EXTENT_SIZE)
-				/ ((1024 * 1024) / UNIV_PAGE_SIZE));
-		}
-
 		if (UNIV_UNLIKELY(init_xdes)) {
 
 			buf_block_t*	block;
@@ -1387,13 +1257,6 @@ fsp_fill_free_list(
 							   mtr);
 		xdes_init(descr, mtr);
 
-#if UNIV_PAGE_SIZE % FSP_EXTENT_SIZE
-# error "UNIV_PAGE_SIZE % FSP_EXTENT_SIZE != 0"
-#endif
-#if PAGE_ZIP_MIN_SIZE % FSP_EXTENT_SIZE
-# error "PAGE_ZIP_MIN_SIZE % FSP_EXTENT_SIZE != 0"
-#endif
-
 		if (UNIV_UNLIKELY(init_xdes)) {
 
 			/* The first page in the extent is a descriptor page
@@ -1434,7 +1297,7 @@ fsp_alloc_free_extent(
 	ulint	hint,	/*!< in: hint of which extent would be desirable: any
 			page offset in the extent goes; the hint must not
 			be > FSP_FREE_LIMIT */
-	mtr_t*	mtr)	/*!< in: mtr */
+	mtr_t*	mtr)	/*!< in/out: mini-transaction */
 {
 	fsp_header_t*	header;
 	fil_addr_t	first;
@@ -1634,7 +1497,7 @@ fsp_alloc_free_page(
 			     hint % FSP_EXTENT_SIZE, mtr);
 	if (free == ULINT_UNDEFINED) {
 
-		ut_print_buf(stderr, ((byte*)descr) - 500, 1000);
+		ut_print_buf(stderr, ((byte*) descr) - 500, 1000);
 		putc('\n', stderr);
 
 		ut_error;
@@ -1667,7 +1530,6 @@ fsp_alloc_free_page(
 	}
 
 	fsp_alloc_from_free_frag(header, descr, free, mtr);
-
 	return(fsp_page_create(space, zip_size, page_no, mtr, init_mtr));
 }
 
@@ -1681,7 +1543,7 @@ fsp_free_page(
 	ulint	zip_size,/*!< in: compressed page size in bytes
 			or 0 for uncompressed pages */
 	ulint	page,	/*!< in: page offset */
-	mtr_t*	mtr)	/*!< in: mtr handle */
+	mtr_t*	mtr)	/*!< in/out: mini-transaction */
 {
 	fsp_header_t*	header;
 	xdes_t*		descr;
@@ -1705,7 +1567,7 @@ fsp_free_page(
 			(ulong) page,
 			(ulong) state);
 		fputs("InnoDB: Dump of descriptor: ", stderr);
-		ut_print_buf(stderr, ((byte*)descr) - 50, 200);
+		ut_print_buf(stderr, ((byte*) descr) - 50, 200);
 		putc('\n', stderr);
 		/* Crash in debug version, so that we get a core dump
 		of this corruption. */
@@ -1726,7 +1588,7 @@ fsp_free_page(
 			"InnoDB: Error: File space extent descriptor"
 			" of page %lu says it is free\n"
 			"InnoDB: Dump of descriptor: ", (ulong) page);
-		ut_print_buf(stderr, ((byte*)descr) - 50, 200);
+		ut_print_buf(stderr, ((byte*) descr) - 50, 200);
 		putc('\n', stderr);
 		/* Crash in debug version, so that we get a core dump
 		of this corruption. */
@@ -1779,7 +1641,7 @@ fsp_free_extent(
 	ulint	zip_size,/*!< in: compressed page size in bytes
 			or 0 for uncompressed pages */
 	ulint	page,	/*!< in: page offset in the extent */
-	mtr_t*	mtr)	/*!< in: mtr */
+	mtr_t*	mtr)	/*!< in/out: mini-transaction */
 {
 	fsp_header_t*	header;
 	xdes_t*		descr;
@@ -1792,7 +1654,7 @@ fsp_free_extent(
 
 	if (xdes_get_state(descr, mtr) == XDES_FREE) {
 
-		ut_print_buf(stderr, (byte*)descr - 500, 1000);
+		ut_print_buf(stderr, (byte*) descr - 500, 1000);
 		putc('\n', stderr);
 
 		ut_error;
@@ -1815,7 +1677,7 @@ fsp_seg_inode_page_get_nth_inode(
 	ulint	zip_size __attribute__((unused)),
 			/*!< in: compressed page size, or 0 */
 	mtr_t*	mtr __attribute__((unused)))
-			/*!< in: mini-transaction handle */
+			/*!< in/out: mini-transaction */
 {
 	ut_ad(i < FSP_SEG_INODES_PER_PAGE(zip_size));
 	ut_ad(mtr_memo_contains_page(mtr, page, MTR_MEMO_PAGE_X_FIX));
@@ -1832,7 +1694,7 @@ fsp_seg_inode_page_find_used(
 /*=========================*/
 	page_t*	page,	/*!< in: segment inode page */
 	ulint	zip_size,/*!< in: compressed page size, or 0 */
-	mtr_t*	mtr)	/*!< in: mini-transaction handle */
+	mtr_t*	mtr)	/*!< in/out: mini-transaction */
 {
 	ulint		i;
 	fseg_inode_t*	inode;
@@ -1864,7 +1726,7 @@ fsp_seg_inode_page_find_free(
 	page_t*	page,	/*!< in: segment inode page */
 	ulint	i,	/*!< in: search forward starting from this index */
 	ulint	zip_size,/*!< in: compressed page size, or 0 */
-	mtr_t*	mtr)	/*!< in: mini-transaction handle */
+	mtr_t*	mtr)	/*!< in/out: mini-transaction */
 {
 	fseg_inode_t*	inode;
 
@@ -1894,7 +1756,7 @@ ibool
 fsp_alloc_seg_inode_page(
 /*=====================*/
 	fsp_header_t*	space_header,	/*!< in: space header */
-	mtr_t*		mtr)		/*!< in: mini-transaction handle */
+	mtr_t*		mtr)		/*!< in/out: mini-transaction */
 {
 	fseg_inode_t*	inode;
 	buf_block_t*	block;
@@ -1906,7 +1768,7 @@ fsp_alloc_seg_inode_page(
 	ut_ad(page_offset(space_header) == FSP_HEADER_OFFSET);
 
 	space = page_get_space_id(page_align(space_header));
-	zip_size = dict_table_flags_to_zip_size(
+	zip_size = fsp_flags_get_zip_size(
 		mach_read_from_4(FSP_SPACE_FLAGS + space_header));
 
 	block = fsp_alloc_free_page(space, zip_size, 0, mtr, mtr);
@@ -1947,7 +1809,7 @@ fseg_inode_t*
 fsp_alloc_seg_inode(
 /*================*/
 	fsp_header_t*	space_header,	/*!< in: space header */
-	mtr_t*		mtr)		/*!< in: mini-transaction handle */
+	mtr_t*		mtr)		/*!< in/out: mini-transaction */
 {
 	ulint		page_no;
 	buf_block_t*	block;
@@ -1972,7 +1834,7 @@ fsp_alloc_seg_inode(
 
 	page_no = flst_get_first(space_header + FSP_SEG_INODES_FREE, mtr).page;
 
-	zip_size = dict_table_flags_to_zip_size(
+	zip_size = fsp_flags_get_zip_size(
 		mach_read_from_4(FSP_SPACE_FLAGS + space_header));
 	block = buf_page_get(page_get_space_id(page_align(space_header)),
 			     zip_size, page_no, RW_X_LATCH, mtr);
@@ -2013,7 +1875,7 @@ fsp_free_seg_inode(
 	ulint		zip_size,/*!< in: compressed page size in bytes
 				or 0 for uncompressed pages */
 	fseg_inode_t*	inode,	/*!< in: segment inode */
-	mtr_t*		mtr)	/*!< in: mini-transaction handle */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
 {
 	page_t*		page;
 	fsp_header_t*	space_header;
@@ -2062,7 +1924,7 @@ fseg_inode_try_get(
 	ulint		space,	/*!< in: space id */
 	ulint		zip_size,/*!< in: compressed page size in bytes
 				or 0 for uncompressed pages */
-	mtr_t*		mtr)	/*!< in: mtr handle */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
 {
 	fil_addr_t	inode_addr;
 	fseg_inode_t*	inode;
@@ -2095,7 +1957,7 @@ fseg_inode_get(
 	ulint		space,	/*!< in: space id */
 	ulint		zip_size,/*!< in: compressed page size in bytes
 				or 0 for uncompressed pages */
-	mtr_t*		mtr)	/*!< in: mtr handle */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
 {
 	fseg_inode_t*	inode
 		= fseg_inode_try_get(header, space, zip_size, mtr);
@@ -2112,7 +1974,8 @@ fseg_get_nth_frag_page_no(
 /*======================*/
 	fseg_inode_t*	inode,	/*!< in: segment inode */
 	ulint		n,	/*!< in: slot index */
-	mtr_t*		mtr __attribute__((unused))) /*!< in: mtr handle */
+	mtr_t*		mtr __attribute__((unused)))
+				/*!< in/out: mini-transaction */
 {
 	ut_ad(inode && mtr);
 	ut_ad(n < FSEG_FRAG_ARR_N_SLOTS);
@@ -2131,7 +1994,7 @@ fseg_set_nth_frag_page_no(
 	fseg_inode_t*	inode,	/*!< in: segment inode */
 	ulint		n,	/*!< in: slot index */
 	ulint		page_no,/*!< in: page number to set */
-	mtr_t*		mtr)	/*!< in: mtr handle */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
 {
 	ut_ad(inode && mtr);
 	ut_ad(n < FSEG_FRAG_ARR_N_SLOTS);
@@ -2150,7 +2013,7 @@ ulint
 fseg_find_free_frag_page_slot(
 /*==========================*/
 	fseg_inode_t*	inode,	/*!< in: segment inode */
-	mtr_t*		mtr)	/*!< in: mtr handle */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
 {
 	ulint	i;
 	ulint	page_no;
@@ -2177,7 +2040,7 @@ ulint
 fseg_find_last_used_frag_page_slot(
 /*===============================*/
 	fseg_inode_t*	inode,	/*!< in: segment inode */
-	mtr_t*		mtr)	/*!< in: mtr handle */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
 {
 	ulint	i;
 	ulint	page_no;
@@ -2205,7 +2068,7 @@ ulint
 fseg_get_n_frag_pages(
 /*==================*/
 	fseg_inode_t*	inode,	/*!< in: segment inode */
-	mtr_t*		mtr)	/*!< in: mtr handle */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
 {
 	ulint	i;
 	ulint	count	= 0;
@@ -2242,7 +2105,7 @@ fseg_create_general(
 			the inode and the other for the segment) then there is
 			no need to do the check for this individual
 			operation */
-	mtr_t*	mtr)	/*!< in: mtr */
+	mtr_t*	mtr)	/*!< in/out: mini-transaction */
 {
 	ulint		flags;
 	ulint		zip_size;
@@ -2261,16 +2124,13 @@ fseg_create_general(
 	      <= UNIV_PAGE_SIZE - FIL_PAGE_DATA_END);
 
 	latch = fil_space_get_latch(space, &flags);
-	zip_size = dict_table_flags_to_zip_size(flags);
+	zip_size = fsp_flags_get_zip_size(flags);
 
 	if (page != 0) {
 		block = buf_page_get(space, zip_size, page, RW_X_LATCH, mtr);
 		header = byte_offset + buf_block_get_frame(block);
 	}
 
-	ut_ad(!mutex_own(&kernel_mutex)
-	      || mtr_memo_contains(mtr, latch, MTR_MEMO_X_LOCK));
-
 	mtr_x_lock(latch, mtr);
 
 	if (rw_lock_get_x_lock_count(latch) == 1) {
@@ -2370,7 +2230,7 @@ fseg_create(
 			will belong to the created segment */
 	ulint	byte_offset, /*!< in: byte offset of the created segment header
 			on the page */
-	mtr_t*	mtr)	/*!< in: mtr */
+	mtr_t*	mtr)	/*!< in/out: mini-transaction */
 {
 	return(fseg_create_general(space, page, byte_offset, FALSE, mtr));
 }
@@ -2386,7 +2246,7 @@ fseg_n_reserved_pages_low(
 	fseg_inode_t*	inode,	/*!< in: segment inode */
 	ulint*		used,	/*!< out: number of pages used (not
 				more than reserved) */
-	mtr_t*		mtr)	/*!< in: mtr handle */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
 {
 	ulint	ret;
 
@@ -2415,7 +2275,7 @@ fseg_n_reserved_pages(
 /*==================*/
 	fseg_header_t*	header,	/*!< in: segment header */
 	ulint*		used,	/*!< out: number of pages used (<= reserved) */
-	mtr_t*		mtr)	/*!< in: mtr handle */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
 {
 	ulint		ret;
 	fseg_inode_t*	inode;
@@ -2426,10 +2286,7 @@ fseg_n_reserved_pages(
 
 	space = page_get_space_id(page_align(header));
 	latch = fil_space_get_latch(space, &flags);
-	zip_size = dict_table_flags_to_zip_size(flags);
-
-	ut_ad(!mutex_own(&kernel_mutex)
-	      || mtr_memo_contains(mtr, latch, MTR_MEMO_X_LOCK));
+	zip_size = fsp_flags_get_zip_size(flags);
 
 	mtr_x_lock(latch, mtr);
 
@@ -2455,7 +2312,7 @@ fseg_fill_free_list(
 				or 0 for uncompressed pages */
 	ulint		hint,	/*!< in: hint which extent would be good as
 				the first extent */
-	mtr_t*		mtr)	/*!< in: mtr */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
 {
 	xdes_t*	descr;
 	ulint	i;
@@ -2522,7 +2379,7 @@ fseg_alloc_free_extent(
 	ulint		space,	/*!< in: space id */
 	ulint		zip_size,/*!< in: compressed page size in bytes
 				or 0 for uncompressed pages */
-	mtr_t*		mtr)	/*!< in: mtr */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
 {
 	xdes_t*		descr;
 	ib_id_t		seg_id;
@@ -2798,11 +2655,11 @@ got_hinted_page:
 		ut_ad(xdes_get_bit(ret_descr, XDES_FREE_BIT,
 				   ret_page % FSP_EXTENT_SIZE, mtr) == TRUE);
 
-		fseg_mark_page_used(seg_inode, ret_page, ret_descr, mtr);
+		fseg_mark_page_used(seg_inode, space, zip_size, ret_page, mtr);
 	}
 
 	return(fsp_page_create(
-		       space, dict_table_flags_to_zip_size(
+		       space, fsp_flags_get_zip_size(
 			       mach_read_from_4(FSP_SPACE_FLAGS
 						+ space_header)),
 		       ret_page, mtr, init_mtr));
@@ -2833,7 +2690,7 @@ fseg_alloc_free_page_general(
 				with fsp_reserve_free_extents, then there
 				is no need to do the check for this individual
 				page */
-	mtr_t*		mtr,	/*!< in/out: mini-transaction handle */
+	mtr_t*		mtr,	/*!< in/out: mini-transaction */
 	mtr_t*		init_mtr)/*!< in/out: mtr or another mini-transaction
 				in which the page should be initialized.
 				If init_mtr!=mtr, but the page is already
@@ -2851,10 +2708,7 @@ fseg_alloc_free_page_general(
 
 	latch = fil_space_get_latch(space, &flags);
 
-	zip_size = dict_table_flags_to_zip_size(flags);
-
-	ut_ad(!mutex_own(&kernel_mutex)
-	      || mtr_memo_contains(mtr, latch, MTR_MEMO_X_LOCK));
+	zip_size = fsp_flags_get_zip_size(flags);
 
 	mtr_x_lock(latch, mtr);
 
@@ -2899,9 +2753,9 @@ fsp_reserve_free_pages(
 	ulint		space,		/*!< in: space id, must be != 0 */
 	fsp_header_t*	space_header,	/*!< in: header of that space,
 					x-latched */
-	ulint		size,		/*!< in: size of the tablespace in pages,
-					must be < FSP_EXTENT_SIZE / 2 */
-	mtr_t*		mtr)		/*!< in: mtr */
+	ulint		size,		/*!< in: size of the tablespace in
+					pages, must be < FSP_EXTENT_SIZE/2 */
+	mtr_t*		mtr)		/*!< in/out: mini-transaction */
 {
 	xdes_t*	descr;
 	ulint	n_used;
@@ -2960,7 +2814,7 @@ fsp_reserve_free_extents(
 	ulint	space,	/*!< in: space id */
 	ulint	n_ext,	/*!< in: number of extents to reserve */
 	ulint	alloc_type,/*!< in: FSP_NORMAL, FSP_UNDO, or FSP_CLEANING */
-	mtr_t*	mtr)	/*!< in: mtr */
+	mtr_t*	mtr)	/*!< in/out: mini-transaction */
 {
 	fsp_header_t*	space_header;
 	rw_lock_t*	latch;
@@ -2979,10 +2833,7 @@ fsp_reserve_free_extents(
 	*n_reserved = n_ext;
 
 	latch = fil_space_get_latch(space, &flags);
-	zip_size = dict_table_flags_to_zip_size(flags);
-
-	ut_ad(!mutex_own(&kernel_mutex)
-	      || mtr_memo_contains(mtr, latch, MTR_MEMO_X_LOCK));
+	zip_size = fsp_flags_get_zip_size(flags);
 
 	mtr_x_lock(latch, mtr);
 
@@ -3084,8 +2935,6 @@ fsp_get_available_space_in_free_extents(
 	rw_lock_t*	latch;
 	mtr_t		mtr;
 
-	ut_ad(!mutex_own(&kernel_mutex));
-
 	/* The convoluted mutex acquire is to overcome latching order
 	issues: The problem is that the fil_mutex is at a lower level
 	than the tablespace latch and the buffer pool mutex. We have to
@@ -3098,7 +2947,7 @@ fsp_get_available_space_in_free_extents(
 	However, there is one further complication, we release the fil_mutex
 	when we need to invalidate the the pages in the buffer pool and we
 	reacquire the fil_mutex when deleting and freeing the tablespace
-	instance in fil0fil.c. Here we need to account for that situation
+	instance in fil0fil.cc. Here we need to account for that situation
 	too. */
 
 	mutex_enter(&dict_sys->mutex);
@@ -3121,7 +2970,7 @@ fsp_get_available_space_in_free_extents(
 	by another thread. However, the tablespace pages can still be freed
 	from the buffer pool. We need to check for that again. */
 
-	zip_size = dict_table_flags_to_zip_size(flags);
+	zip_size = fsp_flags_get_zip_size(flags);
 
 	mtr_x_lock(latch, &mtr);
 
@@ -3204,21 +3053,27 @@ fsp_get_available_space_in_free_extents(
 /********************************************************************//**
 Marks a page used. The page must reside within the extents of the given
 segment. */
-static __attribute__((nonnull))
+static
 void
 fseg_mark_page_used(
 /*================*/
 	fseg_inode_t*	seg_inode,/*!< in: segment inode */
+	ulint		space,	/*!< in: space id */
+	ulint		zip_size,/*!< in: compressed page size in bytes
+				or 0 for uncompressed pages */
 	ulint		page,	/*!< in: page offset */
-	xdes_t*		descr, /* extent descriptor */
-	mtr_t*		mtr)	/*!< in: mtr */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
 {
+	xdes_t*	descr;
 	ulint	not_full_n_used;
 
+	ut_ad(seg_inode && mtr);
 	ut_ad(!((page_offset(seg_inode) - FSEG_ARR_OFFSET) % FSEG_INODE_SIZE));
 	ut_ad(mach_read_from_4(seg_inode + FSEG_MAGIC_N)
 	      == FSEG_MAGIC_N_VALUE);
 
+	descr = xdes_get_descriptor(space, zip_size, page, mtr);
+
 	ut_ad(mtr_read_ulint(seg_inode + FSEG_ID, MLOG_4BYTES, mtr)
 	      == mtr_read_ulint(descr + XDES_ID, MLOG_4BYTES, mtr));
 
@@ -3266,7 +3121,7 @@ fseg_free_page_low(
 	ulint		zip_size,/*!< in: compressed page size in bytes
 				or 0 for uncompressed pages */
 	ulint		page,	/*!< in: page offset */
-	mtr_t*		mtr)	/*!< in: mtr handle */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
 {
 	xdes_t*	descr;
 	ulint	not_full_n_used;
@@ -3401,7 +3256,7 @@ fseg_free_page(
 	fseg_header_t*	seg_header, /*!< in: segment header */
 	ulint		space,	/*!< in: space id */
 	ulint		page,	/*!< in: page offset */
-	mtr_t*		mtr)	/*!< in: mtr handle */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
 {
 	ulint		flags;
 	ulint		zip_size;
@@ -3409,10 +3264,7 @@ fseg_free_page(
 	rw_lock_t*	latch;
 
 	latch = fil_space_get_latch(space, &flags);
-	zip_size = dict_table_flags_to_zip_size(flags);
-
-	ut_ad(!mutex_own(&kernel_mutex)
-	      || mtr_memo_contains(mtr, latch, MTR_MEMO_X_LOCK));
+	zip_size = fsp_flags_get_zip_size(flags);
 
 	mtr_x_lock(latch, mtr);
 
@@ -3436,7 +3288,7 @@ fseg_free_extent(
 	ulint		zip_size,/*!< in: compressed page size in bytes
 				or 0 for uncompressed pages */
 	ulint		page,	/*!< in: a page in the extent */
-	mtr_t*		mtr)	/*!< in: mtr handle */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
 {
 	ulint	first_page_in_extent;
 	xdes_t*	descr;
@@ -3511,7 +3363,7 @@ fseg_free_step(
 				resides on the first page of the frag list
 				of the segment, this pointer becomes obsolete
 				after the last freeing step */
-	mtr_t*		mtr)	/*!< in: mtr */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
 {
 	ulint		n;
 	ulint		page;
@@ -3527,10 +3379,7 @@ fseg_free_step(
 	header_page = page_get_page_no(page_align(header));
 
 	latch = fil_space_get_latch(space, &flags);
-	zip_size = dict_table_flags_to_zip_size(flags);
-
-	ut_ad(!mutex_own(&kernel_mutex)
-	      || mtr_memo_contains(mtr, latch, MTR_MEMO_X_LOCK));
+	zip_size = fsp_flags_get_zip_size(flags);
 
 	mtr_x_lock(latch, mtr);
 
@@ -3596,7 +3445,7 @@ fseg_free_step_not_header(
 /*======================*/
 	fseg_header_t*	header,	/*!< in: segment header which must reside on
 				the first fragment page of the segment */
-	mtr_t*		mtr)	/*!< in: mtr */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
 {
 	ulint		n;
 	ulint		page;
@@ -3611,10 +3460,7 @@ fseg_free_step_not_header(
 	space = page_get_space_id(page_align(header));
 
 	latch = fil_space_get_latch(space, &flags);
-	zip_size = dict_table_flags_to_zip_size(flags);
-
-	ut_ad(!mutex_own(&kernel_mutex)
-	      || mtr_memo_contains(mtr, latch, MTR_MEMO_X_LOCK));
+	zip_size = fsp_flags_get_zip_size(flags);
 
 	mtr_x_lock(latch, mtr);
 
@@ -3664,7 +3510,7 @@ fseg_get_first_extent(
 	ulint		space,	/*!< in: space id */
 	ulint		zip_size,/*!< in: compressed page size in bytes
 				or 0 for uncompressed pages */
-	mtr_t*		mtr)	/*!< in: mtr */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
 {
 	fil_addr_t	first;
 	xdes_t*		descr;
@@ -3706,7 +3552,7 @@ ibool
 fseg_validate_low(
 /*==============*/
 	fseg_inode_t*	inode, /*!< in: segment inode */
-	mtr_t*		mtr2)	/*!< in: mtr */
+	mtr_t*		mtr2)	/*!< in/out: mini-transaction */
 {
 	ulint		space;
 	ib_id_t		seg_id;
@@ -3737,7 +3583,7 @@ fseg_validate_low(
 
 		mtr_start(&mtr);
 		mtr_x_lock(fil_space_get_latch(space, &flags), &mtr);
-		zip_size = dict_table_flags_to_zip_size(flags);
+		zip_size = fsp_flags_get_zip_size(flags);
 
 		descr = xdes_lst_get_descriptor(space, zip_size,
 						node_addr, &mtr);
@@ -3760,7 +3606,7 @@ fseg_validate_low(
 
 		mtr_start(&mtr);
 		mtr_x_lock(fil_space_get_latch(space, &flags), &mtr);
-		zip_size = dict_table_flags_to_zip_size(flags);
+		zip_size = fsp_flags_get_zip_size(flags);
 
 		descr = xdes_lst_get_descriptor(space, zip_size,
 						node_addr, &mtr);
@@ -3786,7 +3632,7 @@ fseg_validate_low(
 
 		mtr_start(&mtr);
 		mtr_x_lock(fil_space_get_latch(space, &flags), &mtr);
-		zip_size = dict_table_flags_to_zip_size(flags);
+		zip_size = fsp_flags_get_zip_size(flags);
 
 		descr = xdes_lst_get_descriptor(space, zip_size,
 						node_addr, &mtr);
@@ -3813,7 +3659,7 @@ ibool
 fseg_validate(
 /*==========*/
 	fseg_header_t*	header, /*!< in: segment header */
-	mtr_t*		mtr)	/*!< in: mtr */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
 {
 	fseg_inode_t*	inode;
 	ibool		ret;
@@ -3824,7 +3670,7 @@ fseg_validate(
 	space = page_get_space_id(page_align(header));
 
 	mtr_x_lock(fil_space_get_latch(space, &flags), mtr);
-	zip_size = dict_table_flags_to_zip_size(flags);
+	zip_size = fsp_flags_get_zip_size(flags);
 
 	inode = fseg_inode_get(header, space, zip_size, mtr);
 
@@ -3841,7 +3687,7 @@ void
 fseg_print_low(
 /*===========*/
 	fseg_inode_t*	inode, /*!< in: segment inode */
-	mtr_t*		mtr)	/*!< in: mtr */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
 {
 	ulint	space;
 	ulint	n_used;
@@ -3890,7 +3736,7 @@ void
 fseg_print(
 /*=======*/
 	fseg_header_t*	header, /*!< in: segment header */
-	mtr_t*		mtr)	/*!< in: mtr */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
 {
 	fseg_inode_t*	inode;
 	ulint		space;
@@ -3900,7 +3746,7 @@ fseg_print(
 	space = page_get_space_id(page_align(header));
 
 	mtr_x_lock(fil_space_get_latch(space, &flags), mtr);
-	zip_size = dict_table_flags_to_zip_size(flags);
+	zip_size = fsp_flags_get_zip_size(flags);
 
 	inode = fseg_inode_get(header, space, zip_size, mtr);
 
@@ -3940,10 +3786,10 @@ fsp_validate(
 	ulint		seg_inode_len_full;
 
 	latch = fil_space_get_latch(space, &flags);
-	zip_size = dict_table_flags_to_zip_size(flags);
+	zip_size = fsp_flags_get_zip_size(flags);
 	ut_a(ut_is_2pow(zip_size));
-	ut_a(zip_size <= UNIV_PAGE_SIZE);
-	ut_a(!zip_size || zip_size >= PAGE_ZIP_MIN_SIZE);
+	ut_a(zip_size <= UNIV_ZIP_SIZE_MAX);
+	ut_a(!zip_size || zip_size >= UNIV_ZIP_SIZE_MIN);
 
 	/* Start first a mini-transaction mtr2 to lock out all other threads
 	from the fsp system */
@@ -4190,7 +4036,7 @@ fsp_print(
 	mtr_t		mtr2;
 
 	latch = fil_space_get_latch(space, &flags);
-	zip_size = dict_table_flags_to_zip_size(flags);
+	zip_size = fsp_flags_get_zip_size(flags);
 
 	/* Start first a mini-transaction mtr2 to lock out all other threads
 	from the fsp system */
diff --git a/storage/innobase/fts/Makefile.query b/storage/innobase/fts/Makefile.query
new file mode 100644
index 00000000000..12dcd833064
--- /dev/null
+++ b/storage/innobase/fts/Makefile.query
@@ -0,0 +1,32 @@
+LEX=flex
+YACC=bison
+PREFIX=fts
+
+all:	fts0pars.cc fts0blex.cc fts0tlex.cc
+
+fts0par.cc: fts0pars.y
+fts0blex.cc: fts0blex.l
+fts0tlex.cc: fts0tlex.l
+
+.l.cc:
+	$(LEX) -P$(subst lex,,$*) -o $*.cc --header-file=../include/$*.h $<
+
+.y.cc:
+	$(YACC) -p $(PREFIX) -o $*.cc -d $<
+	mv $*.h ../include
+LEX=flex
+YACC=bison
+PREFIX=fts
+
+all:	fts0pars.cc fts0blex.cc fts0tlex.cc
+
+fts0par.cc: fts0pars.y
+fts0blex.cc: fts0blex.l
+fts0tlex.cc: fts0tlex.l
+
+.l.cc:
+	$(LEX) -P$(subst lex,,$*) -o $*.cc --header-file=../include/$*.h $<
+
+.y.cc:
+	$(YACC) -p $(PREFIX) -o $*.cc -d $<
+	mv $*.h ../include
diff --git a/storage/innobase/fts/fts0ast.cc b/storage/innobase/fts/fts0ast.cc
new file mode 100644
index 00000000000..c01c43a021f
--- /dev/null
+++ b/storage/innobase/fts/fts0ast.cc
@@ -0,0 +1,416 @@
+/*****************************************************************************
+
+Copyright (c) 2007, 2011, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file fts/fts0ast.cc
+Full Text Search parser helper file.
+
+Created 2007/3/16 Sunny Bains.
+***********************************************************************/
+
+#include "mem0mem.h"
+#include "fts0ast.h"
+#include "fts0pars.h"
+
+/******************************************************************//**
+Create an empty fts_ast_node_t.
+@return Create a new node */
+static
+fts_ast_node_t*
+fts_ast_node_create(void)
+/*=====================*/
+{
+	fts_ast_node_t*	node;
+
+	node = (fts_ast_node_t*) ut_malloc(sizeof(*node));
+	memset(node, 0x0, sizeof(*node));
+
+	return(node);
+}
+
+/******************************************************************//**
+Create a operator fts_ast_node_t.
+@return new node */
+UNIV_INTERN
+fts_ast_node_t*
+fts_ast_create_node_oper(
+/*=====================*/
+	void*		arg,			/*!< in: ast state instance */
+	fts_ast_oper_t	oper)			/*!< in: ast operator */
+{
+	fts_ast_node_t*	node = fts_ast_node_create();
+
+	node->type = FTS_AST_OPER;
+	node->oper = oper;
+
+	fts_ast_state_add_node((fts_ast_state_t*) arg, node);
+
+	return(node);
+}
+
+/******************************************************************//**
+This function takes ownership of the ptr and is responsible
+for free'ing it
+@return new node */
+UNIV_INTERN
+fts_ast_node_t*
+fts_ast_create_node_term(
+/*=====================*/
+	void*		arg,			/*!< in: ast state instance */
+	const char*	ptr)			/*!< in: ast term string */
+{
+	ulint		len = strlen(ptr);
+	fts_ast_node_t*	node = fts_ast_node_create();
+
+	node->type = FTS_AST_TERM;
+
+	node->term.ptr = static_cast<byte*>(ut_malloc(len + 1));
+	memcpy(node->term.ptr, ptr, len + 1);
+
+	fts_ast_state_add_node((fts_ast_state_t*) arg, node);
+
+	return(node);
+}
+
+/******************************************************************//**
+This function takes ownership of the ptr and is responsible
+for free'ing it.
+@return new node */
+UNIV_INTERN
+fts_ast_node_t*
+fts_ast_create_node_text(
+/*=====================*/
+	void*		arg,			/*!< in: ast state instance */
+	const char*	ptr)			/*!< in: ast text string */
+{
+	/*!< We ignore the actual quotes "" */
+	ulint		len = strlen(ptr) - 2;
+	fts_ast_node_t*	node = fts_ast_node_create();
+
+	node->type = FTS_AST_TEXT;
+	node->text.ptr = static_cast<byte*>(ut_malloc(len + 1));
+
+	/*!< Skip copying the first quote */
+	memcpy(node->text.ptr, ptr + 1, len);
+	node->text.ptr[len] = 0;
+	node->text.distance = ULINT_UNDEFINED;
+
+	fts_ast_state_add_node((fts_ast_state_t*) arg, node);
+
+	return(node);
+}
+
+/******************************************************************//**
+This function takes ownership of the expr and is responsible
+for free'ing it.
+@return new node */
+UNIV_INTERN
+fts_ast_node_t*
+fts_ast_create_node_list(
+/*=====================*/
+	void*		arg,			/*!< in: ast state instance */
+	fts_ast_node_t*	expr)			/*!< in: ast expr instance */
+{
+	fts_ast_node_t*	node = fts_ast_node_create();
+
+	node->type = FTS_AST_LIST;
+	node->list.head = node->list.tail = expr;
+
+	fts_ast_state_add_node((fts_ast_state_t*) arg, node);
+
+	return(node);
+}
+
+/******************************************************************//**
+Create a sub-expression list node. This function takes ownership of
+expr and is responsible for deleting it.
+@return new node */
+UNIV_INTERN
+fts_ast_node_t*
+fts_ast_create_node_subexp_list(
+/*============================*/
+	void*		arg,			/*!< in: ast state instance */
+	fts_ast_node_t*	expr)			/*!< in: ast expr instance */
+{
+	fts_ast_node_t*	node = fts_ast_node_create();
+
+	node->type = FTS_AST_SUBEXP_LIST;
+	node->list.head = node->list.tail = expr;
+
+	fts_ast_state_add_node((fts_ast_state_t*) arg, node);
+
+	return(node);
+}
+
+/******************************************************************//**
+Free an expr list node elements. */
+static
+void
+fts_ast_free_list(
+/*==============*/
+	fts_ast_node_t*	node)			/*!< in: ast node to free */
+{
+	ut_a(node->type == FTS_AST_LIST
+	     || node->type == FTS_AST_SUBEXP_LIST);
+
+	for (node = node->list.head;
+	     node != NULL;
+	     node = fts_ast_free_node(node)) {
+
+		/*!< No op */
+	}
+}
+
+/********************************************************************//**
+Free a fts_ast_node_t instance.
+@return next node to free */
+UNIV_INTERN
+fts_ast_node_t*
+fts_ast_free_node(
+/*==============*/
+	fts_ast_node_t*	node)			/*!< in: the node to free */
+{
+	fts_ast_node_t*	next_node;
+
+	switch (node->type) {
+	case FTS_AST_TEXT:
+		if (node->text.ptr) {
+			ut_free(node->text.ptr);
+			node->text.ptr = NULL;
+		}
+		break;
+
+	case FTS_AST_TERM:
+		if (node->term.ptr) {
+			ut_free(node->term.ptr);
+			node->term.ptr = NULL;
+		}
+		break;
+
+	case FTS_AST_LIST:
+	case FTS_AST_SUBEXP_LIST:
+		fts_ast_free_list(node);
+		node->list.head = node->list.tail = NULL;
+		break;
+
+	case FTS_AST_OPER:
+		break;
+
+	default:
+		ut_error;
+	}
+
+	/*!< Get next node before freeing the node itself */
+	next_node = node->next;
+
+	ut_free(node);
+
+	return(next_node);
+}
+
+/******************************************************************//**
+This AST takes ownership of the expr and is responsible
+for free'ing it.
+@return in param "list" */
+UNIV_INTERN
+fts_ast_node_t*
+fts_ast_add_node(
+/*=============*/
+	fts_ast_node_t*	node,			/*!< in: list instance */
+	fts_ast_node_t*	elem)			/*!< in: node to add to list */
+{
+	if (!elem) {
+		return(NULL);
+	}
+
+	ut_a(!elem->next);
+	ut_a(node->type == FTS_AST_LIST
+	     || node->type == FTS_AST_SUBEXP_LIST);
+
+	if (!node->list.head) {
+		ut_a(!node->list.tail);
+
+		node->list.head = node->list.tail = elem;
+	} else {
+		ut_a(node->list.tail);
+
+		node->list.tail->next = elem;
+		node->list.tail = elem;
+	}
+
+	return(node);
+}
+
+/******************************************************************//**
+For tracking node allocations, in case there is an error during
+parsing. */
+UNIV_INTERN
+void
+fts_ast_state_add_node(
+/*===================*/
+	fts_ast_state_t*state,			/*!< in: ast instance */
+	fts_ast_node_t*	node)			/*!< in: node to add to ast */
+{
+	if (!state->list.head) {
+		ut_a(!state->list.tail);
+
+		state->list.head = state->list.tail = node;
+	} else {
+		state->list.tail->next_alloc = node;
+		state->list.tail = node;
+	}
+}
+
+/******************************************************************//**
+Set the wildcard attribute of a term. */
+UNIV_INTERN
+void
+fts_ast_term_set_wildcard(
+/*======================*/
+	fts_ast_node_t*	node)			/*!< in/out: set attribute of
+						a term node */
+{
+	ut_a(node->type == FTS_AST_TERM);
+	ut_a(!node->term.wildcard);
+
+	node->term.wildcard = TRUE;
+}
+
+/******************************************************************//**
+Set the proximity attribute of a text node. */
+UNIV_INTERN
+void
+fts_ast_term_set_distance(
+/*======================*/
+	fts_ast_node_t*	node,			/*!< in/out: text node */
+	ulint		distance)		/*!< in: the text proximity
+						distance */
+{
+	ut_a(node->type == FTS_AST_TEXT);
+	ut_a(node->text.distance == ULINT_UNDEFINED);
+
+	node->text.distance = distance;
+}
+
+/******************************************************************//**
+Free node and expr allocations. */
+UNIV_INTERN
+void
+fts_ast_state_free(
+/*===============*/
+	fts_ast_state_t*state)			/*!< in: ast state to free */
+{
+	fts_ast_node_t*	node = state->list.head;
+
+	/* Free the nodes that were allocated during parsing. */
+	while (node) {
+		fts_ast_node_t*	next = node->next_alloc;
+
+		if (node->type == FTS_AST_TEXT && node->text.ptr) {
+			ut_free(node->text.ptr);
+			node->text.ptr = NULL;
+		} else if (node->type == FTS_AST_TERM && node->term.ptr) {
+			ut_free(node->term.ptr);
+			node->term.ptr = NULL;
+		}
+
+		ut_free(node);
+		node = next;
+	}
+
+	state->root = state->list.head = state->list.tail = NULL;
+}
+
+/******************************************************************//**
+Print an ast node. */
+UNIV_INTERN
+void
+fts_ast_node_print(
+/*===============*/
+	fts_ast_node_t*	node)			/*!< in: ast node to print */
+{
+	switch (node->type) {
+	case FTS_AST_TEXT:
+		printf("TEXT: %s\n", node->text.ptr);
+		break;
+
+	case FTS_AST_TERM:
+		printf("TERM: %s\n", node->term.ptr);
+		break;
+
+	case FTS_AST_LIST:
+		printf("LIST: ");
+		node = node->list.head;
+
+		while (node) {
+			fts_ast_node_print(node);
+			node = node->next;
+		}
+		break;
+
+	case FTS_AST_SUBEXP_LIST:
+		printf("SUBEXP_LIST: ");
+		node = node->list.head;
+
+		while (node) {
+			fts_ast_node_print(node);
+			node = node->next;
+		}
+	case FTS_AST_OPER:
+		printf("OPER: %d\n", node->oper);
+		break;
+
+	default:
+		ut_error;
+	}
+}
+
+/******************************************************************//**
+Traverse the AST - in-order traversal.
+@return DB_SUCCESS if all went well */
+UNIV_INTERN
+ulint
+fts_ast_visit(
+/*==========*/
+	fts_ast_oper_t		oper,		/*!< in: current operator */
+	fts_ast_node_t*		node,		/*!< in: current root node */
+	fts_ast_callback	visitor,	/*!< in: callback function */
+	void*			arg)		/*!< in: arg for callback */
+{
+	ulint			error = DB_SUCCESS;
+
+	ut_a(node->type == FTS_AST_LIST
+	     || node->type == FTS_AST_SUBEXP_LIST);
+
+	for (node = node->list.head;
+	     node && error == DB_SUCCESS;
+	     node = node->next) {
+
+		if (node->type == FTS_AST_LIST) {
+			error = fts_ast_visit(oper, node, visitor, arg);
+		} else if (node->type == FTS_AST_SUBEXP_LIST) {
+			error = fts_ast_visit_sub_exp(node, visitor, arg);
+		} else if (node->type == FTS_AST_OPER) {
+			oper = node->oper;
+		} else {
+			visitor(oper, node, arg);
+		}
+	}
+
+	return(error);
+}
diff --git a/storage/innobase/fts/fts0blex.cc b/storage/innobase/fts/fts0blex.cc
new file mode 100644
index 00000000000..b3350010db0
--- /dev/null
+++ b/storage/innobase/fts/fts0blex.cc
@@ -0,0 +1,1955 @@
+#include "univ.i"
+#line 2 "fts0blex.cc"
+
+#line 4 "fts0blex.cc"
+
+#define  YY_INT_ALIGNED short int
+
+/* A lexical scanner generated by flex */
+
+#define FLEX_SCANNER
+#define YY_FLEX_MAJOR_VERSION 2
+#define YY_FLEX_MINOR_VERSION 5
+#define YY_FLEX_SUBMINOR_VERSION 35
+#if YY_FLEX_SUBMINOR_VERSION > 0
+#define FLEX_BETA
+#endif
+
+/* First, we deal with  platform-specific or compiler-specific issues. */
+
+/* begin standard C headers. */
+#include <stdio.h>
+#include <string.h>
+#include <errno.h>
+#include <stdlib.h>
+
+/* end standard C headers. */
+
+/* flex integer type definitions */
+
+#ifndef FLEXINT_H
+#define FLEXINT_H
+
+/* C99 systems have <inttypes.h>. Non-C99 systems may or may not. */
+
+#if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
+
+/* C99 says to define __STDC_LIMIT_MACROS before including stdint.h,
+ * if you want the limit (max/min) macros for int types.
+ */
+#ifndef __STDC_LIMIT_MACROS
+#define __STDC_LIMIT_MACROS 1
+#endif
+
+#include <inttypes.h>
+typedef int8_t flex_int8_t;
+typedef uint8_t flex_uint8_t;
+typedef int16_t flex_int16_t;
+typedef uint16_t flex_uint16_t;
+typedef int32_t flex_int32_t;
+typedef uint32_t flex_uint32_t;
+#else
+typedef signed char flex_int8_t;
+typedef short int flex_int16_t;
+typedef int flex_int32_t;
+typedef unsigned char flex_uint8_t;
+typedef unsigned short int flex_uint16_t;
+typedef unsigned int flex_uint32_t;
+
+/* Limits of integral types. */
+#ifndef INT8_MIN
+#define INT8_MIN               (-128)
+#endif
+#ifndef INT16_MIN
+#define INT16_MIN              (-32767-1)
+#endif
+#ifndef INT32_MIN
+#define INT32_MIN              (-2147483647-1)
+#endif
+#ifndef INT8_MAX
+#define INT8_MAX               (127)
+#endif
+#ifndef INT16_MAX
+#define INT16_MAX              (32767)
+#endif
+#ifndef INT32_MAX
+#define INT32_MAX              (2147483647)
+#endif
+#ifndef UINT8_MAX
+#define UINT8_MAX              (255U)
+#endif
+#ifndef UINT16_MAX
+#define UINT16_MAX             (65535U)
+#endif
+#ifndef UINT32_MAX
+#define UINT32_MAX             (4294967295U)
+#endif
+
+#endif /* ! C99 */
+
+#endif /* ! FLEXINT_H */
+
+#ifdef __cplusplus
+
+/* The "const" storage-class-modifier is valid. */
+#define YY_USE_CONST
+
+#else	/* ! __cplusplus */
+
+/* C99 requires __STDC__ to be defined as 1. */
+#if defined (__STDC__)
+
+#define YY_USE_CONST
+
+#endif	/* defined (__STDC__) */
+#endif	/* ! __cplusplus */
+
+#ifdef YY_USE_CONST
+#define yyconst const
+#else
+#define yyconst
+#endif
+
+/* Returned upon end-of-file. */
+#define YY_NULL 0
+
+/* Promotes a possibly negative, possibly signed char to an unsigned
+ * integer for use as an array index.  If the signed char is negative,
+ * we want to instead treat it as an 8-bit unsigned char, hence the
+ * double cast.
+ */
+#define YY_SC_TO_UI(c) ((unsigned int) (unsigned char) c)
+
+/* An opaque pointer. */
+#ifndef YY_TYPEDEF_YY_SCANNER_T
+#define YY_TYPEDEF_YY_SCANNER_T
+typedef void* yyscan_t;
+#endif
+
+/* For convenience, these vars (plus the bison vars far below)
+   are macros in the reentrant scanner. */
+#define yyin yyg->yyin_r
+#define yyout yyg->yyout_r
+#define yyextra yyg->yyextra_r
+#define yyleng yyg->yyleng_r
+#define yytext yyg->yytext_r
+#define yylineno (YY_CURRENT_BUFFER_LVALUE->yy_bs_lineno)
+#define yycolumn (YY_CURRENT_BUFFER_LVALUE->yy_bs_column)
+#define yy_flex_debug yyg->yy_flex_debug_r
+
+/* Enter a start condition.  This macro really ought to take a parameter,
+ * but we do it the disgusting crufty way forced on us by the ()-less
+ * definition of BEGIN.
+ */
+#define BEGIN yyg->yy_start = 1 + 2 *
+
+/* Translate the current start state into a value that can be later handed
+ * to BEGIN to return to the state.  The YYSTATE alias is for lex
+ * compatibility.
+ */
+#define YY_START ((yyg->yy_start - 1) / 2)
+#define YYSTATE YY_START
+
+/* Action number for EOF rule of a given start state. */
+#define YY_STATE_EOF(state) (YY_END_OF_BUFFER + state + 1)
+
+/* Special action meaning "start processing a new file". */
+#define YY_NEW_FILE fts0brestart(yyin ,yyscanner )
+
+#define YY_END_OF_BUFFER_CHAR 0
+
+/* Size of default input buffer. */
+#ifndef YY_BUF_SIZE
+#ifdef __ia64__
+/* On IA-64, the buffer size is 16k, not 8k.
+ * Moreover, YY_BUF_SIZE is 2*YY_READ_BUF_SIZE in the general case.
+ * Ditto for the __ia64__ case accordingly.
+ */
+#define YY_BUF_SIZE 32768
+#else
+#define YY_BUF_SIZE 16384
+#endif /* __ia64__ */
+#endif
+
+/* The state buf must be large enough to hold one state per character in the main buffer.
+ */
+#define YY_STATE_BUF_SIZE   ((YY_BUF_SIZE + 2) * sizeof(yy_state_type))
+
+#ifndef YY_TYPEDEF_YY_BUFFER_STATE
+#define YY_TYPEDEF_YY_BUFFER_STATE
+typedef struct yy_buffer_state *YY_BUFFER_STATE;
+#endif
+
+#define EOB_ACT_CONTINUE_SCAN 0
+#define EOB_ACT_END_OF_FILE 1
+#define EOB_ACT_LAST_MATCH 2
+
+    #define YY_LESS_LINENO(n)
+
+/* Return all but the first "n" matched characters back to the input stream. */
+#define yyless(n) \
+	do \
+		{ \
+		/* Undo effects of setting up yytext. */ \
+        int yyless_macro_arg = (n); \
+        YY_LESS_LINENO(yyless_macro_arg);\
+		*yy_cp = yyg->yy_hold_char; \
+		YY_RESTORE_YY_MORE_OFFSET \
+		yyg->yy_c_buf_p = yy_cp = yy_bp + yyless_macro_arg - YY_MORE_ADJ; \
+		YY_DO_BEFORE_ACTION; /* set up yytext again */ \
+		} \
+	while ( 0 )
+
+#define unput(c) yyunput( c, yyg->yytext_ptr , yyscanner )
+
+#ifndef YY_TYPEDEF_YY_SIZE_T
+#define YY_TYPEDEF_YY_SIZE_T
+typedef size_t yy_size_t;
+#endif
+
+#ifndef YY_STRUCT_YY_BUFFER_STATE
+#define YY_STRUCT_YY_BUFFER_STATE
+struct yy_buffer_state
+	{
+	FILE *yy_input_file;
+
+	char *yy_ch_buf;		/* input buffer */
+	char *yy_buf_pos;		/* current position in input buffer */
+
+	/* Size of input buffer in bytes, not including room for EOB
+	 * characters.
+	 */
+	yy_size_t yy_buf_size;
+
+	/* Number of characters read into yy_ch_buf, not including EOB
+	 * characters.
+	 */
+	int yy_n_chars;
+
+	/* Whether we "own" the buffer - i.e., we know we created it,
+	 * and can realloc() it to grow it, and should free() it to
+	 * delete it.
+	 */
+	int yy_is_our_buffer;
+
+	/* Whether this is an "interactive" input source; if so, and
+	 * if we're using stdio for input, then we want to use getc()
+	 * instead of fread(), to make sure we stop fetching input after
+	 * each newline.
+	 */
+	int yy_is_interactive;
+
+	/* Whether we're considered to be at the beginning of a line.
+	 * If so, '^' rules will be active on the next match, otherwise
+	 * not.
+	 */
+	int yy_at_bol;
+
+    int yy_bs_lineno; /**< The line count. */
+    int yy_bs_column; /**< The column count. */
+
+	/* Whether to try to fill the input buffer when we reach the
+	 * end of it.
+	 */
+	int yy_fill_buffer;
+
+	int yy_buffer_status;
+
+#define YY_BUFFER_NEW 0
+#define YY_BUFFER_NORMAL 1
+	/* When an EOF's been seen but there's still some text to process
+	 * then we mark the buffer as YY_EOF_PENDING, to indicate that we
+	 * shouldn't try reading from the input source any more.  We might
+	 * still have a bunch of tokens to match, though, because of
+	 * possible backing-up.
+	 *
+	 * When we actually see the EOF, we change the status to "new"
+	 * (via fts0brestart()), so that the user can continue scanning by
+	 * just pointing yyin at a new input file.
+	 */
+#define YY_BUFFER_EOF_PENDING 2
+
+	};
+#endif /* !YY_STRUCT_YY_BUFFER_STATE */
+
+/* We provide macros for accessing buffer states in case in the
+ * future we want to put the buffer states in a more general
+ * "scanner state".
+ *
+ * Returns the top of the stack, or NULL.
+ */
+#define YY_CURRENT_BUFFER ( yyg->yy_buffer_stack \
+                          ? yyg->yy_buffer_stack[yyg->yy_buffer_stack_top] \
+                          : NULL)
+
+/* Same as previous macro, but useful when we know that the buffer stack is not
+ * NULL or when we need an lvalue. For internal use only.
+ */
+#define YY_CURRENT_BUFFER_LVALUE yyg->yy_buffer_stack[yyg->yy_buffer_stack_top]
+
+void fts0brestart (FILE *input_file ,yyscan_t yyscanner );
+void fts0b_switch_to_buffer (YY_BUFFER_STATE new_buffer ,yyscan_t yyscanner );
+YY_BUFFER_STATE fts0b_create_buffer (FILE *file,int size ,yyscan_t yyscanner );
+void fts0b_delete_buffer (YY_BUFFER_STATE b ,yyscan_t yyscanner );
+void fts0b_flush_buffer (YY_BUFFER_STATE b ,yyscan_t yyscanner );
+void fts0bpush_buffer_state (YY_BUFFER_STATE new_buffer ,yyscan_t yyscanner );
+void fts0bpop_buffer_state (yyscan_t yyscanner );
+
+static void fts0bensure_buffer_stack (yyscan_t yyscanner );
+static void fts0b_load_buffer_state (yyscan_t yyscanner );
+static void fts0b_init_buffer (YY_BUFFER_STATE b,FILE *file ,yyscan_t yyscanner );
+
+#define YY_FLUSH_BUFFER fts0b_flush_buffer(YY_CURRENT_BUFFER ,yyscanner)
+
+YY_BUFFER_STATE fts0b_scan_buffer (char *base,yy_size_t size ,yyscan_t yyscanner );
+YY_BUFFER_STATE fts0b_scan_string (yyconst char *yy_str ,yyscan_t yyscanner );
+YY_BUFFER_STATE fts0b_scan_bytes (yyconst char *bytes,int len ,yyscan_t yyscanner );
+
+void *fts0balloc (yy_size_t , yyscan_t yyscanner __attribute__((unused)) );
+void *fts0brealloc (void *,yy_size_t , yyscan_t yyscanner __attribute__((unused)) );
+void fts0bfree (void * , yyscan_t yyscanner __attribute__((unused)) );
+
+#define yy_new_buffer fts0b_create_buffer
+
+#define yy_set_interactive(is_interactive) \
+	{ \
+	if ( ! YY_CURRENT_BUFFER ){ \
+        fts0bensure_buffer_stack (yyscanner); \
+		YY_CURRENT_BUFFER_LVALUE =    \
+            fts0b_create_buffer(yyin,YY_BUF_SIZE ,yyscanner); \
+	} \
+	YY_CURRENT_BUFFER_LVALUE->yy_is_interactive = is_interactive; \
+	}
+
+#define yy_set_bol(at_bol) \
+	{ \
+	if ( ! YY_CURRENT_BUFFER ){\
+        fts0bensure_buffer_stack (yyscanner); \
+		YY_CURRENT_BUFFER_LVALUE =    \
+            fts0b_create_buffer(yyin,YY_BUF_SIZE ,yyscanner); \
+	} \
+	YY_CURRENT_BUFFER_LVALUE->yy_at_bol = at_bol; \
+	}
+
+#define YY_AT_BOL() (YY_CURRENT_BUFFER_LVALUE->yy_at_bol)
+
+/* Begin user sect3 */
+
+#define fts0bwrap(n) 1
+#define YY_SKIP_YYWRAP
+
+typedef unsigned char YY_CHAR;
+
+typedef int yy_state_type;
+
+#define yytext_ptr yytext_r
+
+static yy_state_type yy_get_previous_state (yyscan_t yyscanner );
+static yy_state_type yy_try_NUL_trans (yy_state_type current_state  ,yyscan_t yyscanner);
+static int yy_get_next_buffer (yyscan_t yyscanner );
+static void yy_fatal_error (yyconst char msg[] , yyscan_t yyscanner __attribute__((unused)) );
+
+/* Done after the current pattern has been matched and before the
+ * corresponding action - sets up yytext.
+ */
+#define YY_DO_BEFORE_ACTION \
+	yyg->yytext_ptr = yy_bp; \
+	yyleng = (size_t) (yy_cp - yy_bp); \
+	yyg->yy_hold_char = *yy_cp; \
+	*yy_cp = '\0'; \
+	yyg->yy_c_buf_p = yy_cp;
+
+#define YY_NUM_RULES 7
+#define YY_END_OF_BUFFER 8
+/* This struct is not used in this scanner,
+   but its presence is necessary. */
+struct yy_trans_info
+	{
+	flex_int32_t yy_verify;
+	flex_int32_t yy_nxt;
+	};
+static yyconst flex_int16_t yy_accept[18] =
+    {   0,
+        4,    4,    8,    4,    1,    6,    1,    7,    2,    3,
+        4,    1,    1,    0,    5,    3,    0
+    } ;
+
+static yyconst flex_int32_t yy_ec[256] =
+    {   0,
+        1,    1,    1,    1,    1,    1,    1,    1,    2,    3,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    4,    1,    5,    1,    1,    1,    1,    1,    6,
+        6,    6,    6,    1,    6,    1,    1,    7,    7,    7,
+        7,    7,    7,    7,    7,    7,    7,    1,    1,    6,
+        1,    6,    1,    6,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    6,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1
+    } ;
+
+static yyconst flex_int32_t yy_meta[8] =
+    {   0,
+        1,    2,    3,    4,    5,    5,    1
+    } ;
+
+static yyconst flex_int16_t yy_base[21] =
+    {   0,
+        0,    0,   21,    0,    6,   22,    0,   13,   22,    7,
+        0,    0,    0,    4,   22,    0,   22,   10,   11,   15
+    } ;
+
+static yyconst flex_int16_t yy_def[21] =
+    {   0,
+       17,    1,   17,   18,   18,   17,   19,   20,   17,   18,
+       18,    5,   19,   20,   17,   10,    0,   17,   17,   17
+    } ;
+
+static yyconst flex_int16_t yy_nxt[30] =
+    {   0,
+        4,    5,    6,    7,    8,    9,   10,   12,   15,   13,
+       11,   11,   13,   16,   13,   14,   14,   15,   14,   14,
+       17,    3,   17,   17,   17,   17,   17,   17,   17
+    } ;
+
+static yyconst flex_int16_t yy_chk[30] =
+    {   0,
+        1,    1,    1,    1,    1,    1,    1,    5,   14,    5,
+       18,   18,   19,   10,   19,   20,   20,    8,   20,   20,
+        3,   17,   17,   17,   17,   17,   17,   17,   17
+    } ;
+
+/* The intent behind this definition is that it'll catch
+ * any uses of REJECT which flex missed.
+ */
+#define REJECT reject_used_but_not_detected
+#define yymore() yymore_used_but_not_detected
+#define YY_MORE_ADJ 0
+#define YY_RESTORE_YY_MORE_OFFSET
+#line 1 "fts0blex.l"
+/*****************************************************************************
+
+Copyright (c) 2007, 2011, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+/**
+ * @file fts/fts0blex.l
+ * FTS parser lexical analyzer
+ *
+ * Created 2007/5/9 Sunny Bains
+ */
+#line 27 "fts0blex.l"
+
+#include "fts0ast.h"
+#include "fts0pars.h"
+
+/* Required for reentrant parser */
+#define YY_DECL int fts_blexer(YYSTYPE* val, yyscan_t yyscanner)
+
+#define YY_NO_INPUT 1
+#line 480 "fts0blex.cc"
+
+#define INITIAL 0
+
+#ifndef YY_NO_UNISTD_H
+/* Special case for "unistd.h", since it is non-ANSI. We include it way
+ * down here because we want the user's section 1 to have been scanned first.
+ * The user has a chance to override it with an option.
+ */
+#include <unistd.h>
+#endif
+
+#ifndef YY_EXTRA_TYPE
+#define YY_EXTRA_TYPE void *
+#endif
+
+/* Holds the entire state of the reentrant scanner. */
+struct yyguts_t
+    {
+
+    /* User-defined. Not touched by flex. */
+    YY_EXTRA_TYPE yyextra_r;
+
+    /* The rest are the same as the globals declared in the non-reentrant scanner. */
+    FILE *yyin_r, *yyout_r;
+    size_t yy_buffer_stack_top; /**< index of top of stack. */
+    size_t yy_buffer_stack_max; /**< capacity of stack. */
+    YY_BUFFER_STATE * yy_buffer_stack; /**< Stack as an array. */
+    char yy_hold_char;
+    int yy_n_chars;
+    int yyleng_r;
+    char *yy_c_buf_p;
+    int yy_init;
+    int yy_start;
+    int yy_did_buffer_switch_on_eof;
+    int yy_start_stack_ptr;
+    int yy_start_stack_depth;
+    int *yy_start_stack;
+    yy_state_type yy_last_accepting_state;
+    char* yy_last_accepting_cpos;
+
+    int yylineno_r;
+    int yy_flex_debug_r;
+
+    char *yytext_r;
+    int yy_more_flag;
+    int yy_more_len;
+
+    }; /* end struct yyguts_t */
+
+static int yy_init_globals (yyscan_t yyscanner );
+
+int fts0blex_init (yyscan_t* scanner);
+
+int fts0blex_init_extra (YY_EXTRA_TYPE user_defined,yyscan_t* scanner);
+
+/* Accessor methods to globals.
+   These are made visible to non-reentrant scanners for convenience. */
+
+int fts0blex_destroy (yyscan_t yyscanner );
+
+int fts0bget_debug (yyscan_t yyscanner );
+
+void fts0bset_debug (int debug_flag ,yyscan_t yyscanner );
+
+YY_EXTRA_TYPE fts0bget_extra (yyscan_t yyscanner );
+
+void fts0bset_extra (YY_EXTRA_TYPE user_defined ,yyscan_t yyscanner );
+
+FILE *fts0bget_in (yyscan_t yyscanner );
+
+void fts0bset_in  (FILE * in_str ,yyscan_t yyscanner );
+
+FILE *fts0bget_out (yyscan_t yyscanner );
+
+void fts0bset_out  (FILE * out_str ,yyscan_t yyscanner );
+
+int fts0bget_leng (yyscan_t yyscanner );
+
+char *fts0bget_text (yyscan_t yyscanner );
+
+int fts0bget_lineno (yyscan_t yyscanner );
+
+void fts0bset_lineno (int line_number ,yyscan_t yyscanner );
+
+/* Macros after this point can all be overridden by user definitions in
+ * section 1.
+ */
+
+#ifndef YY_SKIP_YYWRAP
+#ifdef __cplusplus
+extern "C" int fts0bwrap (yyscan_t yyscanner );
+#else
+extern int fts0bwrap (yyscan_t yyscanner );
+#endif
+#endif
+
+#ifndef yytext_ptr
+static void yy_flex_strncpy (char *,yyconst char *,int , yyscan_t yyscanner __attribute__((unused)));
+#endif
+
+#ifdef YY_NEED_STRLEN
+static int yy_flex_strlen (yyconst char * , yyscan_t yyscanner __attribute__((unused)));
+#endif
+
+#ifndef YY_NO_INPUT
+
+#ifdef __cplusplus
+static int yyinput (yyscan_t yyscanner );
+#else
+static int input (yyscan_t yyscanner );
+#endif
+
+#endif
+
+/* Amount of stuff to slurp up with each read. */
+#ifndef YY_READ_BUF_SIZE
+#ifdef __ia64__
+/* On IA-64, the buffer size is 16k, not 8k */
+#define YY_READ_BUF_SIZE 16384
+#else
+#define YY_READ_BUF_SIZE 8192
+#endif /* __ia64__ */
+#endif
+
+/* Copy whatever the last rule matched to the standard output. */
+#ifndef ECHO
+/* This used to be an fputs(), but since the string might contain NUL's,
+ * we now use fwrite().
+ */
+#define ECHO do { if (fwrite( yytext, yyleng, 1, yyout )) {} } while (0)
+#endif
+
+/* Gets input and stuffs it into "buf".  number of characters read, or YY_NULL,
+ * is returned in "result".
+ */
+#ifndef YY_INPUT
+#define YY_INPUT(buf,result,max_size) \
+	if ( YY_CURRENT_BUFFER_LVALUE->yy_is_interactive ) \
+		{ \
+		int c = '*'; \
+		size_t n; \
+		for ( n = 0; n < max_size && \
+			     (c = getc( yyin )) != EOF && c != '\n'; ++n ) \
+			buf[n] = (char) c; \
+		if ( c == '\n' ) \
+			buf[n++] = (char) c; \
+		if ( c == EOF && ferror( yyin ) ) \
+			YY_FATAL_ERROR( "input in flex scanner failed" ); \
+		result = n; \
+		} \
+	else \
+		{ \
+		errno=0; \
+		while ( (result = fread(buf, 1, max_size, yyin))==0 && ferror(yyin)) \
+			{ \
+			if( errno != EINTR) \
+				{ \
+				YY_FATAL_ERROR( "input in flex scanner failed" ); \
+				break; \
+				} \
+			errno=0; \
+			clearerr(yyin); \
+			} \
+		}\
+\
+
+#endif
+
+/* No semi-colon after return; correct usage is to write "yyterminate();" -
+ * we don't want an extra ';' after the "return" because that will cause
+ * some compilers to complain about unreachable statements.
+ */
+#ifndef yyterminate
+#define yyterminate() return YY_NULL
+#endif
+
+/* Number of entries by which start-condition stack grows. */
+#ifndef YY_START_STACK_INCR
+#define YY_START_STACK_INCR 25
+#endif
+
+/* Report a fatal error. */
+#ifndef YY_FATAL_ERROR
+#define YY_FATAL_ERROR(msg) yy_fatal_error( msg , yyscanner)
+#endif
+
+/* end tables serialization structures and prototypes */
+
+/* Default declaration of generated scanner - a define so the user can
+ * easily add parameters.
+ */
+#ifndef YY_DECL
+#define YY_DECL_IS_OURS 1
+
+extern int fts0blex (yyscan_t yyscanner);
+
+#define YY_DECL int fts0blex (yyscan_t yyscanner)
+#endif /* !YY_DECL */
+
+/* Code executed at the beginning of each rule, after yytext and yyleng
+ * have been set up.
+ */
+#ifndef YY_USER_ACTION
+#define YY_USER_ACTION
+#endif
+
+/* Code executed at the end of each rule. */
+#ifndef YY_BREAK
+#define YY_BREAK break;
+#endif
+
+#define YY_RULE_SETUP \
+	YY_USER_ACTION
+
+/** The main scanner function which does all the work.
+ */
+YY_DECL
+{
+	register yy_state_type yy_current_state;
+	register char *yy_cp, *yy_bp;
+	register int yy_act;
+    struct yyguts_t * yyg = (struct yyguts_t*) yyscanner;
+
+#line 43 "fts0blex.l"
+
+
+#line 707 "fts0blex.cc"
+
+	if ( !yyg->yy_init )
+		{
+		yyg->yy_init = 1;
+
+#ifdef YY_USER_INIT
+		YY_USER_INIT;
+#endif
+
+		if ( ! yyg->yy_start )
+			yyg->yy_start = 1;	/* first start state */
+
+		if ( ! yyin )
+			yyin = stdin;
+
+		if ( ! yyout )
+			yyout = stdout;
+
+		if ( ! YY_CURRENT_BUFFER ) {
+			fts0bensure_buffer_stack (yyscanner);
+			YY_CURRENT_BUFFER_LVALUE =
+				fts0b_create_buffer(yyin,YY_BUF_SIZE ,yyscanner);
+		}
+
+		fts0b_load_buffer_state(yyscanner );
+		}
+
+	while ( 1 )		/* loops until end-of-file is reached */
+		{
+		yy_cp = yyg->yy_c_buf_p;
+
+		/* Support of yytext. */
+		*yy_cp = yyg->yy_hold_char;
+
+		/* yy_bp points to the position in yy_ch_buf of the start of
+		 * the current run.
+		 */
+		yy_bp = yy_cp;
+
+		yy_current_state = yyg->yy_start;
+yy_match:
+		do
+			{
+			register YY_CHAR yy_c = yy_ec[YY_SC_TO_UI(*yy_cp)];
+			if ( yy_accept[yy_current_state] )
+				{
+				yyg->yy_last_accepting_state = yy_current_state;
+				yyg->yy_last_accepting_cpos = yy_cp;
+				}
+			while ( yy_chk[yy_base[yy_current_state] + yy_c] != yy_current_state )
+				{
+				yy_current_state = (int) yy_def[yy_current_state];
+				if ( yy_current_state >= 18 )
+					yy_c = yy_meta[(unsigned int) yy_c];
+				}
+			yy_current_state = yy_nxt[yy_base[yy_current_state] + (unsigned int) yy_c];
+			++yy_cp;
+			}
+		while ( yy_current_state != 17 );
+		yy_cp = yyg->yy_last_accepting_cpos;
+		yy_current_state = yyg->yy_last_accepting_state;
+
+yy_find_action:
+		yy_act = yy_accept[yy_current_state];
+
+		YY_DO_BEFORE_ACTION;
+
+do_action:	/* This label is used only to access EOF actions. */
+
+		switch ( yy_act )
+	{ /* beginning of action switch */
+			case 0: /* must back up */
+			/* undo the effects of YY_DO_BEFORE_ACTION */
+			*yy_cp = yyg->yy_hold_char;
+			yy_cp = yyg->yy_last_accepting_cpos;
+			yy_current_state = yyg->yy_last_accepting_state;
+			goto yy_find_action;
+
+case 1:
+YY_RULE_SETUP
+#line 45 "fts0blex.l"
+/* Ignore whitespace */ ;
+	YY_BREAK
+case 2:
+YY_RULE_SETUP
+#line 47 "fts0blex.l"
+{
+	val->oper = fts0bget_text(yyscanner)[0];
+
+	return(val->oper);
+}
+	YY_BREAK
+case 3:
+YY_RULE_SETUP
+#line 53 "fts0blex.l"
+{
+	val->token = strdup(fts0bget_text(yyscanner));
+
+	return(FTS_NUMB);
+}
+	YY_BREAK
+case 4:
+YY_RULE_SETUP
+#line 59 "fts0blex.l"
+{
+	val->token = strdup(fts0bget_text(yyscanner));
+
+	return(FTS_TERM);
+}
+	YY_BREAK
+case 5:
+YY_RULE_SETUP
+#line 65 "fts0blex.l"
+{
+	val->token = strdup(fts0bget_text(yyscanner));
+
+	return(FTS_TEXT);
+}
+	YY_BREAK
+case 6:
+/* rule 6 can match eol */
+YY_RULE_SETUP
+#line 71 "fts0blex.l"
+
+	YY_BREAK
+case 7:
+YY_RULE_SETUP
+#line 73 "fts0blex.l"
+ECHO;
+	YY_BREAK
+#line 838 "fts0blex.cc"
+case YY_STATE_EOF(INITIAL):
+	yyterminate();
+
+	case YY_END_OF_BUFFER:
+		{
+		/* Amount of text matched not including the EOB char. */
+		int yy_amount_of_matched_text = (int) (yy_cp - yyg->yytext_ptr) - 1;
+
+		/* Undo the effects of YY_DO_BEFORE_ACTION. */
+		*yy_cp = yyg->yy_hold_char;
+		YY_RESTORE_YY_MORE_OFFSET
+
+		if ( YY_CURRENT_BUFFER_LVALUE->yy_buffer_status == YY_BUFFER_NEW )
+			{
+			/* We're scanning a new file or input source.  It's
+			 * possible that this happened because the user
+			 * just pointed yyin at a new source and called
+			 * fts0blex().  If so, then we have to assure
+			 * consistency between YY_CURRENT_BUFFER and our
+			 * globals.  Here is the right place to do so, because
+			 * this is the first action (other than possibly a
+			 * back-up) that will match for the new input source.
+			 */
+			yyg->yy_n_chars = YY_CURRENT_BUFFER_LVALUE->yy_n_chars;
+			YY_CURRENT_BUFFER_LVALUE->yy_input_file = yyin;
+			YY_CURRENT_BUFFER_LVALUE->yy_buffer_status = YY_BUFFER_NORMAL;
+			}
+
+		/* Note that here we test for yy_c_buf_p "<=" to the position
+		 * of the first EOB in the buffer, since yy_c_buf_p will
+		 * already have been incremented past the NUL character
+		 * (since all states make transitions on EOB to the
+		 * end-of-buffer state).  Contrast this with the test
+		 * in input().
+		 */
+		if ( yyg->yy_c_buf_p <= &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[yyg->yy_n_chars] )
+			{ /* This was really a NUL. */
+			yy_state_type yy_next_state;
+
+			yyg->yy_c_buf_p = yyg->yytext_ptr + yy_amount_of_matched_text;
+
+			yy_current_state = yy_get_previous_state( yyscanner );
+
+			/* Okay, we're now positioned to make the NUL
+			 * transition.  We couldn't have
+			 * yy_get_previous_state() go ahead and do it
+			 * for us because it doesn't know how to deal
+			 * with the possibility of jamming (and we don't
+			 * want to build jamming into it because then it
+			 * will run more slowly).
+			 */
+
+			yy_next_state = yy_try_NUL_trans( yy_current_state , yyscanner);
+
+			yy_bp = yyg->yytext_ptr + YY_MORE_ADJ;
+
+			if ( yy_next_state )
+				{
+				/* Consume the NUL. */
+				yy_cp = ++yyg->yy_c_buf_p;
+				yy_current_state = yy_next_state;
+				goto yy_match;
+				}
+
+			else
+				{
+				yy_cp = yyg->yy_last_accepting_cpos;
+				yy_current_state = yyg->yy_last_accepting_state;
+				goto yy_find_action;
+				}
+			}
+
+		else switch ( yy_get_next_buffer( yyscanner ) )
+			{
+			case EOB_ACT_END_OF_FILE:
+				{
+				yyg->yy_did_buffer_switch_on_eof = 0;
+
+				if ( fts0bwrap(yyscanner ) )
+					{
+					/* Note: because we've taken care in
+					 * yy_get_next_buffer() to have set up
+					 * yytext, we can now set up
+					 * yy_c_buf_p so that if some total
+					 * hoser (like flex itself) wants to
+					 * call the scanner after we return the
+					 * YY_NULL, it'll still work - another
+					 * YY_NULL will get returned.
+					 */
+					yyg->yy_c_buf_p = yyg->yytext_ptr + YY_MORE_ADJ;
+
+					yy_act = YY_STATE_EOF(YY_START);
+					goto do_action;
+					}
+
+				else
+					{
+					if ( ! yyg->yy_did_buffer_switch_on_eof )
+						YY_NEW_FILE;
+					}
+				break;
+				}
+
+			case EOB_ACT_CONTINUE_SCAN:
+				yyg->yy_c_buf_p =
+					yyg->yytext_ptr + yy_amount_of_matched_text;
+
+				yy_current_state = yy_get_previous_state( yyscanner );
+
+				yy_cp = yyg->yy_c_buf_p;
+				yy_bp = yyg->yytext_ptr + YY_MORE_ADJ;
+				goto yy_match;
+
+			case EOB_ACT_LAST_MATCH:
+				yyg->yy_c_buf_p =
+				&YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[yyg->yy_n_chars];
+
+				yy_current_state = yy_get_previous_state( yyscanner );
+
+				yy_cp = yyg->yy_c_buf_p;
+				yy_bp = yyg->yytext_ptr + YY_MORE_ADJ;
+				goto yy_find_action;
+			}
+		break;
+		}
+
+	default:
+		YY_FATAL_ERROR(
+			"fatal flex scanner internal error--no action found" );
+	} /* end of action switch */
+		} /* end of scanning one token */
+} /* end of fts0blex */
+
+/* yy_get_next_buffer - try to read in a new buffer
+ *
+ * Returns a code representing an action:
+ *	EOB_ACT_LAST_MATCH -
+ *	EOB_ACT_CONTINUE_SCAN - continue scanning from current position
+ *	EOB_ACT_END_OF_FILE - end of file
+ */
+static int yy_get_next_buffer (yyscan_t yyscanner)
+{
+    struct yyguts_t * yyg = (struct yyguts_t*) yyscanner;
+	register char *dest = YY_CURRENT_BUFFER_LVALUE->yy_ch_buf;
+	register char *source = yyg->yytext_ptr;
+	register int number_to_move, i;
+	int ret_val;
+
+	if ( yyg->yy_c_buf_p > &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[yyg->yy_n_chars + 1] )
+		YY_FATAL_ERROR(
+		"fatal flex scanner internal error--end of buffer missed" );
+
+	if ( YY_CURRENT_BUFFER_LVALUE->yy_fill_buffer == 0 )
+		{ /* Don't try to fill the buffer, so this is an EOF. */
+		if ( yyg->yy_c_buf_p - yyg->yytext_ptr - YY_MORE_ADJ == 1 )
+			{
+			/* We matched a single character, the EOB, so
+			 * treat this as a final EOF.
+			 */
+			return EOB_ACT_END_OF_FILE;
+			}
+
+		else
+			{
+			/* We matched some text prior to the EOB, first
+			 * process it.
+			 */
+			return EOB_ACT_LAST_MATCH;
+			}
+		}
+
+	/* Try to read more data. */
+
+	/* First move last chars to start of buffer. */
+	number_to_move = (int) (yyg->yy_c_buf_p - yyg->yytext_ptr) - 1;
+
+	for ( i = 0; i < number_to_move; ++i )
+		*(dest++) = *(source++);
+
+	if ( YY_CURRENT_BUFFER_LVALUE->yy_buffer_status == YY_BUFFER_EOF_PENDING )
+		/* don't do the read, it's not guaranteed to return an EOF,
+		 * just force an EOF
+		 */
+		YY_CURRENT_BUFFER_LVALUE->yy_n_chars = yyg->yy_n_chars = 0;
+
+	else
+		{
+			int num_to_read =
+			YY_CURRENT_BUFFER_LVALUE->yy_buf_size - number_to_move - 1;
+
+		while ( num_to_read <= 0 )
+			{ /* Not enough room in the buffer - grow it. */
+
+			/* just a shorter name for the current buffer */
+			YY_BUFFER_STATE b = YY_CURRENT_BUFFER;
+
+			int yy_c_buf_p_offset =
+				(int) (yyg->yy_c_buf_p - b->yy_ch_buf);
+
+			if ( b->yy_is_our_buffer )
+				{
+				int new_size = b->yy_buf_size * 2;
+
+				if ( new_size <= 0 )
+					b->yy_buf_size += b->yy_buf_size / 8;
+				else
+					b->yy_buf_size *= 2;
+
+				b->yy_ch_buf = (char*)
+					/* Include room in for 2 EOB chars. */
+					fts0brealloc((void*) b->yy_ch_buf,b->yy_buf_size + 2 ,yyscanner );
+				}
+			else
+				/* Can't grow it, we don't own it. */
+				b->yy_ch_buf = 0;
+
+			if ( ! b->yy_ch_buf )
+				YY_FATAL_ERROR(
+				"fatal error - scanner input buffer overflow" );
+
+			yyg->yy_c_buf_p = &b->yy_ch_buf[yy_c_buf_p_offset];
+
+			num_to_read = YY_CURRENT_BUFFER_LVALUE->yy_buf_size -
+						number_to_move - 1;
+
+			}
+
+		if ( num_to_read > YY_READ_BUF_SIZE )
+			num_to_read = YY_READ_BUF_SIZE;
+
+		/* Read in more data. */
+		YY_INPUT( (&YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[number_to_move]),
+			yyg->yy_n_chars, (size_t) num_to_read );
+
+		YY_CURRENT_BUFFER_LVALUE->yy_n_chars = yyg->yy_n_chars;
+		}
+
+	if ( yyg->yy_n_chars == 0 )
+		{
+		if ( number_to_move == YY_MORE_ADJ )
+			{
+			ret_val = EOB_ACT_END_OF_FILE;
+			fts0brestart(yyin  ,yyscanner);
+			}
+
+		else
+			{
+			ret_val = EOB_ACT_LAST_MATCH;
+			YY_CURRENT_BUFFER_LVALUE->yy_buffer_status =
+				YY_BUFFER_EOF_PENDING;
+			}
+		}
+
+	else
+		ret_val = EOB_ACT_CONTINUE_SCAN;
+
+	if ((yy_size_t) (yyg->yy_n_chars + number_to_move) > YY_CURRENT_BUFFER_LVALUE->yy_buf_size) {
+		/* Extend the array by 50%, plus the number we really need. */
+		yy_size_t new_size = yyg->yy_n_chars + number_to_move + (yyg->yy_n_chars >> 1);
+		YY_CURRENT_BUFFER_LVALUE->yy_ch_buf = (char*) fts0brealloc((void*) YY_CURRENT_BUFFER_LVALUE->yy_ch_buf,new_size ,yyscanner );
+		if ( ! YY_CURRENT_BUFFER_LVALUE->yy_ch_buf )
+			YY_FATAL_ERROR( "out of dynamic memory in yy_get_next_buffer()" );
+	}
+
+	yyg->yy_n_chars += number_to_move;
+	YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[yyg->yy_n_chars] = YY_END_OF_BUFFER_CHAR;
+	YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[yyg->yy_n_chars + 1] = YY_END_OF_BUFFER_CHAR;
+
+	yyg->yytext_ptr = &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[0];
+
+	return ret_val;
+}
+
+/* yy_get_previous_state - get the state just before the EOB char was reached */
+
+    static yy_state_type yy_get_previous_state (yyscan_t yyscanner)
+{
+	register yy_state_type yy_current_state;
+	register char *yy_cp;
+    struct yyguts_t * yyg = (struct yyguts_t*) yyscanner;
+
+	yy_current_state = yyg->yy_start;
+
+	for ( yy_cp = yyg->yytext_ptr + YY_MORE_ADJ; yy_cp < yyg->yy_c_buf_p; ++yy_cp )
+		{
+		register YY_CHAR yy_c = (*yy_cp ? yy_ec[YY_SC_TO_UI(*yy_cp)] : 1);
+		if ( yy_accept[yy_current_state] )
+			{
+			yyg->yy_last_accepting_state = yy_current_state;
+			yyg->yy_last_accepting_cpos = yy_cp;
+			}
+		while ( yy_chk[yy_base[yy_current_state] + yy_c] != yy_current_state )
+			{
+			yy_current_state = (int) yy_def[yy_current_state];
+			if ( yy_current_state >= 18 )
+				yy_c = yy_meta[(unsigned int) yy_c];
+			}
+		yy_current_state = yy_nxt[yy_base[yy_current_state] + (unsigned int) yy_c];
+		}
+
+	return yy_current_state;
+}
+
+/* yy_try_NUL_trans - try to make a transition on the NUL character
+ *
+ * synopsis
+ *	next_state = yy_try_NUL_trans( current_state );
+ */
+    static yy_state_type yy_try_NUL_trans  (yy_state_type yy_current_state , yyscan_t yyscanner)
+{
+	register int yy_is_jam;
+    struct yyguts_t * yyg = (struct yyguts_t*) yyscanner; /* This var may be unused depending upon options. */
+	register char *yy_cp = yyg->yy_c_buf_p;
+
+	register YY_CHAR yy_c = 1;
+	if ( yy_accept[yy_current_state] )
+		{
+		yyg->yy_last_accepting_state = yy_current_state;
+		yyg->yy_last_accepting_cpos = yy_cp;
+		}
+	while ( yy_chk[yy_base[yy_current_state] + yy_c] != yy_current_state )
+		{
+		yy_current_state = (int) yy_def[yy_current_state];
+		if ( yy_current_state >= 18 )
+			yy_c = yy_meta[(unsigned int) yy_c];
+		}
+	yy_current_state = yy_nxt[yy_base[yy_current_state] + (unsigned int) yy_c];
+	yy_is_jam = (yy_current_state == 17);
+
+	return yy_is_jam ? 0 : yy_current_state;
+}
+
+#ifndef YY_NO_INPUT
+#ifdef __cplusplus
+    static int yyinput (yyscan_t yyscanner)
+#else
+    static int input  (yyscan_t yyscanner)
+#endif
+
+{
+	int c;
+    struct yyguts_t * yyg = (struct yyguts_t*) yyscanner;
+
+	*yyg->yy_c_buf_p = yyg->yy_hold_char;
+
+	if ( *yyg->yy_c_buf_p == YY_END_OF_BUFFER_CHAR )
+		{
+		/* yy_c_buf_p now points to the character we want to return.
+		 * If this occurs *before* the EOB characters, then it's a
+		 * valid NUL; if not, then we've hit the end of the buffer.
+		 */
+		if ( yyg->yy_c_buf_p < &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[yyg->yy_n_chars] )
+			/* This was really a NUL. */
+			*yyg->yy_c_buf_p = '\0';
+
+		else
+			{ /* need more input */
+			int offset = yyg->yy_c_buf_p - yyg->yytext_ptr;
+			++yyg->yy_c_buf_p;
+
+			switch ( yy_get_next_buffer( yyscanner ) )
+				{
+				case EOB_ACT_LAST_MATCH:
+					/* This happens because yy_g_n_b()
+					 * sees that we've accumulated a
+					 * token and flags that we need to
+					 * try matching the token before
+					 * proceeding.  But for input(),
+					 * there's no matching to consider.
+					 * So convert the EOB_ACT_LAST_MATCH
+					 * to EOB_ACT_END_OF_FILE.
+					 */
+
+					/* Reset buffer status. */
+					fts0brestart(yyin ,yyscanner);
+
+					/*FALLTHROUGH*/
+
+				case EOB_ACT_END_OF_FILE:
+					{
+					if ( fts0bwrap(yyscanner ) )
+						return EOF;
+
+					if ( ! yyg->yy_did_buffer_switch_on_eof )
+						YY_NEW_FILE;
+#ifdef __cplusplus
+					return yyinput(yyscanner);
+#else
+					return input(yyscanner);
+#endif
+					}
+
+				case EOB_ACT_CONTINUE_SCAN:
+					yyg->yy_c_buf_p = yyg->yytext_ptr + offset;
+					break;
+				}
+			}
+		}
+
+	c = *(unsigned char*) yyg->yy_c_buf_p;	/* cast for 8-bit char's */
+	*yyg->yy_c_buf_p = '\0';	/* preserve yytext */
+	yyg->yy_hold_char = *++yyg->yy_c_buf_p;
+
+	return c;
+}
+#endif	/* ifndef YY_NO_INPUT */
+
+/** Immediately switch to a different input stream.
+ * @param input_file A readable stream.
+ * @param yyscanner The scanner object.
+ * @note This function does not reset the start condition to @c INITIAL .
+ */
+    void fts0brestart  (FILE * input_file , yyscan_t yyscanner)
+{
+    struct yyguts_t * yyg = (struct yyguts_t*) yyscanner;
+
+	if ( ! YY_CURRENT_BUFFER ){
+        fts0bensure_buffer_stack (yyscanner);
+		YY_CURRENT_BUFFER_LVALUE =
+            fts0b_create_buffer(yyin,YY_BUF_SIZE ,yyscanner);
+	}
+
+	fts0b_init_buffer(YY_CURRENT_BUFFER,input_file ,yyscanner);
+	fts0b_load_buffer_state(yyscanner );
+}
+
+/** Switch to a different input buffer.
+ * @param new_buffer The new input buffer.
+ * @param yyscanner The scanner object.
+ */
+    void fts0b_switch_to_buffer  (YY_BUFFER_STATE  new_buffer , yyscan_t yyscanner)
+{
+    struct yyguts_t * yyg = (struct yyguts_t*) yyscanner;
+
+	/* TODO. We should be able to replace this entire function body
+	 * with
+	 *		fts0bpop_buffer_state();
+	 *		fts0bpush_buffer_state(new_buffer);
+     */
+	fts0bensure_buffer_stack (yyscanner);
+	if ( YY_CURRENT_BUFFER == new_buffer )
+		return;
+
+	if ( YY_CURRENT_BUFFER )
+		{
+		/* Flush out information for old buffer. */
+		*yyg->yy_c_buf_p = yyg->yy_hold_char;
+		YY_CURRENT_BUFFER_LVALUE->yy_buf_pos = yyg->yy_c_buf_p;
+		YY_CURRENT_BUFFER_LVALUE->yy_n_chars = yyg->yy_n_chars;
+		}
+
+	YY_CURRENT_BUFFER_LVALUE = new_buffer;
+	fts0b_load_buffer_state(yyscanner );
+
+	/* We don't actually know whether we did this switch during
+	 * EOF (fts0bwrap()) processing, but the only time this flag
+	 * is looked at is after fts0bwrap() is called, so it's safe
+	 * to go ahead and always set it.
+	 */
+	yyg->yy_did_buffer_switch_on_eof = 1;
+}
+
+static void fts0b_load_buffer_state  (yyscan_t yyscanner)
+{
+    struct yyguts_t * yyg = (struct yyguts_t*) yyscanner;
+	yyg->yy_n_chars = YY_CURRENT_BUFFER_LVALUE->yy_n_chars;
+	yyg->yytext_ptr = yyg->yy_c_buf_p = YY_CURRENT_BUFFER_LVALUE->yy_buf_pos;
+	yyin = YY_CURRENT_BUFFER_LVALUE->yy_input_file;
+	yyg->yy_hold_char = *yyg->yy_c_buf_p;
+}
+
+/** Allocate and initialize an input buffer state.
+ * @param file A readable stream.
+ * @param size The character buffer size in bytes. When in doubt, use @c YY_BUF_SIZE.
+ * @param yyscanner The scanner object.
+ * @return the allocated buffer state.
+ */
+    YY_BUFFER_STATE fts0b_create_buffer  (FILE * file, int  size , yyscan_t yyscanner)
+{
+	YY_BUFFER_STATE b;
+
+	b = (YY_BUFFER_STATE) fts0balloc(sizeof( struct yy_buffer_state ) ,yyscanner );
+	if ( ! b )
+		YY_FATAL_ERROR( "out of dynamic memory in fts0b_create_buffer()" );
+
+	b->yy_buf_size = size;
+
+	/* yy_ch_buf has to be 2 characters longer than the size given because
+	 * we need to put in 2 end-of-buffer characters.
+	 */
+	b->yy_ch_buf = (char*) fts0balloc(b->yy_buf_size + 2 ,yyscanner );
+	if ( ! b->yy_ch_buf )
+		YY_FATAL_ERROR( "out of dynamic memory in fts0b_create_buffer()" );
+
+	b->yy_is_our_buffer = 1;
+
+	fts0b_init_buffer(b,file ,yyscanner);
+
+	return b;
+}
+
+/** Destroy the buffer.
+ * @param b a buffer created with fts0b_create_buffer()
+ * @param yyscanner The scanner object.
+ */
+    void fts0b_delete_buffer (YY_BUFFER_STATE  b , yyscan_t yyscanner)
+{
+    struct yyguts_t * yyg = (struct yyguts_t*) yyscanner;
+
+	if ( ! b )
+		return;
+
+	if ( b == YY_CURRENT_BUFFER ) /* Not sure if we should pop here. */
+		YY_CURRENT_BUFFER_LVALUE = (YY_BUFFER_STATE) 0;
+
+	if ( b->yy_is_our_buffer )
+		fts0bfree((void*) b->yy_ch_buf ,yyscanner );
+
+	fts0bfree((void*) b ,yyscanner );
+}
+
+/* Initializes or reinitializes a buffer.
+ * This function is sometimes called more than once on the same buffer,
+ * such as during a fts0brestart() or at EOF.
+ */
+    static void fts0b_init_buffer  (YY_BUFFER_STATE  b, FILE * file , yyscan_t yyscanner)
+
+{
+	int oerrno = errno;
+    struct yyguts_t * yyg = (struct yyguts_t*) yyscanner;
+
+	fts0b_flush_buffer(b ,yyscanner);
+
+	b->yy_input_file = file;
+	b->yy_fill_buffer = 1;
+
+    /* If b is the current buffer, then fts0b_init_buffer was _probably_
+     * called from fts0brestart() or through yy_get_next_buffer.
+     * In that case, we don't want to reset the lineno or column.
+     */
+    if (b != YY_CURRENT_BUFFER){
+        b->yy_bs_lineno = 1;
+        b->yy_bs_column = 0;
+    }
+
+        b->yy_is_interactive = 0;
+
+	errno = oerrno;
+}
+
+/** Discard all buffered characters. On the next scan, YY_INPUT will be called.
+ * @param b the buffer state to be flushed, usually @c YY_CURRENT_BUFFER.
+ * @param yyscanner The scanner object.
+ */
+    void fts0b_flush_buffer (YY_BUFFER_STATE  b , yyscan_t yyscanner)
+{
+    struct yyguts_t * yyg = (struct yyguts_t*) yyscanner;
+	if ( ! b )
+		return;
+
+	b->yy_n_chars = 0;
+
+	/* We always need two end-of-buffer characters.  The first causes
+	 * a transition to the end-of-buffer state.  The second causes
+	 * a jam in that state.
+	 */
+	b->yy_ch_buf[0] = YY_END_OF_BUFFER_CHAR;
+	b->yy_ch_buf[1] = YY_END_OF_BUFFER_CHAR;
+
+	b->yy_buf_pos = &b->yy_ch_buf[0];
+
+	b->yy_at_bol = 1;
+	b->yy_buffer_status = YY_BUFFER_NEW;
+
+	if ( b == YY_CURRENT_BUFFER )
+		fts0b_load_buffer_state(yyscanner );
+}
+
+/** Pushes the new state onto the stack. The new state becomes
+ *  the current state. This function will allocate the stack
+ *  if necessary.
+ *  @param new_buffer The new state.
+ *  @param yyscanner The scanner object.
+ */
+void fts0bpush_buffer_state (YY_BUFFER_STATE new_buffer , yyscan_t yyscanner)
+{
+    struct yyguts_t * yyg = (struct yyguts_t*) yyscanner;
+	if (new_buffer == NULL)
+		return;
+
+	fts0bensure_buffer_stack(yyscanner);
+
+	/* This block is copied from fts0b_switch_to_buffer. */
+	if ( YY_CURRENT_BUFFER )
+		{
+		/* Flush out information for old buffer. */
+		*yyg->yy_c_buf_p = yyg->yy_hold_char;
+		YY_CURRENT_BUFFER_LVALUE->yy_buf_pos = yyg->yy_c_buf_p;
+		YY_CURRENT_BUFFER_LVALUE->yy_n_chars = yyg->yy_n_chars;
+		}
+
+	/* Only push if top exists. Otherwise, replace top. */
+	if (YY_CURRENT_BUFFER)
+		yyg->yy_buffer_stack_top++;
+	YY_CURRENT_BUFFER_LVALUE = new_buffer;
+
+	/* copied from fts0b_switch_to_buffer. */
+	fts0b_load_buffer_state(yyscanner );
+	yyg->yy_did_buffer_switch_on_eof = 1;
+}
+
+/** Removes and deletes the top of the stack, if present.
+ *  The next element becomes the new top.
+ *  @param yyscanner The scanner object.
+ */
+void fts0bpop_buffer_state (yyscan_t yyscanner)
+{
+    struct yyguts_t * yyg = (struct yyguts_t*) yyscanner;
+	if (!YY_CURRENT_BUFFER)
+		return;
+
+	fts0b_delete_buffer(YY_CURRENT_BUFFER ,yyscanner);
+	YY_CURRENT_BUFFER_LVALUE = NULL;
+	if (yyg->yy_buffer_stack_top > 0)
+		--yyg->yy_buffer_stack_top;
+
+	if (YY_CURRENT_BUFFER) {
+		fts0b_load_buffer_state(yyscanner );
+		yyg->yy_did_buffer_switch_on_eof = 1;
+	}
+}
+
+/* Allocates the stack if it does not exist.
+ *  Guarantees space for at least one push.
+ */
+static void fts0bensure_buffer_stack (yyscan_t yyscanner)
+{
+	int num_to_alloc;
+    struct yyguts_t * yyg = (struct yyguts_t*) yyscanner;
+
+	if (!yyg->yy_buffer_stack) {
+
+		/* First allocation is just for 2 elements, since we don't know if this
+		 * scanner will even need a stack. We use 2 instead of 1 to avoid an
+		 * immediate realloc on the next call.
+         */
+		num_to_alloc = 1;
+		yyg->yy_buffer_stack = (struct yy_buffer_state**) fts0balloc
+								(num_to_alloc * sizeof(struct yy_buffer_state*)
+								, yyscanner);
+		if ( ! yyg->yy_buffer_stack )
+			YY_FATAL_ERROR( "out of dynamic memory in fts0bensure_buffer_stack()" );
+
+		memset(yyg->yy_buffer_stack, 0, num_to_alloc * sizeof(struct yy_buffer_state*));
+
+		yyg->yy_buffer_stack_max = num_to_alloc;
+		yyg->yy_buffer_stack_top = 0;
+		return;
+	}
+
+	if (yyg->yy_buffer_stack_top >= (yyg->yy_buffer_stack_max) - 1){
+
+		/* Increase the buffer to prepare for a possible push. */
+		int grow_size = 8 /* arbitrary grow size */;
+
+		num_to_alloc = yyg->yy_buffer_stack_max + grow_size;
+		yyg->yy_buffer_stack = (struct yy_buffer_state**) fts0brealloc
+								(yyg->yy_buffer_stack,
+								num_to_alloc * sizeof(struct yy_buffer_state*)
+								, yyscanner);
+		if ( ! yyg->yy_buffer_stack )
+			YY_FATAL_ERROR( "out of dynamic memory in fts0bensure_buffer_stack()" );
+
+		/* zero only the new slots.*/
+		memset(yyg->yy_buffer_stack + yyg->yy_buffer_stack_max, 0, grow_size * sizeof(struct yy_buffer_state*));
+		yyg->yy_buffer_stack_max = num_to_alloc;
+	}
+}
+
+/** Setup the input buffer state to scan directly from a user-specified character buffer.
+ * @param base the character buffer
+ * @param size the size in bytes of the character buffer
+ * @param yyscanner The scanner object.
+ * @return the newly allocated buffer state object.
+ */
+YY_BUFFER_STATE fts0b_scan_buffer  (char * base, yy_size_t  size , yyscan_t yyscanner)
+{
+	YY_BUFFER_STATE b;
+
+	if ( size < 2 ||
+	     base[size-2] != YY_END_OF_BUFFER_CHAR ||
+	     base[size-1] != YY_END_OF_BUFFER_CHAR )
+		/* They forgot to leave room for the EOB's. */
+		return 0;
+
+	b = (YY_BUFFER_STATE) fts0balloc(sizeof( struct yy_buffer_state ) ,yyscanner );
+	if ( ! b )
+		YY_FATAL_ERROR( "out of dynamic memory in fts0b_scan_buffer()" );
+
+	b->yy_buf_size = size - 2;	/* "- 2" to take care of EOB's */
+	b->yy_buf_pos = b->yy_ch_buf = base;
+	b->yy_is_our_buffer = 0;
+	b->yy_input_file = 0;
+	b->yy_n_chars = b->yy_buf_size;
+	b->yy_is_interactive = 0;
+	b->yy_at_bol = 1;
+	b->yy_fill_buffer = 0;
+	b->yy_buffer_status = YY_BUFFER_NEW;
+
+	fts0b_switch_to_buffer(b ,yyscanner );
+
+	return b;
+}
+
+/** Setup the input buffer state to scan a string. The next call to fts0blex() will
+ * scan from a @e copy of @a str.
+ * @param yystr a NUL-terminated string to scan
+ * @param yyscanner The scanner object.
+ * @return the newly allocated buffer state object.
+ * @note If you want to scan bytes that may contain NUL values, then use
+ *       fts0b_scan_bytes() instead.
+ */
+YY_BUFFER_STATE fts0b_scan_string (yyconst char * yystr , yyscan_t yyscanner)
+{
+
+	return fts0b_scan_bytes(yystr,strlen(yystr) ,yyscanner);
+}
+
+/** Setup the input buffer state to scan the given bytes. The next call to fts0blex() will
+ * scan from a @e copy of @a bytes.
+ * @param yybytes the byte buffer to scan
+ * @param _yybytes_len the number of bytes in the buffer pointed to by @a bytes.
+ * @param yyscanner The scanner object.
+ * @return the newly allocated buffer state object.
+ */
+YY_BUFFER_STATE fts0b_scan_bytes  (yyconst char * yybytes, int  _yybytes_len , yyscan_t yyscanner)
+{
+	YY_BUFFER_STATE b;
+	char *buf;
+	yy_size_t n;
+	int i;
+
+	/* Get memory for full buffer, including space for trailing EOB's. */
+	n = _yybytes_len + 2;
+	buf = (char*) fts0balloc(n ,yyscanner );
+	if ( ! buf )
+		YY_FATAL_ERROR( "out of dynamic memory in fts0b_scan_bytes()" );
+
+	for ( i = 0; i < _yybytes_len; ++i )
+		buf[i] = yybytes[i];
+
+	buf[_yybytes_len] = buf[_yybytes_len+1] = YY_END_OF_BUFFER_CHAR;
+
+	b = fts0b_scan_buffer(buf,n ,yyscanner);
+	if ( ! b )
+		YY_FATAL_ERROR( "bad buffer in fts0b_scan_bytes()" );
+
+	/* It's okay to grow etc. this buffer, and we should throw it
+	 * away when we're done.
+	 */
+	b->yy_is_our_buffer = 1;
+
+	return b;
+}
+
+#ifndef YY_EXIT_FAILURE
+#define YY_EXIT_FAILURE 2
+#endif
+
+static void yy_fatal_error (yyconst char* msg ,  yyscan_t yyscanner __attribute__((unused)))
+{
+    	(void) fprintf( stderr, "%s\n", msg );
+	exit( YY_EXIT_FAILURE );
+}
+
+/* Redefine yyless() so it works in section 3 code. */
+
+#undef yyless
+#define yyless(n) \
+	do \
+		{ \
+		/* Undo effects of setting up yytext. */ \
+        int yyless_macro_arg = (n); \
+        YY_LESS_LINENO(yyless_macro_arg);\
+		yytext[yyleng] = yyg->yy_hold_char; \
+		yyg->yy_c_buf_p = yytext + yyless_macro_arg; \
+		yyg->yy_hold_char = *yyg->yy_c_buf_p; \
+		*yyg->yy_c_buf_p = '\0'; \
+		yyleng = yyless_macro_arg; \
+		} \
+	while ( 0 )
+
+/* Accessor  methods (get/set functions) to struct members. */
+
+/** Get the user-defined data for this scanner.
+ * @param yyscanner The scanner object.
+ */
+YY_EXTRA_TYPE fts0bget_extra  (yyscan_t yyscanner)
+{
+    struct yyguts_t * yyg = (struct yyguts_t*) yyscanner;
+    return yyextra;
+}
+
+/** Get the current line number.
+ * @param yyscanner The scanner object.
+ */
+int fts0bget_lineno  (yyscan_t yyscanner)
+{
+    struct yyguts_t * yyg = (struct yyguts_t*) yyscanner;
+
+        if (! YY_CURRENT_BUFFER)
+            return 0;
+
+    return yylineno;
+}
+
+/** Get the current column number.
+ * @param yyscanner The scanner object.
+ */
+int fts0bget_column  (yyscan_t yyscanner)
+{
+    struct yyguts_t * yyg = (struct yyguts_t*) yyscanner;
+
+        if (! YY_CURRENT_BUFFER)
+            return 0;
+
+    return yycolumn;
+}
+
+/** Get the input stream.
+ * @param yyscanner The scanner object.
+ */
+FILE *fts0bget_in  (yyscan_t yyscanner)
+{
+    struct yyguts_t * yyg = (struct yyguts_t*) yyscanner;
+    return yyin;
+}
+
+/** Get the output stream.
+ * @param yyscanner The scanner object.
+ */
+FILE *fts0bget_out  (yyscan_t yyscanner)
+{
+    struct yyguts_t * yyg = (struct yyguts_t*) yyscanner;
+    return yyout;
+}
+
+/** Get the length of the current token.
+ * @param yyscanner The scanner object.
+ */
+int fts0bget_leng  (yyscan_t yyscanner)
+{
+    struct yyguts_t * yyg = (struct yyguts_t*) yyscanner;
+    return yyleng;
+}
+
+/** Get the current token.
+ * @param yyscanner The scanner object.
+ */
+
+char *fts0bget_text  (yyscan_t yyscanner)
+{
+    struct yyguts_t * yyg = (struct yyguts_t*) yyscanner;
+    return yytext;
+}
+
+/** Set the user-defined data. This data is never touched by the scanner.
+ * @param user_defined The data to be associated with this scanner.
+ * @param yyscanner The scanner object.
+ */
+void fts0bset_extra (YY_EXTRA_TYPE  user_defined , yyscan_t yyscanner)
+{
+    struct yyguts_t * yyg = (struct yyguts_t*) yyscanner;
+    yyextra = user_defined ;
+}
+
+/** Set the current line number.
+ * @param line_number
+ * @param yyscanner The scanner object.
+ */
+void fts0bset_lineno (int  line_number , yyscan_t yyscanner)
+{
+    struct yyguts_t * yyg = (struct yyguts_t*) yyscanner;
+
+        /* lineno is only valid if an input buffer exists. */
+        if (! YY_CURRENT_BUFFER )
+           yy_fatal_error( "fts0bset_lineno called with no buffer" , yyscanner);
+
+    yylineno = line_number;
+}
+
+/** Set the current column.
+ * @param line_number
+ * @param yyscanner The scanner object.
+ */
+void fts0bset_column (int  column_no , yyscan_t yyscanner)
+{
+    struct yyguts_t * yyg = (struct yyguts_t*) yyscanner;
+
+        /* column is only valid if an input buffer exists. */
+        if (! YY_CURRENT_BUFFER )
+           yy_fatal_error( "fts0bset_column called with no buffer" , yyscanner);
+
+    yycolumn = column_no;
+}
+
+/** Set the input stream. This does not discard the current
+ * input buffer.
+ * @param in_str A readable stream.
+ * @param yyscanner The scanner object.
+ * @see fts0b_switch_to_buffer
+ */
+void fts0bset_in (FILE *  in_str , yyscan_t yyscanner)
+{
+    struct yyguts_t * yyg = (struct yyguts_t*) yyscanner;
+    yyin = in_str ;
+}
+
+void fts0bset_out (FILE *  out_str , yyscan_t yyscanner)
+{
+    struct yyguts_t * yyg = (struct yyguts_t*) yyscanner;
+    yyout = out_str ;
+}
+
+int fts0bget_debug  (yyscan_t yyscanner)
+{
+    struct yyguts_t * yyg = (struct yyguts_t*) yyscanner;
+    return yy_flex_debug;
+}
+
+void fts0bset_debug (int  bdebug , yyscan_t yyscanner)
+{
+    struct yyguts_t * yyg = (struct yyguts_t*) yyscanner;
+    yy_flex_debug = bdebug ;
+}
+
+/* Accessor methods for yylval and yylloc */
+
+/* User-visible API */
+
+/* fts0blex_init is special because it creates the scanner itself, so it is
+ * the ONLY reentrant function that doesn't take the scanner as the last argument.
+ * That's why we explicitly handle the declaration, instead of using our macros.
+ */
+
+int fts0blex_init(yyscan_t* ptr_yy_globals)
+
+{
+    if (ptr_yy_globals == NULL){
+        errno = EINVAL;
+        return 1;
+    }
+
+    *ptr_yy_globals = (yyscan_t) fts0balloc ( sizeof( struct yyguts_t ), NULL );
+
+    if (*ptr_yy_globals == NULL){
+        errno = ENOMEM;
+        return 1;
+    }
+
+    /* By setting to 0xAA, we expose bugs in yy_init_globals. Leave at 0x00 for releases. */
+    memset(*ptr_yy_globals,0x00,sizeof(struct yyguts_t));
+
+    return yy_init_globals ( *ptr_yy_globals );
+}
+
+/* fts0blex_init_extra has the same functionality as fts0blex_init, but follows the
+ * convention of taking the scanner as the last argument. Note however, that
+ * this is a *pointer* to a scanner, as it will be allocated by this call (and
+ * is the reason, too, why this function also must handle its own declaration).
+ * The user defined value in the first argument will be available to fts0balloc in
+ * the yyextra field.
+ */
+
+int fts0blex_init_extra(YY_EXTRA_TYPE yy_user_defined,yyscan_t* ptr_yy_globals )
+
+{
+    struct yyguts_t dummy_yyguts;
+
+    fts0bset_extra (yy_user_defined, &dummy_yyguts);
+
+    if (ptr_yy_globals == NULL){
+        errno = EINVAL;
+        return 1;
+    }
+
+    *ptr_yy_globals = (yyscan_t) fts0balloc ( sizeof( struct yyguts_t ), &dummy_yyguts );
+
+    if (*ptr_yy_globals == NULL){
+        errno = ENOMEM;
+        return 1;
+    }
+
+    /* By setting to 0xAA, we expose bugs in
+    yy_init_globals. Leave at 0x00 for releases. */
+    memset(*ptr_yy_globals,0x00,sizeof(struct yyguts_t));
+
+    fts0bset_extra (yy_user_defined, *ptr_yy_globals);
+
+    return yy_init_globals ( *ptr_yy_globals );
+}
+
+static int yy_init_globals (yyscan_t yyscanner)
+{
+    struct yyguts_t * yyg = (struct yyguts_t*) yyscanner;
+    /* Initialization is the same as for the non-reentrant scanner.
+     * This function is called from fts0blex_destroy(), so don't allocate here.
+     */
+
+    yyg->yy_buffer_stack = 0;
+    yyg->yy_buffer_stack_top = 0;
+    yyg->yy_buffer_stack_max = 0;
+    yyg->yy_c_buf_p = (char*) 0;
+    yyg->yy_init = 0;
+    yyg->yy_start = 0;
+
+    yyg->yy_start_stack_ptr = 0;
+    yyg->yy_start_stack_depth = 0;
+    yyg->yy_start_stack =  NULL;
+
+/* Defined in main.c */
+#ifdef YY_STDINIT
+    yyin = stdin;
+    yyout = stdout;
+#else
+    yyin = (FILE*) 0;
+    yyout = (FILE*) 0;
+#endif
+
+    /* For future reference: Set errno on error, since we are called by
+     * fts0blex_init()
+     */
+    return 0;
+}
+
+/* fts0blex_destroy is for both reentrant and non-reentrant scanners. */
+int fts0blex_destroy  (yyscan_t yyscanner)
+{
+    struct yyguts_t * yyg = (struct yyguts_t*) yyscanner;
+
+    /* Pop the buffer stack, destroying each element. */
+	while(YY_CURRENT_BUFFER){
+		fts0b_delete_buffer(YY_CURRENT_BUFFER ,yyscanner );
+		YY_CURRENT_BUFFER_LVALUE = NULL;
+		fts0bpop_buffer_state(yyscanner);
+	}
+
+	/* Destroy the stack itself. */
+	fts0bfree(yyg->yy_buffer_stack ,yyscanner);
+	yyg->yy_buffer_stack = NULL;
+
+    /* Destroy the start condition stack. */
+        fts0bfree(yyg->yy_start_stack ,yyscanner );
+        yyg->yy_start_stack = NULL;
+
+    /* Reset the globals. This is important in a non-reentrant scanner so the next time
+     * fts0blex() is called, initialization will occur. */
+    yy_init_globals( yyscanner);
+
+    /* Destroy the main struct (reentrant only). */
+    fts0bfree ( yyscanner , yyscanner );
+    yyscanner = NULL;
+    return 0;
+}
+
+/*
+ * Internal utility routines.
+ */
+
+#ifndef yytext_ptr
+static void yy_flex_strncpy (char* s1, yyconst char * s2, int n ,  yyscan_t yyscanner __attribute__((unused)))
+{
+	register int i;
+	for ( i = 0; i < n; ++i )
+		s1[i] = s2[i];
+}
+#endif
+
+#ifdef YY_NEED_STRLEN
+static int yy_flex_strlen (yyconst char * s ,  yyscan_t yyscanner __attribute__((unused)))
+{
+	register int n;
+	for ( n = 0; s[n]; ++n )
+		;
+
+	return n;
+}
+#endif
+
+void *fts0balloc (yy_size_t  size ,  yyscan_t yyscanner __attribute__((unused)))
+{
+	return (void*) malloc( size );
+}
+
+void *fts0brealloc  (void * ptr, yy_size_t  size ,  yyscan_t yyscanner __attribute__((unused)))
+{
+	/* The cast to (char*) in the following accommodates both
+	 * implementations that use char* generic pointers, and those
+	 * that use void* generic pointers.  It works with the latter
+	 * because both ANSI C and C++ allow castless assignment from
+	 * any pointer type to void*, and deal with argument conversions
+	 * as though doing an assignment.
+	 */
+	return (void*) realloc( (char*) ptr, size );
+}
+
+void fts0bfree (void * ptr ,  yyscan_t yyscanner __attribute__((unused)))
+{
+	free( (char*) ptr );	/* see fts0brealloc() for (char*) cast */
+}
+
+#define YYTABLES_NAME "yytables"
+
+#line 73 "fts0blex.l"
+
+
+
diff --git a/storage/innobase/fts/fts0blex.l b/storage/innobase/fts/fts0blex.l
new file mode 100644
index 00000000000..b84b0cea294
--- /dev/null
+++ b/storage/innobase/fts/fts0blex.l
@@ -0,0 +1,73 @@
+/*****************************************************************************
+
+Copyright (c) 2007, 2011, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**
+ * @file fts/fts0blex.l
+ * FTS parser lexical analyzer
+ *
+ * Created 2007/5/9 Sunny Bains
+ */
+
+%{
+
+#include "fts0ast.h"
+#include "fts0pars.h"
+
+/* Required for reentrant parser */
+#define YY_DECL int fts_blexer(YYSTYPE* val, yyscan_t yyscanner)
+
+%}
+
+%option noinput
+%option nounput
+%option noyywrap
+%option nostdinit
+%option reentrant
+%option never-interactive
+
+%%
+
+[\t ]+	/* Ignore whitespace */ ;
+
+[*()+\-<>~@]		{
+	val->oper = fts0bget_text(yyscanner)[0];
+
+	return(val->oper);
+}
+
+[0-9]+			{
+	val->token = strdup(fts0bget_text(yyscanner));
+
+	return(FTS_NUMB);
+}
+
+[^" \n*()+\-<>~@]*		{
+	val->token = strdup(fts0bget_text(yyscanner));
+
+	return(FTS_TERM);
+}
+
+\"[^\"\n]*\"		{
+	val->token = strdup(fts0bget_text(yyscanner));
+
+	return(FTS_TEXT);
+}
+
+\n
+
+%%
diff --git a/storage/innobase/fts/fts0config.cc b/storage/innobase/fts/fts0config.cc
new file mode 100644
index 00000000000..3f849ef183c
--- /dev/null
+++ b/storage/innobase/fts/fts0config.cc
@@ -0,0 +1,562 @@
+/*****************************************************************************
+
+Copyright (c) 2007, 2011, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file fts/fts0config.cc
+Full Text Search configuration table.
+
+Created 2007/5/9 Sunny Bains
+***********************************************************************/
+
+#include "trx0roll.h"
+#include "row0sel.h"
+
+#include "fts0priv.h"
+
+#ifndef UNIV_NONINL
+#include "fts0types.ic"
+#include "fts0vlc.ic"
+#endif
+
+/******************************************************************//**
+Callback function for fetching the config value.
+@return always returns TRUE */
+static
+ibool
+fts_config_fetch_value(
+/*===================*/
+	void*		row,			/*!< in: sel_node_t* */
+	void*		user_arg)		/*!< in: pointer to
+						 ib_vector_t */
+{
+	sel_node_t*	node = static_cast<sel_node_t*>(row);
+	fts_string_t*	value = static_cast<fts_string_t*>(user_arg);
+
+	dfield_t*	dfield = que_node_get_val(node->select_list);
+	dtype_t*	type = dfield_get_type(dfield);
+	ulint		len = dfield_get_len(dfield);
+	void*		data = dfield_get_data(dfield);
+
+	ut_a(dtype_get_mtype(type) == DATA_VARCHAR);
+
+	if (len != UNIV_SQL_NULL) {
+		ulint	max_len = ut_min(value->f_len - 1, len);
+
+		memcpy(value->f_str, data, max_len);
+		value->f_len = max_len;
+		value->f_str[value->f_len] = '\0';
+	}
+
+	return(TRUE);
+}
+
+/******************************************************************//**
+Get value from the config table. The caller must ensure that enough
+space is allocated for value to hold the column contents.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+ulint
+fts_config_get_value(
+/*=================*/
+	trx_t*		trx,			/*!< transaction */
+	fts_table_t*	fts_table,		/*!< in: the indexed
+						FTS table */
+	const char*	name,			/*!< in: get config value for
+						this parameter name */
+	fts_string_t*	value)			/*!< out: value read from
+						config table */
+{
+	pars_info_t*	info;
+	que_t*		graph;
+	ulint		error;
+	ulint		name_len = strlen(name);
+
+	info = pars_info_create();
+
+	*value->f_str = '\0';
+	ut_a(value->f_len > 0);
+
+	pars_info_bind_function(info, "my_func", fts_config_fetch_value,
+				value);
+
+	/* The len field of value must be set to the max bytes that
+	it can hold. On a successful read, the len field will be set
+	to the actual number of bytes copied to value. */
+	pars_info_bind_varchar_literal(info, "name", (byte*) name, name_len);
+
+	fts_table->suffix = "CONFIG";
+
+	graph = fts_parse_sql(
+		fts_table,
+		info,
+		"DECLARE FUNCTION my_func;\n"
+		"DECLARE CURSOR c IS SELECT value FROM %s"
+		" WHERE key = :name;\n"
+		"BEGIN\n"
+		""
+		"OPEN c;\n"
+		"WHILE 1 = 1 LOOP\n"
+		"  FETCH c INTO my_func();\n"
+		"  IF c % NOTFOUND THEN\n"
+		"    EXIT;\n"
+		"  END IF;\n"
+		"END LOOP;\n"
+		"CLOSE c;");
+
+	trx->op_info = "getting FTS config value";
+
+	error = fts_eval_sql(trx, graph);
+
+	mutex_enter(&dict_sys->mutex);
+	que_graph_free(graph);
+	mutex_exit(&dict_sys->mutex);
+
+	return(error);
+}
+
+/*********************************************************************//**
+Create the config table name for retrieving index specific value.
+@return index config parameter name */
+UNIV_INTERN
+char*
+fts_config_create_index_param_name(
+/*===============================*/
+	const char*		param,		/*!< in: base name of param */
+	const dict_index_t*	index)		/*!< in: index for config */
+{
+	ulint		len;
+	char*		name;
+
+	/* The format of the config name is: name_<index_id>. */
+	len = strlen(param);
+
+	/* Caller is responsible for deleting name. */
+	name = static_cast<char*>(ut_malloc(
+		len + FTS_AUX_MIN_TABLE_ID_LENGTH + 2));
+	strcpy(name, param);
+	name[len] = '_';
+
+	fts_write_object_id(index->id, name + len + 1);
+
+	return(name);
+}
+
+/******************************************************************//**
+Get value specific to an FTS index from the config table. The caller
+must ensure that enough space is allocated for value to hold the
+column contents.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+ulint
+fts_config_get_index_value(
+/*=======================*/
+	trx_t*		trx,			/*!< transaction */
+	dict_index_t*	index,			/*!< in: index */
+	const char*	param,			/*!< in: get config value for
+						this parameter name */
+	fts_string_t*	value)			/*!< out: value read from
+						config table */
+{
+	char*		name;
+	ulint		error;
+	fts_table_t	fts_table;
+
+	FTS_INIT_FTS_TABLE(&fts_table, "CONFIG", FTS_COMMON_TABLE,
+			   index->table);
+
+	/* We are responsible for free'ing name. */
+	name = fts_config_create_index_param_name(param, index);
+
+	error = fts_config_get_value(trx, &fts_table, name, value);
+
+	ut_free(name);
+
+	return(error);
+}
+
+/******************************************************************//**
+Set the value in the config table for name.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+ulint
+fts_config_set_value(
+/*=================*/
+	trx_t*		trx,			/*!< transaction */
+	fts_table_t*	fts_table,		/*!< in: the indexed
+						FTS table */
+	const char*	name,			/*!< in: get config value for
+						this parameter name */
+	const fts_string_t*
+			value)			/*!< in: value to update */
+{
+	pars_info_t*	info;
+	que_t*		graph;
+	ulint		error;
+	undo_no_t	undo_no;
+	undo_no_t	n_rows_updated;
+	ulint		name_len = strlen(name);
+
+	info = pars_info_create();
+
+	pars_info_bind_varchar_literal(info, "name", (byte*) name, name_len);
+	pars_info_bind_varchar_literal(info, "value",
+				       value->f_str, value->f_len);
+
+	fts_table->suffix = "CONFIG";
+
+	graph = fts_parse_sql(
+		fts_table, info,
+		"BEGIN UPDATE %s SET value = :value WHERE key = :name;");
+
+	trx->op_info = "setting FTS config value";
+
+	undo_no = trx->undo_no;
+
+	error = fts_eval_sql(trx, graph);
+
+	fts_que_graph_free_check_lock(fts_table, NULL, graph);
+
+	n_rows_updated = trx->undo_no - undo_no;
+
+	/* Check if we need to do an insert. */
+	if (n_rows_updated == 0) {
+		info = pars_info_create();
+
+		pars_info_bind_varchar_literal(
+			info, "name", (byte*) name, name_len);
+
+		pars_info_bind_varchar_literal(
+			info, "value", value->f_str, value->f_len);
+
+		graph = fts_parse_sql(
+			fts_table, info,
+			"BEGIN\n"
+			"INSERT INTO %s VALUES(:name, :value);");
+
+		trx->op_info = "inserting FTS config value";
+
+		error = fts_eval_sql(trx, graph);
+
+		fts_que_graph_free_check_lock(fts_table, NULL, graph);
+	}
+
+	return(error);
+}
+
+/******************************************************************//**
+Set the value specific to an FTS index in the config table.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+ulint
+fts_config_set_index_value(
+/*=======================*/
+	trx_t*		trx,			/*!< transaction */
+	dict_index_t*	index,			/*!< in: index */
+	const char*	param,			/*!< in: get config value for
+						this parameter name */
+	fts_string_t*	value)			/*!< out: value read from
+						config table */
+{
+	char*		name;
+	ulint		error;
+	fts_table_t	fts_table;
+
+	FTS_INIT_FTS_TABLE(&fts_table, "CONFIG", FTS_COMMON_TABLE,
+			   index->table);
+
+	/* We are responsible for free'ing name. */
+	name = fts_config_create_index_param_name(param, index);
+
+	error = fts_config_set_value(trx, &fts_table, name, value);
+
+	ut_free(name);
+
+	return(error);
+}
+
+/******************************************************************//**
+Get an ulint value from the config table.
+@return DB_SUCCESS if all OK else error code */
+UNIV_INTERN
+ulint
+fts_config_get_index_ulint(
+/*=======================*/
+	trx_t*		trx,			/*!< in: transaction */
+	dict_index_t*	index,			/*!< in: FTS index */
+	const char*	name,			/*!< in: param name */
+	ulint*		int_value)		/*!< out: value */
+{
+	ulint		error;
+	fts_string_t	value;
+
+	/* We set the length of value to the max bytes it can hold. This
+	information is used by the callback that reads the value.*/
+	value.f_len = FTS_MAX_CONFIG_VALUE_LEN;
+	value.f_str = static_cast<byte*>(ut_malloc(value.f_len + 1));
+
+	error = fts_config_get_index_value(trx, index, name, &value);
+
+	if (UNIV_UNLIKELY(error != DB_SUCCESS)) {
+		ut_print_timestamp(stderr);
+
+		fprintf(stderr, "  InnoDB: Error: (%lu) reading `%s'\n",
+			error, name);
+	} else {
+		*int_value = strtoul((char*) value.f_str, NULL, 10);
+	}
+
+	ut_free(value.f_str);
+
+	return(error);
+}
+
+/******************************************************************//**
+Set an ulint value in the config table.
+@return DB_SUCCESS if all OK else error code */
+UNIV_INTERN
+ulint
+fts_config_set_index_ulint(
+/*=======================*/
+	trx_t*		trx,			/*!< in: transaction */
+	dict_index_t*	index,			/*!< in: FTS index */
+	const char*	name,			/*!< in: param name */
+	ulint		int_value)		/*!< in: value */
+{
+	ulint		error;
+	fts_string_t	value;
+
+	/* We set the length of value to the max bytes it can hold. This
+	information is used by the callback that reads the value.*/
+	value.f_len = FTS_MAX_CONFIG_VALUE_LEN;
+	value.f_str = static_cast<byte*>(ut_malloc(value.f_len + 1));
+
+	// FIXME: Get rid of snprintf
+	ut_a(FTS_MAX_INT_LEN < FTS_MAX_CONFIG_VALUE_LEN);
+
+	value.f_len = ut_snprintf(
+		(char*) value.f_str, FTS_MAX_INT_LEN, "%lu", int_value);
+
+	error = fts_config_set_index_value(trx, index, name, &value);
+
+	if (UNIV_UNLIKELY(error != DB_SUCCESS)) {
+		ut_print_timestamp(stderr);
+
+		fprintf(stderr, "  InnoDB: Error: (%lu) writing `%s'\n",
+			error, name);
+	}
+
+	ut_free(value.f_str);
+
+	return(error);
+}
+
+/******************************************************************//**
+Get an ulint value from the config table.
+@return DB_SUCCESS if all OK else error code */
+UNIV_INTERN
+ulint
+fts_config_get_ulint(
+/*=================*/
+	trx_t*		trx,			/*!< in: transaction */
+	fts_table_t*	fts_table,		/*!< in: the indexed
+						FTS table */
+	const char*	name,			/*!< in: param name */
+	ulint*		int_value)		/*!< out: value */
+{
+	ulint		error;
+	fts_string_t	value;
+
+	/* We set the length of value to the max bytes it can hold. This
+	information is used by the callback that reads the value.*/
+	value.f_len = FTS_MAX_CONFIG_VALUE_LEN;
+	value.f_str = static_cast<byte*>(ut_malloc(value.f_len + 1));
+
+	error = fts_config_get_value(trx, fts_table, name, &value);
+
+	if (UNIV_UNLIKELY(error != DB_SUCCESS)) {
+		ut_print_timestamp(stderr);
+
+		fprintf(stderr, "  InnoDB: Error: (%lu) reading `%s'\n",
+			error, name);
+	} else {
+		*int_value = strtoul((char*) value.f_str, NULL, 10);
+	}
+
+	ut_free(value.f_str);
+
+	return(error);
+}
+
+/******************************************************************//**
+Set an ulint value in the config table.
+@return DB_SUCCESS if all OK else error code */
+UNIV_INTERN
+ulint
+fts_config_set_ulint(
+/*=================*/
+	trx_t*		trx,			/*!< in: transaction */
+	fts_table_t*	fts_table,		/*!< in: the indexed
+						FTS table */
+	const char*	name,			/*!< in: param name */
+	ulint		int_value)		/*!< in: value */
+{
+	ulint		error;
+	fts_string_t	value;
+
+	/* We set the length of value to the max bytes it can hold. This
+	information is used by the callback that reads the value.*/
+	value.f_len = FTS_MAX_CONFIG_VALUE_LEN;
+	value.f_str = static_cast<byte*>(ut_malloc(value.f_len + 1));
+
+	// FIXME: Get rid of snprintf
+	ut_a(FTS_MAX_INT_LEN < FTS_MAX_CONFIG_VALUE_LEN);
+
+	value.f_len = snprintf(
+		(char*) value.f_str, FTS_MAX_INT_LEN, "%lu", int_value);
+
+	error = fts_config_set_value(trx, fts_table, name, &value);
+
+	if (UNIV_UNLIKELY(error != DB_SUCCESS)) {
+		ut_print_timestamp(stderr);
+
+		fprintf(stderr, "  InnoDB: Error: (%lu) writing `%s'\n",
+			error, name);
+	}
+
+	ut_free(value.f_str);
+
+	return(error);
+}
+
+/******************************************************************//**
+Increment the value in the config table for column name.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+ulint
+fts_config_increment_value(
+/*=======================*/
+	trx_t*		trx,			/*!< transaction */
+	fts_table_t*	fts_table,		/*!< in: the indexed
+						FTS table */
+	const char*	name,			/*!< in: increment config value
+						for this parameter name */
+	ulint		delta)			/*!< in: increment by this
+						much */
+{
+	ulint		error;
+	fts_string_t	value;
+	que_t*		graph = NULL;
+	ulint		name_len = strlen(name);
+	pars_info_t*	info = pars_info_create();
+
+	/* We set the length of value to the max bytes it can hold. This
+	information is used by the callback that reads the value.*/
+	value.f_len = FTS_MAX_CONFIG_VALUE_LEN;
+	value.f_str = static_cast<byte*>(ut_malloc(value.f_len + 1));
+
+	*value.f_str = '\0';
+
+	pars_info_bind_varchar_literal(info, "name", (byte*) name, name_len);
+
+	pars_info_bind_function(
+		info, "my_func", fts_config_fetch_value, &value);
+
+	fts_table->suffix = "CONFIG";
+
+	graph = fts_parse_sql(
+		fts_table, info,
+		"DECLARE FUNCTION my_func;\n"
+		"DECLARE CURSOR c IS SELECT value FROM %s"
+		" WHERE key = :name FOR UPDATE;\n"
+		"BEGIN\n"
+		""
+		"OPEN c;\n"
+		"WHILE 1 = 1 LOOP\n"
+		"  FETCH c INTO my_func();\n"
+		"  IF c % NOTFOUND THEN\n"
+		"    EXIT;\n"
+		"  END IF;\n"
+		"END LOOP;\n"
+		"CLOSE c;");
+
+	trx->op_info = "read  FTS config value";
+
+	error = fts_eval_sql(trx, graph);
+
+	fts_que_graph_free_check_lock(fts_table, NULL, graph);
+
+	if (UNIV_UNLIKELY(error == DB_SUCCESS)) {
+		ulint		int_value;
+
+		int_value = strtoul((char*) value.f_str, NULL, 10);
+
+		int_value += delta;
+
+		ut_a(FTS_MAX_CONFIG_VALUE_LEN > FTS_MAX_INT_LEN);
+
+		// FIXME: Get rid of snprintf
+		value.f_len = snprintf(
+			(char*) value.f_str, FTS_MAX_INT_LEN, "%lu", int_value);
+
+		fts_config_set_value(trx, fts_table, name, &value);
+	}
+
+	if (UNIV_UNLIKELY(error != DB_SUCCESS)) {
+
+		ut_print_timestamp(stderr);
+
+		fprintf(stderr, "  InnoDB: Error: (%lu) "
+			"while incrementing %s.\n", error, name);
+	}
+
+	ut_free(value.f_str);
+
+	return(error);
+}
+
+/******************************************************************//**
+Increment the per index value in the config table for column name.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+ulint
+fts_config_increment_index_value(
+/*=============================*/
+	trx_t*		trx,			/*!< transaction */
+	dict_index_t*	index,			/*!< in: FTS index */
+	const char*	param,			/*!< in: increment config value
+						for this parameter name */
+	ulint		delta)			/*!< in: increment by this
+						much */
+{
+	char*		name;
+	ulint		error;
+	fts_table_t	fts_table;
+
+	FTS_INIT_FTS_TABLE(&fts_table, "CONFIG", FTS_COMMON_TABLE,
+			   index->table);
+
+	/* We are responsible for free'ing name. */
+	name = fts_config_create_index_param_name(param, index);
+
+	error = fts_config_increment_value(trx, &fts_table, name, delta);
+
+	ut_free(name);
+
+	return(error);
+}
+
diff --git a/storage/innobase/fts/fts0fts.cc b/storage/innobase/fts/fts0fts.cc
new file mode 100644
index 00000000000..f716b980501
--- /dev/null
+++ b/storage/innobase/fts/fts0fts.cc
@@ -0,0 +1,6287 @@
+/*****************************************************************************
+
+Copyright (c) 2011, 2011, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file fts/fts0fts.cc
+Full Text Search interface
+***********************************************************************/
+
+#include "trx0roll.h"
+#include "row0mysql.h"
+#include "row0upd.h"
+#include "dict0types.h"
+#include "row0sel.h"
+
+#include "fts0fts.h"
+#include "fts0priv.h"
+#include "fts0types.h"
+
+#include "fts0types.ic"
+#include "fts0vlc.ic"
+#include "dict0priv.h"
+#include "dict0stats.h"
+#include "btr0pcur.h"
+#include "row0row.h"
+#include "ha_prototypes.h"
+
+#ifdef UNIV_NONINL
+#include "fts0priv.ic"
+#endif
+
+#define FTS_MAX_ID_LEN	32
+
+/** Column name from the FTS config table */
+#define FTS_MAX_CACHE_SIZE_IN_MB	"cache_size_in_mb"
+
+/** This is maximum FTS cache for each table and would be
+a configurable variable */
+UNIV_INTERN ulong	fts_max_cache_size;
+
+/** Variable specifying the maximum FTS max token size */
+UNIV_INTERN ulong	fts_max_token_size;
+
+/** Variable specifying the minimum FTS max token size */
+UNIV_INTERN ulong	fts_min_token_size;
+
+
+// FIXME: testing
+ib_time_t elapsed_time = 0;
+ulint n_nodes = 0;
+
+typedef struct fts_schema_struct fts_schema_t;
+typedef struct fts_sys_table_struct fts_sys_table_t;
+
+/** Error condition reported by fts_utf8_decode() */
+const ulint UTF8_ERROR = 0xFFFFFFFF;
+
+/** The cache size permissible lower limit (1K) */
+static const ulint FTS_CACHE_SIZE_LOWER_LIMIT_IN_MB = 1;
+
+/** The cache size permissible upper limit (1G) */
+static const ulint FTS_CACHE_SIZE_UPPER_LIMIT_IN_MB = 1024;
+
+/** Time to sleep after DEADLOCK error before retrying operation. */
+static const ulint FTS_DEADLOCK_RETRY_WAIT = 100000;
+
+#ifdef UNIV_PFS_RWLOCK
+UNIV_INTERN mysql_pfs_key_t	fts_cache_rw_lock_key;
+UNIV_INTERN mysql_pfs_key_t	fts_cache_init_rw_lock_key;
+#endif /* UNIV_PFS_RWLOCK */
+
+#ifdef UNIV_PFS_MUTEX
+UNIV_INTERN mysql_pfs_key_t	fts_delete_mutex_key;
+UNIV_INTERN mysql_pfs_key_t	fts_optimize_mutex_key;
+UNIV_INTERN mysql_pfs_key_t	fts_bg_threads_mutex_key;
+UNIV_INTERN mysql_pfs_key_t	fts_doc_id_mutex_key;
+#endif /* UNIV_PFS_MUTEX */
+
+/** variable to record innodb_fts_internal_tbl_name for information
+schema table INNODB_FTS_INSERTED etc. */
+UNIV_INTERN char* fts_internal_tbl_name		= NULL;
+
+/** InnoDB default stopword list:
+There are different versions of stopwords, the stop words listed
+below comes from "Google Stopword" list. Reference:
+http://meta.wikimedia.org/wiki/Stop_word_list/google_stop_word_list.
+The final version of InnoDB default stopword list is still pending
+for decision */
+const char *fts_default_stopword[] =
+{
+	"a",
+	"about",
+	"an",
+	"are",
+	"as",
+	"at",
+	"be",
+	"by",
+	"com",
+	"de",
+	"en",
+	"for",
+	"from",
+	"how",
+	"i",
+	"in",
+	"is",
+	"it",
+	"la",
+	"of",
+	"on",
+	"or",
+	"that",
+	"the",
+	"this",
+	"to",
+	"was",
+	"what",
+	"when",
+	"where",
+	"who",
+	"will",
+	"with",
+	"und",
+	"the",
+	"www",
+	NULL
+};
+
+/** For storing table info when checking for orphaned tables. */
+struct fts_sys_table_struct {
+	table_id_t	id;		/*!< Table id */
+	table_id_t	parent_id;	/*!< Parent table id */
+	table_id_t	index_id;	/*!< Table FT index id */
+	char*		name;		/*!< Name of the table */
+};
+
+/** SQL statements for creating the ancillary common FTS tables. */
+static const char* fts_create_common_tables_sql = {
+	"BEGIN\n"
+	""
+	"CREATE TABLE %s_ADDED (\n"
+	"  doc_id BIGINT UNSIGNED\n"
+	") COMPACT;\n"
+	"CREATE UNIQUE CLUSTERED INDEX IND ON %s_ADDED(doc_id);\n"
+	""
+	"CREATE TABLE %s_DELETED (\n"
+	"  doc_id BIGINT UNSIGNED\n"
+	") COMPACT;\n"
+	"CREATE UNIQUE CLUSTERED INDEX IND ON %s_DELETED(doc_id);\n"
+	""
+	"CREATE TABLE %s_DELETED_CACHE (\n"
+	"  doc_id BIGINT UNSIGNED\n"
+	") COMPACT;\n"
+	"CREATE UNIQUE CLUSTERED INDEX IND "
+		"ON %s_DELETED_CACHE(doc_id);\n"
+	""
+	"CREATE TABLE %s_BEING_DELETED (\n"
+	"  doc_id BIGINT UNSIGNED\n"
+	") COMPACT;\n"
+	"CREATE UNIQUE CLUSTERED INDEX IND "
+		"ON %s_BEING_DELETED(doc_id);\n"
+	""
+	"CREATE TABLE %s_BEING_DELETED_CACHE (\n"
+	"  doc_id BIGINT UNSIGNED\n"
+	") COMPACT;\n"
+	"CREATE UNIQUE CLUSTERED INDEX IND "
+		"ON %s_BEING_DELETED_CACHE(doc_id);\n"
+	""
+	"CREATE TABLE %s_CONFIG (\n"
+	"  key CHAR(50),\n"
+	"  value CHAR(50) NOT NULL\n"
+	") COMPACT;\n"
+	"CREATE UNIQUE CLUSTERED INDEX IND ON %s_CONFIG(key);\n"
+	""
+	"CREATE TABLE %s_STOPWORDS (\n"
+	"  word CHAR\n"
+	") COMPACT;\n"
+	"CREATE UNIQUE CLUSTERED INDEX IND ON %s_STOPWORDS(word);\n",
+};
+
+/** Template for creating the FTS auxiliary index specific tables. */
+static const char* fts_create_index_tables_sql = {
+	"BEGIN\n"
+	""
+	"CREATE TABLE %s_DOC_ID (\n"
+	"   doc_id BIGINT UNSIGNED,\n"
+	"   word_count INTEGER UNSIGNED NOT NULL\n"
+	") COMPACT;\n"
+	"CREATE UNIQUE CLUSTERED INDEX IND ON %s_DOC_ID(doc_id);\n"
+};
+
+/** Template for creating the ancillary FTS tables word index tables. */
+static const char* fts_create_index_sql = {
+	"BEGIN\n"
+	""
+	"CREATE UNIQUE CLUSTERED INDEX FTS_INDEX_TABLE_IND "
+		"ON %s(word, first_doc_id);\n"
+};
+
+/** FTS auxiliary table suffixes that are common to all FT indexes. */
+static const char* fts_common_tables[] = {
+	"ADDED",
+	"BEING_DELETED",
+	"BEING_DELETED_CACHE",
+	"CONFIG",
+	"DELETED",
+	"DELETED_CACHE",
+	"STOPWORDS",
+	NULL
+};
+
+/** FTS auxiliary INDEX split intervals. */
+const  fts_index_selector_t fts_index_selector[] = {
+	{ 9, "INDEX_1" },
+	{ 65, "INDEX_2" },
+	{ 70, "INDEX_3" },
+	{ 75, "INDEX_4" },
+	{ 80, "INDEX_5" },
+	{ 85, "INDEX_6" },
+	{  0 , NULL	 }
+};
+
+/** Default config values for FTS indexes on a table. */
+static const char* fts_config_table_insert_values_sql =
+	"BEGIN\n"
+	"\n"
+	"INSERT INTO %s VALUES('"
+		FTS_MAX_CACHE_SIZE_IN_MB "', '256');\n"
+	""
+	"INSERT INTO %s VALUES('"
+		FTS_OPTIMIZE_LIMIT_IN_SECS  "', '180');\n"
+	""
+	"INSERT INTO %s VALUES ('"
+		FTS_SYNCED_DOC_ID "', '1');\n"
+	""
+	"INSERT INTO %s VALUES ('"
+		FTS_TOTAL_DELETED_COUNT "', '0');\n"
+	"" /* Note: 0 == FTS_TABLE_STATE_RUNNING */
+	"INSERT INTO %s VALUES ('"
+		FTS_TABLE_STATE "', '0');\n";
+
+/****************************************************************//**
+Run SYNC on the table, i.e., write out data from the cache to the
+FTS auxiliary INDEX table and clear the cache at the end.
+@return DB_SUCCESS if all OK */
+static
+ulint
+fts_sync(
+/*=====*/
+	fts_sync_t*	sync);		/*!< in: sync state */
+
+/****************************************************************//**
+Release all resources help by the words rb tree e.g., the node ilist. */
+static
+void
+fts_words_free(
+/*===========*/
+	ib_rbt_t*	words);		/*!< in: rb tree of words */
+#ifdef FTS_CACHE_SIZE_DEBUG
+/****************************************************************//**
+Read the max cache size parameter from the config table. */
+static
+void
+fts_update_max_cache_size(
+/*======================*/
+	fts_sync_t*	sync);		/*!< in: sync state */
+#endif
+
+/*********************************************************************//**
+This function fetches the document just inserted right before
+we commit the transaction, and tokenize the inserted text data
+and insert into FTS auxiliary table and its cache.
+@return TRUE if successful */
+static
+ulint
+fts_add_doc_by_id(
+/*==============*/
+	fts_trx_table_t*ftt,		/*!< in: FTS trx table */
+	doc_id_t	doc_id,		/*!< in: doc id */
+	ib_vector_t*	fts_indexes __attribute__((unused)));
+					/*!< in: affected fts indexes */
+/****************************************************************//**
+Check whether a particular word (term) exists in the FTS index.
+@return DB_SUCCESS if all went fine */
+static
+ulint
+fts_is_word_in_index(
+/*=================*/
+	trx_t*		trx,		/*!< in: FTS query state */
+	que_t**		graph,		/*!< out: Query graph */
+	fts_table_t*	fts_table,	/*!< in: table instance */
+	const fts_string_t* word,	/*!< in: the word to check */
+	ibool*		found);		/*!< out: TRUE if exists */
+
+/********************************************************************
+Check if we should stop. */
+UNIV_INLINE
+ibool
+fts_is_stop_signalled(
+/*==================*/
+	fts_t*		fts)			/*!< in: fts instance */
+{
+	ibool		stop_signalled = FALSE;
+
+	mutex_enter(&fts->bg_threads_mutex);
+
+	if (fts->fts_status & BG_THREAD_STOP) {
+
+		stop_signalled = TRUE;
+	}
+
+	mutex_exit(&fts->bg_threads_mutex);
+
+	return(stop_signalled);
+}
+
+/****************************************************************//**
+This function loads the default InnoDB stopword list */
+static
+void
+fts_load_default_stopword(
+/*======================*/
+	fts_stopword_t*		stopword_info)	/*!< in: stopword info */
+{
+	fts_string_t		str;
+	mem_heap_t*		heap;
+	ib_alloc_t*		allocator;
+	ib_rbt_t*		stop_words;
+
+	allocator = stopword_info->heap;
+	heap = static_cast<mem_heap_t*>(allocator->arg);
+	stop_words = stopword_info->cached_stopword;
+
+	str.f_n_char = 0;
+
+	for (ulint i = 0; fts_default_stopword[i]; ++i) {
+		char*			word;
+		fts_tokenizer_word_t	new_word;
+
+		/* We are going to duplicate the value below. */
+		word = const_cast<char*>(fts_default_stopword[i]);
+
+		new_word.nodes = ib_vector_create(
+			allocator, sizeof(fts_node_t), 4);
+
+		str.f_len = ut_strlen(word);
+		str.f_str = reinterpret_cast<byte*>(word);
+
+		fts_utf8_string_dup(&new_word.text, &str, heap);
+
+		rbt_insert(stop_words, &new_word, &new_word);
+	}
+
+	stopword_info->status = STOPWORD_FROM_DEFAULT;
+}
+
+/****************************************************************//**
+Callback function to read a single stopword value.
+@return Always return TRUE */
+static
+ibool
+fts_read_stopword(
+/*==============*/
+	void*		row,		/*!< in: sel_node_t* */
+	void*		user_arg)	/*!< in: pointer to ib_vector_t */
+{
+	ib_alloc_t*	allocator;
+	fts_stopword_t*	stopword_info;
+	sel_node_t*	sel_node;
+	que_node_t*	exp;
+	ib_rbt_t*	stop_words;
+	dfield_t*	dfield;
+	fts_string_t	str;
+	mem_heap_t*	heap;
+	ib_rbt_bound_t	parent;
+
+	sel_node = static_cast<sel_node_t*>(row);
+	stopword_info = static_cast<fts_stopword_t*>(user_arg);
+
+	stop_words = stopword_info->cached_stopword;
+	allocator =  static_cast<ib_alloc_t*>(stopword_info->heap);
+	heap = static_cast<mem_heap_t*>(allocator->arg);
+
+	exp = sel_node->select_list;
+
+	/* We only need to read the first column */
+	dfield = que_node_get_val(exp);
+
+	str.f_n_char = 0;
+	str.f_str = static_cast<byte*>(dfield_get_data(dfield));
+	str.f_len = dfield_get_len(dfield);
+
+	/* Only create new node if it is a value not already existed */
+	if (str.f_len != UNIV_SQL_NULL
+	    && rbt_search(stop_words, &parent, &str) != 0) {
+
+		fts_tokenizer_word_t	new_word;
+
+		new_word.nodes = ib_vector_create(
+			allocator, sizeof(fts_node_t), 4);
+
+		new_word.text.f_str = static_cast<byte*>(
+			 mem_heap_alloc(heap, str.f_len + 1));
+
+		memcpy(new_word.text.f_str, str.f_str, str.f_len);
+
+		new_word.text.f_n_char = 0;
+		new_word.text.f_len = str.f_len;
+		new_word.text.f_str[str.f_len] = 0;
+
+		rbt_insert(stop_words, &new_word, &new_word);
+	}
+
+	return(TRUE);
+}
+
+/******************************************************************//**
+Load user defined stopword from designated user table
+@return TRUE if load operation is successful */
+static
+ibool
+fts_load_user_stopword(
+/*===================*/
+	fts_t*		fts,			/*!< in: FTS struct */
+	const char*	stopword_table_name,	/*!< in: Stopword table
+						name */
+	fts_stopword_t*	stopword_info)		/*!< in: Stopword info */
+{
+	pars_info_t*	info;
+	que_t*		graph;
+	ulint		error = DB_SUCCESS;
+	ibool		ret = TRUE;
+	trx_t*		trx;
+	ibool		has_lock = fts->fts_status & TABLE_DICT_LOCKED;
+
+	trx = trx_allocate_for_background();
+	trx->op_info = "Load user stopword table into FTS cache";
+
+	if (!has_lock) {
+		mutex_enter(&dict_sys->mutex);
+	}
+
+	/* Validate the user table existence and in the right
+	format */
+	if (!fts_valid_stopword_table(stopword_table_name)) {
+		ret = FALSE;
+		goto cleanup;
+	}
+
+	info = pars_info_create();
+
+	pars_info_bind_id(info, TRUE, "table_stopword", stopword_table_name);
+
+	pars_info_bind_function(info, "my_func", fts_read_stopword,
+				stopword_info);
+
+	graph = fts_parse_sql_no_dict_lock(
+		NULL,
+		info,
+		"DECLARE FUNCTION my_func;\n"
+		"DECLARE CURSOR c IS"
+		" SELECT value "
+		" FROM $table_stopword;\n"
+		"BEGIN\n"
+		"\n"
+		"OPEN c;\n"
+		"WHILE 1 = 1 LOOP\n"
+		"  FETCH c INTO my_func();\n"
+		"  IF c % NOTFOUND THEN\n"
+		"    EXIT;\n"
+		"  END IF;\n"
+		"END LOOP;\n"
+		"CLOSE c;");
+
+	for (;;) {
+		error = fts_eval_sql(trx, graph);
+
+		if (error == DB_SUCCESS) {
+			fts_sql_commit(trx);
+			stopword_info->status = STOPWORD_USER_TABLE;
+			break;
+		} else {
+
+			fts_sql_rollback(trx);
+
+			ut_print_timestamp(stderr);
+
+			if (error == DB_LOCK_WAIT_TIMEOUT) {
+				fprintf(stderr, "  InnoDB: Warning: lock wait "
+					"timeout reading user stopword table. "
+					"Retrying!\n");
+
+				trx->error_state = DB_SUCCESS;
+			} else {
+				fprintf(stderr, "  InnoDB: Error: %lu "
+					"while reading user stopword table.\n",
+					error);
+				ret = FALSE;
+				break;
+			}
+		}
+	}
+
+	que_graph_free(graph);
+
+cleanup:
+	if (!has_lock) {
+		mutex_exit(&dict_sys->mutex);
+	}
+
+	trx_free_for_background(trx);
+	return(ret);
+}
+
+/******************************************************************//**
+Initialize the index cache. */
+static
+void
+fts_index_cache_init(
+/*=================*/
+	ib_alloc_t*		allocator,	/*!< in: the allocator to use */
+	fts_index_cache_t*	index_cache)	/*!< in: index cache */
+{
+	ulint			i;
+
+	ut_a(index_cache->words == NULL);
+
+	index_cache->words = rbt_create_arg_cmp(
+		sizeof(fts_tokenizer_word_t), innobase_fts_text_cmp,
+		index_cache->charset);
+
+	ut_a(index_cache->doc_stats == NULL);
+
+	index_cache->doc_stats = ib_vector_create(
+		allocator, sizeof(fts_doc_stats_t), 4);
+
+	for (i = 0; fts_index_selector[i].value; ++i) {
+		ut_a(index_cache->ins_graph[i] == NULL);
+		ut_a(index_cache->sel_graph[i] == NULL);
+	}
+}
+
+/*********************************************************************//**
+Initialize FTS cache. */
+UNIV_INTERN
+void
+fts_cache_init(
+/*===========*/
+	fts_cache_t*	cache)		/*!< in: cache to initialize */
+{
+	ulint		i;
+
+	/* Just to make sure */
+	ut_a(cache->sync_heap->arg == NULL);
+
+	cache->sync_heap->arg = mem_heap_create(1024);
+
+	cache->total_size = 0;
+
+	cache->deleted_doc_ids = ib_vector_create(
+		cache->sync_heap, sizeof(fts_update_t), 4);
+
+	/* Reset the cache data for all the FTS indexes. */
+	for (i = 0; i < ib_vector_size(cache->indexes); ++i) {
+		fts_index_cache_t*	index_cache;
+
+		index_cache = static_cast<fts_index_cache_t*>(
+			ib_vector_get(cache->indexes, i));
+
+		fts_index_cache_init(cache->sync_heap, index_cache);
+	}
+}
+
+/****************************************************************//**
+Create a FTS cache. */
+UNIV_INTERN
+fts_cache_t*
+fts_cache_create(
+/*=============*/
+	dict_table_t*	table)	/*!< in: table owns the FTS cache */
+{
+	mem_heap_t*	heap;
+	fts_cache_t*	cache;
+
+	heap = static_cast<mem_heap_t*>(mem_heap_create(512));
+
+	cache = static_cast<fts_cache_t*>(
+		mem_heap_zalloc(heap, sizeof(*cache)));
+
+	cache->cache_heap = heap;
+
+	rw_lock_create(fts_cache_rw_lock_key, &cache->lock, SYNC_FTS_CACHE);
+
+	rw_lock_create(
+		fts_cache_init_rw_lock_key, &cache->init_lock,
+		SYNC_FTS_CACHE_INIT);
+
+	mutex_create(
+		fts_delete_mutex_key, &cache->deleted_lock, SYNC_FTS_OPTIMIZE);
+
+	mutex_create(
+		fts_optimize_mutex_key, &cache->optimize_lock,
+		SYNC_FTS_OPTIMIZE);
+
+	mutex_create(
+		fts_doc_id_mutex_key, &cache->doc_id_lock, SYNC_FTS_OPTIMIZE);
+
+	/* This is the heap used to create the cache itself. */
+	cache->self_heap = ib_heap_allocator_create(heap);
+
+	/* This is a transient heap, used for storing sync data. */
+	cache->sync_heap = ib_heap_allocator_create(heap);
+	cache->sync_heap->arg = NULL;
+
+	cache->sync = static_cast<fts_sync_t*>(
+		mem_heap_zalloc(heap, sizeof(fts_sync_t)));
+
+	cache->sync->table = table;
+
+	/* Create the index cache vector that will hold the inverted indexes. */
+	cache->indexes = ib_vector_create(
+		cache->self_heap, sizeof(fts_index_cache_t), 2);
+
+	fts_cache_init(cache);
+
+	/* Create stopword RB tree. The stopword tree will
+	remain in cache for the duration of FTS cache's lifetime */
+	cache->stopword_info.cached_stopword = rbt_create(
+		sizeof(fts_tokenizer_word_t), fts_utf8_string_cmp);
+
+	cache->stopword_info.heap = cache->self_heap;
+
+	cache->stopword_info.status = STOPWORD_NOT_INIT;
+
+	return(cache);
+}
+
+/*******************************************************************//**
+Add a newly create index into FTS cache */
+UNIV_INTERN
+void
+fts_add_index(
+/*==========*/
+	dict_index_t*	index,		/*!< FTS index to be added */
+	dict_table_t*	table)		/*!< table */
+{
+	fts_t*			fts = table->fts;
+	fts_cache_t*		cache;
+	fts_index_cache_t*	index_cache;
+
+	ut_ad(fts);
+	cache = table->fts->cache;
+
+	rw_lock_x_lock(&cache->init_lock);
+
+	ib_vector_push(fts->indexes, &index);
+
+	index_cache = (fts_index_cache_t*) fts_find_index_cache(cache, index);
+
+	if (!index_cache) {
+		/* Add new index cache structure */
+		index_cache = fts_cache_index_cache_create(table, index);
+	}
+
+	rw_lock_x_unlock(&cache->init_lock);
+}
+
+/*******************************************************************//**
+recalibrate get_doc structure after index_cache in cache->indexes changed */
+static
+void
+fts_reset_get_doc(
+/*==============*/
+	fts_cache_t*	cache)	/*!< in: FTS index cache */
+{
+	fts_get_doc_t*  get_doc;
+	ulint		i;
+
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own(&cache->init_lock, RW_LOCK_EX));
+#endif
+	ib_vector_reset(cache->get_docs);
+
+	for (i = 0; i < ib_vector_size(cache->indexes); i++) {
+		fts_index_cache_t*	ind_cache;
+
+		ind_cache = static_cast<fts_index_cache_t*>(
+			ib_vector_get(cache->indexes, i));
+
+		get_doc = static_cast<fts_get_doc_t*>(
+			ib_vector_push(cache->get_docs, NULL));
+
+		memset(get_doc, 0x0, sizeof(*get_doc));
+
+		get_doc->index_cache = ind_cache;
+	}
+
+	ut_ad(ib_vector_size(cache->get_docs)
+	      == ib_vector_size(cache->indexes));
+}
+
+/*******************************************************************//**
+Check an index is in the table->indexes list
+@return TRUE if it exists */
+static
+ibool
+fts_in_dict_index(
+/*==============*/
+	dict_table_t*	table,		/*!< in: Table */
+	dict_index_t*	index_check)	/*!< in: index to be checked */
+{
+	dict_index_t*	index;
+
+	for (index = dict_table_get_first_index(table);
+	     index != NULL;
+	     index = dict_table_get_next_index(index)) {
+
+		if (index == index_check) {
+			return(TRUE);
+		}
+	}
+
+	return(FALSE);
+}
+
+/*******************************************************************//**
+Check an index is in the fts->cache->indexes list
+@return TRUE if it exists */
+static
+ibool
+fts_in_index_cache(
+/*===============*/
+	dict_table_t*	table,	/*!< in: Table */
+	dict_index_t*	index)	/*!< in: index to be checked */
+{
+	ulint	i;
+
+	for (i = 0; i < ib_vector_size(table->fts->cache->indexes); i++) {
+		fts_index_cache_t*      index_cache;
+
+		index_cache = static_cast<fts_index_cache_t*>(
+			ib_vector_get(table->fts->cache->indexes, i));
+
+		if (index_cache->index == index) {
+			return(TRUE);
+		}
+	}
+
+	return(FALSE);
+}
+
+/*******************************************************************//**
+Check indexes in the fts->indexes is also present in index cache and
+table->indexes list
+@return TRUE if all indexes match */
+UNIV_INTERN
+ibool
+fts_check_cached_index(
+/*===================*/
+	dict_table_t*	table)	/*!< in: Table where indexes are dropped */
+{
+	ulint	i;
+
+	if (!table->fts || !table->fts->cache) {
+		return(TRUE);
+	}
+
+	ut_a(ib_vector_size(table->fts->indexes)
+	      == ib_vector_size(table->fts->cache->indexes));
+
+	for (i = 0; i < ib_vector_size(table->fts->indexes); i++) {
+		dict_index_t*	index;
+
+		index = static_cast<dict_index_t*>(
+			ib_vector_getp(table->fts->indexes, i));
+
+		if (!fts_in_index_cache(table, index)) {
+			return(FALSE);
+		}
+
+		if (!fts_in_dict_index(table, index)) {
+			return(FALSE);
+		}
+	}
+
+	return(TRUE);
+}
+
+/*******************************************************************//**
+Drop auxiliary tables related to an FTS index
+@return DB_SUCCESS or error number */
+UNIV_INTERN
+ulint
+fts_drop_index(
+/*===========*/
+	dict_table_t*	table,	/*!< in: Table where indexes are dropped */
+	dict_index_t*	index,	/*!< in: Index to be dropped */
+	trx_t*		trx)	/*!< in: Transaction for the drop */
+{
+	ib_vector_t*	indexes = table->fts->indexes;
+	ulint		err = DB_SUCCESS;
+
+	ut_a(indexes);
+
+	if ((ib_vector_size(indexes) == 1
+	    && (index == static_cast<dict_index_t*>(
+			ib_vector_getp(table->fts->indexes, 0))))
+	   || ib_vector_is_empty(indexes)) {
+
+		/* If we are dropping the only FTS index of the table,
+		remove it from optimize thread */
+		fts_optimize_remove_table(table);
+
+		DICT_TF2_FLAG_UNSET(table, DICT_TF2_FTS);
+
+		/* If Doc ID column is not added internally by FTS index,
+		we can drop all FTS auxiliary tables. Otherwise, we will
+		need to keep some common table such as CONFIG table, so
+		as to keep track of incrementing Doc IDs */
+		if (!DICT_TF2_FLAG_IS_SET(
+			table, DICT_TF2_FTS_HAS_DOC_ID)) {
+
+			err = fts_drop_tables(trx, table);
+
+			err = fts_drop_index_tables(trx, index);
+
+			fts_free(table);
+
+			return(err);
+		}
+
+		fts_cache_clear(table->fts->cache, TRUE);
+		fts_cache_destroy(table->fts->cache);
+		table->fts->cache = fts_cache_create(table);
+	} else {
+		fts_cache_t*            cache = table->fts->cache;
+		fts_index_cache_t*      index_cache;
+
+		rw_lock_x_lock(&cache->init_lock);
+
+		index_cache = (fts_index_cache_t*) fts_find_index_cache(
+			cache, index);
+
+		if (index_cache->words) {
+			fts_words_free(index_cache->words);
+			rbt_free(index_cache->words);
+		}
+
+		ib_vector_remove(cache->indexes, *(void**) index_cache);
+
+		if (cache->get_docs) {
+			fts_reset_get_doc(cache);
+		}
+
+		rw_lock_x_unlock(&cache->init_lock);
+	}
+
+	err = fts_drop_index_tables(trx, index);
+
+	ib_vector_remove(indexes, (const void*) index);
+
+	return(err);
+}
+
+/****************************************************************//**
+Free the query graph but check whether dict_sys->mutex is already
+held */
+UNIV_INTERN
+void
+fts_que_graph_free_check_lock(
+/*==========================*/
+	fts_table_t*		fts_table,	/*!< in: FTS table */
+	const fts_index_cache_t*index_cache,	/*!< in: FTS index cache */
+	que_t*			graph)		/*!< in: query graph */
+{
+	ibool	has_dict = FALSE;
+
+	if (fts_table && fts_table->table) {
+		ut_ad(fts_table->table->fts);
+
+		has_dict = fts_table->table->fts->fts_status
+			 & TABLE_DICT_LOCKED;
+	} else if (index_cache) {
+		ut_ad(index_cache->index->table->fts);
+
+		has_dict = index_cache->index->table->fts->fts_status
+			 & TABLE_DICT_LOCKED;
+	}
+
+	if (!has_dict) {
+		mutex_enter(&dict_sys->mutex);
+	}
+
+	que_graph_free(graph);
+
+	if (!has_dict) {
+		mutex_exit(&dict_sys->mutex);
+	}
+}
+
+/****************************************************************//**
+Create an FTS index cache. */
+UNIV_INTERN
+CHARSET_INFO*
+fts_index_get_charset(
+/*==================*/
+	dict_index_t*		index)		/*!< in: FTS index */
+{
+	CHARSET_INFO*		charset = NULL;
+	dict_field_t*		field;
+	ulint			prtype;
+
+	field = dict_index_get_nth_field(index, 0);
+	prtype = field->col->prtype;
+
+	charset = innobase_get_fts_charset(
+		(int) (prtype & DATA_MYSQL_TYPE_MASK),
+		(uint) dtype_get_charset_coll(prtype));
+
+#ifdef FTS_DEBUG
+	/* Set up charset info for this index. Please note all
+	field of the FTS index should have the same charset */
+	for (i = 1; i < index->n_fields; i++) {
+		CHARSET_INFO*   fld_charset;
+
+		field = dict_index_get_nth_field(index, i);
+		prtype = field->col->prtype;
+
+		fld_charset = innobase_get_fts_charset(
+			(int)(prtype & DATA_MYSQL_TYPE_MASK),
+			(uint) dtype_get_charset_coll(prtype));
+
+		/* All FTS columns should have the same charset */
+		if (charset) {
+			ut_a(charset == fld_charset);
+		} else {
+			charset = fld_charset;
+		}
+	}
+#endif
+
+	return(charset);
+
+}
+/****************************************************************//**
+Create an FTS index cache.
+@return Index Cache */
+UNIV_INTERN
+fts_index_cache_t*
+fts_cache_index_cache_create(
+/*=========================*/
+	dict_table_t*		table,		/*!< in: table with FTS index */
+	dict_index_t*		index)		/*!< in: FTS index */
+{
+	ulint			n_bytes;
+	fts_index_cache_t*	index_cache;
+	fts_cache_t*		cache = table->fts->cache;
+
+	ut_a(cache != NULL);
+
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own(&cache->init_lock, RW_LOCK_EX));
+#endif
+
+	/* Must not already exist in the cache vector. */
+	ut_a(fts_find_index_cache(cache, index) == NULL);
+
+	index_cache = static_cast<fts_index_cache_t*>(
+		ib_vector_push(cache->indexes, NULL));
+
+	memset(index_cache, 0x0, sizeof(*index_cache));
+
+	index_cache->index = index;
+
+	index_cache->charset = fts_index_get_charset(index);
+
+	n_bytes = sizeof(que_t*) * sizeof(fts_index_selector);
+
+	index_cache->ins_graph = static_cast<que_t**>(
+		mem_heap_zalloc(static_cast<mem_heap_t*>(
+			cache->self_heap->arg), n_bytes));
+
+	index_cache->sel_graph = static_cast<que_t**>(
+		mem_heap_zalloc(static_cast<mem_heap_t*>(
+			cache->self_heap->arg), n_bytes));
+
+	fts_index_cache_init(cache->sync_heap, index_cache);
+
+	if (cache->get_docs) {
+		fts_reset_get_doc(cache);
+	}
+
+	return(index_cache);
+}
+
+/****************************************************************//**
+Release all resources help by the words rb tree e.g., the node ilist. */
+static
+void
+fts_words_free(
+/*===========*/
+	ib_rbt_t*	words)			/*!< in: rb tree of words */
+{
+	const ib_rbt_node_t*	rbt_node;
+
+	/* Free the resources held by a word. */
+	for (rbt_node = rbt_first(words);
+	     rbt_node != NULL;
+	     rbt_node = rbt_first(words)) {
+
+		ulint			i;
+		fts_tokenizer_word_t*	word;
+
+		word = rbt_value(fts_tokenizer_word_t, rbt_node);
+
+		/* Free the ilists of this word. */
+		for (i = 0; i < ib_vector_size(word->nodes); ++i) {
+
+			fts_node_t* fts_node = static_cast<fts_node_t*>(
+				ib_vector_get(word->nodes, i));
+
+			ut_free(fts_node->ilist);
+			fts_node->ilist = NULL;
+		}
+
+		/* NOTE: We are responsible for free'ing the node */
+		ut_free(rbt_remove_node(words, rbt_node));
+	}
+}
+
+/*********************************************************************//**
+Clear cache. If the shutdown flag is TRUE then the cache can contain
+data that needs to be freed. For regular clear as part of normal
+working we assume the caller has freed all resources. */
+UNIV_INTERN
+void
+fts_cache_clear(
+/*============*/
+	fts_cache_t*	cache,		/*!< in: cache */
+	ibool		free_words)	/*!< in: TRUE if free in memory
+					word cache. */
+{
+	ulint		i;
+
+	for (i = 0; i < ib_vector_size(cache->indexes); ++i) {
+		ulint			j;
+		fts_index_cache_t*	index_cache;
+
+		index_cache = static_cast<fts_index_cache_t*>(
+			ib_vector_get(cache->indexes, i));
+
+		if (free_words) {
+			fts_words_free(index_cache->words);
+		}
+
+		ut_a(rbt_empty(index_cache->words));
+
+		rbt_free(index_cache->words);
+
+		index_cache->words = NULL;
+
+		for (j = 0; fts_index_selector[j].value; ++j) {
+
+			if (index_cache->ins_graph[j] != NULL) {
+
+				fts_que_graph_free_check_lock(
+					NULL, index_cache,
+					index_cache->ins_graph[j]);
+
+				index_cache->ins_graph[j] = NULL;
+			}
+
+			if (index_cache->sel_graph[j] != NULL) {
+
+				fts_que_graph_free_check_lock(
+					NULL, index_cache,
+					index_cache->sel_graph[j]);
+
+				index_cache->sel_graph[j] = NULL;
+			}
+		}
+
+		index_cache->doc_stats = NULL;
+	}
+
+	mem_heap_free(static_cast<mem_heap_t*>(cache->sync_heap->arg));
+	cache->sync_heap->arg = NULL;
+
+	cache->total_size = 0;
+	cache->deleted_doc_ids = NULL;
+}
+
+/*********************************************************************//**
+Search the index specific cache for a particular FTS index.
+@return the index cache else NULL */
+UNIV_INLINE
+fts_index_cache_t*
+fts_get_index_cache(
+/*================*/
+	fts_cache_t*		cache,		/*!< in: cache to search */
+	const dict_index_t*	index)		/*!< in: index to search for */
+{
+	ulint			i;
+
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own((rw_lock_t*) &cache->lock, RW_LOCK_EX)
+	      || rw_lock_own((rw_lock_t*) &cache->init_lock, RW_LOCK_EX));
+#endif
+
+	for (i = 0; i < ib_vector_size(cache->indexes); ++i) {
+		fts_index_cache_t*	index_cache;
+
+		index_cache = static_cast<fts_index_cache_t*>(
+			ib_vector_get(cache->indexes, i));
+
+		if (index_cache->index == index) {
+
+			return(index_cache);
+		}
+	}
+
+	return(NULL);
+}
+
+#ifdef FTS_DEBUG
+/*********************************************************************//**
+Search the index cache for a get_doc structure.
+@return the fts_get_doc_t item else NULL */
+static
+fts_get_doc_t*
+fts_get_index_get_doc(
+/*==================*/
+	fts_cache_t*		cache,		/*!< in: cache to search */
+	const dict_index_t*	index)		/*!< in: index to search for */
+{
+	ulint			i;
+
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own((rw_lock_t*) &cache->init_lock, RW_LOCK_EX));
+#endif
+
+	for (i = 0; i < ib_vector_size(cache->get_docs); ++i) {
+		fts_get_doc_t*	get_doc;
+
+		get_doc = static_cast<fts_get_doc_t*>(
+			ib_vector_get(cache->get_docs, i));
+
+		if (get_doc->index_cache->index == index) {
+
+			return(get_doc);
+		}
+	}
+
+	return(NULL);
+}
+#endif
+
+/**********************************************************************//**
+Free the FTS cache. */
+UNIV_INTERN
+void
+fts_cache_destroy(
+/*==============*/
+	fts_cache_t*	cache)			/*!< in: cache*/
+{
+	rw_lock_free(&cache->lock);
+	rw_lock_free(&cache->init_lock);
+	mutex_free(&cache->optimize_lock);
+	mutex_free(&cache->deleted_lock);
+	mutex_free(&cache->doc_id_lock);
+	rbt_free(cache->stopword_info.cached_stopword);
+
+	if (cache->sync_heap->arg) {
+		mem_heap_free(static_cast<mem_heap_t*>(cache->sync_heap->arg));
+	}
+
+	mem_heap_free(cache->cache_heap);
+}
+
+/**********************************************************************//**
+Find an existing word, or if not found, create one and return it.
+@return specified word token */
+static
+fts_tokenizer_word_t*
+fts_tokenizer_word_get(
+/*===================*/
+	fts_cache_t*	cache,			/*!< in: cache */
+	fts_index_cache_t*
+			index_cache,		/*!< in: index cache */
+	fts_string_t*	text)			/*!< in: node text */
+{
+	fts_tokenizer_word_t*	word;
+	ib_rbt_bound_t		parent;
+
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own(&cache->lock, RW_LOCK_EX));
+#endif
+
+	/* If it is a stopword, do not index it */
+	if (rbt_search(cache->stopword_info.cached_stopword,
+		       &parent, text) == 0) {
+
+		return NULL;
+	}
+
+	/* Check if we found a match, if not then add word to tree. */
+	if (rbt_search(index_cache->words, &parent, text) != 0) {
+		mem_heap_t*		heap;
+		fts_tokenizer_word_t	new_word;
+
+		heap = static_cast<mem_heap_t*>(cache->sync_heap->arg);
+
+		new_word.nodes = ib_vector_create(
+			cache->sync_heap, sizeof(fts_node_t), 4);
+
+		fts_utf8_string_dup(&new_word.text, text, heap);
+
+		parent.last = rbt_add_node(
+			index_cache->words, &parent, &new_word);
+
+		/* Take into account the RB tree memory use and the vector. */
+		cache->total_size += sizeof(new_word)
+			+ sizeof(ib_rbt_node_t)
+			+ text->f_len
+			+ (sizeof(fts_node_t) * 4)
+			+ sizeof(*new_word.nodes);
+
+		ut_ad(rbt_validate(index_cache->words));
+	}
+
+	word = rbt_value(fts_tokenizer_word_t, parent.last);
+
+	return(word);
+}
+
+/**********************************************************************//**
+Add the given doc_id/word positions to the given node's ilist. */
+UNIV_INTERN
+void
+fts_cache_node_add_positions(
+/*=========================*/
+	fts_cache_t*	cache,		/*!< in: cache */
+	fts_node_t*	node,		/*!< in: word node */
+	doc_id_t	doc_id,		/*!< in: doc id */
+	ib_vector_t*	positions)	/*!< in: fts_token_t::positions */
+{
+	ulint		i;
+	byte*		ptr;
+	byte*		ilist;
+	ulint		enc_len;
+	ulint		last_pos;
+	byte*		ptr_start;
+	ulint		doc_id_delta;
+
+#ifdef UNIV_SYNC_DEBUG
+	if (cache) {
+		ut_ad(rw_lock_own(&cache->lock, RW_LOCK_EX));
+	}
+#endif
+	ut_ad(doc_id >= node->last_doc_id);
+
+	/* Calculate the space required to store the ilist. */
+	doc_id_delta = (ulint)(doc_id - node->last_doc_id);
+	enc_len = fts_get_encoded_len(doc_id_delta);
+
+	last_pos = 0;
+	for (i = 0; i < ib_vector_size(positions); i++) {
+		ulint	pos = *(static_cast<ulint*>(
+			ib_vector_get(positions, i)));
+
+		ut_ad(last_pos == 0 || pos > last_pos);
+
+		enc_len += fts_get_encoded_len(pos - last_pos);
+		last_pos = pos;
+	}
+
+	/* The 0x00 byte at the end of the token positions list. */
+	enc_len++;
+
+	if ((node->ilist_size_alloc - node->ilist_size) >= enc_len) {
+		/* No need to allocate more space, we can fit in the new
+		data at the end of the old one. */
+		ilist = NULL;
+		ptr = node->ilist + node->ilist_size;
+	} else {
+		ulint	new_size = node->ilist_size + enc_len;
+
+		/* Over-reserve space by a fixed size for small lengths and
+		by 20% for lengths >= 48 bytes. */
+		if (new_size < 16) {
+			new_size = 16;
+		} else if (new_size < 32) {
+			new_size = 32;
+		} else if (new_size < 48) {
+			new_size = 48;
+		} else {
+			new_size = (ulint)(1.2 * new_size);
+		}
+
+		ilist = static_cast<byte*>(ut_malloc(new_size));
+		ptr = ilist + node->ilist_size;
+
+		node->ilist_size_alloc = new_size;
+	}
+
+	ptr_start = ptr;
+
+	/* Encode the new fragment. */
+	ptr += fts_encode_int(doc_id_delta, ptr);
+
+	last_pos = 0;
+	for (i = 0; i < ib_vector_size(positions); i++) {
+		ulint	pos = *(static_cast<ulint*>(
+			 ib_vector_get(positions, i)));
+
+		ptr += fts_encode_int(pos - last_pos, ptr);
+		last_pos = pos;
+	}
+
+	*ptr++ = 0;
+
+	ut_a(enc_len == (ulint)(ptr - ptr_start));
+
+	if (ilist) {
+		/* Copy old ilist to the start of the new one and switch the
+		new one into place in the node. */
+		if (node->ilist_size > 0) {
+			memcpy(ilist, node->ilist, node->ilist_size);
+			ut_free(node->ilist);
+		}
+
+		node->ilist = ilist;
+	}
+
+	node->ilist_size += enc_len;
+
+	if (cache) {
+		cache->total_size += enc_len;
+	}
+
+	if (node->first_doc_id == FTS_NULL_DOC_ID) {
+		node->first_doc_id = doc_id;
+	}
+
+	node->last_doc_id = doc_id;
+	++node->doc_count;
+}
+
+/**********************************************************************//**
+Add document to the cache. */
+static
+void
+fts_cache_add_doc(
+/*==============*/
+	fts_cache_t*	cache,			/*!< in: cache */
+	fts_index_cache_t*
+			index_cache,		/*!< in: index cache */
+	doc_id_t	doc_id,			/*!< in: doc id to add */
+	ib_rbt_t*	tokens)			/*!< in: document tokens */
+{
+	const ib_rbt_node_t*	node;
+	ulint			n_words;
+	fts_doc_stats_t*	doc_stats;
+
+	if (!tokens) {
+		return;
+	}
+
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own(&cache->lock, RW_LOCK_EX));
+#endif
+
+	n_words = rbt_size(tokens);
+
+	for (node = rbt_first(tokens); node; node = rbt_first(tokens)) {
+
+		fts_tokenizer_word_t*	word;
+		fts_node_t*		fts_node = NULL;
+		fts_token_t*		token = rbt_value(fts_token_t, node);
+
+		/* Find and/or add token to the cache. */
+		word = fts_tokenizer_word_get(
+			cache, index_cache, &token->text);
+
+		if (!word) {
+			ut_free(rbt_remove_node(tokens, node));
+			continue;
+		}
+
+		if (ib_vector_size(word->nodes) > 0) {
+			fts_node = static_cast<fts_node_t*>(
+				ib_vector_last(word->nodes));
+		}
+
+		if (fts_node == NULL
+		    || fts_node->ilist_size > FTS_ILIST_MAX_SIZE
+		    || doc_id < fts_node->last_doc_id) {
+
+			fts_node = static_cast<fts_node_t*>(
+				ib_vector_push(word->nodes, NULL));
+
+			memset(fts_node, 0x0, sizeof(*fts_node));
+
+			cache->total_size += sizeof(*fts_node);
+		}
+
+		fts_cache_node_add_positions(
+			cache, fts_node, doc_id, token->positions);
+
+		ut_free(rbt_remove_node(tokens, node));
+	}
+
+	ut_a(rbt_empty(tokens));
+
+	/* Add to doc ids processed so far. */
+	doc_stats = static_cast<fts_doc_stats_t*>(
+		ib_vector_push(index_cache->doc_stats, NULL));
+
+	doc_stats->doc_id = doc_id;
+	doc_stats->word_count = n_words;
+
+	/* Add the doc stats memory usage too. */
+	cache->total_size += sizeof(*doc_stats);
+
+	if (doc_id > cache->sync->max_doc_id) {
+		cache->sync->max_doc_id = doc_id;
+	}
+}
+
+/****************************************************************//**
+Drops a table. If the table can't be found we return a SUCCESS code.
+@return DB_SUCCESS or error code */
+static
+ulint
+fts_drop_table(
+/*===========*/
+	trx_t*		trx,			/*!< in: transaction */
+	const char*	table_name)		/*!< in: table to drop */
+{
+	ulint		error = DB_SUCCESS;
+
+	/* Check that the table exists in our data dictionary. */
+	if (dict_table_get_low(table_name)) {
+
+#ifdef FTS_INTERNAL_DIAG_PRINT
+		ut_print_timestamp(stderr);
+		fprintf(stderr, "  InnoDB: Dropping %s\n", table_name);
+#endif
+
+		error = row_drop_table_for_mysql(table_name, trx, TRUE);
+
+		/* We only return the status of the last error. */
+		if (error != DB_SUCCESS) {
+			ut_print_timestamp(stderr);
+			fprintf(stderr, "  InnoDB: Error: (%lu) dropping "
+				"FTS index table %s\n", error, table_name);
+		}
+	} else {
+		ut_print_timestamp(stderr);
+
+		/* FIXME: Should provide appropriate error return code
+		rather than printing message indiscriminately. */
+		fprintf(stderr, "  InnoDB: %s not found.\n",
+			table_name);
+	}
+
+	return(error);
+}
+
+/****************************************************************//**
+Drops the common ancillary tables needed for supporting an FTS index
+on the given table. row_mysql_lock_data_dictionary must have been called
+before this.
+@return DB_SUCCESS or error code */
+static
+ulint
+fts_drop_common_tables(
+/*===================*/
+	trx_t*		trx,			/*!< in: transaction */
+	fts_table_t*	fts_table)		/*!< in: table with an FTS
+						index */
+{
+	ulint		i;
+	ulint		error = DB_SUCCESS;
+
+	for (i = 0; fts_common_tables[i] != NULL; ++i) {
+		ulint	err;
+		char*	table_name;
+
+		fts_table->suffix = fts_common_tables[i];
+
+		table_name = fts_get_table_name(fts_table);
+
+		err = fts_drop_table(trx, table_name);
+
+		/* We only return the status of the last error. */
+		if (err != DB_SUCCESS) {
+			error = err;
+		}
+
+		mem_free(table_name);
+	}
+
+	return(error);
+}
+
+/****************************************************************//**
+Since we do a horizontal split on the index table, we need to drop the
+all the split tables.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+ulint
+fts_drop_index_split_tables(
+/*========================*/
+	trx_t*		trx,			/*!< in: transaction */
+	dict_index_t*	index)			/*!< in: fts instance */
+
+{
+	ulint		i;
+	fts_table_t	fts_table;
+	ulint		error = DB_SUCCESS;
+
+	FTS_INIT_INDEX_TABLE(&fts_table, NULL, FTS_INDEX_TABLE, index);
+
+	for (i = 0; fts_index_selector[i].value; ++i) {
+		ulint	err;
+		char*	table_name;
+
+		fts_table.suffix = fts_get_suffix(i);
+
+		table_name = fts_get_table_name(&fts_table);
+
+		err = fts_drop_table(trx, table_name);
+
+		/* We only return the status of the last error. */
+		if (err != DB_SUCCESS) {
+			error = err;
+		}
+
+		mem_free(table_name);
+	}
+
+	return(error);
+}
+
+/****************************************************************//**
+Drops FTS auxiliary tables for an FTS index
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+ulint
+fts_drop_index_tables(
+/*==================*/
+	trx_t*		trx,		/*!< in: transaction */
+	dict_index_t*	index)		/*!< in: Index to drop */
+{
+	ulint			err;
+	ulint			error = DB_SUCCESS;
+	fts_table_t		fts_table;
+	ulint			j;
+
+	static const char*	index_tables[] = {
+		"DOC_ID",
+		NULL
+	};
+
+	err = fts_drop_index_split_tables(trx, index);
+
+	/* We only return the status of the last error. */
+	if (err != DB_SUCCESS) {
+		error = err;
+	}
+
+	FTS_INIT_INDEX_TABLE(&fts_table, NULL, FTS_INDEX_TABLE, index);
+
+	for (j = 0; index_tables[j] != NULL; ++j) {
+		ulint	err;
+		char*	table_name;
+
+		fts_table.suffix = index_tables[j];
+
+		table_name = fts_get_table_name(&fts_table);
+
+		err = fts_drop_table(trx, table_name);
+
+		/* We only return the status of the last error. */
+		if (err != DB_SUCCESS) {
+			error = err;
+		}
+
+		mem_free(table_name);
+	}
+
+	return(error);
+}
+
+/****************************************************************//**
+Drops FTS ancillary tables needed for supporting an FTS index
+on the given table. row_mysql_lock_data_dictionary must have been called
+before this.
+@return DB_SUCCESS or error code */
+static
+ulint
+fts_drop_all_index_tables(
+/*======================*/
+	trx_t*		trx,			/*!< in: transaction */
+	fts_t*		fts)			/*!< in: fts instance */
+{
+	ulint		i;
+	ulint		error = DB_SUCCESS;
+
+	for (i = 0; i < ib_vector_size(fts->indexes); ++i) {
+		ulint		err;
+		dict_index_t*	index;
+
+		index = static_cast<dict_index_t*>(
+			ib_vector_getp(fts->indexes, i));
+
+		err = fts_drop_index_tables(trx, index);
+
+		if (err != DB_SUCCESS) {
+			error = err;
+		}
+	}
+
+	return(error);
+}
+
+/*********************************************************************//**
+Drops the ancillary tables needed for supporting an FTS index on a
+given table. row_mysql_lock_data_dictionary must have been called before
+this.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+ulint
+fts_drop_tables(
+/*============*/
+	trx_t*		trx,		/*!< in: transaction */
+	dict_table_t*	table)		/*!< in: table has the FTS index */
+{
+	ulint		error;
+	fts_table_t	fts_table;
+
+	FTS_INIT_FTS_TABLE(&fts_table, NULL, FTS_COMMON_TABLE, table);
+
+	error = fts_drop_common_tables(trx, &fts_table);
+
+	if (error == DB_SUCCESS) {
+		error = fts_drop_all_index_tables(trx, table->fts);
+	}
+
+	return(error);
+}
+
+/*********************************************************************//**
+Prepare the SQL, so that all '%s' are replaced by the common prefix.
+@return sql string, use mem_free() to free the memory */
+static
+char*
+fts_prepare_sql(
+/*============*/
+	fts_table_t*	fts_table,	/*!< in: table name info */
+	const char*	my_template)	/*!< in: sql template */
+{
+	char*		sql;
+	char*		name_prefix;
+
+	name_prefix = fts_get_table_name_prefix(fts_table);
+	sql = ut_strreplace(my_template, "%s", name_prefix);
+	mem_free(name_prefix);
+
+	return(sql);
+}
+
+/*********************************************************************//**
+Creates the common ancillary tables needed for supporting an FTS index
+on the given table. row_mysql_lock_data_dictionary must have been called
+before this.
+@return DB_SUCCESS if succeed */
+UNIV_INTERN
+ulint
+fts_create_common_tables(
+/*=====================*/
+	trx_t*		trx,			/*!< in: transaction */
+	const dict_table_t* table,		/*!< in: table with FTS index */
+	const char*	name,			/*!< in: table name normalized.*/
+	ibool		skip_doc_id_index)	/*!< in: Skip index on doc id */
+
+{
+	char*		sql;
+	ulint		error;
+	que_t*		graph;
+	fts_table_t	fts_table;
+	mem_heap_t*	heap = mem_heap_create(1024);
+
+	FTS_INIT_FTS_TABLE(&fts_table, NULL, FTS_COMMON_TABLE, table);
+
+	error = fts_drop_common_tables(trx, &fts_table);
+
+	if (error != DB_SUCCESS) {
+
+		goto func_exit;
+	}
+
+	/* Create the FTS tables that are common to an FTS index. */
+	sql = fts_prepare_sql(&fts_table, fts_create_common_tables_sql);
+	graph = fts_parse_sql_no_dict_lock(NULL, NULL, sql);
+	mem_free(sql);
+
+	error = fts_eval_sql(trx, graph);
+
+	que_graph_free(graph);
+
+	if (error != DB_SUCCESS) {
+
+		goto func_exit;
+	}
+
+	/* Write the default settings to the config table. */
+	fts_table.suffix = "CONFIG";
+	graph = fts_parse_sql_no_dict_lock(
+		&fts_table, NULL, fts_config_table_insert_values_sql);
+
+	error = fts_eval_sql(trx, graph);
+
+	que_graph_free(graph);
+
+	if (error != DB_SUCCESS || skip_doc_id_index) {
+
+		goto func_exit;
+	}
+
+	/* Create the FTS DOC_ID index on the hidden column. Currently this
+	is common for any FT index created on the table. */
+	graph = fts_parse_sql_no_dict_lock(
+		NULL,
+		NULL,
+		mem_heap_printf(
+			heap,
+			"BEGIN\n"
+			""
+			"CREATE UNIQUE INDEX %s ON %s(%s);\n",
+			FTS_DOC_ID_INDEX_NAME, name, FTS_DOC_ID_COL_NAME));
+
+	error = fts_eval_sql(trx, graph);
+	que_graph_free(graph);
+
+func_exit:
+	if (error != DB_SUCCESS) {
+		/* We have special error handling here */
+
+		trx->error_state = DB_SUCCESS;
+
+		trx_rollback_to_savepoint(trx, NULL);
+
+		row_drop_table_for_mysql(table->name, trx, FALSE);
+
+		trx->error_state = DB_SUCCESS;
+	}
+
+	mem_heap_free(heap);
+
+	return(error);
+}
+
+/*************************************************************//**
+Wrapper function of fts_create_index_tables_low(), create auxiliary
+tables for an FTS index
+@return: DB_SUCCESS or error code */
+static
+dict_table_t*
+fts_create_one_index_table(
+/*=======================*/
+	trx_t*		trx,		/*!< in: transaction */
+	const dict_index_t*
+			index,		/*!< in: the index instance */
+	fts_table_t*	fts_table,	/*!< in: fts_table structure */
+	mem_heap_t*	heap)		/*!< in: heap */
+{
+	dict_field_t*		field;
+	dict_table_t*		new_table = NULL;
+	char*			table_name = fts_get_table_name(fts_table);
+	ulint			error;
+	CHARSET_INFO*		charset;
+
+	ut_ad(index->type & DICT_FTS);
+
+	new_table = dict_mem_table_create(table_name, 0, 5, 1, 0);
+
+	field = dict_index_get_nth_field(index, 0);
+	charset = innobase_get_fts_charset(
+		(int)(field->col->prtype & DATA_MYSQL_TYPE_MASK),
+		(uint) dtype_get_charset_coll(field->col->prtype));
+
+	if (strcmp(charset->name, "latin1_swedish_ci") == 0) {
+		dict_mem_table_add_col(new_table, heap, "word", DATA_VARCHAR,
+				       field->col->prtype, FTS_MAX_WORD_LEN);
+	} else {
+		dict_mem_table_add_col(new_table, heap, "word", DATA_VARMYSQL,
+				       field->col->prtype, FTS_MAX_WORD_LEN);
+	}
+
+	dict_mem_table_add_col(new_table, heap, "first_doc_id", DATA_INT,
+			       DATA_NOT_NULL | DATA_UNSIGNED,
+			       sizeof(doc_id_t));
+
+	dict_mem_table_add_col(new_table, heap, "last_doc_id", DATA_INT,
+			       DATA_NOT_NULL | DATA_UNSIGNED,
+			       sizeof(doc_id_t));
+
+	dict_mem_table_add_col(new_table, heap, "doc_count", DATA_INT,
+			       DATA_NOT_NULL | DATA_UNSIGNED, 4);
+
+	dict_mem_table_add_col(new_table, heap, "ilist", DATA_BLOB,
+			       4130048,	0);
+
+	error = row_create_table_for_mysql(new_table, trx);
+
+	if (error != DB_SUCCESS) {
+		trx->error_state = static_cast<db_err>(error);
+		dict_mem_table_free(new_table);
+		new_table = NULL;
+		fprintf(stderr, "  InnoDB: Warning: Fail to create FTS "
+				"  index table %s \n", table_name);
+	}
+
+	mem_free(table_name);
+
+	return(new_table);
+}
+
+/*************************************************************//**
+Wrapper function of fts_create_index_tables_low(), create auxiliary
+tables for an FTS index
+@return: DB_SUCCESS or error code */
+UNIV_INTERN
+ulint
+fts_create_index_tables_low(
+/*========================*/
+	trx_t*		trx,		/*!< in: transaction */
+	const dict_index_t*
+			index,		/*!< in: the index instance */
+	const char*	table_name,	/*!< in: the table name */
+	table_id_t	table_id)	/*!< in: the table id */
+
+{
+	ulint		i;
+	char*		sql;
+	que_t*		graph;
+	fts_table_t	fts_table;
+	ulint		error = DB_SUCCESS;
+	mem_heap_t*	heap = mem_heap_create(1024);
+
+	fts_table.type = FTS_INDEX_TABLE;
+	fts_table.index_id = index->id;
+	fts_table.table_id = table_id;
+	fts_table.parent = table_name;
+	fts_table.table = NULL;
+
+	/* Create the FTS auxiliary tables that are specific
+	to an FTS index. */
+	sql = fts_prepare_sql(&fts_table, fts_create_index_tables_sql);
+	graph = fts_parse_sql_no_dict_lock(NULL, NULL, sql);
+	mem_free(sql);
+
+	error = fts_eval_sql(trx, graph);
+	que_graph_free(graph);
+
+	for (i = 0; fts_index_selector[i].value && error == DB_SUCCESS; ++i) {
+		dict_table_t*	new_table;
+
+		/* Create the FTS auxiliary tables that are specific
+		to an FTS index. We need to preserve the table_id %s
+		which fts_parse_sql_no_dict_lock() will fill in for us. */
+		fts_table.suffix = fts_get_suffix(i);
+
+		new_table = fts_create_one_index_table(
+			trx, index, &fts_table, heap);
+
+		if (!new_table) {
+			error = DB_FAIL;
+			break;
+		}
+
+		graph = fts_parse_sql_no_dict_lock(
+			&fts_table, NULL, fts_create_index_sql);
+
+		error = fts_eval_sql(trx, graph);
+		que_graph_free(graph);
+	}
+
+	if (error == DB_SUCCESS) {
+		error = fts_sql_commit(trx);
+	} else {
+		/* We have special error handling here */
+
+		trx->error_state = DB_SUCCESS;
+
+		trx_rollback_to_savepoint(trx, NULL);
+
+		row_drop_table_for_mysql(table_name, trx, FALSE);
+
+		trx->error_state = DB_SUCCESS;
+	}
+
+	mem_heap_free(heap);
+
+	return(error);
+}
+
+/******************************************************************//**
+Creates the column specific ancillary tables needed for supporting an
+FTS index on the given table. row_mysql_lock_data_dictionary must have
+been called before this.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+ulint
+fts_create_index_tables(
+/*====================*/
+	trx_t*			trx,	/*!< in: transaction */
+	const dict_index_t*	index)	/*!< in: the index instance */
+{
+	dict_table_t*	table;
+
+	table = dict_table_get_low(index->table_name);
+	ut_a(table != NULL);
+
+	return(fts_create_index_tables_low(trx, index, table->name, table->id));
+}
+#if 0
+/******************************************************************//**
+Return string representation of state. */
+static
+const char*
+fts_get_state_str(
+/*==============*/
+				/* out: string representation of state */
+	fts_row_state	state)	/*!< in: state */
+{
+	switch (state) {
+	case FTS_INSERT:
+		return "INSERT";
+
+	case FTS_MODIFY:
+		return "MODIFY";
+
+	case FTS_DELETE:
+		return "DELETE";
+
+	case FTS_NOTHING:
+		return "NOTHING";
+
+	case FTS_INVALID:
+		return "INVALID";
+
+	default:
+		return "UNKNOWN";
+	}
+}
+#endif
+
+/******************************************************************//**
+Calculate the new state of a row given the existing state and a new event.
+@return new state of row */
+static
+fts_row_state
+fts_trx_row_get_new_state(
+/*======================*/
+	fts_row_state	old_state,		/*!< in: existing state of row */
+	fts_row_state	event)			/*!< in: new event */
+{
+	/* The rules for transforming states:
+
+	I = inserted
+	M = modified
+	D = deleted
+	N = nothing
+
+	M+D -> D:
+
+	If the row existed before the transaction started and it is modified
+	during the transaction, followed by a deletion of the row, only the
+	deletion will be signaled.
+
+	M+ -> M:
+
+	If the row existed before the transaction started and it is modified
+	more than once during the transaction, only the last modification
+	will be signaled.
+
+	IM*D -> N:
+
+	If a new row is added during the transaction (and possibly modified
+	after its initial insertion) but it is deleted before the end of the
+	transaction, nothing will be signaled.
+
+	IM* -> I:
+
+	If a new row is added during the transaction and modified after its
+	initial insertion, only the addition will be signaled.
+
+	M*DI -> M:
+
+	If the row existed before the transaction started and it is deleted,
+	then re-inserted, only a modification will be signaled. Note that
+	this case is only possible if the table is using the row's primary
+	key for FTS row ids, since those can be re-inserted by the user,
+	which is not true for InnoDB generated row ids.
+
+	It is easily seen that the above rules decompose such that we do not
+	need to store the row's entire history of events. Instead, we can
+	store just one state for the row and update that when new events
+	arrive. Then we can implement the above rules as a two-dimensional
+	look-up table, and get checking of invalid combinations "for free"
+	in the process. */
+
+	/* The lookup table for transforming states. old_state is the
+	Y-axis, event is the X-axis. */
+	static const fts_row_state table[4][4] = {
+			/*    I            M            D            N */
+		/* I */	{ FTS_INVALID, FTS_INSERT,  FTS_NOTHING, FTS_INVALID },
+		/* M */	{ FTS_INVALID, FTS_MODIFY,  FTS_DELETE,  FTS_INVALID },
+		/* D */	{ FTS_MODIFY,  FTS_INVALID, FTS_INVALID, FTS_INVALID },
+		/* N */	{ FTS_INVALID, FTS_INVALID, FTS_INVALID, FTS_INVALID }
+	};
+
+	fts_row_state result;
+
+	ut_a(old_state < FTS_INVALID);
+	ut_a(event < FTS_INVALID);
+
+	result = table[(int) old_state][(int) event];
+	ut_a(result != FTS_INVALID);
+
+	return(result);
+}
+
+/******************************************************************//**
+Create a savepoint instance.
+@return savepoint instance */
+static
+fts_savepoint_t*
+fts_savepoint_create(
+/*=================*/
+	ib_vector_t*	savepoints,		/*!< out: InnoDB transaction */
+	const char*	name,			/*!< in: savepoint name */
+	mem_heap_t*	heap)			/*!< in: heap */
+{
+	fts_savepoint_t*	savepoint;
+
+	savepoint = static_cast<fts_savepoint_t*>(
+		ib_vector_push(savepoints, NULL));
+
+	memset(savepoint, 0x0, sizeof(*savepoint));
+
+	if (name) {
+		savepoint->name = mem_heap_strdup(heap, name);
+	}
+
+	savepoint->tables = rbt_create(
+		sizeof(fts_trx_table_t*), fts_trx_table_cmp);
+
+	return(savepoint);
+}
+
+/******************************************************************//**
+Create an FTS trx.
+@return FTS trx  */
+static
+fts_trx_t*
+fts_trx_create(
+/*===========*/
+	trx_t*	trx)				/*!< in: InnoDB transaction */
+{
+	fts_trx_t*	ftt;
+	ib_alloc_t*	heap_alloc;
+	mem_heap_t*	heap = mem_heap_create(1024);
+
+	ftt = static_cast<fts_trx_t*>(mem_heap_alloc(heap, sizeof(fts_trx_t)));
+	ftt->trx = trx;
+	ftt->heap = heap;
+
+	heap_alloc = ib_heap_allocator_create(heap);
+
+	ftt->savepoints = static_cast<ib_vector_t*>(ib_vector_create(
+		heap_alloc, sizeof(fts_savepoint_t), 4));
+
+	ftt->last_stmt = static_cast<ib_vector_t*>(ib_vector_create(
+		heap_alloc, sizeof(fts_savepoint_t), 4));
+
+	/* Default instance has no name and no heap. */
+	fts_savepoint_create(ftt->savepoints, NULL, NULL);
+	fts_savepoint_create(ftt->last_stmt, NULL, NULL);
+
+	return(ftt);
+}
+
+/******************************************************************//**
+Create an FTS trx table.
+@return FTS trx table */
+static
+fts_trx_table_t*
+fts_trx_table_create(
+/*=================*/
+	fts_trx_t*	fts_trx,		/*!< in: FTS trx */
+	dict_table_t*	table)			/*!< in: table */
+{
+	fts_trx_table_t*	ftt;
+
+	ftt = static_cast<fts_trx_table_t*>(
+		mem_heap_alloc(fts_trx->heap, sizeof(*ftt)));
+
+	memset(ftt, 0x0, sizeof(*ftt));
+
+	ftt->table = table;
+	ftt->fts_trx = fts_trx;
+
+	ftt->rows = rbt_create(sizeof(fts_trx_row_t), fts_trx_row_doc_id_cmp);
+
+	return(ftt);
+}
+
+/******************************************************************//**
+Clone an FTS trx table.
+@return FTS trx table */
+static
+fts_trx_table_t*
+fts_trx_table_clone(
+/*=================*/
+	const fts_trx_table_t*	ftt_src)	/*!< in: FTS trx */
+{
+	fts_trx_table_t*	ftt;
+
+	ftt = static_cast<fts_trx_table_t*>(
+		mem_heap_alloc(ftt_src->fts_trx->heap, sizeof(*ftt)));
+
+	memset(ftt, 0x0, sizeof(*ftt));
+
+	ftt->table = ftt_src->table;
+	ftt->fts_trx = ftt_src->fts_trx;
+
+	ftt->rows = rbt_create(sizeof(fts_trx_row_t), fts_trx_row_doc_id_cmp);
+
+	/* Copy the rb tree values to the new savepoint. */
+	rbt_merge_uniq(ftt_src->rows, ftt->rows);
+
+	/* These are only added on commit. At this stage we only have
+	the updated row state. */
+	ut_a(ftt_src->added_doc_ids == NULL);
+
+	return(ftt);
+}
+
+/******************************************************************//**
+Initialize the FTS trx instance.
+@return FTS trx instance */
+static
+fts_trx_table_t*
+fts_trx_init(
+/*=========*/
+	trx_t*			trx,		/*!< in: transaction */
+	dict_table_t*		table,		/*!< in: FTS table instance */
+	ib_vector_t*		savepoints)	/*!< in: Savepoints */
+{
+	fts_trx_table_t*	ftt;
+	ib_rbt_bound_t		parent;
+	ib_rbt_t*		tables;
+	fts_savepoint_t*	savepoint;
+
+	savepoint = static_cast<fts_savepoint_t*>(ib_vector_last(savepoints));
+
+	tables = savepoint->tables;
+	rbt_search_cmp(tables, &parent, &table->id, fts_trx_table_id_cmp, NULL);
+
+	if (parent.result == 0) {
+		fts_trx_table_t**	fttp;
+
+		fttp = rbt_value(fts_trx_table_t*, parent.last);
+		ftt = *fttp;
+	} else {
+		ftt = fts_trx_table_create(trx->fts_trx, table);
+		rbt_add_node(tables, &parent, &ftt);
+	}
+
+	ut_a(ftt->table == table);
+
+	return(ftt);
+}
+
+/******************************************************************//**
+Notify the FTS system about an operation on an FTS-indexed table. */
+static
+void
+fts_trx_table_add_op(
+/*=================*/
+	fts_trx_table_t*ftt,			/*!< in: FTS trx table */
+	doc_id_t	doc_id,			/*!< in: doc id */
+	fts_row_state	state,			/*!< in: state of the row */
+	ib_vector_t*	fts_indexes)		/*!< in: FTS indexes affected */
+{
+	ib_rbt_t*	rows;
+	ib_rbt_bound_t	parent;
+
+	rows = ftt->rows;
+	rbt_search(rows, &parent, &doc_id);
+
+	/* Row id found, update state, and if new state is FTS_NOTHING,
+	we delete the row from our tree. */
+	if (parent.result == 0) {
+		fts_trx_row_t*	row = rbt_value(fts_trx_row_t, parent.last);
+
+		row->state = fts_trx_row_get_new_state(row->state, state);
+
+		if (row->state == FTS_NOTHING) {
+			if (row->fts_indexes) {
+				ib_vector_free(row->fts_indexes);
+			}
+
+			ut_free(rbt_remove_node(rows, parent.last));
+			row = NULL;
+		} else if (row->fts_indexes != NULL) {
+			ib_vector_free(row->fts_indexes);
+			row->fts_indexes = fts_indexes;
+		}
+
+	} else { /* Row-id not found, create a new one. */
+		fts_trx_row_t	row;
+
+		row.doc_id = doc_id;
+		row.state = state;
+		row.fts_indexes = fts_indexes;
+
+		rbt_add_node(rows, &parent, &row);
+	}
+}
+
+/******************************************************************//**
+Notify the FTS system about an operation on an FTS-indexed table. */
+UNIV_INTERN
+void
+fts_trx_add_op(
+/*===========*/
+	trx_t*		trx,			/*!< in: InnoDB transaction */
+	dict_table_t*	table,			/*!< in: table */
+	doc_id_t	doc_id,			/*!< in: new doc id */
+	fts_row_state	state,			/*!< in: state of the row */
+	ib_vector_t*	fts_indexes)		/*!< in: FTS indexes affected
+						(NULL=all) */
+{
+	fts_trx_table_t*	tran_ftt;
+	fts_trx_table_t*	stmt_ftt;
+
+	if (!trx->fts_trx) {
+		trx->fts_trx = fts_trx_create(trx);
+	}
+
+	tran_ftt = fts_trx_init(trx, table, trx->fts_trx->savepoints);
+	stmt_ftt = fts_trx_init(trx, table, trx->fts_trx->last_stmt);
+
+	fts_trx_table_add_op(tran_ftt, doc_id, state, fts_indexes);
+	fts_trx_table_add_op(stmt_ftt, doc_id, state, fts_indexes);
+}
+
+/******************************************************************//**
+Fetch callback that converts a textual document id to a binary value and
+stores it in the given place.
+@return always returns NULL */
+static
+ibool
+fts_fetch_store_doc_id(
+/*===================*/
+	void*		row,			/*!< in: sel_node_t* */
+	void*		user_arg)		/*!< in: doc_id_t* to store
+						doc_id in */
+{
+	int		n_parsed;
+	sel_node_t*	node = static_cast<sel_node_t*>(row);
+	doc_id_t*	doc_id = static_cast<doc_id_t*>(user_arg);
+	dfield_t*	dfield = que_node_get_val(node->select_list);
+	dtype_t*	type = dfield_get_type(dfield);
+	ulint		len = dfield_get_len(dfield);
+
+	char		buf[32];
+
+	ut_a(dtype_get_mtype(type) == DATA_VARCHAR);
+	ut_a(len > 0 && len < sizeof(buf));
+
+	memcpy(buf, dfield_get_data(dfield), len);
+	buf[len] = '\0';
+
+	n_parsed = sscanf(buf, FTS_DOC_ID_FORMAT, doc_id);
+	ut_a(n_parsed == 1);
+
+	return(FALSE);
+}
+
+#ifdef FTS_CACHE_SIZE_DEBUG
+/******************************************************************//**
+Get the max cache size in bytes. If there is an error reading the
+value we simply print an error message here and return the default
+value to the caller.
+@return max cache size in bytes */
+static
+ulint
+fts_get_max_cache_size(
+/*===================*/
+	trx_t*		trx,			/*!< in: transaction */
+	fts_table_t*	fts_table)		/*!< in: table instance */
+{
+	ulint		error;
+	fts_string_t	value;
+	ulint		cache_size_in_mb;
+
+	/* Set to the default value. */
+	cache_size_in_mb = FTS_CACHE_SIZE_LOWER_LIMIT_IN_MB;
+
+	/* We set the length of value to the max bytes it can hold. This
+	information is used by the callback that reads the value. */
+	value.f_n_char = 0;
+	value.f_len = FTS_MAX_CONFIG_VALUE_LEN;
+	value.f_str = ut_malloc(value.f_len + 1);
+
+	error = fts_config_get_value(
+		trx, fts_table, FTS_MAX_CACHE_SIZE_IN_MB, &value);
+
+	if (error == DB_SUCCESS) {
+
+		value.f_str[value.f_len] = 0;
+		cache_size_in_mb = strtoul((char*) value.f_str, NULL, 10);
+
+		if (cache_size_in_mb > FTS_CACHE_SIZE_UPPER_LIMIT_IN_MB) {
+
+			ut_print_timestamp(stderr);
+			fprintf(stderr, "  InnoDB: Warning: FTS max cache size "
+				" (%lu) out of range. Minimum value is "
+				"%luMB and the maximum values is %luMB, "
+				"setting cache size to upper limit\n",
+				cache_size_in_mb,
+				FTS_CACHE_SIZE_LOWER_LIMIT_IN_MB,
+				FTS_CACHE_SIZE_UPPER_LIMIT_IN_MB);
+
+			cache_size_in_mb = FTS_CACHE_SIZE_UPPER_LIMIT_IN_MB;
+
+		} else if  (cache_size_in_mb
+			    < FTS_CACHE_SIZE_LOWER_LIMIT_IN_MB) {
+
+			ut_print_timestamp(stderr);
+			fprintf(stderr, "  InnoDB: Warning: FTS max cache size "
+				" (%lu) out of range. Minimum value is "
+				"%luMB and the maximum values is %luMB, "
+				"setting cache size to lower limit\n",
+				cache_size_in_mb,
+				FTS_CACHE_SIZE_LOWER_LIMIT_IN_MB,
+				FTS_CACHE_SIZE_UPPER_LIMIT_IN_MB);
+
+			cache_size_in_mb = FTS_CACHE_SIZE_LOWER_LIMIT_IN_MB;
+		}
+	} else {
+		ut_print_timestamp(stderr);
+		fprintf(stderr, "InnoDB: Error: (%lu) reading max cache "
+			"config value from config table\n", error);
+	}
+
+	ut_free(value.f_str);
+
+	return(cache_size_in_mb * 1024 * 1024);
+}
+#endif
+
+/*********************************************************************//**
+Get the total number of documents in the FTS.
+@return estimated number of rows in the table */
+UNIV_INTERN
+ulint
+fts_get_total_document_count(
+/*=========================*/
+	dict_table_t*   table)		/*!< in: table instance */
+{
+	ut_ad(table->stat_initialized);
+
+	return((ulint) table->stat_n_rows);
+}
+
+/*********************************************************************//**
+Get the total number of words in the FTS for a particular FTS index.
+@return DB_SUCCESS if all OK else error code */
+UNIV_INTERN
+ulint
+fts_get_total_word_count(
+/*=====================*/
+	trx_t*		trx,			/*!< in: transaction */
+	dict_index_t*	index,			/*!< in: for this index */
+	ulint*		total)			/* out: total words */
+{
+	ulint		error;
+	fts_string_t	value;
+
+	*total = 0;
+
+	/* We set the length of value to the max bytes it can hold. This
+	information is used by the callback that reads the value. */
+	value.f_n_char = 0;
+	value.f_len = FTS_MAX_CONFIG_VALUE_LEN;
+	value.f_str = static_cast<byte*>(ut_malloc(value.f_len + 1));
+
+	error = fts_config_get_index_value(
+		trx, index, FTS_TOTAL_WORD_COUNT, &value);
+
+	if (error == DB_SUCCESS) {
+
+		value.f_str[value.f_len] = 0;
+		*total = strtoul((char*) value.f_str, NULL, 10);
+	} else {
+		ut_print_timestamp(stderr);
+		fprintf(stderr, "  InnoDB: Error: (%lu) reading total words "
+			"value from config table\n", error);
+	}
+
+	ut_free(value.f_str);
+
+	return(error);
+}
+
+/*********************************************************************//**
+Update the next and last Doc ID in the CONFIG table to be the input
+"doc_id" value (+ 1). We would do so after each FTS index build or
+table truncate */
+UNIV_INTERN
+void
+fts_update_next_doc_id(
+/*===================*/
+	const dict_table_t*	table,		/*!< in: table */
+	const char*		table_name,	/*!< in: table name */
+	doc_id_t		doc_id)		/*!< in: DOC ID to set */
+{
+	table->fts->cache->synced_doc_id = doc_id;
+	table->fts->cache->next_doc_id = doc_id + 1;
+
+	table->fts->cache->first_doc_id = table->fts->cache->next_doc_id;
+
+	fts_update_sync_doc_id(
+		table, table_name, table->fts->cache->synced_doc_id, NULL);
+
+}
+
+/*********************************************************************//**
+Get the next available document id.
+@return DB_SUCCESS if OK */
+UNIV_INTERN
+ulint
+fts_get_next_doc_id(
+/*================*/
+	const dict_table_t*	table,		/*!< in: table */
+	doc_id_t*		doc_id)		/*!< out: new document id */
+{
+	fts_cache_t*	cache = table->fts->cache;
+
+	/* If the Doc ID system has not yet been initialized, we
+	will consult the CONFIG table and user table to re-establish
+	the initial value of the Doc ID */
+
+	if (cache->first_doc_id != 0 || !fts_init_doc_id(table)) {
+		if (!DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_HAS_DOC_ID)) {
+			*doc_id = FTS_NULL_DOC_ID;
+			return(DB_SUCCESS);
+		}
+
+		/* Otherwise, simply increment the value in cache */
+		mutex_enter(&cache->doc_id_lock);
+		++cache->next_doc_id;
+		mutex_exit(&cache->doc_id_lock);
+	}
+
+	*doc_id = cache->next_doc_id;
+
+	return(DB_SUCCESS);
+}
+
+/*********************************************************************//**
+This function fetch the Doc ID from CONFIG table, and compare with
+the Doc ID supplied. And store the larger one to the CONFIG table.
+@return DB_SUCCESS if OK */
+UNIV_INTERN
+ulint
+fts_cmp_set_sync_doc_id(
+/*====================*/
+	const dict_table_t*	table,		/*!< in: table */
+	doc_id_t		doc_id_cmp,	/*!< in: Doc ID to compare */
+	ibool			read_only,	/*!< in: TRUE if read the
+						synced_doc_id only */
+	doc_id_t*		doc_id)		/*!< out: larger document id
+						after comparing "doc_id_cmp"
+						to the one stored in CONFIG
+						table */
+{
+	trx_t*		trx;
+	pars_info_t*	info;
+	ulint		error;
+	fts_table_t	fts_table;
+	que_t*		graph = NULL;
+	fts_cache_t*	cache = table->fts->cache;
+retry:
+	ut_a(table->fts->doc_col != ULINT_UNDEFINED);
+
+	fts_table.suffix = "CONFIG";
+	fts_table.table_id = table->id;
+	fts_table.type = FTS_COMMON_TABLE;
+	fts_table.table = table;
+
+	fts_table.parent = table->name;
+
+	trx = trx_allocate_for_background();
+
+	trx->op_info = "update the next FTS document id";
+
+	info = pars_info_create();
+
+	pars_info_bind_function(
+		info, "my_func", fts_fetch_store_doc_id, doc_id);
+
+	graph = fts_parse_sql(
+		&fts_table, info,
+		"DECLARE FUNCTION my_func;\n"
+		"DECLARE CURSOR c IS SELECT value FROM %s"
+		" WHERE key = 'synced_doc_id' FOR UPDATE;\n"
+		"BEGIN\n"
+		""
+		"OPEN c;\n"
+		"WHILE 1 = 1 LOOP\n"
+		"  FETCH c INTO my_func();\n"
+		"  IF c % NOTFOUND THEN\n"
+		"    EXIT;\n"
+		"  END IF;\n"
+		"END LOOP;\n"
+		"CLOSE c;");
+
+	*doc_id = 0;
+
+	error = fts_eval_sql(trx, graph);
+
+	fts_que_graph_free_check_lock(&fts_table, NULL, graph);
+
+	// FIXME: We need to retry deadlock errors
+	if (error != DB_SUCCESS) {
+		goto func_exit;
+	}
+
+	ut_a(*doc_id > 0);
+
+	if (read_only) {
+		goto func_exit;
+	}
+
+	if (doc_id_cmp == 0 && *doc_id) {
+		cache->synced_doc_id = *doc_id - 1;
+	} else {
+		cache->synced_doc_id = ut_max(doc_id_cmp, *doc_id);
+	}
+
+	mutex_enter(&cache->doc_id_lock);
+	/* For each sync operation, we will add next_doc_id by 1,
+	so to mark a sync operation */
+	if (cache->next_doc_id < cache->synced_doc_id + 1) {
+		cache->next_doc_id = cache->synced_doc_id + 1;
+	}
+	mutex_exit(&cache->doc_id_lock);
+
+	if (doc_id_cmp > *doc_id) {
+		error = fts_update_sync_doc_id(
+			table, table->name, cache->synced_doc_id, trx);
+	}
+
+	*doc_id = cache->next_doc_id;
+
+func_exit:
+
+	if (error == DB_SUCCESS) {
+		fts_sql_commit(trx);
+	} else {
+		*doc_id = 0;
+
+		ut_print_timestamp(stderr);
+		fprintf(stderr, "  InnoDB: Error: (%lu) "
+			"while getting next doc id.\n", error);
+
+		fts_sql_rollback(trx);
+
+		if (error == DB_DEADLOCK) {
+			os_thread_sleep(FTS_DEADLOCK_RETRY_WAIT);
+			goto retry;
+		}
+	}
+
+	trx_free_for_background(trx);
+
+	return(error);
+}
+
+/*********************************************************************//**
+Update the last document id. This function could create a new
+transaction to update the last document id.
+@return DB_SUCCESS if OK */
+UNIV_INTERN
+ulint
+fts_update_sync_doc_id(
+/*===================*/
+	const dict_table_t*	table,		/*!< in: table */
+	const char*		table_name,	/*!< in: table name */
+	doc_id_t		doc_id,		/*!< in: last document id */
+	trx_t*			trx)		/*!< in: update trx */
+{
+	byte		id[FTS_MAX_ID_LEN];
+	pars_info_t*	info;
+	fts_table_t	fts_table;
+	ulint		id_len;
+	que_t*		graph = NULL;
+	ulint		error;
+	ibool		local_trx = FALSE;
+	fts_cache_t*	cache = table->fts->cache;;
+
+	fts_table.suffix = "CONFIG";
+	fts_table.table_id = table->id;
+	fts_table.type = FTS_COMMON_TABLE;
+	fts_table.table = table;
+	if (table_name) {
+		fts_table.parent = table_name;
+	} else {
+		fts_table.parent = table->name;
+	}
+
+	if (!trx) {
+		trx = trx_allocate_for_background();
+
+		trx->op_info = "setting last FTS document id";
+		local_trx = TRUE;
+	}
+
+	info = pars_info_create();
+
+	// FIXME: Get rid of snprintf
+	id_len = snprintf(
+		(char*) id, sizeof(id), FTS_DOC_ID_FORMAT, doc_id + 1);
+
+	pars_info_bind_varchar_literal(info, "doc_id", id, id_len);
+
+	graph = fts_parse_sql(
+		&fts_table, info,
+		"BEGIN "
+		"UPDATE %s SET value = :doc_id"
+		" WHERE key = 'synced_doc_id';");
+
+	error = fts_eval_sql(trx, graph);
+
+	fts_que_graph_free_check_lock(&fts_table, NULL, graph);
+
+	if (local_trx) {
+		if (error == DB_SUCCESS) {
+			fts_sql_commit(trx);
+			cache->synced_doc_id = doc_id;
+		} else {
+			ut_print_timestamp(stderr);
+			fprintf(stderr, "  InnoDB: Error: (%lu) "
+				"while updating last doc id.\n", error);
+
+			fts_sql_rollback(trx);
+		}
+		trx_free_for_background(trx);
+	}
+
+	return(error);
+}
+
+/*********************************************************************//**
+Create a new fts_doc_ids_t.
+@return new fts_doc_ids_t */
+UNIV_INTERN
+fts_doc_ids_t*
+fts_doc_ids_create(void)
+/*====================*/
+{
+	fts_doc_ids_t*	fts_doc_ids;
+	mem_heap_t*	heap = mem_heap_create(512);
+
+	fts_doc_ids = static_cast<fts_doc_ids_t*>(
+		mem_heap_alloc(heap, sizeof(*fts_doc_ids)));
+
+	fts_doc_ids->self_heap = ib_heap_allocator_create(heap);
+
+	fts_doc_ids->doc_ids = static_cast<ib_vector_t*>(ib_vector_create(
+		fts_doc_ids->self_heap, sizeof(fts_update_t), 32));
+
+	return(fts_doc_ids);
+}
+
+/*********************************************************************//**
+Free a fts_doc_ids_t. */
+
+void
+fts_doc_ids_free(
+/*=============*/
+	fts_doc_ids_t*	fts_doc_ids)
+{
+	mem_heap_t*	heap = static_cast<mem_heap_t*>(
+		fts_doc_ids->self_heap->arg);
+
+	memset(fts_doc_ids, 0, sizeof(*fts_doc_ids));
+
+	mem_heap_free(heap);
+}
+
+/*********************************************************************//**
+Do commit-phase steps necessary for the insertion of a new row.
+@return DB_SUCCESS or error code */
+static
+ulint
+fts_add(
+/*====*/
+	fts_trx_table_t*ftt,			/*!< in: FTS trx table */
+	fts_trx_row_t*	row)			/*!< in: row */
+{
+	dict_table_t*	table = ftt->table;
+	ulint		error = DB_SUCCESS;
+	doc_id_t	doc_id = row->doc_id;
+
+	ut_a(row->state == FTS_INSERT || row->state == FTS_MODIFY);
+
+	fts_add_doc_by_id(ftt, doc_id, row->fts_indexes);
+
+	if (error == DB_SUCCESS) {
+		mutex_enter(&table->fts->cache->deleted_lock);
+		++table->fts->cache->added;
+		mutex_exit(&table->fts->cache->deleted_lock);
+
+		if (!DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_HAS_DOC_ID)
+		    && doc_id >= table->fts->cache->next_doc_id) {
+			table->fts->cache->next_doc_id = doc_id + 1;
+		}
+	}
+
+	return(error);
+}
+
+/*********************************************************************//**
+Do commit-phase steps necessary for the deletion of a row.
+@return DB_SUCCESS or error code */
+static
+ulint
+fts_delete(
+/*=======*/
+	fts_trx_table_t*ftt,			/*!< in: FTS trx table */
+	fts_trx_row_t*	row)			/*!< in: row */
+{
+	que_t*		graph;
+	fts_table_t	fts_table;
+	ulint		error = DB_SUCCESS;
+	doc_id_t	write_doc_id;
+	dict_table_t*	table = ftt->table;
+	doc_id_t	doc_id = row->doc_id;
+	trx_t*		trx = ftt->fts_trx->trx;
+	pars_info_t*	info = pars_info_create();
+	fts_cache_t*	cache = table->fts->cache;
+
+	/* we do not index Documents whose Doc ID value is 0 */
+	if (doc_id == FTS_NULL_DOC_ID) {
+		ut_ad(!DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_HAS_DOC_ID));
+		return(error);
+	}
+
+	ut_a(row->state == FTS_DELETE || row->state == FTS_MODIFY);
+
+	FTS_INIT_FTS_TABLE(&fts_table, "DELETED", FTS_COMMON_TABLE, table);
+
+	/* Convert to "storage" byte order. */
+	fts_write_doc_id((byte*) &write_doc_id, doc_id);
+	fts_bind_doc_id(info, "doc_id", &write_doc_id);
+
+	/* It is possible we update a record that has not yet been sync-ed
+	into cache from last crash (delete Doc will not initialize the
+	sync). Avoid any added counter accounting until the FTS cache
+	is re-established and sync-ed */
+	if (table->fts->fts_status & ADDED_TABLE_SYNCED
+	    && doc_id > cache->synced_doc_id) {
+		mutex_enter(&table->fts->cache->deleted_lock);
+
+		/* The Doc ID could belong to those left in
+		ADDED table from last crash. So need to check
+		if it is less than first_doc_id when we initialize
+		the Doc ID system after reboot */
+		if (doc_id >= table->fts->cache->first_doc_id
+		    && table->fts->cache->added > 0) {
+			--table->fts->cache->added;
+		}
+
+		mutex_exit(&table->fts->cache->deleted_lock);
+
+		/* Only if the row was really deleted. */
+		ut_a(row->state == FTS_DELETE || row->state == FTS_MODIFY);
+	}
+
+	/* Note the deleted document for OPTIMIZE to purge. */
+	if (error == DB_SUCCESS) {
+
+		trx->op_info = "adding doc id to FTS DELETED";
+
+		info->graph_owns_us = TRUE;
+
+		fts_table.suffix = "DELETED";
+
+		graph = fts_parse_sql(
+			&fts_table,
+			info,
+			"BEGIN INSERT INTO %s VALUES (:doc_id);");
+
+		error = fts_eval_sql(trx, graph);
+
+		fts_que_graph_free(graph);
+	} else {
+		pars_info_free(info);
+	}
+
+	/* Increment the total deleted count, this is used to calculate the
+	number of documents indexed. */
+	if (error == DB_SUCCESS) {
+		mutex_enter(&table->fts->cache->deleted_lock);
+
+		++table->fts->cache->deleted;
+
+		mutex_exit(&table->fts->cache->deleted_lock);
+	}
+
+	return(error);
+}
+
+/*********************************************************************//**
+Do commit-phase steps necessary for the modification of a row.
+@return DB_SUCCESS or error code */
+static
+ulint
+fts_modify(
+/*=======*/
+	fts_trx_table_t*	ftt,		/*!< in: FTS trx table */
+	fts_trx_row_t*		row)		/*!< in: row */
+{
+	ulint			error;
+
+	ut_a(row->state == FTS_MODIFY);
+
+	error = fts_delete(ftt, row);
+
+	if (error == DB_SUCCESS) {
+		error = fts_add(ftt, row);
+	}
+
+	return(error);
+}
+
+/*********************************************************************//**
+Create a new document id.
+@return DB_SUCCESS if all went well else error */
+UNIV_INTERN
+ulint
+fts_create_doc_id(
+/*==============*/
+	dict_table_t*	table,		/*!< in: row is of this table. */
+	dtuple_t*	row,		/* in/out: add doc id value to this
+					row. This is the current row that is
+					being inserted. */
+	mem_heap_t*	heap)		/*!< in: heap */
+{
+	doc_id_t	doc_id;
+	ulint		error = DB_SUCCESS;
+
+	ut_a(table->fts->doc_col != ULINT_UNDEFINED);
+
+	if (!DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_HAS_DOC_ID)) {
+		if (table->fts->cache->first_doc_id == FTS_NULL_DOC_ID) {
+			error = fts_get_next_doc_id(table, &doc_id);
+		}
+		return(error);
+	}
+
+	error = fts_get_next_doc_id(table, &doc_id);
+
+	if (error == DB_SUCCESS) {
+		dfield_t*	dfield;
+		doc_id_t*	write_doc_id;
+
+		ut_a(doc_id > 0);
+
+		dfield = dtuple_get_nth_field(row, table->fts->doc_col);
+		write_doc_id = static_cast<doc_id_t*>(
+			mem_heap_alloc(heap, sizeof(*write_doc_id)));
+
+		ut_a(doc_id != FTS_NULL_DOC_ID);
+		ut_a(sizeof(doc_id) == dfield->type.len);
+		fts_write_doc_id((byte*) write_doc_id, doc_id);
+
+		dfield_set_data(dfield, write_doc_id, sizeof(*write_doc_id));
+	}
+
+	return(error);
+}
+
+/*********************************************************************//**
+The given transaction is about to be committed; do whatever is necessary
+from the FTS system's POV.
+@return DB_SUCCESS or error code */
+static
+ulint
+fts_commit_table(
+/*=============*/
+	fts_trx_table_t*	ftt)		/*!< in: FTS table to commit*/
+{
+	const ib_rbt_node_t*	node;
+	ib_rbt_t*		rows;
+	ulint			error = DB_SUCCESS;
+	fts_cache_t*		cache = ftt->table->fts->cache;
+	trx_t*			trx = trx_allocate_for_background();
+
+	rows = ftt->rows;
+
+	ftt->fts_trx->trx = trx;
+
+	if (cache->get_docs == NULL) {
+		rw_lock_x_lock(&cache->init_lock);
+		if (cache->get_docs == NULL) {
+			cache->get_docs = fts_get_docs_create(cache);
+		}
+		rw_lock_x_unlock(&cache->init_lock);
+	}
+
+	for (node = rbt_first(rows);
+	     node != NULL && error == DB_SUCCESS;
+	     node = rbt_next(rows, node)) {
+
+		fts_trx_row_t*	row = rbt_value(fts_trx_row_t, node);
+
+		switch (row->state) {
+		case FTS_INSERT:
+			error = fts_add(ftt, row);
+			break;
+
+		case FTS_MODIFY:
+			error = fts_modify(ftt, row);
+			break;
+
+		case FTS_DELETE:
+			error = fts_delete(ftt, row);
+			break;
+
+		default:
+			ut_error;
+		}
+	}
+
+	fts_sql_commit(trx);
+
+	trx_free_for_background(trx);
+
+	return(error);
+}
+
+/*********************************************************************//**
+The given transaction is about to be committed; do whatever is necessary
+from the FTS system's POV.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+ulint
+fts_commit(
+/*=======*/
+	trx_t*	trx)				/*!< in: transaction */
+{
+	const ib_rbt_node_t*	node;
+	ulint			error;
+	ib_rbt_t*		tables;
+	fts_savepoint_t*	savepoint;
+
+	savepoint = static_cast<fts_savepoint_t*>(
+		ib_vector_last(trx->fts_trx->savepoints));
+	tables = savepoint->tables;
+
+	for (node = rbt_first(tables), error = DB_SUCCESS;
+	     node != NULL && error == DB_SUCCESS;
+	     node = rbt_next(tables, node)) {
+
+		fts_trx_table_t**	ftt;
+
+		ftt = rbt_value(fts_trx_table_t*, node);
+
+		error = fts_commit_table(*ftt);
+	}
+
+	return(error);
+}
+
+/*********************************************************************//**
+Create a new empty document.
+@return new document */
+UNIV_INTERN
+fts_doc_t*
+fts_doc_init(
+/*=========*/
+	fts_doc_t*	doc)			/*!< in: doc to initialize */
+{
+	mem_heap_t*	heap = mem_heap_create(32);
+
+	memset(doc, 0, sizeof(*doc));
+
+	doc->self_heap = ib_heap_allocator_create(heap);
+
+	return(doc);
+}
+
+/*********************************************************************//**
+Free document. */
+UNIV_INTERN
+void
+fts_doc_free(
+/*=========*/
+	fts_doc_t*	doc)			/*!< in: document */
+{
+	mem_heap_t*	heap = static_cast<mem_heap_t*>(doc->self_heap->arg);
+
+	if (doc->tokens) {
+		rbt_free(doc->tokens);
+	}
+
+#ifdef UNIV_DEBUG
+	memset(doc, 0, sizeof(*doc));
+#endif /* UNIV_DEBUG */
+
+	mem_heap_free(heap);
+}
+
+/*********************************************************************//**
+Callback function for fetch that stores a row id to the location pointed.
+The column's type must be DATA_FIXBINARY, DATA_BINARY_TYPE, length = 8.
+@return always returns NULL */
+UNIV_INTERN
+void*
+fts_fetch_row_id(
+/*=============*/
+	void*	row,				/*!< in: sel_node_t* */
+	void*	user_arg)			/*!< in: data pointer */
+{
+	sel_node_t*	node = static_cast<sel_node_t*>(row);
+
+	dfield_t*	dfield = que_node_get_val(node->select_list);
+	dtype_t*	type = dfield_get_type(dfield);
+	ulint		len = dfield_get_len(dfield);
+
+	ut_a(dtype_get_mtype(type) == DATA_FIXBINARY);
+	ut_a(dtype_get_prtype(type) & DATA_BINARY_TYPE);
+	ut_a(len == 8);
+
+	memcpy(user_arg, dfield_get_data(dfield), 8);
+
+	return(NULL);
+}
+
+/*********************************************************************//**
+Callback function for fetch that stores the text of an FTS document,
+converting each column to UTF-16.
+@return: always returns FALSE */
+UNIV_INTERN
+ibool
+fts_query_expansion_fetch_doc(
+/*==========================*/
+	void*		row,			/*!< in: sel_node_t* */
+	void*		user_arg)		/*!< in: fts_doc_t* */
+{
+	que_node_t*	exp;
+	sel_node_t*	node = static_cast<sel_node_t*>(row);
+	fts_doc_t*	result_doc = static_cast<fts_doc_t*>(user_arg);
+	dfield_t*	dfield;
+	ulint		len;
+	ulint		doc_len;
+	fts_doc_t	doc;
+	CHARSET_INFO*	doc_charset = NULL;
+	ulint		field_no = 0;
+
+	len = 0;
+
+	fts_doc_init(&doc);
+	doc.found = TRUE;
+
+	exp = node->select_list;
+	doc_len = 0;
+
+	doc_charset  = result_doc->charset;
+
+	/* Copy each indexed column content into doc->text.f_str */
+	while (exp) {
+		dfield = que_node_get_val(exp);
+		len = dfield_get_len(dfield);
+
+		/* NULL column */
+		if (len == UNIV_SQL_NULL) {
+			exp = que_node_get_next(exp);
+			continue;
+		}
+
+		if (!doc_charset) {
+			ulint   prtype = dfield->type.prtype;
+			doc_charset = innobase_get_fts_charset(
+					(int)(prtype & DATA_MYSQL_TYPE_MASK),
+					(uint) dtype_get_charset_coll(prtype));
+		}
+
+		doc.charset = doc_charset;
+
+		if (dfield_is_ext(dfield)) {
+			/* We ignore columns that are stored externally, this
+			could result in too many words to search */
+			exp = que_node_get_next(exp);
+			continue;
+		} else {
+			doc.text.f_n_char = 0;
+
+			doc.text.f_str = static_cast<byte*>(
+				dfield_get_data(dfield));
+
+			doc.text.f_len = len;
+		}
+
+		if (field_no == 0) {
+			fts_tokenize_document(&doc, result_doc);
+		} else {
+			fts_tokenize_document_next(&doc, doc_len, result_doc);
+		}
+
+		exp = que_node_get_next(exp);
+
+		doc_len += (exp) ? len + 1 : len;
+
+		field_no++;
+	}
+
+	ut_ad(doc_charset);
+
+	if (!result_doc->charset) {
+		result_doc->charset = doc_charset;
+	}
+
+	fts_doc_free(&doc);
+
+	return(FALSE);
+}
+
+/*********************************************************************//**
+fetch and tokenize the document. */
+static
+void
+fts_fetch_doc_from_rec(
+/*===================*/
+	fts_get_doc_t*  get_doc,	/*!< in: FTS index's get_doc struct */
+	dict_index_t*	clust_index,	/*!< in: cluster index */
+	btr_pcur_t*	pcur,		/*!< in: cursor whose position
+					has been stored */
+	ulint*		offsets,	/*!< in: offsets */
+	fts_doc_t*	doc)		/*!< out: fts doc to hold parsed
+					documents */
+{
+	dict_index_t*		index;
+	dict_table_t*		table;
+	const rec_t*		clust_rec;
+	ulint			num_field;
+	const dict_field_t*	ifield;
+	const dict_col_t*	col;
+	ulint			clust_pos;
+	ulint			i;
+	ulint			doc_len = 0;
+	ulint			processed_doc = 0;
+
+	if (!get_doc) {
+		return;
+	}
+
+	index = get_doc->index_cache->index;
+	table = get_doc->index_cache->index->table;
+
+	clust_rec = btr_pcur_get_rec(pcur);
+
+	num_field = dict_index_get_n_fields(index);
+
+	for (i = 0; i < num_field; i++) {
+		ifield = dict_index_get_nth_field(index, i);
+		col = dict_field_get_col(ifield);
+		clust_pos = dict_col_get_clust_pos(col, clust_index);
+
+		if (!get_doc->index_cache->charset) {
+			ulint   prtype = ifield->col->prtype;
+
+			get_doc->index_cache->charset =
+				innobase_get_fts_charset(
+					(int) (prtype & DATA_MYSQL_TYPE_MASK),
+					(uint) dtype_get_charset_coll(prtype));
+		}
+
+		if (rec_offs_nth_extern(offsets, clust_pos)) {
+			doc->text.f_str =
+				btr_rec_copy_externally_stored_field(
+					clust_rec, offsets,
+					dict_table_zip_size(table),
+					clust_pos, &doc->text.f_len,
+					static_cast<mem_heap_t*>(
+						doc->self_heap->arg));
+		} else {
+			doc->text.f_str = (byte*) rec_get_nth_field(
+				clust_rec, offsets, clust_pos,
+				&doc->text.f_len);
+		}
+
+		doc->found = TRUE;
+		doc->charset = get_doc->index_cache->charset;
+
+		/* Null Field */
+		if (doc->text.f_len == UNIV_SQL_NULL) {
+			continue;
+		}
+
+		if (processed_doc == 0) {
+			fts_tokenize_document(doc, NULL);
+		} else {
+			fts_tokenize_document_next(doc, doc_len, NULL);
+		}
+
+		processed_doc++;
+		doc_len += doc->text.f_len + 1;
+	}
+}
+
+/*********************************************************************//**
+This function fetches the document inserted during the committing
+transaction, and tokenize the inserted text data and insert into
+FTS auxiliary table and its cache.
+@return TRUE if successful */
+static
+ulint
+fts_add_doc_by_id(
+/*==============*/
+	fts_trx_table_t*ftt,		/*!< in: FTS trx table */
+	doc_id_t	doc_id,		/*!< in: doc id */
+	ib_vector_t*	fts_indexes __attribute__((unused)))
+					/*!< in: affected fts indexes */
+{
+	mtr_t		mtr;
+	mem_heap_t*	heap;
+	btr_pcur_t	pcur;
+	dict_table_t*	table;
+	dtuple_t*	tuple;
+	dfield_t*       dfield;
+	fts_get_doc_t*	get_doc;
+	doc_id_t        temp_doc_id;
+	dict_index_t*   clust_index;
+	dict_index_t*	fts_id_index;
+	ibool		is_id_cluster;
+	fts_cache_t*   	cache = ftt->table->fts->cache;
+
+	ut_ad(cache->get_docs);
+
+	/* If Doc ID has been supplied by the user, then the table
+	might not yet be sync-ed */
+
+	if (!(ftt->table->fts->fts_status & ADDED_TABLE_SYNCED)) {
+		fts_init_index(ftt->table, FALSE);
+	}
+
+	/* Get the first FTS index's get_doc */
+	get_doc = static_cast<fts_get_doc_t*>(
+		ib_vector_get(cache->get_docs, 0));
+	ut_ad(get_doc);
+
+	table = get_doc->index_cache->index->table;
+
+	heap = mem_heap_create(512);
+
+	clust_index = dict_table_get_first_index(table);
+	fts_id_index = dict_table_get_index_on_name(
+				table, FTS_DOC_ID_INDEX_NAME);
+
+	/* Check whether the index on FTS_DOC_ID is cluster index */
+	is_id_cluster = (clust_index == fts_id_index);
+
+	mtr_start(&mtr);
+	btr_pcur_init(&pcur);
+
+	/* Search based on Doc ID. Here, we'll need to consider the case
+	when there is no primary index on Doc ID */
+	tuple = dtuple_create(heap, 1);
+	dfield = dtuple_get_nth_field(tuple, 0);
+	dfield->type.mtype = DATA_INT;
+	dfield->type.prtype = DATA_NOT_NULL | DATA_UNSIGNED | DATA_BINARY_TYPE;
+
+	mach_write_to_8((byte*) &temp_doc_id, doc_id);
+	dfield_set_data(dfield, &temp_doc_id, sizeof(temp_doc_id));
+
+	btr_pcur_open_with_no_init(
+		fts_id_index, tuple, PAGE_CUR_LE, BTR_SEARCH_LEAF,
+		&pcur, 0, &mtr);
+
+	/* If we have a match, add the data to doc structure */
+	if (btr_pcur_get_low_match(&pcur) == 1) {
+		const rec_t*	rec;
+		btr_pcur_t*	doc_pcur;
+		const rec_t*	clust_rec;
+		btr_pcur_t	clust_pcur;
+		ulint*		offsets = NULL;
+		ulint		num_idx = ib_vector_size(cache->get_docs);
+
+		rec = btr_pcur_get_rec(&pcur);
+
+		/* Doc could be deleted */
+		if (page_rec_is_infimum(rec)
+		    || rec_get_deleted_flag(rec, dict_table_is_comp(table))) {
+
+			goto func_exit;
+		}
+
+		if (is_id_cluster) {
+			clust_rec = rec;
+			doc_pcur = &pcur;
+		} else {
+			dtuple_t*	clust_ref;
+			ulint		n_fields;
+
+			btr_pcur_init(&clust_pcur);
+			n_fields = dict_index_get_n_unique(clust_index);
+
+			clust_ref = dtuple_create(heap, n_fields);
+			dict_index_copy_types(clust_ref, clust_index, n_fields);
+
+			row_build_row_ref_in_tuple(
+				clust_ref, rec, fts_id_index, NULL, NULL);
+
+			btr_pcur_open_with_no_init(
+				clust_index, clust_ref, PAGE_CUR_LE,
+				BTR_SEARCH_LEAF, &clust_pcur, 0, &mtr);
+
+			doc_pcur = &clust_pcur;
+			clust_rec = btr_pcur_get_rec(&clust_pcur);
+
+		}
+
+		offsets = rec_get_offsets(clust_rec, clust_index,
+					  NULL, ULINT_UNDEFINED, &heap);
+
+		 for (ulint i = 0; i < num_idx; ++i) {
+			fts_doc_t       doc;
+			dict_table_t*   table;
+			fts_get_doc_t*  get_doc;
+
+			get_doc = static_cast<fts_get_doc_t*>(
+				ib_vector_get(cache->get_docs, i));
+
+			table = get_doc->index_cache->index->table;
+
+			fts_doc_init(&doc);
+
+			fts_fetch_doc_from_rec(
+				get_doc, clust_index, doc_pcur, offsets, &doc);
+
+			if (doc.found) {
+				ibool	success __attribute__((unused));
+
+				btr_pcur_store_position(doc_pcur, &mtr);
+				mtr_commit(&mtr);
+
+				rw_lock_x_lock(&table->fts->cache->lock);
+
+				fts_cache_add_doc(
+					table->fts->cache,
+					get_doc->index_cache,
+					doc_id, doc.tokens);
+
+				rw_lock_x_unlock(&table->fts->cache->lock);
+
+				if (cache->total_size > fts_max_cache_size) {
+					fts_sync(cache->sync);
+				}
+
+				mtr_start(&mtr);
+
+				if (i < num_idx - 1) {
+
+					success = btr_pcur_restore_position(
+						BTR_SEARCH_LEAF, doc_pcur,
+						&mtr);
+
+					ut_ad(success);
+				}
+			}
+
+			fts_doc_free(&doc);
+		}
+
+		if (!is_id_cluster) {
+			btr_pcur_close(doc_pcur);
+		}
+	}
+func_exit:
+	mtr_commit(&mtr);
+
+	btr_pcur_close(&pcur);
+
+	mem_heap_free(heap);
+	return(TRUE);
+}
+
+
+/*********************************************************************//**
+Callback function to read a single ulint column.
+return always returns TRUE */
+static
+ibool
+fts_read_ulint(
+/*===========*/
+	void*		row,		/*!< in: sel_node_t* */
+	void*		user_arg)	/*!< in: pointer to ulint */
+{
+	sel_node_t*	sel_node = static_cast<sel_node_t*>(row);
+	ulint*		value = static_cast<ulint*>(user_arg);
+	que_node_t*	exp = sel_node->select_list;
+	dfield_t*	dfield = que_node_get_val(exp);
+	void*		data = dfield_get_data(dfield);
+
+	*value = static_cast<ulint>(mach_read_from_4(
+		static_cast<const byte*>(data)));
+
+	return(TRUE);
+}
+
+/*********************************************************************//**
+Get maximum Doc ID in a table if index "FTS_DOC_ID_INDEX" exists
+@return max Doc ID or 0 if index "FTS_DOC_ID_INDEX" does not exist */
+UNIV_INTERN
+doc_id_t
+fts_get_max_doc_id(
+/*===============*/
+	dict_table_t*	table)		/*!< in: user table */
+{
+	dict_index_t*	index;
+	dict_field_t*	dfield __attribute__((unused)) = NULL;
+	doc_id_t	doc_id = 0;
+	mtr_t		mtr;
+	btr_pcur_t	pcur;
+
+	index = dict_table_get_index_on_name(table, FTS_DOC_ID_INDEX_NAME);
+
+	if (!index) {
+		return(0);
+	}
+
+	dfield = dict_index_get_nth_field(index, 0);
+
+	ut_ad(innobase_strcasecmp(FTS_DOC_ID_COL_NAME, dfield->name) == 0);
+
+	mtr_start(&mtr);
+
+	/* fetch the largest indexes value */
+	btr_pcur_open_at_index_side(
+		FALSE, index, BTR_SEARCH_LEAF, &pcur, TRUE, &mtr);
+
+	if (page_get_n_recs(btr_pcur_get_page(&pcur)) > 0) {
+		const rec_t*    rec = NULL;
+		ulint		offsets_[REC_OFFS_NORMAL_SIZE];
+		ulint*		offsets = offsets_;
+		mem_heap_t*	heap = NULL;
+		ulint		len;
+		const void*	data;
+
+		rec_offs_init(offsets_);
+
+		do {
+			rec = btr_pcur_get_rec(&pcur);
+
+			if (page_rec_is_user_rec(rec)) {
+				break;
+			}
+		} while (btr_pcur_move_to_prev(&pcur, &mtr));
+
+		if (!rec) {
+			goto func_exit;
+		}
+
+		offsets = rec_get_offsets(
+			rec, index, offsets, ULINT_UNDEFINED, &heap);
+
+		data = rec_get_nth_field(rec, offsets, 0, &len);
+
+		doc_id = static_cast<doc_id_t>(fts_read_doc_id(
+			static_cast<const byte*>(data)));
+	}
+
+func_exit:
+	btr_pcur_close(&pcur);
+	mtr_commit(&mtr);
+	return(doc_id);
+}
+
+/*********************************************************************//**
+Fetch document with the given document id.
+@return DB_SUCCESS if OK else error */
+UNIV_INTERN
+ulint
+fts_doc_fetch_by_doc_id(
+/*====================*/
+	fts_get_doc_t*	get_doc,	/*!< in: state */
+	doc_id_t	doc_id,		/*!< in: id of document to
+					fetch */
+	dict_index_t*	index_to_use,	/*!< in: caller supplied FTS index */
+	ulint		option,		/*!< in: search option, if it is
+					greater than doc_id or equal */
+	fts_sql_callback
+			callback,	/*!< in: callback to read */
+	void*		arg)		/*!< in: callback arg */
+{
+	pars_info_t*	info;
+	ulint		error;
+	const char*	select_str;
+	doc_id_t	write_doc_id;
+	dict_index_t*	index;
+	trx_t*		trx = trx_allocate_for_background();
+	que_t*          graph;
+
+	trx->op_info = "fetching indexed FTS document";
+
+	/* The FTS index can be supplied by caller directly with
+	"index_to_use", otherwise, get it from "get_doc" */
+	index = (index_to_use) ? index_to_use : get_doc->index_cache->index;
+
+	if (get_doc && get_doc->get_document_graph) {
+		info = get_doc->get_document_graph->info;
+	} else {
+		info = pars_info_create();
+	}
+
+	/* Convert to "storage" byte order. */
+	fts_write_doc_id((byte*) &write_doc_id, doc_id);
+	fts_bind_doc_id(info, "doc_id", &write_doc_id);
+	pars_info_bind_function(info, "my_func", callback, arg);
+
+	select_str = fts_get_select_columns_str(index, info, info->heap);
+
+	if (!get_doc || !get_doc->get_document_graph) {
+		if (option == FTS_FETCH_DOC_BY_ID_EQUAL) {
+			graph = fts_parse_sql(
+				NULL,
+				info,
+				mem_heap_printf(info->heap,
+					"DECLARE FUNCTION my_func;\n"
+					"DECLARE CURSOR c IS"
+					" SELECT %s FROM %s"
+					" WHERE %s = :doc_id;\n"
+					"BEGIN\n"
+					""
+					"OPEN c;\n"
+					"WHILE 1 = 1 LOOP\n"
+					"  FETCH c INTO my_func();\n"
+					"  IF c %% NOTFOUND THEN\n"
+					"    EXIT;\n"
+					"  END IF;\n"
+					"END LOOP;\n"
+					"CLOSE c;",
+					select_str, index->table_name,
+					FTS_DOC_ID_COL_NAME));
+		} else {
+			ut_ad(option == FTS_FETCH_DOC_BY_ID_LARGE);
+
+			graph = fts_parse_sql(
+				NULL,
+				info,
+				mem_heap_printf(info->heap,
+					"DECLARE FUNCTION my_func;\n"
+					"DECLARE CURSOR c IS"
+					" SELECT %s, %s FROM %s"
+					" WHERE %s > :doc_id"
+					" ORDER BY %s;\n"
+					"BEGIN\n"
+					""
+					"OPEN c;\n"
+					"WHILE 1 = 1 LOOP\n"
+					"  FETCH c INTO my_func();\n"
+					"  IF c %% NOTFOUND THEN\n"
+					"    EXIT;\n"
+					"  END IF;\n"
+					"END LOOP;\n"
+					"CLOSE c;",
+					FTS_DOC_ID_COL_NAME,
+					select_str, index->table_name,
+					FTS_DOC_ID_COL_NAME,
+					FTS_DOC_ID_COL_NAME));
+		}
+		if (get_doc) {
+			get_doc->get_document_graph = graph;
+		}
+	} else {
+		graph = get_doc->get_document_graph;
+	}
+
+	error = fts_eval_sql(trx, graph);
+
+	if (error == DB_SUCCESS) {
+		fts_sql_commit(trx);
+	} else {
+		fts_sql_rollback(trx);
+	}
+
+	trx_free_for_background(trx);
+
+	if (!get_doc) {
+		fts_que_graph_free(graph);
+	}
+
+	return(error);
+}
+
+/*********************************************************************//**
+Write out a single word's data as new entry/entries in the INDEX table.
+@return DB_SUCCESS if all OK. */
+UNIV_INTERN
+ulint
+fts_write_node(
+/*===========*/
+	trx_t*		trx,			/*!< in: transaction */
+	que_t**		graph,			/*!< in: query graph */
+	fts_table_t*	fts_table,		/*!< in: aux table */
+	fts_string_t*	word,			/*!< in: word in UTF-8 */
+	fts_node_t*	node)			/*!< in: node columns */
+{
+	pars_info_t*	info;
+	ulint		error;
+	ib_uint32_t	doc_count;
+	ib_time_t	start_time;
+	doc_id_t	last_doc_id;
+	doc_id_t	first_doc_id;
+
+	if (*graph) {
+		info = (*graph)->info;
+	} else {
+		info = pars_info_create();
+	}
+
+	pars_info_bind_varchar_literal(info, "token", word->f_str, word->f_len);
+
+	/* Convert to "storage" byte order. */
+	fts_write_doc_id((byte*) &first_doc_id, node->first_doc_id);
+	fts_bind_doc_id(info, "first_doc_id", &first_doc_id);
+
+	/* Convert to "storage" byte order. */
+	fts_write_doc_id((byte*) &last_doc_id, node->last_doc_id);
+	fts_bind_doc_id(info, "last_doc_id", &last_doc_id);
+
+	ut_a(node->last_doc_id >= node->first_doc_id);
+
+	/* Convert to "storage" byte order. */
+	mach_write_to_4((byte*) &doc_count, node->doc_count);
+	pars_info_bind_int4_literal(
+		info, "doc_count", (const ib_uint32_t*) &doc_count);
+
+	/* Set copy_name to FALSE since it's a static. */
+	pars_info_bind_literal(
+		info, "ilist", node->ilist, node->ilist_size,
+		DATA_BLOB, DATA_BINARY_TYPE);
+
+	if (!*graph) {
+		*graph = fts_parse_sql(
+			fts_table,
+			info,
+			"BEGIN\n"
+			"INSERT INTO %s VALUES "
+			"(:token, :first_doc_id,"
+			" :last_doc_id, :doc_count, :ilist);");
+	}
+
+	start_time = ut_time();
+	error = fts_eval_sql(trx, *graph);
+	elapsed_time += ut_time() - start_time;
+	++n_nodes;
+
+	return(error);
+}
+
+/*********************************************************************//**
+Add rows to the DELETED_CACHE table.
+@return DB_SUCCESS if all went well else error code*/
+static
+ulint
+fts_sync_add_deleted_cache(
+/*=======================*/
+	fts_sync_t*	sync,			/*!< in: sync state */
+	ib_vector_t*	doc_ids)		/*!< in: doc ids to add */
+{
+	ulint		i;
+	pars_info_t*	info;
+	que_t*		graph;
+	fts_table_t	fts_table;
+	doc_id_t	dummy = 0;
+	ulint		error = DB_SUCCESS;
+	ulint		n_elems = ib_vector_size(doc_ids);
+
+	ut_a(ib_vector_size(doc_ids) > 0);
+
+	ib_vector_sort(doc_ids, fts_update_doc_id_cmp);
+
+	info = pars_info_create();
+
+	fts_bind_doc_id(info, "doc_id", &dummy);
+
+	FTS_INIT_FTS_TABLE(
+		&fts_table, "DELETED_CACHE", FTS_COMMON_TABLE, sync->table);
+
+	graph = fts_parse_sql(
+		&fts_table,
+		info,
+		"BEGIN INSERT INTO %s VALUES (:doc_id);");
+
+	for (i = 0; i < n_elems && error == DB_SUCCESS; ++i) {
+		fts_update_t*	update;
+		doc_id_t	write_doc_id;
+
+		update = static_cast<fts_update_t*>(ib_vector_get(doc_ids, i));
+
+		/* Convert to "storage" byte order. */
+		fts_write_doc_id((byte*) &write_doc_id, update->doc_id);
+		fts_bind_doc_id(info, "doc_id", &write_doc_id);
+
+		error = fts_eval_sql(sync->trx, graph);
+	}
+
+	fts_que_graph_free(graph);
+
+	return(error);
+}
+
+/*********************************************************************//**
+Write the words and ilist to disk.*/
+static
+ulint
+fts_sync_write_words(
+/*=================*/
+	trx_t*		trx,			/*!< in: transaction */
+	fts_index_cache_t*
+			index_cache)		/*!< in: index cache */
+{
+	fts_table_t	fts_table;
+	ulint		n_nodes = 0;
+	ulint		n_words = 0;
+	const ib_rbt_node_t* rbt_node;
+	ulint		n_new_words = 0;
+	ulint		error = DB_SUCCESS;
+	ibool		print_error = FALSE;
+	dict_table_t*	table = index_cache->index->table;
+
+	FTS_INIT_INDEX_TABLE(
+		&fts_table, NULL, FTS_INDEX_TABLE, index_cache->index);
+
+	n_words = rbt_size(index_cache->words);
+
+	/* We iterate over the entire tree, even if there is an error,
+	since we want to free the memory used during caching. */
+	for (rbt_node = rbt_first(index_cache->words);
+	     rbt_node;
+	     rbt_node = rbt_first(index_cache->words)) {
+
+		ulint			i;
+		ulint			selected;
+		fts_tokenizer_word_t*	word;
+
+		word = rbt_value(fts_tokenizer_word_t, rbt_node);
+
+		selected = fts_select_index(
+			index_cache->charset, word->text.f_str,
+			word->text.f_len);
+
+		fts_table.suffix = fts_get_suffix(selected);
+
+		/* Check if the word exists in the FTS index and if not
+		then we need to increment the total word count stats. */
+		if (error == DB_SUCCESS) {
+			ibool	found = FALSE;
+
+			error = fts_is_word_in_index(
+				trx,
+				&index_cache->sel_graph[selected],
+				&fts_table,
+				&word->text, &found);
+
+			if (error == DB_SUCCESS && !found) {
+
+				++n_new_words;
+			}
+		}
+
+		n_nodes += ib_vector_size(word->nodes);
+
+		/* We iterate over all the nodes even if there was an error,
+		this is to free the memory of the fts_node_t elements. */
+		for (i = 0; i < ib_vector_size(word->nodes); ++i) {
+
+			fts_node_t* fts_node = static_cast<fts_node_t*>(
+				ib_vector_get(word->nodes, i));
+
+			if (error == DB_SUCCESS) {
+
+				error = fts_write_node(
+					trx,
+					&index_cache->ins_graph[selected],
+					&fts_table, &word->text, fts_node);
+			}
+
+			ut_free(fts_node->ilist);
+			fts_node->ilist = NULL;
+		}
+
+		if (error != DB_SUCCESS && !print_error) {
+			ut_print_timestamp(stderr);
+			fprintf(stderr, "  InnoDB: Error (%lu) writing "
+				"word node to FTS auxiliary index "
+				"table.\n", error);
+
+			print_error = TRUE;
+		}
+
+		/* NOTE: We are responsible for free'ing the node */
+		ut_free(rbt_remove_node(index_cache->words, rbt_node));
+	}
+
+	if (error == DB_SUCCESS && n_new_words > 0) {
+		fts_table_t	fts_table;
+
+		FTS_INIT_FTS_TABLE(&fts_table, NULL, FTS_COMMON_TABLE, table);
+
+		/* Increment the total number of words in the FTS index */
+		fts_config_increment_index_value(
+			trx, index_cache->index, FTS_TOTAL_WORD_COUNT,
+			n_new_words);
+	}
+
+	printf("Avg number of nodes: %lf\n",
+	       (double) n_nodes / (double) (n_words > 1 ? n_words : 1));
+
+	return(error);
+}
+
+#ifdef FTS_DOC_STATS_DEBUG
+/*********************************************************************//**
+Write a single documents statistics to disk.
+@return DB_SUCCESS if all went well else error code */
+static
+ulint
+fts_sync_write_doc_stat(
+/*====================*/
+	trx_t*			trx,		/*!< in: transaction */
+	dict_index_t*		index,		/*!< in: index */
+	que_t**			graph,		/* out: query graph */
+	const fts_doc_stats_t*	doc_stat)	/*!< in: doc stats to write */
+{
+	pars_info_t*	info;
+	doc_id_t	doc_id;
+	ulint		error = DB_SUCCESS;
+	ib_uint32_t	word_count;
+
+	if (*graph) {
+		info = (*graph)->info;
+	} else {
+		info = pars_info_create();
+	}
+
+	/* Convert to "storage" byte order. */
+	mach_write_to_4((byte*) &word_count, doc_stat->word_count);
+	pars_info_bind_int4_literal(
+		info, "count", (const ib_uint32_t*) &word_count);
+
+	/* Convert to "storage" byte order. */
+	fts_write_doc_id((byte*) &doc_id, doc_stat->doc_id);
+	fts_bind_doc_id(info, "doc_id", &doc_id);
+
+	if (!*graph) {
+		fts_table_t	fts_table;
+
+		FTS_INIT_INDEX_TABLE(
+			&fts_table, "DOC_ID", FTS_INDEX_TABLE, index);
+
+		*graph = fts_parse_sql(
+			&fts_table,
+			info,
+			"BEGIN INSERT INTO %s VALUES (:doc_id, :count);");
+	}
+
+	for (;;) {
+		error = fts_eval_sql(trx, *graph);
+
+		if (error == DB_SUCCESS) {
+
+			break;				/* Exit the loop. */
+		} else {
+			ut_print_timestamp(stderr);
+
+			if (error == DB_LOCK_WAIT_TIMEOUT) {
+				fprintf(stderr, "  InnoDB: Warning: lock wait "
+					"timeout writing to FTS doc_id. "
+					"Retrying!\n");
+
+				trx->error_state = DB_SUCCESS;
+			} else {
+				fprintf(stderr, "  InnoDB: Error: %lu "
+					"while writing to FTS doc_id.\n",
+					error);
+
+				break;			/* Exit the loop. */
+			}
+		}
+	}
+
+	return(error);
+}
+
+/*********************************************************************//**
+Write document statistics to disk.
+@return DB_SUCCESS if all OK */
+static
+ulint
+fts_sync_write_doc_stats(
+/*=====================*/
+	trx_t*			trx,		/*!< in: transaction */
+	const fts_index_cache_t*index_cache)	/*!< in: index cache */
+{
+	ulint		error = DB_SUCCESS;
+	que_t*		graph = NULL;
+	fts_doc_stats_t*  doc_stat;
+
+	if (ib_vector_is_empty(index_cache->doc_stats)) {
+		return(DB_SUCCESS);
+	}
+
+	doc_stat = static_cast<ts_doc_stats_t*>(
+		ib_vector_pop(index_cache->doc_stats));
+
+	while (doc_stat) {
+		error = fts_sync_write_doc_stat(
+			trx, index_cache->index, &graph, doc_stat);
+
+		if (error != DB_SUCCESS) {
+			break;
+		}
+
+		if (ib_vector_is_empty(index_cache->doc_stats)) {
+			break;
+		}
+
+		doc_stat = static_cast<ts_doc_stats_t*>(
+			ib_vector_pop(index_cache->doc_stats));
+	}
+
+	if (graph != NULL) {
+		fts_que_graph_free_check_lock(NULL, index_cache, graph);
+	}
+
+	return(error);
+}
+#endif /* FTS_DOC_STATS_DEBUG */
+
+/*********************************************************************//**
+Callback to check the existince of a word.
+@return always return NULL */
+static
+ibool
+fts_lookup_word(
+/*============*/
+	void*	row,				/*!< in:  sel_node_t* */
+	void*	user_arg)			/*!< in:  fts_doc_t* */
+{
+
+	que_node_t*	exp;
+	sel_node_t*	node = static_cast<sel_node_t*>(row);
+	ibool*		found = static_cast<ibool*>(user_arg);
+
+	exp = node->select_list;
+
+	while (exp) {
+		dfield_t*	dfield = que_node_get_val(exp);
+		ulint		len = dfield_get_len(dfield);
+
+		if (len != UNIV_SQL_NULL && len != 0) {
+			*found = TRUE;
+		}
+
+		exp = que_node_get_next(exp);
+	}
+
+	return(FALSE);
+}
+
+/*********************************************************************//**
+Check whether a particular word (term) exists in the FTS index. */
+static
+ulint
+fts_is_word_in_index(
+/*=================*/
+						/* out: DB_SUCCESS if all went
+						well else error code */
+	trx_t*		trx,			/*!< in: FTS query state */
+	que_t**		graph,			/* out: Query graph */
+	fts_table_t*	fts_table,		/*!< in: table instance */
+	const fts_string_t*
+			word,			/*!< in: the word to check */
+	ibool*		found)			/* out: TRUE if exists */
+{
+	pars_info_t*	info;
+	ulint		error;
+
+	trx->op_info = "looking up word in FTS index";
+
+	if (*graph) {
+		info = (*graph)->info;
+	} else {
+		info = pars_info_create();
+	}
+
+	pars_info_bind_function(info, "my_func", fts_lookup_word, found);
+	pars_info_bind_varchar_literal(info, "word", word->f_str, word->f_len);
+
+	if (*graph == NULL) {
+		*graph = fts_parse_sql(
+			fts_table,
+			info,
+			"DECLARE FUNCTION my_func;\n"
+			"DECLARE CURSOR c IS"
+			" SELECT doc_count\n"
+			" FROM %s\n"
+			" WHERE word = :word "
+			" ORDER BY first_doc_id;\n"
+			"BEGIN\n"
+			"\n"
+			"OPEN c;\n"
+			"WHILE 1 = 1 LOOP\n"
+			"  FETCH c INTO my_func();\n"
+			"  IF c % NOTFOUND THEN\n"
+			"    EXIT;\n"
+			"  END IF;\n"
+			"END LOOP;\n"
+			"CLOSE c;");
+	}
+
+	for (;;) {
+		error = fts_eval_sql(trx, *graph);
+
+		if (error == DB_SUCCESS) {
+
+			break;				/* Exit the loop. */
+		} else {
+			ut_print_timestamp(stderr);
+
+			if (error == DB_LOCK_WAIT_TIMEOUT) {
+				fprintf(stderr, "  InnoDB: Warning: lock wait "
+					"timeout reading FTS index. "
+					"Retrying!\n");
+
+				trx->error_state = DB_SUCCESS;
+			} else {
+				fprintf(stderr, "  InnoDB: Error: %lu "
+					"while reading FTS index.\n", error);
+
+				break;			/* Exit the loop. */
+			}
+		}
+	}
+
+	return(error);
+}
+
+/*********************************************************************//**
+Begin Sync, create transaction, acquire locks, etc. */
+static
+void
+fts_sync_begin(
+/*===========*/
+	fts_sync_t*	sync)			/*!< in: sync state */
+{
+	fts_cache_t*	cache = sync->table->fts->cache;
+
+	n_nodes = 0;
+	elapsed_time = 0;
+
+	sync->start_time = ut_time();
+
+	sync->trx = trx_allocate_for_background();
+
+	ut_print_timestamp(stderr);
+	fprintf(stderr, "  SYNC deleted count: %ld size: %lu bytes\n",
+		ib_vector_size(cache->deleted_doc_ids), cache->total_size);
+}
+
+/*********************************************************************//**
+Run SYNC on the table, i.e., write out data from the index specific
+cache to the FTS aux INDEX table and FTS aux doc id stats table. */
+static
+ulint
+fts_sync_index(
+/*===========*/
+						/* out: DB_SUCCESS if all OK */
+	fts_sync_t*		sync,		/*!< in: sync state */
+	fts_index_cache_t*	index_cache)	/*!< in: index cache */
+{
+	trx_t*		trx = sync->trx;
+	ulint		error = DB_SUCCESS;
+
+	trx->op_info = "doing SYNC index";
+
+	ut_print_timestamp(stderr);
+	fprintf(stderr, "  SYNC words: %ld\n", rbt_size(index_cache->words));
+
+	ut_ad(rbt_validate(index_cache->words));
+
+	error = fts_sync_write_words(trx, index_cache);
+
+#ifdef FTS_DOC_STATS_DEBUG
+	/* FTS_RESOLVE: the word counter info in auxiliary table "DOC_ID"
+	is not used currently for ranking. We disable fts_sync_write_doc_stats()
+	for now */
+	/* Write the per doc statistics that will be used for ranking. */
+	if (error == DB_SUCCESS) {
+
+		error = fts_sync_write_doc_stats(trx, index_cache);
+	}
+#endif /* FTS_DOC_STATS_DEBUG */
+
+	return(error);
+}
+
+/*********************************************************************//**
+Commit the SYNC, change state of processed doc ids etc.
+@return DB_SUCCESS if all OK */
+static
+ulint
+fts_sync_commit(
+/*============*/
+	fts_sync_t*	sync)			/*!< in: sync state */
+{
+	ulint		error;
+	trx_t*		trx = sync->trx;
+	fts_cache_t*	cache = sync->table->fts->cache;
+	doc_id_t	last_doc_id;
+
+	trx->op_info = "doing SYNC commit";
+
+	/* After each Sync, update the CONFIG table about the max doc id
+	we just sync-ed to index table */
+	error = fts_cmp_set_sync_doc_id(sync->table, sync->max_doc_id, FALSE,
+					&last_doc_id);
+
+	/* Get the list of deleted documents that are either in the
+	cache or were headed there but were deleted before the add
+	thread got to them. */
+
+	if (error == DB_SUCCESS && ib_vector_size(cache->deleted_doc_ids) > 0) {
+
+		error = fts_sync_add_deleted_cache(
+			sync, cache->deleted_doc_ids);
+	}
+
+	/* We need to do this within the deleted lock since fts_delete() can
+	attempt to add a deleted doc id to the cache deleted id array. Set
+	the shutdown flag to FALSE, signifying that we don't want to release
+	all resources. */
+	fts_cache_clear(cache, FALSE);
+	fts_cache_init(cache);
+	rw_lock_x_unlock(&cache->lock);
+
+	if (error == DB_SUCCESS) {
+
+		fts_sql_commit(trx);
+
+	} else if (error != DB_SUCCESS) {
+
+		fts_sql_rollback(trx);
+
+		ut_print_timestamp(stderr);
+		fprintf(stderr, "  InnoDB: Error: (%lu) during SYNC.\n", error);
+	}
+
+	ut_print_timestamp(stderr);
+	fprintf(stderr, "  InnoDB: SYNC time : %lusecs: elapsed %lf ins/sec\n",
+		(ulong) (ut_time() - sync->start_time),
+		(double) n_nodes/ (double) elapsed_time);
+
+	trx_free_for_background(trx);
+
+	return(error);
+}
+
+/*********************************************************************//**
+Rollback a sync operation */
+static
+void
+fts_sync_rollback(
+/*==============*/
+	fts_sync_t*	sync)			/*!< in: sync state */
+{
+	trx_t*		trx = sync->trx;
+	fts_cache_t*	cache = sync->table->fts->cache;
+
+	rw_lock_x_unlock(&cache->lock);
+
+	fts_sql_rollback(trx);
+	trx_free_for_background(trx);
+}
+
+/****************************************************************//**
+Run SYNC on the table, i.e., write out data from the cache to the
+FTS auxiliary INDEX table and clear the cache at the end.
+@return DB_SUCCESS if all OK */
+static
+ulint
+fts_sync(
+/*=====*/
+	fts_sync_t*	sync)		/*!< in: sync state */
+{
+	ulint		i;
+	ulint		error = DB_SUCCESS;
+	fts_cache_t*	cache = sync->table->fts->cache;
+
+	rw_lock_x_lock(&cache->lock);
+
+	fts_sync_begin(sync);
+
+	for (i = 0; i < ib_vector_size(cache->indexes); ++i) {
+		fts_index_cache_t*	index_cache;
+
+		index_cache = static_cast<fts_index_cache_t*>(
+			ib_vector_get(cache->indexes, i));
+
+		error = fts_sync_index(sync, index_cache);
+
+		if (error != DB_SUCCESS && !sync->interrupted) {
+
+			break;
+		}
+	}
+
+	if (error == DB_SUCCESS && !sync->interrupted) {
+		error = fts_sync_commit(sync);
+	}  else {
+		fts_sync_rollback(sync);
+	}
+
+	/* We need to check whether an optimize is required, for that
+	we make copies of the two variables that control the trigger. These
+	variables can change behind our back and we don't want to hold the
+	lock for longer than is needed. */
+	mutex_enter(&cache->deleted_lock);
+
+	cache->added = 0;
+	cache->deleted = 0;
+
+	mutex_exit(&cache->deleted_lock);
+
+	return(error);
+}
+
+/****************************************************************//**
+Run SYNC on the table, i.e., write out data from the cache to the
+FTS auxiliary INDEX table and clear the cache at the end.
+@return DB_SUCCESS if all OK */
+UNIV_INTERN
+ulint
+fts_sync_table(
+/*===========*/
+	dict_table_t*	table)		/*!< in: table */
+{
+	ulint	error = DB_SUCCESS;
+
+	ut_ad(table->fts);
+
+	if (table->fts->cache) {
+		fts_sync(table->fts->cache->sync);
+	}
+
+	return(error);
+}
+
+/********************************************************************
+Process next token from document starting at the given position, i.e., add
+the token's start position to the token's list of positions. */
+static
+ulint
+fts_process_token(
+/*==============*/
+					/* out: number of characters
+					handled in this call */
+	fts_doc_t*	doc,		/* in/out: document to
+					tokenize */
+	fts_doc_t*	result,		/* out: if provided, save
+					result here */
+	ulint		start_pos,	/*!< in: start position in text */
+	ulint		add_pos)	/*!< in: add this position to all
+					tokens from this tokenization */
+{
+	ulint		ret;
+	fts_string_t	str;
+	ulint		offset = 0;
+	fts_doc_t*	result_doc;
+	byte		buf[FTS_MAX_WORD_LEN + 1];
+
+	str.f_str = buf;
+
+	/* Determine where to save the result. */
+	result_doc = (result) ? result : doc;
+
+	/* The length of a string in characters is set here only. */
+
+	ret = innobase_mysql_fts_get_token(
+		doc->charset, doc->text.f_str + start_pos,
+		doc->text.f_str + doc->text.f_len, &str, &offset);
+
+	/* Ignore string whose character number is less than
+	"fts_min_token_size" or more than "fts_max_token_size" */
+
+	if (str.f_n_char >= fts_min_token_size
+	    && str.f_n_char <= fts_max_token_size) {
+
+		mem_heap_t*	heap;
+		fts_string_t	t_str;
+		fts_token_t*	token;
+		ib_rbt_bound_t	parent;
+		ulint		newlen;
+
+		heap = static_cast<mem_heap_t*>(result_doc->self_heap->arg);
+
+		t_str.f_n_char = str.f_n_char;
+
+		t_str.f_len = str.f_len * doc->charset->casedn_multiply + 1;
+
+		t_str.f_str = static_cast<byte*>(
+			mem_heap_alloc(heap, t_str.f_len));
+
+		newlen = innobase_fts_casedn_str(
+			doc->charset, (char*) str.f_str, str.f_len,
+			(char*) t_str.f_str, t_str.f_len);
+
+		t_str.f_len = newlen;
+
+		/* Add the word to the document statistics. If the word
+		hasn't been seen before we create a new entry for it. */
+		if (rbt_search(result_doc->tokens, &parent, &t_str) != 0) {
+			fts_token_t	new_token;
+
+			new_token.text.f_len = newlen;
+			new_token.text.f_str = t_str.f_str;
+			new_token.text.f_n_char = t_str.f_n_char;
+
+			new_token.positions = ib_vector_create(
+				result_doc->self_heap, sizeof(ulint), 32);
+
+			ut_a(new_token.text.f_n_char >= fts_min_token_size);
+			ut_a(new_token.text.f_n_char <= fts_max_token_size);
+
+			parent.last = rbt_add_node(
+				result_doc->tokens, &parent, &new_token);
+
+			ut_ad(rbt_validate(result_doc->tokens));
+		}
+
+#ifdef	FTS_CHARSET_DEBUG
+		offset += start_pos + add_pos;
+#endif /* FTS_CHARSET_DEBUG */
+
+		offset += start_pos + ret - str.f_len + add_pos;
+
+		token = rbt_value(fts_token_t, parent.last);
+		ib_vector_push(token->positions, &offset);
+	}
+
+	return(ret);
+}
+
+/******************************************************************//**
+Tokenize a document. */
+UNIV_INTERN
+void
+fts_tokenize_document(
+/*==================*/
+	fts_doc_t*	doc,		/* in/out: document to
+					tokenize */
+	fts_doc_t*	result)		/* out: if provided, save
+					the result token here */
+{
+	ulint		inc;
+
+	ut_a(!doc->tokens);
+	ut_a(doc->charset);
+
+	doc->tokens = rbt_create_arg_cmp(
+		sizeof(fts_token_t), innobase_fts_text_cmp, doc->charset);
+
+	for (ulint i = 0; i < doc->text.f_len; i += inc) {
+		inc = fts_process_token(doc, result, i, 0);
+		ut_a(inc > 0);
+	}
+}
+
+/******************************************************************//**
+Continue to tokenize a document. */
+UNIV_INTERN
+void
+fts_tokenize_document_next(
+/*=======================*/
+	fts_doc_t*	doc,		/*!< in/out: document to
+					tokenize */
+	ulint		add_pos,	/*!< in: add this position to all
+					tokens from this tokenization */
+	fts_doc_t*	result)		/*!< out: if provided, save
+					the result token here */
+{
+	ulint		inc;
+
+	ut_a(doc->tokens);
+
+	for (ulint i = 0; i < doc->text.f_len; i += inc) {
+		inc = fts_process_token(doc, result, i, add_pos);
+		ut_a(inc > 0);
+	}
+}
+
+/********************************************************************
+Create the vector of fts_get_doc_t instances. */
+UNIV_INTERN
+ib_vector_t*
+fts_get_docs_create(
+/*================*/
+						/* out: vector of
+						fts_get_doc_t instances */
+	fts_cache_t*	cache)			/*!< in: fts cache */
+{
+	ulint		i;
+	ib_vector_t*	get_docs;
+
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own(&cache->init_lock, RW_LOCK_EX));
+#endif
+	/* We need one instance of fts_get_doc_t per index. */
+	get_docs = ib_vector_create(
+		cache->self_heap, sizeof(fts_get_doc_t), 4);
+
+	/* Create the get_doc instance, we need one of these
+	per FTS index. */
+	for (i = 0; i < ib_vector_size(cache->indexes); ++i) {
+
+		dict_index_t**	index;
+		fts_get_doc_t*	get_doc;
+
+		index = static_cast<dict_index_t**>(
+			ib_vector_get(cache->indexes, i));
+
+		get_doc = static_cast<fts_get_doc_t*>(
+			ib_vector_push(get_docs, NULL));
+
+		memset(get_doc, 0x0, sizeof(*get_doc));
+
+		get_doc->index_cache = fts_get_index_cache(cache, *index);
+
+		/* Must find the index cache. */
+		ut_a(get_doc->index_cache != NULL);
+	}
+
+	return(get_docs);
+}
+
+/********************************************************************
+Release any resources held by the fts_get_doc_t instances. */
+static
+void
+fts_get_docs_clear(
+/*===============*/
+	ib_vector_t*	get_docs)		/*!< in: Doc retrieval vector */
+{
+	ulint		i;
+
+	/* Release the get doc graphs if any. */
+	for (i = 0; i < ib_vector_size(get_docs); ++i) {
+
+		fts_get_doc_t*	get_doc = static_cast<fts_get_doc_t*>(
+			ib_vector_get(get_docs, i));
+
+		if (get_doc->get_document_graph != NULL) {
+
+			ut_a(get_doc->index_cache);
+
+			fts_que_graph_free(get_doc->get_document_graph);
+			get_doc->get_document_graph = NULL;
+		}
+	}
+}
+
+/*********************************************************************//**
+Get the initial Doc ID by consulting the ADDED and the CONFIG table
+@return initial Doc ID */
+UNIV_INTERN
+doc_id_t
+fts_init_doc_id(
+/*============*/
+	const dict_table_t*	table)		/*!< in: table */
+{
+	doc_id_t	max_doc_id = 0;
+
+	rw_lock_x_lock(&table->fts->cache->lock);
+
+	if (table->fts->cache->first_doc_id != FTS_NULL_DOC_ID) {
+		rw_lock_x_unlock(&table->fts->cache->lock);
+		return(0);
+	}
+
+	/* Then compare this value with the ID value stored in the CONFIG
+	table. The larger one will be our new initial Doc ID */
+	fts_cmp_set_sync_doc_id(table, 0, FALSE, &max_doc_id);
+
+	/* If DICT_TF2_FTS_ADD_DOC_ID is set, we are in the process of
+	creating index (and add doc id column. No need to recovery
+	documents */
+	if (!DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_ADD_DOC_ID)) {
+		fts_init_index((dict_table_t*) table, TRUE);
+	}
+
+	table->fts->fts_status |= ADDED_TABLE_SYNCED;
+
+	table->fts->cache->first_doc_id = max_doc_id;
+
+	rw_lock_x_unlock(&table->fts->cache->lock);
+
+	ut_ad(max_doc_id > 0);
+
+	return(max_doc_id);
+}
+
+#ifdef FTS_MULT_INDEX
+/*********************************************************************//**
+Check if the index is in the affected set.
+@return TRUE if index is updated */
+static
+ibool
+fts_is_index_updated(
+/*=================*/
+	const ib_vector_t*	fts_indexes,	/*!< in: affected FTS indexes */
+	const fts_get_doc_t*	get_doc)	/*!< in: info for reading
+						document */
+{
+	ulint		i;
+	dict_index_t*	index = get_doc->index_cache->index;
+
+	for (i = 0; i < ib_vector_size(fts_indexes); ++i) {
+		const dict_index_t*	updated_fts_index;
+
+		updated_fts_index = static_cast<const dict_index_t*>(
+			ib_vector_getp_const(fts_indexes, i));
+
+		ut_a(updated_fts_index != NULL);
+
+		if (updated_fts_index == index) {
+			return(TRUE);
+		}
+	}
+
+	return(FALSE);
+}
+#endif
+
+/*********************************************************************//**
+Fetch COUNT(*) from specified table.
+@return the number of rows in the table */
+UNIV_INTERN
+ulint
+fts_get_rows_count(
+/*===============*/
+	fts_table_t*	fts_table)	/*!< in: fts table to read */
+{
+	trx_t*		trx;
+	pars_info_t*	info;
+	que_t*		graph;
+	ulint		error;
+	ulint		count = 0;
+
+	trx = trx_allocate_for_background();
+
+	trx->op_info = "fetching FT table rows count";
+
+	info = pars_info_create();
+
+	pars_info_bind_function(info, "my_func", fts_read_ulint, &count);
+
+	graph = fts_parse_sql(
+		fts_table,
+		info,
+		"DECLARE FUNCTION my_func;\n"
+		"DECLARE CURSOR c IS"
+		" SELECT COUNT(*) "
+		" FROM %s;\n"
+		"BEGIN\n"
+		"\n"
+		"OPEN c;\n"
+		"WHILE 1 = 1 LOOP\n"
+		"  FETCH c INTO my_func();\n"
+		"  IF c % NOTFOUND THEN\n"
+		"    EXIT;\n"
+		"  END IF;\n"
+		"END LOOP;\n"
+		"CLOSE c;");
+
+	for (;;) {
+		error = fts_eval_sql(trx, graph);
+
+		if (error == DB_SUCCESS) {
+			fts_sql_commit(trx);
+
+			break;				/* Exit the loop. */
+		} else {
+			fts_sql_rollback(trx);
+
+			ut_print_timestamp(stderr);
+
+			if (error == DB_LOCK_WAIT_TIMEOUT) {
+				fprintf(stderr, "  InnoDB: Warning: lock wait "
+					"timeout reading FTS table. "
+					"Retrying!\n");
+
+				trx->error_state = DB_SUCCESS;
+			} else {
+				fprintf(stderr, "  InnoDB: Error: %lu "
+					"while reading FTS table.\n",
+					error);
+
+				break;			/* Exit the loop. */
+			}
+		}
+	}
+
+	fts_que_graph_free(graph);
+
+	trx_free_for_background(trx);
+
+	return(count);
+}
+
+#ifdef FTS_CACHE_SIZE_DEBUG
+/*********************************************************************//**
+Read the max cache size parameter from the config table. */
+static
+void
+fts_update_max_cache_size(
+/*======================*/
+	fts_sync_t*	sync)			/*!< in: sync state */
+{
+	trx_t*		trx;
+	fts_table_t	fts_table;
+
+	trx = trx_allocate_for_background();
+
+	FTS_INIT_FTS_TABLE(&fts_table, "CONFIG", FTS_COMMON_TABLE, sync->table);
+
+	/* The size returned is in bytes. */
+	sync->max_cache_size = fts_get_max_cache_size(trx, &fts_table);
+
+	fts_sql_commit(trx);
+
+	trx_free_for_background(trx);
+}
+#endif
+
+/*********************************************************************//**
+Free the modified rows of a table. */
+UNIV_INLINE
+void
+fts_trx_table_rows_free(
+/*====================*/
+	ib_rbt_t*	rows)			/*!< in: rbt of rows to free */
+{
+	const ib_rbt_node_t*	node;
+
+	for (node = rbt_first(rows); node; node = rbt_first(rows)) {
+		fts_trx_row_t*	row;
+
+		row = rbt_value(fts_trx_row_t, node);
+
+		if (row->fts_indexes != NULL) {
+			/* This vector shouldn't be using the
+			heap allocator.  */
+			ut_a(row->fts_indexes->allocator->arg == NULL);
+
+			ib_vector_free(row->fts_indexes);
+			row->fts_indexes = NULL;
+		}
+
+		ut_free(rbt_remove_node(rows, node));
+	}
+
+	ut_a(rbt_empty(rows));
+	rbt_free(rows);
+}
+
+/*********************************************************************//**
+Free an FTS savepoint instance. */
+UNIV_INLINE
+void
+fts_savepoint_free(
+/*===============*/
+	fts_savepoint_t*	savepoint)	/*!< in: savepoint instance */
+{
+	const ib_rbt_node_t*	node;
+	ib_rbt_t*		tables = savepoint->tables;
+
+	/* Nothing to free! */
+	if (tables == NULL) {
+		return;
+	}
+
+	for (node = rbt_first(tables); node; node = rbt_first(tables)) {
+		fts_trx_table_t*	ftt;
+		fts_trx_table_t**	fttp;
+
+		fttp = rbt_value(fts_trx_table_t*, node);
+		ftt = *fttp;
+
+		/* This can be NULL if a savepoint was released. */
+		if (ftt->rows != NULL) {
+			fts_trx_table_rows_free(ftt->rows);
+			ftt->rows = NULL;
+		}
+
+		/* This can be NULL if a savepoint was released. */
+		if (ftt->added_doc_ids != NULL) {
+			fts_doc_ids_free(ftt->added_doc_ids);
+			ftt->added_doc_ids = NULL;
+		}
+
+		/* The default savepoint name must be NULL. */
+		if (ftt->docs_added_graph) {
+			fts_que_graph_free(ftt->docs_added_graph);
+		}
+
+		/* NOTE: We are responsible for free'ing the node */
+		ut_free(rbt_remove_node(tables, node));
+	}
+
+	ut_a(rbt_empty(tables));
+	rbt_free(tables);
+	savepoint->tables = NULL;
+}
+
+/*********************************************************************//**
+Free an FTS trx. */
+UNIV_INTERN
+void
+fts_trx_free(
+/*=========*/
+	fts_trx_t*	fts_trx)		/* in, own: FTS trx */
+{
+	ulint		i;
+
+	for (i = 0; i < ib_vector_size(fts_trx->savepoints); ++i) {
+		fts_savepoint_t*	savepoint;
+
+		savepoint = static_cast<fts_savepoint_t*>(
+			ib_vector_get(fts_trx->savepoints, i));
+
+		/* The default savepoint name must be NULL. */
+		if (i == 0) {
+			ut_a(savepoint->name == NULL);
+		}
+
+		fts_savepoint_free(savepoint);
+	}
+
+	for (i = 0; i < ib_vector_size(fts_trx->last_stmt); ++i) {
+		fts_savepoint_t*	savepoint;
+
+		savepoint = static_cast<fts_savepoint_t*>(
+			ib_vector_get(fts_trx->last_stmt, i));
+
+		/* The default savepoint name must be NULL. */
+		if (i == 0) {
+			ut_a(savepoint->name == NULL);
+		}
+
+		fts_savepoint_free(savepoint);
+	}
+
+	if (fts_trx->heap) {
+		mem_heap_free(fts_trx->heap);
+	}
+}
+
+/*********************************************************************//**
+Extract the doc id from the FTS hidden column.
+@return doc id that was extracted from rec */
+UNIV_INTERN
+doc_id_t
+fts_get_doc_id_from_row(
+/*====================*/
+	dict_table_t*	table,			/*!< in: table */
+	dtuple_t*	row)			/*!< in: row whose FTS doc id we
+						want to extract.*/
+{
+	dfield_t*	field;
+	doc_id_t	doc_id = 0;
+
+	ut_a(table->fts->doc_col != ULINT_UNDEFINED);
+
+	field = dtuple_get_nth_field(row, table->fts->doc_col);
+
+	ut_a(dfield_get_len(field) == sizeof(doc_id));
+	ut_a(dfield_get_type(field)->mtype == DATA_INT);
+
+	doc_id = fts_read_doc_id(
+		static_cast<const byte*>(dfield_get_data(field)));
+
+	return(doc_id);
+}
+
+/*********************************************************************//**
+Extract the doc id from the FTS hidden column.
+@return doc id that was extracted from rec */
+UNIV_INTERN
+doc_id_t
+fts_get_doc_id_from_rec(
+/*====================*/
+	dict_table_t*	table,			/*!< in: table */
+	const rec_t*	rec,			/*!< in: rec */
+	mem_heap_t*	heap)			/*!< in: heap */
+{
+	ulint		len;
+	const byte*	data;
+	ulint		col_no;
+	ulint*		offsets;
+	doc_id_t	doc_id = 0;
+	dict_index_t*	clust_index;
+	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
+
+	ut_a(table->fts->doc_col != ULINT_UNDEFINED);
+
+	offsets	= offsets_;
+	clust_index = dict_table_get_first_index(table);
+
+	offsets_[0] = UT_ARR_SIZE(offsets_);
+
+	offsets = rec_get_offsets(
+		rec, clust_index, offsets, ULINT_UNDEFINED, &heap);
+
+	col_no = dict_col_get_clust_pos(
+		&table->cols[table->fts->doc_col], clust_index);
+
+	/* We have no choice but to cast rec here :-( */
+	data = rec_get_nth_field((rec_t*) rec, offsets, col_no, &len);
+
+	ut_a(len == 8);
+	ut_a(len == sizeof(doc_id));
+	doc_id = (doc_id_t) mach_read_from_8(data);
+
+	return(doc_id);
+}
+
+/*********************************************************************//**
+Search the index specific cache for a particular FTS index.
+@return the index specific cache else NULL */
+UNIV_INTERN
+const fts_index_cache_t*
+fts_find_index_cache(
+/*=================*/
+	const fts_cache_t*	cache,		/*!< in: cache to search */
+	const dict_index_t*	index)		/*!< in: index to search for */
+{
+	/* We cast away the const because our internal function, takes
+	non-const cache arg and returns a non-const pointer. */
+	return(fts_get_index_cache((fts_cache_t*) cache, index));
+}
+
+/*********************************************************************//**
+Search cache for word.
+@return the word node vector if found else NULL */
+UNIV_INTERN
+const ib_vector_t*
+fts_cache_find_word(
+/*================*/
+	const fts_index_cache_t*index_cache,	/*!< in: cache to search */
+	const fts_string_t*	text)		/*!< in: word to search for */
+{
+	ib_rbt_bound_t		parent;
+	const ib_vector_t*	nodes = NULL;
+#ifdef UNIV_SYNC_DEBUG
+	dict_table_t*		table = index_cache->index->table;
+	fts_cache_t*		cache = table->fts->cache;
+
+	ut_ad(rw_lock_own((rw_lock_t*) &cache->lock, RW_LOCK_EX));
+#endif
+
+	/* Lookup the word in the rb tree */
+	if (rbt_search(index_cache->words, &parent, text) == 0) {
+		const fts_tokenizer_word_t*	word;
+
+		word = rbt_value(fts_tokenizer_word_t, parent.last);
+
+		nodes = word->nodes;
+	}
+
+	return(nodes);
+}
+
+/*********************************************************************//**
+Check cache for deleted doc id.
+@return TRUE if deleted */
+UNIV_INTERN
+ibool
+fts_cache_is_deleted_doc_id(
+/*========================*/
+	const fts_cache_t*	cache,		/*!< in: cache ito search */
+	doc_id_t		doc_id)		/*!< in: doc id to search for */
+{
+	ulint			i;
+
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(mutex_own(&cache->deleted_lock));
+#endif
+
+	for (i = 0; i < ib_vector_size(cache->deleted_doc_ids); ++i) {
+		const fts_update_t*	update;
+
+		update = static_cast<const fts_update_t*>(
+			ib_vector_get_const(cache->deleted_doc_ids, i));
+
+		if (doc_id == update->doc_id) {
+
+			return(TRUE);
+		}
+	}
+
+	return(FALSE);
+}
+
+/*********************************************************************//**
+Append deleted doc ids to vector. */
+UNIV_INTERN
+void
+fts_cache_append_deleted_doc_ids(
+/*=============================*/
+	const fts_cache_t*	cache,		/*!< in: cache to use */
+	ib_vector_t*		vector)		/*!< in: append to this vector */
+{
+	ulint			i;
+
+	mutex_enter((mutex_t*) &cache->deleted_lock);
+
+	for (i = 0; i < ib_vector_size(cache->deleted_doc_ids); ++i) {
+		fts_update_t*	update;
+
+		update = static_cast<fts_update_t*>(
+			ib_vector_get(cache->deleted_doc_ids, i));
+
+		ib_vector_push(vector, &update->doc_id);
+	}
+
+	mutex_exit((mutex_t*) &cache->deleted_lock);
+}
+
+/*********************************************************************//**
+Wait for the background thread to start. We poll to detect change
+of state, which is acceptable, since the wait should happen only
+once during startup.
+@return true if the thread started else FALSE (i.e timed out) */
+UNIV_INTERN
+ibool
+fts_wait_for_background_thread_to_start(
+/*====================================*/
+	dict_table_t*		table,		/*!< in: table to which the thread
+						is attached */
+	ulint			max_wait)	/*!< in: time in microseconds, if
+						set to 0 then it disables
+						timeout checking */
+{
+	ulint			count = 0;
+	ibool			done = FALSE;
+
+	ut_a(max_wait == 0 || max_wait >= FTS_MAX_BACKGROUND_THREAD_WAIT);
+
+	for (;;) {
+		fts_t*		fts = table->fts;
+
+		mutex_enter(&fts->bg_threads_mutex);
+
+		if (fts->fts_status & BG_THREAD_READY) {
+
+			done = TRUE;
+		}
+
+		mutex_exit(&fts->bg_threads_mutex);
+
+		if (!done) {
+			os_thread_sleep(FTS_MAX_BACKGROUND_THREAD_WAIT);
+
+			if (max_wait > 0) {
+
+				max_wait -= FTS_MAX_BACKGROUND_THREAD_WAIT;
+
+				/* We ignore the residual value. */
+				if (max_wait < FTS_MAX_BACKGROUND_THREAD_WAIT) {
+					break;
+				}
+			}
+
+			++count;
+		} else {
+			break;
+		}
+
+		if (count >= FTS_BACKGROUND_THREAD_WAIT_COUNT) {
+			ut_print_timestamp(stderr);
+			fprintf(stderr, " InnoDB: Error the background thread "
+				"for the FTS table %s refuses to start\n",
+				table->name);
+
+			count = 0;
+		}
+	}
+
+	return(done);
+}
+
+/*********************************************************************//**
+Add the FTS document id hidden column. */
+UNIV_INTERN
+void
+fts_add_doc_id_column(
+/*==================*/
+	dict_table_t*	table)		/*!< in/out: Table with FTS index */
+{
+	dict_mem_table_add_col(
+		table,
+		table->heap,
+		FTS_DOC_ID_COL_NAME,
+		DATA_INT,
+		dtype_form_prtype(
+			DATA_NOT_NULL | DATA_UNSIGNED
+			| DATA_BINARY_TYPE | DATA_FTS_DOC_ID, 0),
+		sizeof(doc_id_t));
+	DICT_TF2_FLAG_SET(table, DICT_TF2_FTS_HAS_DOC_ID);
+}
+
+/*********************************************************************//**
+Update the query graph with a new document id.
+@return Doc ID used */
+UNIV_INTERN
+doc_id_t
+fts_update_doc_id(
+/*==============*/
+	dict_table_t*	table,		/*!< in: table */
+	upd_field_t*	ufield,		/*!< out: update node */
+	doc_id_t*	next_doc_id)	/*!< in/out: buffer for writing */
+{
+	doc_id_t	doc_id;
+	ulint		error = DB_SUCCESS;
+
+	if (*next_doc_id) {
+		doc_id = *next_doc_id;
+	} else {
+		/* Get the new document id that will be added. */
+		error = fts_get_next_doc_id(table, &doc_id);
+	}
+
+	if (error == DB_SUCCESS) {
+		dict_index_t*	clust_index;
+
+		ufield->exp = NULL;
+
+		ufield->new_val.len = sizeof(doc_id);
+
+		clust_index = dict_table_get_first_index(table);
+
+		ufield->field_no = dict_col_get_clust_pos(
+			&table->cols[table->fts->doc_col], clust_index);
+
+		/* It is possible we update record that has
+		not yet be sync-ed from last crash. */
+
+		/* Convert to storage byte order. */
+		ut_a(doc_id != FTS_NULL_DOC_ID);
+		fts_write_doc_id((byte*) next_doc_id, doc_id);
+
+		ufield->new_val.data = next_doc_id;
+	}
+
+	return(doc_id);
+}
+
+/*********************************************************************//**
+Check if the table has an FTS index. This is the non-inline version
+of dict_table_has_fts_index().
+@return TRUE if table has an FTS index */
+UNIV_INTERN
+ibool
+fts_dict_table_has_fts_index(
+/*=========================*/
+	dict_table_t*	table)		/*!< in: table */
+{
+	return(dict_table_has_fts_index(table));
+}
+
+/*********************************************************************//**
+Create an instance of fts_t.
+@return instance of fts_t */
+UNIV_INTERN
+fts_t*
+fts_create(
+/*=======*/
+	dict_table_t*	table)		/*!< in/out: table with FTS indexes */
+{
+	fts_t*		fts;
+	ib_alloc_t*	heap_alloc;
+	mem_heap_t*	heap;
+
+	ut_a(!table->fts);
+
+	heap = mem_heap_create(512);
+
+	fts = static_cast<fts_t*>(mem_heap_alloc(heap, sizeof(*fts)));
+
+	memset(fts, 0x0, sizeof(*fts));
+
+	fts->fts_heap = heap;
+
+	fts->doc_col = ULINT_UNDEFINED;
+
+	mutex_create(
+		fts_bg_threads_mutex_key, &fts->bg_threads_mutex,
+		SYNC_FTS_BG_THREADS);
+
+	heap_alloc = ib_heap_allocator_create(heap);
+	fts->indexes = ib_vector_create(heap_alloc, sizeof(dict_index_t*), 4);
+	dict_table_get_all_fts_indexes(table, fts->indexes);
+
+	return(fts);
+}
+
+/*********************************************************************//**
+Free the FTS resources. */
+UNIV_INTERN
+void
+fts_free(
+/*=====*/
+	dict_table_t*	table)	/*!< in/out: table with FTS indexes */
+{
+	fts_t*		fts = table->fts;
+
+	mutex_free(&fts->bg_threads_mutex);
+
+	ut_ad(!fts->add_wq);
+
+	if (fts->cache) {
+		fts_cache_clear(fts->cache, TRUE);
+		fts_cache_destroy(fts->cache);
+		fts->cache = NULL;
+	}
+
+	mem_heap_free(fts->fts_heap);
+
+	table->fts = NULL;
+}
+
+/*********************************************************************//**
+Signal FTS threads to initiate shutdown. */
+UNIV_INTERN
+void
+fts_start_shutdown(
+/*===============*/
+	dict_table_t*	table,		/*!< in: table with FTS indexes */
+	fts_t*		fts)		/*!< in: fts instance that needs
+					to be informed about shutdown */
+{
+	mutex_enter(&fts->bg_threads_mutex);
+
+	fts->fts_status |= BG_THREAD_STOP;
+
+	mutex_exit(&fts->bg_threads_mutex);
+
+}
+
+/*********************************************************************//**
+Wait for FTS threads to shutdown. */
+UNIV_INTERN
+void
+fts_shutdown(
+/*=========*/
+	dict_table_t*	table,		/*!< in: table with FTS indexes */
+	fts_t*		fts)		/*!< in: fts instance to shutdown */
+{
+	mutex_enter(&fts->bg_threads_mutex);
+
+	ut_a(fts->fts_status & BG_THREAD_STOP);
+
+	dict_table_wait_for_bg_threads_to_exit(table, 20000);
+
+	mutex_exit(&fts->bg_threads_mutex);
+}
+
+/*********************************************************************//**
+Take a FTS savepoint. */
+UNIV_INLINE
+void
+fts_savepoint_copy(
+/*===============*/
+	const fts_savepoint_t*	src,	/*!< in: source savepoint */
+	fts_savepoint_t*	dst)	/*!< out: destination savepoint */
+{
+	const ib_rbt_node_t*	node;
+	const ib_rbt_t*		tables;
+
+	tables = src->tables;
+
+	for (node = rbt_first(tables); node; node = rbt_next(tables, node)) {
+
+		fts_trx_table_t*	ftt_dst;
+		const fts_trx_table_t**	ftt_src;
+
+		ftt_src = rbt_value(const fts_trx_table_t*, node);
+
+		ftt_dst = fts_trx_table_clone(*ftt_src);
+
+		rbt_insert(dst->tables, &ftt_dst->table->id, &ftt_dst);
+	}
+}
+
+/*********************************************************************//**
+Take a FTS savepoint.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+void
+fts_savepoint_take(
+/*===============*/
+	trx_t*		trx,		/*!< in: transaction */
+	const char*	name)		/*!< in: savepoint name */
+{
+	mem_heap_t*		heap;
+	fts_trx_t*		fts_trx;
+	fts_savepoint_t*	savepoint;
+	fts_savepoint_t*	last_savepoint;
+
+	ut_a(name != NULL);
+
+	fts_trx = trx->fts_trx;
+	heap = fts_trx->heap;
+
+	/* The implied savepoint must exist. */
+	ut_a(ib_vector_size(fts_trx->savepoints) > 0);
+
+	last_savepoint = static_cast<fts_savepoint_t*>(
+		ib_vector_last(fts_trx->savepoints));
+	savepoint = fts_savepoint_create(fts_trx->savepoints, name, heap);
+
+	if (last_savepoint->tables != NULL) {
+		fts_savepoint_copy(last_savepoint, savepoint);
+	}
+}
+
+/*********************************************************************//**
+Lookup a savepoint instance by name.
+@return ULINT_UNDEFINED if not found */
+UNIV_INLINE
+ulint
+fts_savepoint_lookup(
+/*==================*/
+	ib_vector_t*	savepoints,	/*!< in: savepoints */
+	const char*	name)		/*!< in: savepoint name */
+{
+	ulint			i;
+
+	ut_a(ib_vector_size(savepoints) > 0);
+
+	for (i = 1; i < ib_vector_size(savepoints); ++i) {
+		fts_savepoint_t*	savepoint;
+
+		savepoint = static_cast<fts_savepoint_t*>(
+			ib_vector_get(savepoints, i));
+
+		if (strcmp(name, savepoint->name) == 0) {
+			return(i);
+		}
+	}
+
+	return(ULINT_UNDEFINED);
+}
+
+/*********************************************************************//**
+Release the savepoint data identified by  name. All savepoints created
+after the named savepoint are also released.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+void
+fts_savepoint_release(
+/*==================*/
+	trx_t*		trx,		/*!< in: transaction */
+	const char*	name)		/*!< in: savepoint name */
+{
+	ulint			i;
+	fts_savepoint_t*	prev;
+	ib_vector_t*		savepoints;
+	ulint			top_of_stack = 0;
+
+	ut_a(name != NULL);
+
+	savepoints = trx->fts_trx->savepoints;
+
+	ut_a(ib_vector_size(savepoints) > 0);
+
+	prev = static_cast<fts_savepoint_t*>(
+		ib_vector_get(savepoints, top_of_stack));
+
+	/* Skip the implied savepoint (first element). */
+	for (i = 1; i < ib_vector_size(savepoints); ++i) {
+		fts_savepoint_t*	savepoint;
+
+		savepoint = static_cast<fts_savepoint_t*>(
+			ib_vector_get(savepoints, i));
+
+		/* Even though we release the resources that are part
+		of the savepoint, we don't (always) actually delete the
+		entry.  We simply set the savepoint name to NULL. Therefore
+		we have to skip deleted/released entries. */
+		if (savepoint->name != NULL
+		    && strcmp(name, savepoint->name) == 0) {
+
+			fts_savepoint_t*	last;
+			fts_savepoint_t		temp;
+
+			last = static_cast<fts_savepoint_t*>(
+				ib_vector_last(savepoints));
+
+			/* Swap the entries. */
+			memcpy(&temp, last, sizeof(temp));
+			memcpy(last, prev, sizeof(*last));
+			memcpy(prev, &temp, sizeof(prev));
+			break;
+
+		/* Track the previous savepoint instance that will
+		be at the top of the stack after the release. */
+		} else if (savepoint->name != NULL) {
+			/* We need to delete all entries
+			greater than this element. */
+			top_of_stack = i;
+
+			prev = savepoint;
+		}
+	}
+
+	/* Only if we found and element to release. */
+	if (i < ib_vector_size(savepoints)) {
+
+		ut_a(top_of_stack < ib_vector_size(savepoints));
+
+		/* Skip the implied savepoint. */
+		for (i = ib_vector_size(savepoints) - 1;
+		     i > top_of_stack;
+		     --i) {
+
+			fts_savepoint_t*	savepoint;
+
+			savepoint = static_cast<fts_savepoint_t*>(
+				ib_vector_get(savepoints, i));
+
+			/* Skip savepoints that were released earlier. */
+			if (savepoint->name != NULL) {
+				savepoint->name = NULL;
+				fts_savepoint_free(savepoint);
+			}
+
+			ib_vector_pop(savepoints);
+		}
+
+		/* Make sure we don't delete the implied savepoint. */
+		ut_a(ib_vector_size(savepoints) > 0);
+
+		/* This must hold. */
+		ut_a(ib_vector_size(savepoints) == (top_of_stack + 1));
+	}
+}
+
+/**********************************************************************//**
+Refresh last statement savepoint.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+void
+fts_savepoint_laststmt_refresh(
+/*===========================*/
+	trx_t*			trx)	/*!< in: transaction */
+{
+
+	fts_trx_t*              fts_trx;
+	fts_savepoint_t*        savepoint;
+
+	fts_trx = trx->fts_trx;
+
+	savepoint = static_cast<fts_savepoint_t*>(
+		ib_vector_pop(fts_trx->last_stmt));
+	fts_savepoint_free(savepoint);
+
+	ut_ad(ib_vector_is_empty(fts_trx->last_stmt));
+	savepoint = fts_savepoint_create(fts_trx->last_stmt, NULL, NULL);
+}
+
+/********************************************************************
+Undo the Doc ID add/delete operations in last stmt */
+static
+void
+fts_undo_last_stmt(
+/*===============*/
+	fts_trx_table_t*	s_ftt,	/*!< in: Transaction FTS table */
+	fts_trx_table_t*	l_ftt)	/*!< in: last stmt FTS table */
+{
+	ib_rbt_t*		s_rows;
+	ib_rbt_t*		l_rows;
+	const ib_rbt_node_t*	node;
+
+	l_rows = l_ftt->rows;
+	s_rows = s_ftt->rows;
+
+	for (node = rbt_first(l_rows);
+	     node;
+	     node = rbt_next(l_rows, node)) {
+		fts_trx_row_t*	l_row = rbt_value(fts_trx_row_t, node);
+		ib_rbt_bound_t	parent;
+
+		rbt_search(s_rows, &parent, &(l_row->doc_id));
+
+		if (parent.result == 0) {
+			fts_trx_row_t*	s_row = rbt_value(
+				fts_trx_row_t, parent.last);
+
+			switch (l_row->state) {
+			case FTS_INSERT:
+				ut_free(rbt_remove_node(s_rows, parent.last));
+				break;
+
+			case FTS_DELETE:
+				if (s_row->state == FTS_NOTHING) {
+					s_row->state = FTS_INSERT;
+				} else if (s_row->state == FTS_DELETE) {
+					ut_free(rbt_remove_node(
+						s_rows, parent.last));
+				}
+				break;
+
+			/* FIXME: Check if FTS_MODIFY need to be addressed */
+			case FTS_MODIFY:
+			case FTS_NOTHING:
+				break;
+			default:
+				ut_error;
+			}
+		}
+	}
+}
+
+/**********************************************************************//**
+Rollback to savepoint indentified by name.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+void
+fts_savepoint_rollback_last_stmt(
+/*=============================*/
+	trx_t*		trx)		/*!< in: transaction */
+{
+	ib_vector_t*		savepoints;
+	fts_savepoint_t*	savepoint;
+	fts_savepoint_t*	last_stmt;
+	fts_trx_t*		fts_trx;
+	ib_rbt_bound_t		parent;
+	const ib_rbt_node_t*    node;
+	ib_rbt_t*		l_tables;
+	ib_rbt_t*		s_tables;
+
+	fts_trx = trx->fts_trx;
+	savepoints = fts_trx->savepoints;
+
+	savepoint = static_cast<fts_savepoint_t*>(ib_vector_last(savepoints));
+	last_stmt = static_cast<fts_savepoint_t*>(
+		ib_vector_last(fts_trx->last_stmt));
+
+	l_tables = last_stmt->tables;
+	s_tables = savepoint->tables;
+
+	for (node = rbt_first(l_tables);
+	     node;
+	     node = rbt_next(l_tables, node)) {
+
+		fts_trx_table_t**	l_ftt;
+
+		l_ftt = rbt_value(fts_trx_table_t*, node);
+
+		rbt_search_cmp(
+			s_tables, &parent, &(*l_ftt)->table->id,
+			fts_trx_table_id_cmp, NULL);
+
+		if (parent.result == 0) {
+			fts_trx_table_t**	s_ftt;
+
+			s_ftt = rbt_value(fts_trx_table_t*, parent.last);
+
+			fts_undo_last_stmt(*s_ftt, *l_ftt);
+		}
+	}
+}
+
+/**********************************************************************//**
+Rollback to savepoint indentified by name.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+void
+fts_savepoint_rollback(
+/*===================*/
+	trx_t*		trx,		/*!< in: transaction */
+	const char*	name)		/*!< in: savepoint name */
+{
+	ulint		i;
+	ib_vector_t*	savepoints;
+
+	ut_a(name != NULL);
+
+	savepoints = trx->fts_trx->savepoints;
+
+	/* We pop all savepoints from the the top of the stack up to
+	and including the instance that was found. */
+	i = fts_savepoint_lookup(savepoints, name);
+
+	if (i != ULINT_UNDEFINED) {
+		fts_savepoint_t*	savepoint;
+
+		ut_a(i > 0);
+
+		while (ib_vector_size(savepoints) > i) {
+			fts_savepoint_t*	savepoint;
+
+			savepoint = static_cast<fts_savepoint_t*>(
+				ib_vector_pop(savepoints));
+
+			if (savepoint->name != NULL) {
+				/* Since name was allocated on the heap, the
+				memory will be released when the transaction
+				completes. */
+				savepoint->name = NULL;
+
+				fts_savepoint_free(savepoint);
+			}
+		}
+
+		/* Pop all a elements from the top of the stack that may
+		have been released. We have to be careful that we don't
+		delete the implied savepoint. */
+
+		for (savepoint = static_cast<fts_savepoint_t*>(
+				ib_vector_last(savepoints));
+		     ib_vector_size(savepoints) > 1
+		     && savepoint->name == NULL;
+		     savepoint = static_cast<fts_savepoint_t*>(
+				ib_vector_last(savepoints))) {
+
+			ib_vector_pop(savepoints);
+		}
+
+		/* Make sure we don't delete the implied savepoint. */
+		ut_a(ib_vector_size(savepoints) > 0);
+	}
+}
+
+/**********************************************************************//**
+Check if a table is an FTS auxiliary table name.
+@return TRUE if the name matches an auxiliary table name pattern */
+static
+ibool
+fts_is_aux_table_name(
+/*==================*/
+	fts_sys_table_t*table,		/*!< out: table info */
+	const char*	name,		/*!< in: table name */
+	ulint		len)		/*!< in: length of table name */
+{
+	const char*	ptr;
+	char*		end;
+	char		my_name[MAX_FULL_NAME_LEN + 1];
+
+	ut_ad(len <= MAX_FULL_NAME_LEN);
+	ut_memcpy(my_name, name, len);
+	my_name[len] = 0;
+	end = my_name + len;
+
+	ptr =  static_cast<const char*>(memchr(my_name, '/', len));
+
+	if (ptr != NULL) {
+		/* We will start the match after the '/' */
+		++ptr;
+		len = end - ptr;
+	}
+
+	/* All auxiliary tables are prefixed with "FTS_" and the name
+	length will be at the very least greater than 20 bytes. */
+	if (ptr != NULL && len > 20 && strncmp(ptr, "FTS_", 4) == 0) {
+		ulint		i;
+
+
+		/* Skip the prefix. */
+		ptr += 4;
+		len -= 4;
+
+		/* Try and read the table id. */
+		if (!fts_read_object_id(&table->parent_id, ptr)) {
+			return(FALSE);
+		}
+
+		/* Skip the table id. */
+		ptr = static_cast<const char*>(memchr(ptr, '_', len));
+
+		if (ptr == NULL) {
+			return(FALSE);
+		}
+
+		/* Skip the underscore. */
+		++ptr;
+		ut_a(end > ptr);
+		len = end - ptr;
+
+		/* First search the common table suffix array. */
+		for (i = 0; fts_common_tables[i] != NULL; ++i) {
+
+			if (strncmp(ptr, fts_common_tables[i], len) == 0) {
+				return(TRUE);
+			}
+		}
+
+		/* Try and read the index id. */
+		if (!fts_read_object_id(&table->index_id, ptr)) {
+			return(FALSE);
+		}
+
+		/* Skip the table id. */
+		ptr = static_cast<const char*>(memchr(ptr, '_', len));
+
+		if (ptr == NULL) {
+			return(FALSE);
+		}
+
+		/* Skip the underscore. */
+		++ptr;
+		ut_a(end > ptr);
+		len = end - ptr;
+
+		/* Search the FT index specific array. */
+		for (i = 0; fts_index_selector[i].value; ++i) {
+
+			if (strncmp(ptr, fts_get_suffix(i), len) == 0) {
+				return(TRUE);
+			}
+		}
+
+		/* Other FT index specific table(s). */
+		if (strncmp(ptr, "DOC_ID", len) == 0) {
+			return(TRUE);
+		}
+	}
+
+	return(FALSE);
+}
+
+/**********************************************************************//**
+Callback function to read a single table ID column.
+@return Always return TRUE */
+static
+ibool
+fts_read_tables(
+/*============*/
+	void*		row,		/*!< in: sel_node_t* */
+	void*		user_arg)	/*!< in: pointer to ib_vector_t */
+{
+	int		i;
+	fts_sys_table_t*table;
+	mem_heap_t*	heap;
+	ibool		done = FALSE;
+	ib_vector_t*	tables = static_cast<ib_vector_t*>(user_arg);
+	sel_node_t*	sel_node = static_cast<sel_node_t*>(row);
+	que_node_t*	exp = sel_node->select_list;
+
+	/* Must be a heap allocated vector. */
+	ut_a(tables->allocator->arg != NULL);
+
+	/* We will use this heap for allocating strings. */
+	heap = static_cast<mem_heap_t*>(tables->allocator->arg);
+	table = static_cast<fts_sys_table_t*>(ib_vector_push(tables, NULL));
+
+	memset(table, 0x0, sizeof(*table));
+
+	/* Iterate over the columns and read the values. */
+	for (i = 0; exp && !done; exp = que_node_get_next(exp), ++i) {
+
+		dfield_t*	dfield = que_node_get_val(exp);
+		void*		data = dfield_get_data(dfield);
+		ulint		len = dfield_get_len(dfield);
+
+		ut_a(len != UNIV_SQL_NULL);
+
+		/* Note: The column numbers below must match the SELECT */
+		switch (i) {
+		case 0: /* NAME */
+
+			if (!fts_is_aux_table_name(
+				table, static_cast<const char*>(data), len)) {
+				ib_vector_pop(tables);
+				done = TRUE;
+				break;
+			}
+
+			table->name = static_cast<char*>(
+				mem_heap_dup(heap, data, len + 1));
+			table->name[len] = '\0';
+			printf("Found [%.*s]\n", (int) len, table->name);
+			break;
+
+		case 1: /* ID */
+			ut_a(len == 8);
+			table->id = mach_read_from_8(
+				static_cast<const byte*>(data));
+			break;
+
+		default:
+			ut_error;
+		}
+	}
+
+	return(TRUE);
+}
+
+/**********************************************************************//**
+Check and drop all orphaned FTS auxiliary tables, those that don't have
+a parent table or FTS index defined on them.
+@return DB_SUCCESS or error code */
+static
+ulint
+fts_check_and_drop_orphaned_tables(
+/*===============================*/
+	trx_t*		trx,			/*!< in: transaction */
+	ib_vector_t*	tables)			/*!< in: tables to check */
+{
+	ulint		i;
+	ulint		error = DB_SUCCESS;
+
+	for (i = 0; i < ib_vector_size(tables); ++i) {
+		dict_table_t*		table;
+		fts_sys_table_t*	sys_table;
+		ibool			drop = FALSE;
+
+		sys_table = static_cast<fts_sys_table_t*>(
+			ib_vector_get(tables, i));
+
+		table = dict_table_open_on_id(sys_table->parent_id, FALSE);
+
+		if (table == NULL || table->fts == NULL) {
+
+			drop = TRUE;
+
+		} else if (sys_table->index_id != 0) {
+			ulint		j;
+			index_id_t	id;
+			fts_t*	fts;
+
+			drop = TRUE;
+			fts = table->fts;
+			id = sys_table->index_id;
+
+			/* Search for the FT index in the table's list. */
+			for (j = 0; j < ib_vector_size(fts->indexes); ++j) {
+				const dict_index_t*	index;
+
+				index = static_cast<const dict_index_t*>(
+					ib_vector_getp_const(fts->indexes, j));
+
+				if (index->id == id) {
+
+					drop = FALSE;
+					break;
+				}
+			}
+		}
+
+		if (table) {
+			dict_table_close(table, FALSE);
+		}
+
+		if (drop) {
+			ut_print_timestamp(stderr);
+			fprintf(stderr, "  InnoDB: Warning: Parent table of "
+				"FT auxiliary table %s not found.\n",
+				sys_table->name);
+
+			/* We ignore drop errors. */
+			fts_drop_table(trx, sys_table->name);
+		}
+	}
+
+	return(error);
+}
+
+/**********************************************************************//**
+Drop all orphaned FTS auxiliary tables, those that don't have a parent
+table or FTS index defined on them. */
+UNIV_INTERN
+void
+fts_drop_orphaned_tables(void)
+/*==========================*/
+{
+	trx_t*		trx;
+	pars_info_t*	info;
+	mem_heap_t*	heap;
+	que_t*		graph;
+	ib_vector_t*	tables;
+	ib_alloc_t*	heap_alloc;
+	ulint		error = DB_SUCCESS;
+
+	heap = mem_heap_create(1024);
+	heap_alloc = ib_heap_allocator_create(heap);
+
+	/* We store the table ids of all the FTS indexes that were found. */
+	tables = ib_vector_create(heap_alloc, sizeof(fts_sys_table_t), 128);
+
+	trx = trx_allocate_for_background();
+	trx->op_info = "dropping orphaned FTS tables";
+	row_mysql_lock_data_dictionary(trx);
+
+	info = pars_info_create();
+
+	pars_info_bind_function(info, "my_func", fts_read_tables, tables);
+
+	graph = fts_parse_sql_no_dict_lock(
+		NULL,
+		info,
+		"DECLARE FUNCTION my_func;\n"
+		"DECLARE CURSOR c IS"
+		" SELECT NAME, ID "
+		" FROM SYS_TABLES;\n"
+		"BEGIN\n"
+		"\n"
+		"OPEN c;\n"
+		"WHILE 1 = 1 LOOP\n"
+		"  FETCH c INTO my_func();\n"
+		"  IF c % NOTFOUND THEN\n"
+		"    EXIT;\n"
+		"  END IF;\n"
+		"END LOOP;\n"
+		"CLOSE c;");
+
+	for (;;) {
+		error = fts_eval_sql(trx, graph);
+
+		if (error == DB_SUCCESS) {
+			error = fts_check_and_drop_orphaned_tables(trx, tables);
+		}
+
+		if (error == DB_SUCCESS) {
+			fts_sql_commit(trx);
+			break;				/* Exit the loop. */
+		} else {
+			ib_vector_reset(tables);
+
+			fts_sql_rollback(trx);
+
+			ut_print_timestamp(stderr);
+
+			if (error == DB_LOCK_WAIT_TIMEOUT) {
+				fprintf(stderr, "  InnoDB: Warning: lock wait "
+					"timeout reading SYS_TABLES. "
+					"Retrying!\n");
+
+				trx->error_state = DB_SUCCESS;
+			} else {
+				fprintf(stderr, "  InnoDB: Error: %lu "
+					"while reading SYS_TABLES.\n",
+					error);
+
+				break;			/* Exit the loop. */
+			}
+		}
+	}
+
+	que_graph_free(graph);
+
+	row_mysql_unlock_data_dictionary(trx);
+
+	trx_free_for_background(trx);
+
+	if (heap != NULL) {
+		mem_heap_free(heap);
+	}
+}
+
+/**********************************************************************//**
+Check whether user supplied stopword table is of the right format.
+Caller is responsible to hold dictionary locks.
+@return TRUE if the table qualifies */
+UNIV_INTERN
+ibool
+fts_valid_stopword_table(
+/*=====================*/
+	 const char*	stopword_table_name)	/*!< in: Stopword table
+						name */
+{
+	dict_table_t*	table;
+
+	if (!stopword_table_name) {
+		return(FALSE);
+	}
+
+	table = dict_table_get_low(stopword_table_name);
+
+	if (!table) {
+		fprintf(stderr,
+			"InnoDB: user stopword table %s does not exist.\n",
+			stopword_table_name);
+
+		return(FALSE);
+	} else {
+		dict_col_t*     col;
+		const char*     col_name;
+
+		col_name = dict_table_get_col_name(table, 0);
+
+		if (ut_strcmp(col_name, "value")) {
+			fprintf(stderr,
+				"InnoDB: invalid column name for stopword "
+				"table %s. Its first column must be named as "
+				"'value'.\n", stopword_table_name);
+
+			return(FALSE);
+		}
+
+		col = dict_table_get_nth_col(table, 0);
+
+		if (col->mtype != DATA_VARCHAR) {
+			fprintf(stderr,
+				"InnoDB: invalid column type for stopword "
+				"table %s. Its first column must be of "
+				"varchar type\n", stopword_table_name);
+
+			return(FALSE);
+		}
+	}
+
+	return(TRUE);
+}
+
+/**********************************************************************//**
+This function loads the stopword into the FTS cache. It also
+records/fetches stopword configuration to/from FTS configure
+table, depending on whether we are creating or reloading the
+FTS.
+@return TRUE if load operation is successful */
+UNIV_INTERN
+ibool
+fts_load_stopword(
+/*==============*/
+	const dict_table_t*
+			table,			/*!< in: Table with FTS */
+	trx_t*		trx,			/*!< in: Transactions */
+	const char*	global_stopword_table,	/*!< in: Global stopword table
+						name */
+	const char*	session_stopword_table,	/*!< in: Session stopword table
+						name */
+	ibool		stopword_is_on,		/*!< in: Whether stopword
+						option is turned on/off */
+	ibool		reload)			/*!< in: Whether it is
+						for reloading FTS table */
+{
+	fts_table_t	fts_table;
+	fts_string_t	str;
+	ulint		error = DB_SUCCESS;
+	ulint		use_stopword;
+	fts_cache_t*	cache;
+	const char*	stopword_to_use = NULL;
+	ibool		new_trx = FALSE;
+	byte		str_buffer[MAX_FULL_NAME_LEN + 1];
+
+	FTS_INIT_FTS_TABLE(&fts_table, "CONFIG", FTS_COMMON_TABLE, table);
+
+	cache = table->fts->cache;
+
+	if (!reload && !(cache->stopword_info.status
+			 & STOPWORD_NOT_INIT)) {
+		return(TRUE);
+	}
+
+	if (!trx) {
+		trx = trx_allocate_for_background();
+		trx->op_info = "upload FTS stopword";
+		new_trx = TRUE;
+	}
+
+	/* First check whether stopword filtering is turned off */
+	if (reload) {
+		error = fts_config_get_ulint(
+			trx, &fts_table, FTS_USE_STOPWORD, &use_stopword);
+	} else {
+		use_stopword = (ulint) stopword_is_on;
+
+		error = fts_config_set_ulint(
+			trx, &fts_table, FTS_USE_STOPWORD, use_stopword);
+	}
+
+	if (error != DB_SUCCESS) {
+		goto cleanup;
+	}
+
+	/* If stopword is turned off, no need to continue to load the
+	stopword into cache */
+	if (!use_stopword) {
+		cache->stopword_info.status = STOPWORD_OFF;
+		goto cleanup;
+	}
+
+	if (reload) {
+		/* Fetch the stopword table name from FTS config
+		table */
+		str.f_n_char = 0;
+		str.f_str = str_buffer;
+		str.f_len = sizeof(str_buffer) - 1;
+
+		error = fts_config_get_value(
+			trx, &fts_table, FTS_STOPWORD_TABLE_NAME, &str);
+
+		if (error != DB_SUCCESS) {
+			goto cleanup;
+		}
+
+		if (strlen((char*) str.f_str) > 0) {
+			stopword_to_use = (const char*) str.f_str;
+		}
+	} else {
+		stopword_to_use = (session_stopword_table)
+			? session_stopword_table : global_stopword_table;
+	}
+
+	if (stopword_to_use
+	    && fts_load_user_stopword(table->fts, stopword_to_use,
+				      &cache->stopword_info)) {
+		/* Save the stopword table name to the configure
+		table */
+		if (!reload) {
+			str.f_n_char = 0;
+			str.f_str = (byte*) stopword_to_use;
+			str.f_len = ut_strlen(stopword_to_use);
+
+			error = fts_config_set_value(
+				trx, &fts_table, FTS_STOPWORD_TABLE_NAME, &str);
+		}
+	} else {
+		/* Load system default stopword list */
+		fts_load_default_stopword(&cache->stopword_info);
+	}
+
+cleanup:
+	if (new_trx) {
+		if (error == DB_SUCCESS) {
+			fts_sql_commit(trx);
+		} else {
+			fts_sql_rollback(trx);
+		}
+
+		trx_free_for_background(trx);
+	}
+
+	return(error == DB_SUCCESS);
+}
+
+/**********************************************************************//**
+Callback function when we initialize the FTS at the start up
+time. It recovers Doc IDs that have not sync-ed to the auxiliary
+table, and require to bring them back into FTS index.
+@return: always returns TRUE */
+static
+ibool
+fts_init_recover_doc(
+/*=================*/
+	void*	row,			/*!< in: sel_node_t* */
+	void*	user_arg)		/*!< in: fts cache */
+{
+
+	fts_doc_t       doc;
+	ulint		doc_len = 0;
+	ulint		field_no = 0;
+	ibool		has_fts = TRUE;
+	fts_get_doc_t*  get_doc = NULL;
+	doc_id_t	doc_id = FTS_NULL_DOC_ID;
+	sel_node_t*	node = static_cast<sel_node_t*>(row);
+	que_node_t*	exp = node->select_list;
+	fts_cache_t*    cache = static_cast<fts_cache_t*>(user_arg);
+
+	if (ib_vector_is_empty(cache->get_docs)) {
+		has_fts = FALSE;
+	} else {
+		get_doc = static_cast<fts_get_doc_t*>(
+			ib_vector_get(cache->get_docs, 0));
+
+		fts_doc_init(&doc);
+		doc.found = TRUE;
+	}
+
+	/* Copy each indexed column content into doc->text.f_str */
+	while (exp) {
+		dfield_t*	dfield = que_node_get_val(exp);
+		ulint		len = dfield_get_len(dfield);
+
+		if (field_no == 0) {
+			dtype_t*        type = dfield_get_type(dfield);
+			void*           data = dfield_get_data(dfield);
+
+			ut_a(dtype_get_mtype(type) == DATA_INT);
+
+			doc_id = static_cast<doc_id_t>(mach_read_from_8(
+				static_cast<const byte*>(data)));
+
+			/* Just need to fetch the Doc ID */
+			if (!has_fts) {
+				goto func_exit;
+			}
+
+			field_no++;
+			exp = que_node_get_next(exp);
+			continue;
+		}
+
+		ut_a(has_fts);
+
+		if (len == UNIV_SQL_NULL) {
+			exp = que_node_get_next(exp);
+			continue;
+		}
+
+		ut_ad(get_doc);
+
+		if (!get_doc->index_cache->charset) {
+			ulint   prtype = dfield->type.prtype;
+
+			get_doc->index_cache->charset =
+				innobase_get_fts_charset(
+				(int)(prtype & DATA_MYSQL_TYPE_MASK),
+				(uint) dtype_get_charset_coll(prtype));
+		}
+
+		doc.charset = get_doc->index_cache->charset;
+
+		if (dfield_is_ext(dfield)) {
+			dict_table_t*	table = cache->sync->table;
+			ulint		zip_size = dict_table_zip_size(table);
+
+			doc.text.f_str = btr_copy_externally_stored_field(
+				&doc.text.f_len,
+				static_cast<byte*>(dfield_get_data(dfield)),
+				zip_size, len,
+				static_cast<mem_heap_t*>(doc.self_heap->arg));
+		} else {
+			doc.text.f_str = static_cast<byte*>(
+				dfield_get_data(dfield));
+
+			doc.text.f_len = len;
+		}
+
+		if (field_no == 1) {
+			fts_tokenize_document(&doc, NULL);
+		} else {
+			fts_tokenize_document_next(&doc, doc_len, NULL);
+		}
+
+		exp = que_node_get_next(exp);
+
+		doc_len += (exp) ? len + 1 : len;
+
+		field_no++;
+	}
+
+	fts_cache_add_doc(cache, get_doc->index_cache, doc_id, doc.tokens);
+
+	fts_doc_free(&doc);
+
+	cache->added++;
+
+func_exit:
+	if (doc_id >= cache->next_doc_id) {
+		cache->next_doc_id = doc_id + 1;
+	}
+
+	return(TRUE);
+}
+
+/**********************************************************************//**
+This function brings FTS index in sync when FTS index is first
+used. There are documents that have not yet sync-ed to auxiliary
+tables from last server abnormally shutdown, we will need to bring
+such document into FTS cache before any further operations
+@return TRUE if all OK */
+UNIV_INTERN
+ibool
+fts_init_index(
+/*===========*/
+	dict_table_t*	table,		/*!< in: Table with FTS */
+	ibool		has_cache_lock)	/*!< in: Whether we already have
+					cache lock */
+{
+	dict_index_t*   index;
+	doc_id_t        start_doc;
+	fts_get_doc_t*  get_doc = NULL;
+	ibool		has_fts = TRUE;
+	fts_cache_t*    cache = table->fts->cache;
+
+	/* First check cache->get_docs is initialized */
+	if (!has_cache_lock) {
+		rw_lock_x_lock(&cache->lock);
+	}
+
+	rw_lock_x_lock(&cache->init_lock);
+	if (cache->get_docs == NULL) {
+		cache->get_docs = fts_get_docs_create(cache);
+	}
+	rw_lock_x_unlock(&cache->init_lock);
+
+	if (table->fts->fts_status & ADDED_TABLE_SYNCED) {
+		goto func_exit;
+	}
+
+	start_doc = cache->synced_doc_id;
+
+	if (!start_doc) {
+		fts_cmp_set_sync_doc_id(table, 0, TRUE, &start_doc);
+		cache->synced_doc_id = start_doc;
+	}
+
+	/* No FTS index, this is the case when previous FTS index
+	dropped, and we re-initialize the Doc ID system for subsequent
+	insertion */
+	if (ib_vector_is_empty(cache->get_docs)) {
+		index = dict_table_get_first_index(table);
+		has_fts = FALSE;
+	} else {
+		/* We only have one FTS index per table */
+		get_doc = static_cast<fts_get_doc_t*>(
+			ib_vector_get(cache->get_docs, 0));
+
+		index = get_doc->index_cache->index;
+	}
+
+	fts_doc_fetch_by_doc_id(NULL, start_doc, index,
+				FTS_FETCH_DOC_BY_ID_LARGE,
+				fts_init_recover_doc, cache);
+
+	if (has_fts) {
+		if (table->fts->cache->stopword_info.status
+		    & STOPWORD_NOT_INIT) {
+			fts_load_stopword(table, NULL, NULL, NULL, TRUE, TRUE);
+		}
+
+		/* Register the table with the optimize thread. */
+		fts_optimize_add_table(table);
+	}
+
+	table->fts->fts_status |= ADDED_TABLE_SYNCED;
+
+	fts_get_docs_clear(cache->get_docs);
+
+func_exit:
+	if (!has_cache_lock) {
+		rw_lock_x_unlock(&cache->lock);
+	}
+
+	return(TRUE);
+}
diff --git a/storage/innobase/fts/fts0opt.cc b/storage/innobase/fts/fts0opt.cc
new file mode 100644
index 00000000000..92e040d2715
--- /dev/null
+++ b/storage/innobase/fts/fts0opt.cc
@@ -0,0 +1,3109 @@
+/*****************************************************************************
+
+Copyright (c) 2007, 2011, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file fts/fts0opt.cc
+Full Text Search optimize thread
+
+Created 2007/03/27 Sunny Bains
+Completed 2011/7/10 Sunny and Jimmy Yang
+
+***********************************************************************/
+
+#include "fts0fts.h"
+#include "row0sel.h"
+#include "que0types.h"
+#include "fts0priv.h"
+#include "fts0types.h"
+#include "ut0wqueue.h"
+#include "srv0start.h"
+#include "zlib.h"
+
+#ifndef UNIV_NONINL
+#include "fts0types.ic"
+#include "fts0vlc.ic"
+#endif
+
+/* The FTS optimize thread's work queue. */
+static ib_wqueue_t* fts_optimize_wq;
+
+/* The number of document ids to delete in one statement. */
+static const ulint FTS_MAX_DELETE_DOC_IDS = 1000;
+
+/* Time to wait for a message. */
+static const ulint FTS_QUEUE_WAIT_IN_USECS = 5000000;
+
+/* Default optimize interval in secs. */
+static const ulint FTS_OPTIMIZE_INTERVAL_IN_SECS = 300;
+
+#if 0
+/* Check each table in round robin to see whether they'd
+need to be "optimized" */
+static	ulint	fts_optimize_sync_iterator = 0;
+#endif
+
+/** State of a table within the optimization sub system. */
+enum fts_state_enum {
+	FTS_STATE_LOADED,
+	FTS_STATE_RUNNING,
+	FTS_STATE_SUSPENDED,
+	FTS_STATE_DONE,
+	FTS_STATE_EMPTY
+};
+
+/** FTS optimize thread message types. */
+enum fts_msg_type_enum {
+	FTS_MSG_START,			/*!< Start optimizing thread */
+
+	FTS_MSG_PAUSE,			/*!< Pause optimizing thread */
+
+	FTS_MSG_STOP,			/*!< Stop optimizing and exit thread */
+
+	FTS_MSG_ADD_TABLE,		/*!< Add table to the optimize thread's
+					work queue */
+
+	FTS_MSG_OPTIMIZE_TABLE,		/*!< Optimize a table */
+
+	FTS_MSG_DEL_TABLE,		/*!< Remove a table from the optimize
+					threads work queue */
+};
+
+typedef enum fts_state_enum fts_state_t;
+typedef	struct fts_zip_struct fts_zip_t;
+typedef struct fts_msg_struct fts_msg_t;
+typedef struct fts_slot_struct fts_slot_t;
+typedef struct fts_encode_struct fts_encode_t;
+typedef enum fts_msg_type_enum fts_msg_type_t;
+typedef struct fts_msg_del_struct fts_msg_del_t;
+typedef struct fts_msg_stop_struct fts_msg_stop_t;
+typedef struct fts_optimize_struct fts_optimize_t;
+typedef struct fts_msg_optimize_struct fts_msg_optimize_t;
+typedef struct fts_optimize_graph_struct fts_optimize_graph_t;
+
+/** Compressed list of words that have been read from FTS INDEX
+that needs to be optimized. */
+struct fts_zip_struct {
+	ulint		status;		/*!< Status of (un)/zip operation */
+
+	ulint		n_words;	/*!< Number of words compressed */
+
+	ulint		block_sz;	/*!< Size of a block in bytes */
+
+	ib_vector_t*	blocks;		/*!< Vector of compressed blocks */
+
+	ib_alloc_t*	heap_alloc;	/*!< Heap to use for allocations */
+
+	ulint		pos;		/*!< Offset into blocks */
+
+	ulint		last_big_block;	/*!< Offset of last block in the
+					blocks array that is of size
+					block_sz. Blocks beyond this offset
+					are of size FTS_MAX_WORD_LEN */
+
+	z_streamp	zp;		/*!< ZLib state */
+
+					/*!< The value of the last word read
+					from the FTS INDEX table. This is
+					used to discard duplicates */
+
+	fts_string_t	word;		/*!< UTF-8 string */
+
+	ulint		max_words;	/*!< maximum number of words to read
+					in one pase */
+};
+
+/** Prepared statemets used during optimize */
+struct fts_optimize_graph_struct {
+					/*!< Delete a word from FTS INDEX */
+	que_t*		delete_nodes_graph;
+					/*!< Insert a word into FTS INDEX */
+	que_t*		write_nodes_graph;
+					/*!< COMMIT a transaction */
+	que_t*		commit_graph;
+					/*!< Read the nodes from FTS_INDEX */
+	que_t*		read_nodes_graph;
+};
+
+/** Used by fts_optimize() to store state. */
+struct fts_optimize_struct {
+	trx_t*		trx;		/*!< The transaction used for all SQL */
+
+	ib_alloc_t*	self_heap;	/*!< Heap to use for allocations */
+
+	char*		name_prefix;	/*!< FTS table name prefix */
+
+	fts_table_t	fts_index_table;/*!< Common table definition */
+
+					/*!< Common table definition */
+	fts_table_t	fts_common_table;
+
+	dict_table_t*	table;		/*!< Table that has to be queried */
+
+	dict_index_t*	index;		/*!< The FTS index to be optimized */
+
+	fts_doc_ids_t*	to_delete;	/*!< doc ids to delete, we check against
+					this vector and purge the matching
+					entries during the optimizing
+					process. The vector entries are
+					sorted on doc id */
+
+	ulint		del_pos;	/*!< Offset within to_delete vector,
+					this is used to keep track of where
+					we are up to in the vector */
+
+	ibool		done;		/*!< TRUE when optimize finishes */
+
+	ib_vector_t*	words;		/*!< Word + Nodes read from FTS_INDEX,
+					it contains instances of fts_word_t */
+
+	fts_zip_t*	zip;		/*!< Words read from the FTS_INDEX */
+
+	fts_optimize_graph_t		/*!< Prepared statements used during */
+			graph;		/*optimize */
+
+	ulint		n_completed;	/*!< Number of FTS indexes that have
+					been optimized */
+	ibool		del_list_regenerated;
+					/*!< BEING_DELETED list regenarated */
+};
+
+/** Used by the optimize, to keep state during compacting nodes. */
+struct fts_encode_struct {
+	doc_id_t	src_last_doc_id;/*!< Last doc id read from src node */
+	byte*		src_ilist_ptr;	/*!< Current ptr within src ilist */
+};
+
+/** We use this information to determine when to start the optimize
+cycle for a table. */
+struct fts_slot_struct {
+	dict_table_t*	table;		/*!< Table to optimize */
+
+	fts_state_t	state;		/*!< State of this slot */
+
+	ulint		added;		/*!< Number of doc ids added since the
+					last time this table was optimized */
+
+	ulint		deleted;	/*!< Number of doc ids deleted since the
+					last time this table was optimized */
+
+	ib_time_t	last_run;	/*!< Time last run completed */
+
+	ib_time_t	completed;	/*!< Optimize finish time */
+
+	ib_time_t	interval_time;	/*!< Minimum time to wait before
+					optimizing the table again. */
+};
+
+/** A table remove message for the FTS optimize thread. */
+struct fts_msg_del_struct {
+	dict_table_t*	table;		/*!< The table to remove */
+
+	os_event_t	event;		/*!< Event to synchronize acknowledgement
+					of receipt and processing of the
+					this message by the consumer */
+};
+
+/** Stop the optimize thread. */
+struct fts_msg_optimize_struct {
+	dict_table_t*	table;		/*!< Table to optimize */
+};
+
+/** The FTS optimize message work queue message type. */
+struct fts_msg_struct {
+	fts_msg_type_t	type;		/*!< Message type */
+
+	void*		ptr;		/*!< The message contents */
+
+	mem_heap_t*	heap;		/*!< The heap used to allocate this
+					message, the message consumer will
+					free the heap. */
+};
+
+/** The number of words to read and optimize in a single pass. */
+UNIV_INTERN ulong	fts_num_word_optimize;
+
+// FIXME
+UNIV_INTERN char	fts_enable_diag_print;
+
+/** ZLib compressed block size.*/
+static ulint FTS_ZIP_BLOCK_SIZE	= 1024;
+
+/** The amount of time optimizing in a single pass, in milliseconds. */
+static ib_time_t fts_optimize_time_limit = 0;
+
+/** SQL Statement for changing state of rows to be deleted from FTS Index. */
+static	const char* fts_init_delete_sql =
+	"BEGIN\n"
+	"\n"
+	"INSERT INTO %s_BEING_DELETED\n"
+		"SELECT doc_id FROM %s_DELETED;\n"
+	"\n"
+	"INSERT INTO %s_BEING_DELETED_CACHE\n"
+		"SELECT doc_id FROM %s_DELETED_CACHE;\n";
+
+static const char* fts_delete_doc_ids_sql =
+	"BEGIN\n"
+	"\n"
+	"DELETE FROM %s_DELETED WHERE doc_id = :doc_id1;\n"
+	"DELETE FROM %s_DELETED_CACHE WHERE doc_id = :doc_id2;\n";
+
+static const char* fts_end_delete_sql =
+	"BEGIN\n"
+	"\n"
+	"DELETE FROM %s_BEING_DELETED;\n"
+	"DELETE FROM %s_BEING_DELETED_CACHE;\n";
+
+/**********************************************************************//**
+Initialize fts_zip_t. */
+static
+void
+fts_zip_initialize(
+/*===============*/
+	fts_zip_t*	zip)		/*!< out: zip instance to initialize */
+{
+	zip->pos = 0;
+	zip->n_words = 0;
+
+	zip->status = Z_OK;
+
+	zip->last_big_block = 0;
+
+	zip->word.f_len = 0;
+	memset(zip->word.f_str, 0, FTS_MAX_WORD_LEN);
+
+	ib_vector_reset(zip->blocks);
+
+	memset(zip->zp, 0, sizeof(*zip->zp));
+}
+
+/**********************************************************************//**
+Create an instance of fts_zip_t.
+@return a new instance of fts_zip_t */
+static
+fts_zip_t*
+fts_zip_create(
+/*===========*/
+	mem_heap_t*	heap,		/*!< in: heap */
+	ulint		block_sz,	/*!< in: size of a zip block.*/
+	ulint		max_words)	/*!< in: max words to read */
+{
+	fts_zip_t*	zip;
+
+	zip = static_cast<fts_zip_t*>(mem_heap_zalloc(heap, sizeof(*zip)));
+
+	zip->word.f_str = static_cast<byte*>(
+		mem_heap_zalloc(heap, FTS_MAX_WORD_LEN + 1));
+
+	zip->block_sz = block_sz;
+
+	zip->heap_alloc = ib_heap_allocator_create(heap);
+
+	zip->blocks = ib_vector_create(zip->heap_alloc, sizeof(void*), 128);
+
+	zip->max_words = max_words;
+
+	zip->zp = static_cast<z_stream*>(
+		mem_heap_zalloc(heap, sizeof(*zip->zp)));
+
+	return(zip);
+}
+
+/**********************************************************************//**
+Initialize an instance of fts_zip_t. */
+static
+void
+fts_zip_init(
+/*=========*/
+
+	fts_zip_t*	zip)		/*!< in: zip instance to init */
+{
+	memset(zip->zp, 0, sizeof(*zip->zp));
+
+	zip->word.f_len = 0;
+	*zip->word.f_str = '\0';
+}
+
+/**********************************************************************//**
+Create a fts_optimizer_word_t instance.
+@return new instance */
+UNIV_INTERN
+fts_word_t*
+fts_word_init(
+/*==========*/
+	fts_word_t*	word,		/*!< in: word to initialize */
+	byte*		utf8,		/*!< in: UTF-8 string */
+	ulint		len)		/*!< in: length of string in bytes */
+{
+	mem_heap_t*	heap = mem_heap_create(sizeof(fts_node_t));
+
+	memset(word, 0, sizeof(*word));
+
+	word->text.f_len = len;
+	word->text.f_str = static_cast<byte*>(mem_heap_alloc(heap, len + 1));
+
+	/* Need to copy the NUL character too. */
+	memcpy(word->text.f_str, utf8, word->text.f_len);
+	word->text.f_str[word->text.f_len] = 0;
+
+	word->heap_alloc = ib_heap_allocator_create(heap);
+
+	word->nodes = ib_vector_create(
+		word->heap_alloc, sizeof(fts_node_t), 64);
+
+	return(word);
+}
+
+/**********************************************************************//**
+Read the FTS INDEX row.
+@return fts_node_t instance */
+static
+fts_node_t*
+fts_optimize_read_node(
+/*===================*/
+	fts_word_t*	word,		/*!< in: */
+	que_node_t*	exp)		/*!< in: */
+{
+	int		i;
+	fts_node_t*	node = static_cast<fts_node_t*>(
+		ib_vector_push(word->nodes, NULL));
+
+	/* Start from 1 since the first node has been read by the caller */
+	for (i = 1; exp; exp = que_node_get_next(exp), ++i) {
+
+		dfield_t*	dfield = que_node_get_val(exp);
+		byte*		data = static_cast<byte*>(
+			dfield_get_data(dfield));
+		ulint		len = dfield_get_len(dfield);
+
+		ut_a(len != UNIV_SQL_NULL);
+
+		/* Note: The column numbers below must match the SELECT */
+		switch (i) {
+		case 1: /* DOC_COUNT */
+			node->doc_count = mach_read_from_4(data);
+			break;
+
+		case 2: /* FIRST_DOC_ID */
+			node->first_doc_id = fts_read_doc_id(data);
+			break;
+
+		case 3: /* LAST_DOC_ID */
+			node->last_doc_id = fts_read_doc_id(data);
+			break;
+
+		case 4: /* ILIST */
+			node->ilist_size_alloc = node->ilist_size = len;
+			node->ilist = static_cast<byte*>(ut_malloc(len));
+			memcpy(node->ilist, data, len);
+			break;
+
+		default:
+			ut_error;
+		}
+	}
+
+	/* Make sure all columns were read. */
+	ut_a(i == 5);
+
+	return(node);
+}
+
+/**********************************************************************//**
+Callback function to fetch the rows in an FTS INDEX record.
+@return always returns non-NULL */
+UNIV_INTERN
+ibool
+fts_optimize_index_fetch_node(
+/*==========================*/
+	void*		row,		/*!< in: sel_node_t* */
+	void*		user_arg)	/*!< in: pointer to ib_vector_t */
+{
+	fts_word_t*	word;
+	sel_node_t*	sel_node = static_cast<sel_node_t*>(row);
+	fts_fetch_t*	fetch = static_cast<fts_fetch_t*>(user_arg);
+	ib_vector_t*	words = static_cast<ib_vector_t*>(fetch->read_arg);
+	que_node_t*	exp = sel_node->select_list;
+	dfield_t*	dfield = que_node_get_val(exp);
+	void*		data = dfield_get_data(dfield);
+	ulint		dfield_len = dfield_get_len(dfield);
+
+	ut_a(dfield_len <= FTS_MAX_WORD_LEN);
+
+	if (ib_vector_size(words) == 0) {
+
+		word = static_cast<fts_word_t*>(ib_vector_push(words, NULL));
+		fts_word_init(word, (byte*) data, dfield_len);
+	}
+
+	word = static_cast<fts_word_t*>(ib_vector_last(words));
+
+	if (dfield_len != word->text.f_len
+	    || memcmp(word->text.f_str, data, dfield_len)) {
+
+		word = static_cast<fts_word_t*>(ib_vector_push(words, NULL));
+		fts_word_init(word, (byte*) data, dfield_len);
+	}
+
+	fts_optimize_read_node(word, que_node_get_next(exp));
+
+	return(TRUE);
+}
+
+/**********************************************************************//**
+Read the rows from the FTS inde.
+@return vector of rows fetched */
+UNIV_INTERN
+ulint
+fts_index_fetch_nodes(
+/*==================*/
+	trx_t*		trx,		/*!< in: transaction */
+	que_t**		graph,		/*!< in: prepared statement */
+	fts_table_t*	fts_table,	/*!< in: table of the FTS INDEX */
+	const fts_string_t*
+			word,		/*!< in: the word to fetch */
+	fts_fetch_t*	fetch)		/*!< in: fetch callback.*/
+{
+	pars_info_t*	info;
+	ulint		error;
+
+	trx->op_info = "fetching FTS index nodes";
+
+	if (*graph) {
+		info = (*graph)->info;
+	} else {
+		info = pars_info_create();
+	}
+
+	pars_info_bind_function(info, "my_func", fetch->read_record, fetch);
+	pars_info_bind_varchar_literal(info, "word", word->f_str, word->f_len);
+
+	if (!*graph) {
+		ulint	selected;
+
+		ut_a(fts_table->type == FTS_INDEX_TABLE);
+
+		selected = fts_select_index(fts_table->charset,
+					    word->f_str, word->f_len);
+
+		fts_table->suffix = fts_get_suffix(selected);
+
+		*graph = fts_parse_sql(
+			fts_table,
+			info,
+			"DECLARE FUNCTION my_func;\n"
+			"DECLARE CURSOR c IS"
+			" SELECT word, doc_count, first_doc_id, last_doc_id, "
+				"ilist\n"
+			" FROM %s\n"
+			" WHERE word LIKE :word\n"
+			" ORDER BY first_doc_id;\n"
+			"BEGIN\n"
+			"\n"
+			"OPEN c;\n"
+			"WHILE 1 = 1 LOOP\n"
+			"  FETCH c INTO my_func();\n"
+			"  IF c % NOTFOUND THEN\n"
+			"    EXIT;\n"
+			"  END IF;\n"
+			"END LOOP;\n"
+			"CLOSE c;");
+	}
+
+	for(;;) {
+		error = fts_eval_sql(trx, *graph);
+
+		if (error == DB_SUCCESS) {
+			fts_sql_commit(trx);
+
+			break;				/* Exit the loop. */
+		} else {
+			fts_sql_rollback(trx);
+
+			ut_print_timestamp(stderr);
+
+			if (error == DB_LOCK_WAIT_TIMEOUT) {
+				fprintf(stderr, " InnoDB: Warning: lock wait "
+					"timeout reading FTS index. "
+					"Retrying!\n");
+
+				trx->error_state = DB_SUCCESS;
+			} else {
+				fprintf(stderr, " InnoDB: Error: %lu "
+					"while reading FTS index.\n", error);
+
+				break;			/* Exit the loop. */
+			}
+		}
+	}
+
+	return(error);
+}
+
+/**********************************************************************//**
+Read a word */
+static
+byte*
+fts_zip_read_word(
+/*==============*/
+	fts_zip_t*	zip,		/*!< in: Zip state + data */
+	fts_string_t*	word)		/*!< out: uncompressed word */
+{
+#ifdef UNIV_DEBUG
+	ulint		i;
+#endif
+	byte		len = 0;
+	void*		null = NULL;
+	byte*		ptr = word->f_str;
+	int		flush = Z_NO_FLUSH;
+
+	/* Either there was an error or we are at the Z_STREAM_END. */
+	if (zip->status != Z_OK) {
+		return(NULL);
+	}
+
+	zip->zp->next_out = &len;
+	zip->zp->avail_out = sizeof(len);
+
+	while (zip->status == Z_OK && zip->zp->avail_out > 0) {
+
+		/* Finished decompressing block. */
+		if (zip->zp->avail_in == 0) {
+
+			/* Free the block thats been decompressed. */
+			if (zip->pos > 0) {
+				ulint	prev = zip->pos - 1;
+
+				ut_a(zip->pos < ib_vector_size(zip->blocks));
+
+				ut_free(ib_vector_getp(zip->blocks, prev));
+				ib_vector_set(zip->blocks, prev, &null);
+			}
+
+			/* Any more blocks to decompress. */
+			if (zip->pos < ib_vector_size(zip->blocks)) {
+
+				zip->zp->next_in = static_cast<byte*>(
+					ib_vector_getp(
+						zip->blocks, zip->pos));
+
+				if (zip->pos > zip->last_big_block) {
+					zip->zp->avail_in =
+						FTS_MAX_WORD_LEN;
+				} else {
+					zip->zp->avail_in = zip->block_sz;
+				}
+
+				++zip->pos;
+			} else {
+				flush = Z_FINISH;
+			}
+		}
+
+		switch (zip->status = inflate(zip->zp, flush)) {
+		case Z_OK:
+			if (zip->zp->avail_out == 0 && len > 0) {
+
+				ut_a(len <= FTS_MAX_WORD_LEN);
+				ptr[len] = 0;
+
+				zip->zp->next_out = ptr;
+				zip->zp->avail_out = len;
+
+				word->f_len = len;
+				len = 0;
+			}
+			break;
+
+		case Z_BUF_ERROR:	/* No progress possible. */
+		case Z_STREAM_END:
+			inflateEnd(zip->zp);
+			break;
+
+		case Z_STREAM_ERROR:
+		default:
+			ut_error;
+		}
+	}
+
+#ifdef UNIV_DEBUG
+	/* All blocks must be freed at end of inflate. */
+	if (zip->status != Z_OK) {
+		for (i = 0; i < ib_vector_size(zip->blocks); ++i) {
+			if (ib_vector_getp(zip->blocks, i)) {
+				ut_free(ib_vector_getp(zip->blocks, i));
+				ib_vector_set(zip->blocks, i, &null);
+			}
+		}
+	}
+
+	if (ptr != NULL) {
+		ut_ad(word->f_len == strlen((char*) ptr));
+	}
+#endif /* UNIV_DEBUG */
+
+	return(zip->status == Z_OK || zip->status == Z_STREAM_END ? ptr : NULL);
+}
+
+/**********************************************************************//**
+Callback function to fetch and compress the word in an FTS
+INDEX record.
+@return FALSE on EOF */
+static
+ibool
+fts_fetch_index_words(
+/*==================*/
+	void*		row,		/*!< in: sel_node_t* */
+	void*		user_arg)	/*!< in: pointer to ib_vector_t */
+{
+	sel_node_t*	sel_node = static_cast<sel_node_t*>(row);
+	fts_zip_t*	zip = static_cast<fts_zip_t*>(user_arg);
+	que_node_t*	exp = sel_node->select_list;
+	dfield_t*	dfield = que_node_get_val(exp);
+	byte		len = (byte) dfield_get_len(dfield);
+	void*		data = dfield_get_data(dfield);
+
+	/* Skip the duplicate words. */
+	if (zip->word.f_len == len && !memcmp(zip->word.f_str, data, len)) {
+
+		return(TRUE);
+	}
+
+	ut_a(len <= FTS_MAX_WORD_LEN);
+
+	memcpy(zip->word.f_str, data, len);
+	zip->word.f_len = len;
+
+	ut_a(zip->zp->avail_in == 0);
+	ut_a(zip->zp->next_in == NULL);
+
+	/* The string is prefixed by len. */
+	zip->zp->next_in = &len;
+	zip->zp->avail_in = sizeof(len);
+
+	/* Compress the word, create output blocks as necessary. */
+	while (zip->zp->avail_in > 0) {
+
+		/* No space left in output buffer, create a new one. */
+		if (zip->zp->avail_out == 0) {
+			byte*		block;
+
+			block = static_cast<byte*>(ut_malloc(zip->block_sz));
+			ib_vector_push(zip->blocks, &block);
+
+			zip->zp->next_out = block;
+			zip->zp->avail_out = zip->block_sz;
+		}
+
+		switch (zip->status = deflate(zip->zp, Z_NO_FLUSH)) {
+		case Z_OK:
+			if (zip->zp->avail_in == 0) {
+				zip->zp->next_in = static_cast<byte*>(data);
+				zip->zp->avail_in = len;
+				ut_a(len <= FTS_MAX_WORD_LEN);
+				len = 0;
+			}
+			break;
+
+		case Z_STREAM_END:
+		case Z_BUF_ERROR:
+		case Z_STREAM_ERROR:
+		default:
+			ut_error;
+			break;
+		}
+	}
+
+	/* All data should have been compressed. */
+	ut_a(zip->zp->avail_in == 0);
+	zip->zp->next_in = NULL;
+
+	++zip->n_words;
+
+	return(zip->n_words >= zip->max_words ? FALSE : TRUE);
+}
+
+/**********************************************************************//**
+Finish Zip deflate. */
+static
+void
+fts_zip_deflate_end(
+/*================*/
+	fts_zip_t*	zip)		/*!< in: instance that should be closed*/
+{
+	ut_a(zip->zp->avail_in == 0);
+	ut_a(zip->zp->next_in == NULL);
+
+	zip->status = deflate(zip->zp, Z_FINISH);
+
+	ut_a(ib_vector_size(zip->blocks) > 0);
+	zip->last_big_block = ib_vector_size(zip->blocks) - 1;
+
+	/* Allocate smaller block(s), since this is trailing data. */
+	while (zip->status == Z_OK) {
+		byte*		block;
+
+		ut_a(zip->zp->avail_out == 0);
+
+		block = static_cast<byte*>(ut_malloc(FTS_MAX_WORD_LEN + 1));
+		ib_vector_push(zip->blocks, &block);
+
+		zip->zp->next_out = block;
+		zip->zp->avail_out = FTS_MAX_WORD_LEN;
+
+		zip->status = deflate(zip->zp, Z_FINISH);
+	}
+
+	ut_a(zip->status == Z_STREAM_END);
+
+	zip->status = deflateEnd(zip->zp);
+	ut_a(zip->status == Z_OK);
+
+	/* Reset the ZLib data structure. */
+	memset(zip->zp, 0, sizeof(*zip->zp));
+}
+
+/**********************************************************************//**
+Read the words from the FTS INDEX.
+@return DB_SUCCESS if all OK, DB_TABLE_NOT_FOUND if no more indexes
+        to search else error code */
+static
+ulint
+fts_index_fetch_words(
+/*==================*/
+	fts_optimize_t*		optim,	/*!< in: optimize scratch pad */
+	const fts_string_t*	word,	/*!< in: get words greater than this
+					 word */
+	ulint			n_words)/*!< in: max words to read */
+{
+	pars_info_t*	info;
+	que_t*		graph;
+	ulint		selected;
+	fts_zip_t*	zip = NULL;
+	ulint		error = DB_SUCCESS;
+	mem_heap_t*	heap = static_cast<mem_heap_t*>(optim->self_heap->arg);
+	ibool		inited = FALSE;
+
+	optim->trx->op_info = "fetching FTS index words";
+
+	if (optim->zip == NULL) {
+		optim->zip = fts_zip_create(heap, FTS_ZIP_BLOCK_SIZE, n_words);
+	} else {
+		fts_zip_initialize(optim->zip);
+	}
+
+	for (selected = fts_select_index(
+		optim->fts_index_table.charset, word->f_str, word->f_len);
+	     fts_index_selector[selected].value;
+	     selected++) {
+
+		optim->fts_index_table.suffix = fts_get_suffix(selected);
+
+		/* We've search all indexes. */
+		if (optim->fts_index_table.suffix == NULL) {
+			return(DB_TABLE_NOT_FOUND);
+		}
+
+		info = pars_info_create();
+
+		pars_info_bind_function(
+			info, "my_func", fts_fetch_index_words, optim->zip);
+
+		pars_info_bind_varchar_literal(
+			info, "word", word->f_str, word->f_len);
+
+		graph = fts_parse_sql(
+			&optim->fts_index_table,
+			info,
+			"DECLARE FUNCTION my_func;\n"
+			"DECLARE CURSOR c IS"
+			" SELECT word\n"
+			" FROM %s\n"
+			" WHERE word > :word\n"
+			" ORDER BY word;\n"
+			"BEGIN\n"
+			"\n"
+			"OPEN c;\n"
+			"WHILE 1 = 1 LOOP\n"
+			"  FETCH c INTO my_func();\n"
+			"  IF c % NOTFOUND THEN\n"
+			"    EXIT;\n"
+			"  END IF;\n"
+			"END LOOP;\n"
+			"CLOSE c;");
+
+		zip = optim->zip;
+
+		for(;;) {
+
+			if (!inited && ((error = deflateInit(zip->zp, 9))
+					!= Z_OK)) {
+				ut_print_timestamp(stderr);
+				fprintf(stderr,
+					" InnoDB: Error: ZLib deflateInit() "
+					"failed: %lu\n", error);
+
+				error = DB_ERROR;
+				break;
+			} else {
+				inited = TRUE;
+				error = fts_eval_sql(optim->trx, graph);
+			}
+
+			if (error == DB_SUCCESS) {
+				//FIXME fts_sql_commit(optim->trx);
+				break;
+			} else {
+				//FIXME fts_sql_rollback(optim->trx);
+
+				ut_print_timestamp(stderr);
+
+				if (error == DB_LOCK_WAIT_TIMEOUT) {
+					fprintf(stderr, " InnoDB: "
+						"Warning: lock wait "
+						"timeout reading document. "
+						"Retrying!\n");
+
+					/* We need to reset the ZLib state. */
+					inited = FALSE;
+					deflateEnd(zip->zp);
+					fts_zip_init(zip);
+
+					optim->trx->error_state = DB_SUCCESS;
+				} else {
+					fprintf(stderr, " InnoDB: Error: %lu "
+						"while reading document.\n",
+						error);
+
+					break;	/* Exit the loop. */
+				}
+			}
+		}
+
+		fts_que_graph_free(graph);
+
+		/* Check if max word to fetch is exceeded */
+		if (optim->zip->n_words >= n_words) {
+			break;
+		}
+	}
+
+	if (error == DB_SUCCESS && zip->status == Z_OK && zip->n_words > 0) {
+
+		/* All data should have been read. */
+		ut_a(zip->zp->avail_in == 0);
+
+		fts_zip_deflate_end(zip);
+	} else {
+		deflateEnd(zip->zp);
+	}
+
+	return(error);
+}
+
+/**********************************************************************//**
+Callback function to fetch the doc id from the record.
+@return always returns TRUE */
+static
+ibool
+fts_fetch_doc_ids(
+/*==============*/
+	void*	row,		/*!< in: sel_node_t* */
+	void*	user_arg)	/*!< in: pointer to ib_vector_t */
+{
+	que_node_t*	exp;
+	int		i = 0;
+	sel_node_t*	sel_node = static_cast<sel_node_t*>(row);
+	fts_doc_ids_t*	fts_doc_ids = static_cast<fts_doc_ids_t*>(user_arg);
+	fts_update_t*	update = static_cast<fts_update_t*>(
+		ib_vector_push(fts_doc_ids->doc_ids, NULL));
+
+	for (exp = sel_node->select_list;
+	     exp;
+	     exp = que_node_get_next(exp), ++i) {
+
+		dfield_t*	dfield = que_node_get_val(exp);
+		void*		data = dfield_get_data(dfield);
+		ulint		len = dfield_get_len(dfield);
+
+		ut_a(len != UNIV_SQL_NULL);
+
+		/* Note: The column numbers below must match the SELECT. */
+		switch (i) {
+		case 0: /* DOC_ID */
+			update->fts_indexes = NULL;
+			update->doc_id = fts_read_doc_id(
+				static_cast<byte*>(data));
+			break;
+
+		default:
+			ut_error;
+		}
+	}
+
+	return(TRUE);
+}
+
+/**********************************************************************//**
+Read the rows from a FTS common auxiliary table.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+ulint
+fts_table_fetch_doc_ids(
+/*====================*/
+	trx_t*		trx,		/*!< in: transaction */
+	fts_table_t*	fts_table,	/*!< in: table */
+	fts_doc_ids_t*	doc_ids)	/*!< in: For collecting doc ids */
+{
+	ulint		error;
+	que_t*		graph;
+	pars_info_t*	info = pars_info_create();
+	ibool		alloc_bk_trx = FALSE;
+
+	ut_a(fts_table->suffix != NULL);
+	ut_a(fts_table->type == FTS_COMMON_TABLE);
+
+	if (!trx) {
+		trx = trx_allocate_for_background();
+		alloc_bk_trx = TRUE;
+	}
+
+	trx->op_info = "fetching FTS doc ids";
+
+	pars_info_bind_function(info, "my_func", fts_fetch_doc_ids, doc_ids);
+
+	graph = fts_parse_sql(
+		fts_table,
+		info,
+		"DECLARE FUNCTION my_func;\n"
+		"DECLARE CURSOR c IS"
+		" SELECT doc_id FROM %s;\n"
+		"BEGIN\n"
+		"\n"
+		"OPEN c;\n"
+		"WHILE 1 = 1 LOOP\n"
+		"  FETCH c INTO my_func();\n"
+		"  IF c % NOTFOUND THEN\n"
+		"    EXIT;\n"
+		"  END IF;\n"
+		"END LOOP;\n"
+		"CLOSE c;");
+
+	error = fts_eval_sql(trx, graph);
+
+	mutex_enter(&dict_sys->mutex);
+	que_graph_free(graph);
+	mutex_exit(&dict_sys->mutex);
+
+	if (error == DB_SUCCESS) {
+		fts_sql_commit(trx);
+
+		ib_vector_sort(doc_ids->doc_ids, fts_update_doc_id_cmp);
+	} else {
+		fts_sql_rollback(trx);
+	}
+
+	if (alloc_bk_trx) {
+		trx_free_for_background(trx);
+	}
+
+	return(error);
+}
+
+/**********************************************************************//**
+Do a binary search for a doc id in the array
+@return +ve index if found -ve index where it should be inserted
+        if not found */
+UNIV_INTERN
+int
+fts_bsearch(
+/*========*/
+	fts_update_t*	array,	/*!< in: array to sort */
+	int		lower,	/*!< in: the array lower bound */
+	int		upper,	/*!< in: the array upper bound */
+	doc_id_t	doc_id)	/*!< in: the doc id to search for */
+{
+	int	orig_size = upper;
+
+	if (upper == 0) {
+		/* Nothing to search */
+		return(-1);
+	} else {
+		while (lower < upper) {
+			int	i = (lower + upper) >> 1;
+
+			if (doc_id > array[i].doc_id) {
+				lower = i + 1;
+			} else if (doc_id < array[i].doc_id) {
+				upper = i - 1;
+			} else {
+				return(i); /* Found. */
+			}
+		}
+	}
+
+	if (lower == upper && lower < orig_size) {
+		if (doc_id == array[lower].doc_id) {
+			return(lower);
+		} else if (lower == 0) {
+			return(-1);
+		}
+	}
+
+	/* Not found. */
+	return( (lower == 0) ? -1 : -lower);
+}
+
+/**********************************************************************//**
+Search in the to delete array whether any of the doc ids within
+the [first, last] range are to be deleted
+@return +ve index if found -ve index where it should be inserted
+        if not found */
+static
+int
+fts_optimize_lookup(
+/*================*/
+	ib_vector_t*	doc_ids,	/*!< in: array to search */
+	ulint		lower,		/*!< in: lower limit of array */
+	doc_id_t	first_doc_id,	/*!< in: doc id to lookup */
+	doc_id_t	last_doc_id)	/*!< in: doc id to lookup */
+{
+	int		pos;
+	int		upper = ib_vector_size(doc_ids);
+	fts_update_t*	array = (fts_update_t*) doc_ids->data;
+
+	pos = fts_bsearch(array, lower, upper, first_doc_id);
+
+	ut_a(abs(pos) <= upper + 1);
+
+	if (pos < 0) {
+
+		int	i = abs(pos);
+
+		/* If i is 1, it could be first_doc_id is less than
+		either the first or second array item, do a
+		double check */
+		if (i == 1 && array[0].doc_id <= last_doc_id
+		    && first_doc_id < array[0].doc_id) {
+			pos = 0;
+		} else if (i < upper && array[i].doc_id <= last_doc_id) {
+
+			/* Check if the "next" doc id is within the
+			first & last doc id of the node. */
+			pos = i;
+		}
+	}
+
+	return(pos);
+}
+
+/**********************************************************************//**
+Encode the word pos list into the node
+@return DB_SUCCESS or error code*/
+static
+ulint
+fts_optimize_encode_node(
+/*=====================*/
+	fts_node_t*	node,		/*!< in: node to fill*/
+	doc_id_t	doc_id,		/*!< in: doc id to encode */
+	fts_encode_t*	enc)		/*!< in: encoding state.*/
+{
+	byte*		dst;
+	ulint		enc_len;
+	ulint		pos_enc_len;
+	doc_id_t	doc_id_delta;
+	ulint		error = DB_SUCCESS;
+	byte*		src = enc->src_ilist_ptr;
+
+	if (node->first_doc_id == 0) {
+		ut_a(node->last_doc_id == 0);
+
+		node->first_doc_id = doc_id;
+	}
+
+	/* Calculate the space required to store the ilist. */
+	doc_id_delta = doc_id - node->last_doc_id;
+	enc_len = fts_get_encoded_len(static_cast<ulint>(doc_id_delta));
+
+	/* Calculate the size of the encoded pos array. */
+	while (*src) {
+		fts_decode_vlc(&src);
+	}
+
+	/* Skip the 0x00 byte at the end of the word positions list. */
+	++src;
+
+	/* Number of encoded pos bytes to copy. */
+	pos_enc_len = src - enc->src_ilist_ptr;
+
+	/* Total number of bytes required for copy. */
+	enc_len += pos_enc_len;
+
+	/* Check we have enough space in the destination buffer for
+	copying the document word list. */
+	if (!node->ilist) {
+		ulint	new_size;
+
+		ut_a(node->ilist_size == 0);
+
+		new_size = enc_len > FTS_ILIST_MAX_SIZE
+			? enc_len : FTS_ILIST_MAX_SIZE;
+
+		node->ilist = static_cast<byte*>(ut_malloc(new_size));
+		node->ilist_size_alloc = new_size;
+
+	} else if ((node->ilist_size + enc_len) > node->ilist_size_alloc) {
+		ulint	new_size = node->ilist_size + enc_len;
+		byte*	ilist = static_cast<byte*>(ut_malloc(new_size));
+
+		memcpy(ilist, node->ilist, node->ilist_size);
+
+		ut_free(node->ilist);
+
+		node->ilist = ilist;
+		node->ilist_size_alloc = new_size;
+	}
+
+	src = enc->src_ilist_ptr;
+	dst = node->ilist + node->ilist_size;
+
+	/* Encode the doc id. Cast to ulint, the delta should be small and
+	therefore no loss of precision. */
+	dst += fts_encode_int((ulint) doc_id_delta, dst);
+
+	/* Copy the encoded pos array. */
+	memcpy(dst, src, pos_enc_len);
+
+	node->last_doc_id = doc_id;
+
+	/* Data copied upto here. */
+	node->ilist_size += enc_len;
+	enc->src_ilist_ptr += pos_enc_len;
+
+	ut_a(node->ilist_size <= node->ilist_size_alloc);
+
+	return(error);
+}
+
+/**********************************************************************//**
+Optimize the data contained in a node.
+@return DB_SUCCESS or error code*/
+static
+ulint
+fts_optimize_node(
+/*==============*/
+	ib_vector_t*	del_vec,	/*!< in: vector of doc ids to delete*/
+	int*		del_pos,	/*!< in: offset into above vector */
+	fts_node_t*	dst_node,	/*!< in: node to fill*/
+	fts_node_t*	src_node,	/*!< in: source node for data*/
+	fts_encode_t*	enc)		/*!< in: encoding state */
+{
+	ulint		copied;
+	ulint		error = DB_SUCCESS;
+	doc_id_t	doc_id = enc->src_last_doc_id;
+
+	if (!enc->src_ilist_ptr) {
+		enc->src_ilist_ptr = src_node->ilist;
+	}
+
+	copied = enc->src_ilist_ptr - src_node->ilist;
+
+	/* While there is data in the source node and space to copy
+	into in the destination node. */
+	while (copied < src_node->ilist_size
+	       && dst_node->ilist_size < FTS_ILIST_MAX_SIZE) {
+
+		doc_id_t	delta;
+		doc_id_t	del_doc_id = FTS_NULL_DOC_ID;
+
+		delta = fts_decode_vlc(&enc->src_ilist_ptr);
+
+test_again:
+		/* Check whether the doc id is in the delete list, if
+		so then we skip the entries but we need to track the
+		delta for decoding the entries following this document's
+		entries. */
+		if (*del_pos >= 0 && *del_pos < (int) ib_vector_size(del_vec)) {
+			fts_update_t*	update;
+
+			update = (fts_update_t*) ib_vector_get(
+				del_vec, *del_pos);
+
+			del_doc_id = update->doc_id;
+		}
+
+		if (enc->src_ilist_ptr == src_node->ilist && doc_id == 0) {
+			ut_a(delta == src_node->first_doc_id);
+		}
+
+		doc_id += delta;
+
+		if (del_doc_id > 0 && doc_id == del_doc_id) {
+
+			++*del_pos;
+
+			/* Skip the entries for this document. */
+			while (*enc->src_ilist_ptr) {
+				fts_decode_vlc(&enc->src_ilist_ptr);
+			}
+
+			/* Skip the end of word position marker. */
+			++enc->src_ilist_ptr;
+
+		} else {
+
+			/* DOC ID already becomes larger than
+			del_doc_id, check the next del_doc_id */
+			if (del_doc_id > 0 && doc_id > del_doc_id) {
+				del_doc_id = 0;
+				++*del_pos;
+				delta = 0;
+				goto test_again;
+			}
+
+			/* Decode and copy the word positions into
+			the dest node. */
+			fts_optimize_encode_node(dst_node, doc_id, enc);
+
+			++dst_node->doc_count;
+
+			ut_a(dst_node->last_doc_id == doc_id);
+		}
+
+		/* Bytes copied so for from source. */
+		copied = enc->src_ilist_ptr - src_node->ilist;
+	}
+
+	if (copied >= src_node->ilist_size) {
+		ut_a(doc_id == src_node->last_doc_id);
+	}
+
+	enc->src_last_doc_id = doc_id;
+
+	return(error);
+}
+
+/**********************************************************************//**
+Determine the starting pos within the deleted doc id vector for a word.
+@return DB_SUCCESS or error code */
+static
+int
+fts_optimize_deleted_pos(
+/*=====================*/
+	fts_optimize_t*	optim,		/*!< in: optimize state data */
+	fts_word_t*	word)		/*!< in: the word data to check */
+{
+	int		del_pos;
+	ib_vector_t*	del_vec = optim->to_delete->doc_ids;
+
+	/* Get the first and last dict ids for the word, we will use
+	these values to determine which doc ids need to be removed
+	when we coalesce the nodes. This way we can reduce the numer
+	of elements that need to be searched in the deleted doc ids
+	vector and secondly we can remove the doc ids during the
+	coalescing phase. */
+	if (ib_vector_size(del_vec) > 0) {
+		fts_node_t*	node;
+		doc_id_t	last_id;
+		doc_id_t	first_id;
+		ulint		size = ib_vector_size(word->nodes);
+
+		node = (fts_node_t*) ib_vector_get(word->nodes, 0);
+		first_id = node->first_doc_id;
+
+		node = (fts_node_t*) ib_vector_get(word->nodes, size - 1);
+		last_id = node->last_doc_id;
+
+		ut_a(first_id <= last_id);
+
+		del_pos = fts_optimize_lookup(
+			del_vec, optim->del_pos, first_id, last_id);
+	} else {
+
+		del_pos = -1; /* Note that there is nothing to delete. */
+	}
+
+	return(del_pos);
+}
+
+#define FTS_DEBUG_PRINT
+/**********************************************************************//**
+Compact the nodes for a word, we also remove any doc ids during the
+compaction pass.
+@return DB_SUCCESS or error code.*/
+static
+ib_vector_t*
+fts_optimize_word(
+/*==============*/
+	fts_optimize_t*	optim,		/*!< in: optimize state data */
+	fts_word_t*	word)		/*!< in: the word to optimize */
+{
+	fts_encode_t	enc;
+	ib_vector_t*	nodes;
+	ulint		i = 0;
+	int		del_pos;
+	fts_node_t*	dst_node = NULL;
+	ib_vector_t*	del_vec = optim->to_delete->doc_ids;
+	ulint		size = ib_vector_size(word->nodes);
+
+	del_pos = fts_optimize_deleted_pos(optim, word);
+	nodes = ib_vector_create(word->heap_alloc, sizeof(*dst_node), 128);
+
+	enc.src_last_doc_id = 0;
+	enc.src_ilist_ptr = NULL;
+
+	if (fts_enable_diag_print) {
+		word->text.f_str[word->text.f_len] = 0;
+		fprintf(stderr, "FTS_OPTIMIZE: optimize \"%s\"\n",
+			word->text.f_str);
+	}
+
+	while (i < size) {
+		ulint		copied;
+		fts_node_t*	src_node;
+
+		src_node = (fts_node_t*) ib_vector_get(word->nodes, i);
+
+		if (!dst_node) {
+
+			dst_node = static_cast<fts_node_t*>(
+				ib_vector_push(nodes, NULL));
+			memset(dst_node, 0, sizeof(*dst_node));
+		}
+
+		/* Copy from the src to the dst node. */
+		fts_optimize_node(del_vec, &del_pos, dst_node, src_node, &enc);
+
+		ut_a(enc.src_ilist_ptr != NULL);
+
+		/* Determine the numer of bytes copied to dst_node. */
+		copied = enc.src_ilist_ptr - src_node->ilist;
+
+		/* Can't copy more than whats in the vlc array. */
+		ut_a(copied <= src_node->ilist_size);
+
+		/* We are done with this node release the resources. */
+		if (copied == src_node->ilist_size) {
+
+			enc.src_last_doc_id = 0;
+			enc.src_ilist_ptr = NULL;
+
+			ut_free(src_node->ilist);
+
+			src_node->ilist = NULL;
+			src_node->ilist_size = src_node->ilist_size_alloc = 0;
+
+			src_node = NULL;
+
+			++i; /* Get next source node to OPTIMIZE. */
+		}
+
+		if (dst_node->ilist_size >= FTS_ILIST_MAX_SIZE || i >= size) {
+
+			dst_node = NULL;
+		}
+	}
+
+	/* All dst nodes created should have been added to the vector. */
+	ut_a(dst_node == NULL);
+
+	/* Return the OPTIMIZED nodes. */
+	return(nodes);
+}
+
+/**********************************************************************//**
+Update the FTS index table. This is a delete followed by an insert.
+@return DB_SUCCESS or error code */
+static
+ulint
+fts_optimize_write_word(
+/*====================*/
+	trx_t*		trx,		/*!< in: transaction */
+	fts_table_t*	fts_table,	/*!< in: table of FTS index */
+	fts_string_t*	word,		/*!< in: word data to write */
+	ib_vector_t*	nodes)		/*!< in: the nodes to write */
+{
+	ulint		i;
+	pars_info_t*	info;
+	que_t*		graph;
+	ulint		selected;
+	ulint		error = DB_SUCCESS;
+	char*		table_name = fts_get_table_name(fts_table);
+
+	info = pars_info_create();
+
+	ut_ad(fts_table->charset);
+
+	if (fts_enable_diag_print) {
+		fprintf(stderr, "FTS_OPTIMIZE: processed \"%s\"\n",
+			word->f_str);
+	}
+
+	pars_info_bind_varchar_literal(
+		info, "word", word->f_str, word->f_len);
+
+	selected = fts_select_index(fts_table->charset,
+				    word->f_str, word->f_len);
+
+	fts_table->suffix = fts_get_suffix(selected);
+
+	graph = fts_parse_sql(
+		fts_table,
+		info,
+		"BEGIN DELETE FROM %s WHERE word = :word;");
+
+	error = fts_eval_sql(trx, graph);
+
+	if (error != DB_SUCCESS) {
+		ut_print_timestamp(stderr);
+		fprintf(stderr, " InnoDB: Error: (%lu) during optimize, "
+			"when deleting a word from the FTS index.\n", error);
+	}
+
+	fts_que_graph_free(graph);
+	graph = NULL;
+
+	mem_free(table_name);
+
+	/* Even if the operation needs to be rolled back and redone,
+	we iterate over the nodes in order to free the ilist. */
+	for (i = 0; i < ib_vector_size(nodes); ++i) {
+
+		fts_node_t* node = (fts_node_t*) ib_vector_get(nodes, i);
+
+		if (error == DB_SUCCESS) {
+			error = fts_write_node(
+				trx, &graph, fts_table, word, node);
+
+			if (error != DB_SUCCESS) {
+				ut_print_timestamp(stderr);
+				fprintf(stderr, " InnoDB: Error: (%lu) "
+					"during optimize, while adding a "
+					"word to the FTS index.\n", error);
+			}
+		}
+
+		ut_free(node->ilist);
+		node->ilist = NULL;
+		node->ilist_size = node->ilist_size_alloc = 0;
+	}
+
+	if (graph != NULL) {
+		fts_que_graph_free(graph);
+	}
+
+	return(error);
+}
+
+/**********************************************************************//**
+Free fts_optimizer_word_t instanace.*/
+UNIV_INTERN
+void
+fts_word_free(
+/*==========*/
+	fts_word_t*	word)		/*!< in: instance to free.*/
+{
+	mem_heap_t*	heap = static_cast<mem_heap_t*>(word->heap_alloc->arg);
+
+#ifdef UNIV_DEBUG
+	memset(word, 0, sizeof(*word));
+#endif /* UNIV_DEBUG */
+
+	mem_heap_free(heap);
+}
+
+/**********************************************************************//**
+Optimize the word ilist and rewrite data to the FTS index.
+@return status one of RESTART, EXIT, ERROR */
+static
+ulint
+fts_optimize_compact(
+/*=================*/
+	fts_optimize_t*	optim,		/*!< in: optimize state data */
+	dict_index_t*	index,		/*!< in: current FTS being optimized */
+	ib_time_t	start_time)	/*!< in: optimize start time */
+{
+	ulint		i;
+	ulint		error = DB_SUCCESS;
+	ulint		size = ib_vector_size(optim->words);
+
+	for (i = 0; i < size && error == DB_SUCCESS && !optim->done; ++i) {
+		fts_word_t*	word;
+		ib_vector_t*	nodes;
+		trx_t*		trx = optim->trx;
+
+		word = (fts_word_t*) ib_vector_get(optim->words, i);
+
+		/* nodes is allocated from the word heap and will be destroyed
+		when the word is freed. We however have to be careful about
+		the ilist, that needs to be freed explicitly. */
+		nodes = fts_optimize_word(optim, word);
+
+		/* Update the data on disk. */
+		error = fts_optimize_write_word(
+			trx, &optim->fts_index_table, &word->text, nodes);
+
+		if (error == DB_SUCCESS) {
+			/* Write the last word optimized to the config table,
+			we use this value for restarting optimize. */
+			error = fts_config_set_index_value(
+				optim->trx, index,
+				FTS_LAST_OPTIMIZED_WORD, &word->text);
+		}
+
+		/* Free the word that was optimized. */
+		fts_word_free(word);
+
+		if (fts_optimize_time_limit > 0
+		    && (ut_time() - start_time) > fts_optimize_time_limit) {
+
+			optim->done = TRUE;
+		}
+	}
+
+	return(error);
+}
+
+/**********************************************************************//**
+Create an instance of fts_optimize_t. Also create a new
+background transaction.*/
+static
+fts_optimize_t*
+fts_optimize_create(
+/*================*/
+	dict_table_t*	table)		/*!< in: table with FTS indexes */
+{
+	fts_optimize_t*	optim;
+	mem_heap_t*	heap = mem_heap_create(128);
+
+	optim = (fts_optimize_t*) mem_heap_zalloc(heap, sizeof(*optim));
+
+	optim->self_heap = ib_heap_allocator_create(heap);
+
+	optim->to_delete = fts_doc_ids_create();
+
+	optim->words = ib_vector_create(
+		optim->self_heap, sizeof(fts_word_t), 256);
+
+	optim->table = table;
+
+	optim->trx = trx_allocate_for_background();
+
+	optim->fts_common_table.parent = table->name;
+	optim->fts_common_table.table_id = table->id;
+	optim->fts_common_table.type = FTS_COMMON_TABLE;
+
+	optim->fts_index_table.parent = table->name;
+	optim->fts_index_table.table_id = table->id;
+	optim->fts_index_table.type = FTS_INDEX_TABLE;
+
+	/* The common prefix for all this parent table's aux tables. */
+	optim->name_prefix = fts_get_table_name_prefix(
+		&optim->fts_common_table);
+
+	return(optim);
+}
+
+#ifdef FTS_OPTIMIZE_DEBUG
+/**********************************************************************//**
+Get optimize start time of an FTS index.
+@return DB_SUCCESS if all OK else error code */
+static
+ulint
+fts_optimize_get_index_start_time(
+/*==============================*/
+	trx_t*		trx,			/*!< in: transaction */
+	dict_index_t*	index,			/*!< in: FTS index */
+	ib_time_t*	start_time)		/*!< out: time in secs */
+{
+	ulint		error;
+
+	error = fts_config_get_index_ulint(
+		trx, index, FTS_OPTIMIZE_START_TIME, (ulint*) start_time);
+
+	return(error);
+}
+
+/**********************************************************************//**
+Set the optimize start time of an FTS index.
+@return DB_SUCCESS if all OK else error code */
+static
+ulint
+fts_optimize_set_index_start_time(
+/*==============================*/
+	trx_t*		trx,			/*!< in: transaction */
+	dict_index_t*	index,			/*!< in: FTS index */
+	ib_time_t	start_time)		/*!< in: start time */
+{
+	ulint		error;
+
+	error = fts_config_set_index_ulint(
+		trx, index, FTS_OPTIMIZE_START_TIME, (ulint) start_time);
+
+	return(error);
+}
+
+/**********************************************************************//**
+Get optimize end time of an FTS index.
+@return DB_SUCCESS if all OK else error code */
+static
+ulint
+fts_optimize_get_index_end_time(
+/*============================*/
+	trx_t*		trx,			/*!< in: transaction */
+	dict_index_t*	index,			/*!< in: FTS index */
+	ib_time_t*	end_time)		/*!< out: time in secs */
+{
+	ulint		error;
+
+	error = fts_config_get_index_ulint(
+		trx, index, FTS_OPTIMIZE_END_TIME, (ulint*) end_time);
+
+	return(error);
+}
+
+/**********************************************************************//**
+Set the optimize end time of an FTS index.
+@return DB_SUCCESS if all OK else error code */
+static
+ulint
+fts_optimize_set_index_end_time(
+/*============================*/
+	trx_t*		trx,			/*!< in: transaction */
+	dict_index_t*	index,			/*!< in: FTS index */
+	ib_time_t	end_time)		/*!< in: end time */
+{
+	ulint		error;
+
+	error = fts_config_set_index_ulint(
+		trx, index, FTS_OPTIMIZE_END_TIME, (ulint) end_time);
+
+	return(error);
+}
+#endif
+
+/**********************************************************************//**
+Free the optimize prepared statements.*/
+static
+void
+fts_optimize_graph_free(
+/*====================*/
+	fts_optimize_graph_t*	graph)	/*!< in/out: The graph instances
+					to free */
+{
+	if (graph->commit_graph) {
+		que_graph_free(graph->commit_graph);
+		graph->commit_graph = NULL;
+	}
+
+	if (graph->write_nodes_graph) {
+		que_graph_free(graph->write_nodes_graph);
+		graph->write_nodes_graph = NULL;
+	}
+
+	if (graph->delete_nodes_graph) {
+		que_graph_free(graph->delete_nodes_graph);
+		graph->delete_nodes_graph = NULL;
+	}
+
+	if (graph->read_nodes_graph) {
+		que_graph_free(graph->read_nodes_graph);
+		graph->read_nodes_graph = NULL;
+	}
+}
+
+/**********************************************************************//**
+Free all optimize resources. */
+static
+void
+fts_optimize_free(
+/*==============*/
+	fts_optimize_t*	optim)		/*!< in: table with on FTS index */
+{
+	mem_heap_t*	heap = static_cast<mem_heap_t*>(optim->self_heap->arg);
+
+	trx_free_for_background(optim->trx);
+
+	fts_doc_ids_free(optim->to_delete);
+	fts_optimize_graph_free(&optim->graph);
+
+	mem_free(optim->name_prefix);
+
+	/* This will free the heap from which optim itself was allocated. */
+	mem_heap_free(heap);
+}
+
+/**********************************************************************//**
+Get the max time optimize should run in millisecs.
+@return max optimize time limit in millisecs. */
+static
+ib_time_t
+fts_optimize_get_time_limit(
+/*========================*/
+	trx_t*		trx,			/*!< in: transaction */
+	fts_table_t*	fts_table)		/*!< in: aux table */
+{
+	ib_time_t	time_limit = 0;
+
+	fts_config_get_ulint(
+		trx, fts_table,
+		FTS_OPTIMIZE_LIMIT_IN_SECS, (ulint*) &time_limit);
+
+	return(time_limit * 1000);
+}
+
+
+/**********************************************************************//**
+Run OPTIMIZE on the given table. Note: this can take a very long time
+(hours). */
+static
+void
+fts_optimize_words(
+/*===============*/
+	fts_optimize_t*	optim,	/*!< in: optimize instance */
+	dict_index_t*	index,	/*!< in: current FTS being optimized */
+	fts_string_t*	word)	/*!< in: the starting word to optimize */
+{
+	fts_fetch_t	fetch;
+	ib_time_t	start_time;
+	que_t*		graph = NULL;
+	CHARSET_INFO*	charset = optim->fts_index_table.charset;
+
+	ut_a(!optim->done);
+
+	/* Get the time limit from the config table. */
+	fts_optimize_time_limit = fts_optimize_get_time_limit(
+		optim->trx, &optim->fts_common_table);
+
+	start_time = ut_time();
+
+	/* Setup the callback to use for fetching the word ilist etc. */
+	fetch.read_arg = optim->words;
+	fetch.read_record = fts_optimize_index_fetch_node;
+
+	fprintf(stderr, "%.*s\n", (int) word->f_len, word->f_str);
+
+	while(!optim->done) {
+		ulint	error;
+		trx_t*	trx = optim->trx;
+		ulint	selected;
+
+		ut_a(ib_vector_size(optim->words) == 0);
+
+		selected = fts_select_index(charset, word->f_str, word->f_len);
+
+		/* Read the index records to optimize. */
+		error = fts_index_fetch_nodes(
+			trx, &graph, &optim->fts_index_table, word,
+			&fetch);
+
+		if (error == DB_SUCCESS) {
+			/* There must be some nodes to read. */
+			ut_a(ib_vector_size(optim->words) > 0);
+
+			/* Optimize the nodes that were read and write
+			back to DB. */
+			error = fts_optimize_compact(optim, index, start_time);
+
+			if (error == DB_SUCCESS) {
+				fts_sql_commit(optim->trx);
+			} else {
+				fts_sql_rollback(optim->trx);
+			}
+		}
+
+		ib_vector_reset(optim->words);
+
+		if (error == DB_SUCCESS) {
+			if (!optim->done) {
+				if (!fts_zip_read_word(optim->zip, word)) {
+					optim->done = TRUE;
+				} else if (selected
+					   != fts_select_index(
+						charset, word->f_str,
+						word->f_len)
+					  && graph) {
+					fts_que_graph_free(graph);
+					graph = NULL;
+				}
+			}
+		} else if (error == DB_LOCK_WAIT_TIMEOUT) {
+			fprintf(stderr, "InnoDB: Warning: lock wait timeout "
+				"during optimize. Retrying!\n");
+
+			trx->error_state = DB_SUCCESS;
+		} else if (error == DB_DEADLOCK) {
+			fprintf(stderr, "InnoDB: Warning: deadlock "
+				"during optimize. Retrying!\n");
+
+			trx->error_state = DB_SUCCESS;
+		} else {
+			optim->done = TRUE;		/* Exit the loop. */
+		}
+	}
+
+	if (graph != NULL) {
+		fts_que_graph_free(graph);
+	}
+}
+
+/**********************************************************************//**
+Select the FTS index to search.
+@return TRUE if last index */
+static
+ibool
+fts_optimize_set_next_word(
+/*=======================*/
+	CHARSET_INFO*	charset,	/*!< in: charset */
+	fts_string_t*	word)		/*!< in: current last word */
+{
+	ulint		selected;
+	ibool		last = FALSE;
+
+	selected = fts_select_next_index(charset, word->f_str, word->f_len);
+
+	/* If this was the last index then reset to start. */
+	if (fts_index_selector[selected].value == 0) {
+		/* Reset the last optimized word to '' if no
+		more words could be read from the FTS index. */
+		word->f_len = 0;
+		*word->f_str = 0;
+
+		last = TRUE;
+	} else {
+		ulint	value = fts_index_selector[selected].value;
+
+		ut_a(value <= 0xff);
+
+		/* Set to the first character of the next slot. */
+		word->f_len = 1;
+		*word->f_str = (byte) value;
+	}
+
+	return(last);
+}
+
+/**********************************************************************//**
+Optimize is complete. Set the completion time, and reset the optimize
+start string for this FTS index to "".
+@return DB_SUCCESS if all OK */
+static
+ulint
+fts_optimize_index_completed(
+/*=========================*/
+	fts_optimize_t*	optim,	/*!< in: optimize instance */
+	dict_index_t*	index)	/*!< in: table with one FTS index */
+{
+	fts_string_t	word;
+	ulint		error;
+	byte		buf[sizeof(ulint)];
+#ifdef FTS_OPTIMIZE_DEBUG
+	ib_time_t	end_time = ut_time();
+
+	error = fts_optimize_set_index_end_time(optim->trx, index, end_time);
+#endif
+
+	/* If we've reached the end of the index then set the start
+	word to the empty string. */
+
+	word.f_len = 0;
+	word.f_str = buf;
+	*word.f_str = '\0';
+
+	error = fts_config_set_index_value(
+		optim->trx, index, FTS_LAST_OPTIMIZED_WORD, &word);
+
+	if (error != DB_SUCCESS) {
+
+		fprintf(stderr, "InnoDB: Error: (%lu) while "
+			"updating last optimized word!\n", error);
+	}
+
+	return(error);
+}
+
+
+/**********************************************************************//**
+Read the list of words from the FTS auxiliary index that will be
+optimized in this pass.
+@return DB_SUCCESS if all OK */
+static
+ulint
+fts_optimize_index_read_words(
+/*==========================*/
+	fts_optimize_t*	optim,	/*!< in: optimize instance */
+	dict_index_t*	index,	/*!< in: table with one FTS index */
+	fts_string_t*	word)	/*!< in: buffer to use */
+{
+	ulint		error = DB_SUCCESS;
+
+	if (optim->del_list_regenerated) {
+		word->f_len = 0;
+	} else {
+
+		/* Get the last word that was optimized from
+		the config table. */
+		error = fts_config_get_index_value(
+			optim->trx, index, FTS_LAST_OPTIMIZED_WORD, word);
+	}
+
+	/* If record not found then we start from the top. */
+	if (error == DB_RECORD_NOT_FOUND) {
+		word->f_len = 0;
+		error = DB_SUCCESS;
+	}
+
+	while (error == DB_SUCCESS) {
+
+		error = fts_index_fetch_words(
+			optim, word, fts_num_word_optimize);
+
+		if (error == DB_SUCCESS) {
+
+			/* If the search returned an empty set
+			try the next index in the horizontal split. */
+			if (optim->zip->n_words > 0) {
+				break;
+			} else {
+
+				fts_optimize_set_next_word(
+					optim->fts_index_table.charset,
+					word);
+
+				if (word->f_len == 0) {
+					break;
+				}
+			}
+		}
+	}
+
+	return(error);
+}
+
+/**********************************************************************//**
+Run OPTIMIZE on the given FTS index. Note: this can take a very long
+time (hours).
+@return DB_SUCCESS if all OK */
+static
+ulint
+fts_optimize_index(
+/*===============*/
+	fts_optimize_t*	optim,	/*!< in: optimize instance */
+	dict_index_t*	index)	/*!< in: table with one FTS index */
+{
+	fts_string_t	word;
+	ulint		error;
+	byte		str[FTS_MAX_WORD_LEN + 1];
+
+	/* Set the current index that we have to optimize. */
+	optim->fts_index_table.index_id = index->id;
+	optim->fts_index_table.charset = fts_index_get_charset(index);
+
+	optim->done = FALSE; /* Optimize until !done */
+
+	/* We need to read the last word optimized so that we start from
+	the next word. */
+	word.f_str = str;
+
+	/* We set the length of word to the size of str since we
+	need to pass the max len info to the fts_get_config_value() function. */
+	word.f_len = sizeof(str) - 1;
+
+	memset(word.f_str, 0x0, word.f_len);
+
+	/* Read the words that will be optimized in this pass. */
+	error = fts_optimize_index_read_words(optim, index, &word);
+
+	if (error == DB_SUCCESS) {
+		int	zip_error;
+
+		ut_a(optim->zip->pos == 0);
+		ut_a(optim->zip->zp->total_in == 0);
+		ut_a(optim->zip->zp->total_out == 0);
+
+		zip_error = inflateInit(optim->zip->zp);
+		ut_a(zip_error == Z_OK);
+
+		word.f_len = 0;
+		word.f_str = str;
+
+		/* Read the first word to optimize from the Zip buffer. */
+		if (!fts_zip_read_word(optim->zip, &word)) {
+
+			optim->done = TRUE;
+		} else {
+			fts_optimize_words(optim, index, &word);
+		}
+
+		/* If we couldn't read any records then optimize is
+		complete. Increment the number of indexes that have
+		been optimized and set FTS index optimize state to
+		completed. */
+		if (error == DB_SUCCESS && optim->zip->n_words == 0) {
+
+			error = fts_optimize_index_completed(optim, index);
+
+			if (error == DB_SUCCESS) {
+				++optim->n_completed;
+			}
+		}
+	}
+
+	return(error);
+}
+
+/**********************************************************************//**
+Delete the document ids in the delete, and delete cache tables.
+@return DB_SUCCESS if all OK */
+static
+ulint
+fts_optimize_purge_deleted_doc_ids(
+/*===============================*/
+	fts_optimize_t*	optim)	/*!< in: optimize instance */
+{
+	ulint		i;
+	pars_info_t*	info;
+	que_t*		graph;
+	fts_update_t*	update;
+	char*		sql_str;
+	doc_id_t	write_doc_id;
+	ulint		error = DB_SUCCESS;
+
+	info = pars_info_create();
+
+	ut_a(ib_vector_size(optim->to_delete->doc_ids) > 0);
+
+	update = static_cast<fts_update_t*>(
+		ib_vector_get(optim->to_delete->doc_ids, 0));
+
+	/* Convert to "storage" byte order. */
+	fts_write_doc_id((byte*) &write_doc_id, update->doc_id);
+
+	/* This is required for the SQL parser to work. It must be able
+	to find the following variables. So we do it twice. */
+	fts_bind_doc_id(info, "doc_id1", &write_doc_id);
+	fts_bind_doc_id(info, "doc_id2", &write_doc_id);
+
+	/* Since we only replace the table_id and don't construct the full
+	name, we do substitution ourselves. Remember to free sql_str. */
+	sql_str = ut_strreplace(
+		fts_delete_doc_ids_sql, "%s", optim->name_prefix);
+
+	graph = fts_parse_sql(NULL, info, sql_str);
+
+	mem_free(sql_str);
+
+	/* Delete the doc ids that were copied at the start. */
+	for (i = 0; i < ib_vector_size(optim->to_delete->doc_ids); ++i) {
+
+		update = static_cast<fts_update_t*>(ib_vector_get(
+			optim->to_delete->doc_ids, i));
+
+		/* Convert to "storage" byte order. */
+		fts_write_doc_id((byte*) &write_doc_id, update->doc_id);
+
+		fts_bind_doc_id(info, "doc_id1", &write_doc_id);
+
+		fts_bind_doc_id(info, "doc_id2", &write_doc_id);
+
+		error = fts_eval_sql(optim->trx, graph);
+
+		// FIXME: Check whether delete actually succeeded!
+		if (error != DB_SUCCESS) {
+
+			fts_sql_rollback(optim->trx);
+			break;
+		}
+	}
+
+	fts_que_graph_free(graph);
+
+	return(error);
+}
+
+/**********************************************************************//**
+Delete the document ids in the pending delete, and delete tables.
+@return DB_SUCCESS if all OK */
+static
+ulint
+fts_optimize_purge_deleted_doc_id_snapshot(
+/*=======================================*/
+	fts_optimize_t*	optim)	/*!< in: optimize instance */
+{
+	ulint		error;
+	que_t*		graph;
+	char*		sql_str;
+
+	/* Since we only replace the table_id and don't construct
+	the full name, we do the '%s' substitution ourselves. */
+	sql_str = ut_strreplace(fts_end_delete_sql, "%s", optim->name_prefix);
+
+	/* Delete the doc ids that were copied to delete pending state at
+	the start of optimize. */
+	graph = fts_parse_sql(NULL, NULL, sql_str);
+
+	mem_free(sql_str);
+
+	error = fts_eval_sql(optim->trx, graph);
+	fts_que_graph_free(graph);
+
+	return(error);
+}
+
+/**********************************************************************//**
+Copy the deleted doc ids that will be purged during this optimize run
+to the being deleted FTS auxiliary tables. The transaction is committed
+upon successfull copy and rolled back on DB_DUPLICATE_KEY error.
+@return DB_SUCCESS if all OK */
+static
+ulint
+fts_optimize_being_deleted_count(
+/*=============================*/
+	fts_optimize_t*	optim)	/*!< in: optimize instance */
+{
+	fts_table_t	fts_table;
+
+	FTS_INIT_FTS_TABLE(&fts_table, "BEING_DELETED", FTS_COMMON_TABLE,
+			   optim->table);
+
+	return(fts_get_rows_count(&fts_table));
+}
+
+/*********************************************************************//**
+Copy the deleted doc ids that will be purged during this optimize run
+to the being deleted FTS auxiliary tables. The transaction is committed
+upon successfull copy and rolled back on DB_DUPLICATE_KEY error.
+@return DB_SUCCESS if all OK */
+static
+ulint
+fts_optimize_create_deleted_doc_id_snapshot(
+/*========================================*/
+	fts_optimize_t*	optim)	/*!< in: optimize instance */
+{
+	ulint		error;
+	que_t*		graph;
+	char*		sql_str;
+
+	/* Since we only replace the table_id and don't construct the
+	full name, we do the substitution ourselves. */
+	sql_str = ut_strreplace(fts_init_delete_sql, "%s", optim->name_prefix);
+
+	/* Move doc_ids that are to be deleted to state being deleted. */
+	graph = fts_parse_sql(NULL, NULL, sql_str);
+
+	mem_free(sql_str);
+
+	error = fts_eval_sql(optim->trx, graph);
+
+	fts_que_graph_free(graph);
+
+	if (error != DB_SUCCESS) {
+		fts_sql_rollback(optim->trx);
+	} else {
+		fts_sql_commit(optim->trx);
+	}
+
+	optim->del_list_regenerated = TRUE;
+
+	return(error);
+}
+
+/*********************************************************************//**
+Read in the document ids that are to be purged during optimize. The
+transaction is committed upon successfully read.
+@return DB_SUCCESS if all OK */
+static
+ulint
+fts_optimize_read_deleted_doc_id_snapshot(
+/*======================================*/
+	fts_optimize_t*	optim)	/*!< in: optimize instance */
+{
+	ulint		error;
+
+	optim->fts_common_table.suffix = "BEING_DELETED";
+
+	/* Read the doc_ids to delete. */
+	error = fts_table_fetch_doc_ids(
+		optim->trx, &optim->fts_common_table, optim->to_delete);
+
+	if (error == DB_SUCCESS) {
+
+		optim->fts_common_table.suffix = "BEING_DELETED_CACHE";
+
+		/* Read additional doc_ids to delete. */
+		error = fts_table_fetch_doc_ids(
+			optim->trx, &optim->fts_common_table, optim->to_delete);
+	}
+
+	if (error != DB_SUCCESS) {
+
+		fts_doc_ids_free(optim->to_delete);
+		optim->to_delete = NULL;
+	}
+
+	return(error);
+}
+
+/*********************************************************************//**
+Optimze all the FTS indexes, skipping those that have already been
+optimized, since the FTS auxiliary indexes are not guaranteed to be
+of the same cardinality.
+@return DB_SUCCESS if all OK */
+static
+ulint
+fts_optimize_indexes(
+/*=================*/
+	fts_optimize_t*	optim)	/*!< in: optimize instance */
+{
+	ulint		i;
+	ulint		error = DB_SUCCESS;
+	fts_t*		fts = optim->table->fts;
+
+	/* Optimize the FTS indexes. */
+	for (i = 0; i < ib_vector_size(fts->indexes); ++i) {
+		dict_index_t*	index;
+
+#ifdef	FTS_OPTIMIZE_DEBUG
+		ib_time_t	end_time;
+		ib_time_t	start_time;
+
+		/* Get the start and end optimize times for this index. */
+		error = fts_optimize_get_index_start_time(
+			optim->trx, index, &start_time);
+
+		if (error != DB_SUCCESS) {
+			break;
+		}
+
+		error = fts_optimize_get_index_end_time(
+			optim->trx, index, &end_time);
+
+		if (error != DB_SUCCESS) {
+			break;
+		}
+
+		/* Start time will be 0 only for the first time or after
+		completing the optimization of all FTS indexes. */
+		if (start_time == 0) {
+			start_time = ut_time();
+
+			error = fts_optimize_set_index_start_time(
+				optim->trx, index, start_time);
+		}
+
+		/* Check if this index needs to be optimized or not. */
+		if (ut_difftime(end_time, start_time) < 0) {
+			error = fts_optimize_index(optim, index);
+
+			if (error != DB_SUCCESS) {
+				break;
+			}
+		} else {
+			++optim->n_completed;
+		}
+#endif
+		index = static_cast<dict_index_t*>(
+			ib_vector_getp(fts->indexes, i));
+		error = fts_optimize_index(optim, index);
+	}
+
+	if (error == DB_SUCCESS) {
+		fts_sql_commit(optim->trx);
+	} else {
+		fts_sql_rollback(optim->trx);
+	}
+
+	return(error);
+}
+
+/*********************************************************************//**
+Cleanup the snapshot tables and the master deleted table.
+@return DB_SUCCESS if all OK */
+static
+ulint
+fts_optimize_purge_snapshot(
+/*========================*/
+	fts_optimize_t*	optim)	/*!< in: optimize instance */
+{
+	ulint		error;
+
+	/* Delete the doc ids from the master deleted tables, that were
+	in the snapshot that was taken at the start of optimize. */
+	error = fts_optimize_purge_deleted_doc_ids(optim);
+
+	if (error == DB_SUCCESS) {
+		/* Destroy the deleted doc id snapshot. */
+		error = fts_optimize_purge_deleted_doc_id_snapshot(optim);
+	}
+
+	if (error == DB_SUCCESS) {
+		fts_sql_commit(optim->trx);
+	} else {
+		fts_sql_rollback(optim->trx);
+	}
+
+	return(error);
+}
+
+/*********************************************************************//**
+Reset the start time to 0 so that a new optimize can be started.
+@return DB_SUCCESS if all OK */
+static
+ulint
+fts_optimize_reset_start_time(
+/*==========================*/
+	fts_optimize_t*	optim)	/*!< in: optimize instance */
+{
+	ulint		error = DB_SUCCESS;
+#ifdef FTS_OPTIMIZE_DEBUG
+	fts_t*		fts = optim->table->fts;
+
+	/* Optimization should have been completed for all indexes. */
+	ut_a(optim->n_completed == ib_vector_size(fts->indexes));
+
+	for (uint i = 0; i < ib_vector_size(fts->indexes); ++i) {
+		dict_index_t*	index;
+
+		ib_time_t	start_time = 0;
+
+		/* Reset the start time to 0 for this index. */
+		error = fts_optimize_set_index_start_time(
+			optim->trx, index, start_time);
+
+		index = static_cast<dict_index_t*>(
+			ib_vector_getp(fts->indexes, i));
+	}
+#endif
+
+	if (error == DB_SUCCESS) {
+		fts_sql_commit(optim->trx);
+	} else {
+		fts_sql_rollback(optim->trx);
+	}
+
+	return(error);
+}
+
+/*********************************************************************//**
+Run OPTIMIZE on the given table by a background thread.
+@return DB_SUCCESS if all OK */
+static
+ulint
+fts_optimize_table_bk(
+/*==================*/
+	fts_slot_t*	slot)	/*!< in: table to optimiza */
+{
+	ulint		error;
+	dict_table_t*	table = slot->table;
+	fts_t*		fts = table->fts;
+
+	/* Avoid optimizing tables that were optimized recently. */
+	if (slot->last_run > 0
+	    && (ut_time() - slot->last_run) < slot->interval_time) {
+
+		return(DB_SUCCESS);
+
+	} else if (fts && fts->cache
+		   && fts->cache->deleted >= FTS_OPTIMIZE_THRESHOLD) {
+
+		error = fts_optimize_table(table);
+
+		if (error == DB_SUCCESS) {
+			slot->state = FTS_STATE_DONE;
+			slot->last_run = 0;
+			slot->completed = ut_time();
+		}
+	} else {
+		error = DB_SUCCESS;
+	}
+
+	/* Note time this run completed. */
+	slot->last_run = ut_time();
+
+	return(error);
+}
+/*********************************************************************//**
+Run OPTIMIZE on the given table.
+@return DB_SUCCESS if all OK */
+UNIV_INTERN
+ulint
+fts_optimize_table(
+/*===============*/
+	dict_table_t*	table)	/*!< in: table to optimiza */
+{
+	ulint		error = DB_SUCCESS;
+	fts_optimize_t*	optim = NULL;
+	fts_t*		fts = table->fts;
+
+	ut_print_timestamp(stderr);
+	fprintf(stderr, " InnoDB: FTS start optimize %s\n", table->name);
+
+	optim = fts_optimize_create(table);
+
+	// FIXME: Call this only at the start of optimize, currently we
+	// rely on DB_DUPLICATE_KEY to handle corrupting the snapshot.
+
+	/* Check whether there are still records in BEING_DELETED table */
+	if (fts_optimize_being_deleted_count(optim) == 0) {
+		/* Take a snapshot of the deleted document ids, they are copied
+		to the BEING_ tables. */
+		error = fts_optimize_create_deleted_doc_id_snapshot(optim);
+	}
+
+	/* A duplicate error is OK, since we don't erase the
+	doc ids from the being deleted state until all FTS
+	indexes have been optimized. */
+	if (error == DB_DUPLICATE_KEY) {
+		error = DB_SUCCESS;
+	}
+
+	if (error == DB_SUCCESS) {
+
+		/* These document ids will be filtered out during the
+		index optimization phase. They are in the snapshot that we
+		took above, at the start of the optimize. */
+		error = fts_optimize_read_deleted_doc_id_snapshot(optim);
+
+		if (error == DB_SUCCESS) {
+
+			/* Commit the read of being deleted
+			doc ids transaction. */
+			fts_sql_commit(optim->trx);
+
+			/* We would do optimization only if there
+			are deleted records to be cleaned up */
+			if (ib_vector_size(optim->to_delete->doc_ids) > 0) {
+				error = fts_optimize_indexes(optim);
+			}
+
+		} else {
+			ut_a(optim->to_delete == NULL);
+		}
+
+		/* Only after all indexes have been optimized can we
+		delete the (snapshot) doc ids in the pending delete,
+		and master deleted tables. */
+		if (error == DB_SUCCESS
+		    && optim->n_completed == ib_vector_size(fts->indexes)) {
+
+			if (fts_enable_diag_print) {
+				fprintf(stderr, "FTS_OPTIMIZE: Completed "
+						"Optimize, cleanup DELETED "
+						"table\n");
+			}
+
+			if (ib_vector_size(optim->to_delete->doc_ids) > 0) {
+
+				/* Purge the doc ids that were in the
+				snapshot from the snapshot tables and
+				the master deleted table. */
+				error = fts_optimize_purge_snapshot(optim);
+			}
+
+			if (error == DB_SUCCESS) {
+				/* Reset the start time of all the FTS indexes
+				so that optimize can be restarted. */
+				error = fts_optimize_reset_start_time(optim);
+			}
+		}
+	}
+
+	fts_optimize_free(optim);
+
+	ut_print_timestamp(stderr);
+	fprintf(stderr, " InnoDB: FTS end optimize %s\n", table->name);
+
+	return(error);
+}
+
+/********************************************************************//**
+Add the table to add to the OPTIMIZER's list.
+@return new message instance */
+static
+fts_msg_t*
+fts_optimize_create_msg(
+/*====================*/
+	fts_msg_type_t	type,		/*!< in: type of message */
+	void*		ptr)		/*!< in: message payload */
+{
+	mem_heap_t*	heap;
+	fts_msg_t*	msg;
+
+	heap = mem_heap_create(sizeof(*msg) + sizeof(ib_list_node_t) + 16);
+	msg = static_cast<fts_msg_t*>(mem_heap_alloc(heap, sizeof(*msg)));
+
+	msg->ptr = ptr;
+	msg->type = type;
+	msg->heap = heap;
+
+	return(msg);
+}
+
+/**********************************************************************//**
+Add the table to add to the OPTIMIZER's list. */
+UNIV_INTERN
+void
+fts_optimize_add_table(
+/*===================*/
+	dict_table_t*	table)			/*!< in: table to add */
+{
+	fts_msg_t*	msg;
+
+	if (!fts_optimize_wq) {
+		return;
+	}
+
+	msg = fts_optimize_create_msg(FTS_MSG_ADD_TABLE, table);
+
+	ib_wqueue_add(fts_optimize_wq, msg, msg->heap);
+}
+
+/**********************************************************************//**
+Optimize a table. */
+UNIV_INTERN
+void
+fts_optimize_do_table(
+/*==================*/
+	dict_table_t*	table)			/*!< in: table to optimize */
+{
+	fts_msg_t*	msg;
+
+	/* Optimizer thread could be shutdown */
+	if (!fts_optimize_wq) {
+		return;
+	}
+
+	msg = fts_optimize_create_msg(FTS_MSG_OPTIMIZE_TABLE, table);
+
+	ib_wqueue_add(fts_optimize_wq, msg, msg->heap);
+}
+
+/**********************************************************************//**
+Remove the table from the OPTIMIZER's list. We do wait for
+acknowledgement from the consumer of the message. */
+UNIV_INTERN
+void
+fts_optimize_remove_table(
+/*======================*/
+	dict_table_t*	table)			/*!< in: table to remove */
+{
+	fts_msg_t*	msg;
+	os_event_t		event;
+	fts_msg_del_t* remove;
+
+	/* if the optimize system not yet initialized, return */
+	if (!fts_optimize_wq) {
+		return;
+	}
+
+	msg = fts_optimize_create_msg(FTS_MSG_DEL_TABLE, NULL);
+
+	/* We will wait on this event until signalled by the consumer. */
+	event = os_event_create(table->name);
+
+	remove = static_cast<fts_msg_del_t*>(
+		mem_heap_alloc(msg->heap, sizeof(*remove)));
+
+	remove->table = table;
+	remove->event = event;
+	msg->ptr = remove;
+
+	ib_wqueue_add(fts_optimize_wq, msg, msg->heap);
+
+	os_event_wait(event);
+
+	os_event_free(event);
+}
+
+/**********************************************************************//**
+Find the slot for a particular table.
+@return slot if found else NULL. */
+static
+fts_slot_t*
+fts_optimize_find_slot(
+/*===================*/
+	ib_vector_t*		tables,		/*!< in: vector of tables */
+	const dict_table_t*	table)		/*!< in: table to add */
+{
+	ulint		i;
+
+	for (i = 0; i < ib_vector_size(tables); ++i) {
+		fts_slot_t*	slot;
+
+		slot = static_cast<fts_slot_t*>(ib_vector_get(tables, i));
+
+		if (slot->table->id == table->id) {
+			return(slot);
+		}
+	}
+
+	return(NULL);
+}
+
+/**********************************************************************//**
+Start optimizing table. */
+static
+void
+fts_optimize_start_table(
+/*=====================*/
+	ib_vector_t*		tables,		/*!< in/out: vector of tables */
+	dict_table_t*		table)		/*!< in: table to optimize */
+{
+	fts_slot_t*	slot;
+
+	slot = fts_optimize_find_slot(tables, table);
+
+	if (slot == NULL) {
+		ut_print_timestamp(stderr);
+		fprintf(stderr, " InnoDB: Error: table %s not registered "
+			"with the optimize thread.\n", table->name);
+	} else {
+		slot->last_run = 0;
+		slot->completed = 0;
+	}
+}
+
+/**********************************************************************//**
+Add the table to the vector if it doesn't already exist. */
+static
+ibool
+fts_optimize_new_table(
+/*===================*/
+	ib_vector_t*	tables,			/*!< in/out: vector of tables */
+	dict_table_t*	table)			/*!< in: table to add */
+{
+	ulint		i;
+	fts_slot_t*	slot;
+	ulint		empty_slot = ULINT_UNDEFINED;
+
+	/* Search for duplicates, also find a free slot if one exists. */
+	for (i = 0; i < ib_vector_size(tables); ++i) {
+
+		slot = static_cast<fts_slot_t*>(
+			ib_vector_get(tables, i));
+
+		if (slot->state == FTS_STATE_EMPTY) {
+			empty_slot = i;
+		} else if (slot->table->id == table->id) {
+			/* Already exists in our optimize queue. */
+			return(FALSE);
+		}
+	}
+
+	/* Reuse old slot. */
+	if (empty_slot != ULINT_UNDEFINED) {
+
+		slot = static_cast<fts_slot_t*>(
+			ib_vector_get(tables, empty_slot));
+
+		ut_a(slot->state == FTS_STATE_EMPTY);
+
+	} else { /* Create a new slot. */
+
+		slot = static_cast<fts_slot_t*>(ib_vector_push(tables, NULL));
+	}
+
+	memset(slot, 0x0, sizeof(*slot));
+
+	slot->table = table;
+	slot->state = FTS_STATE_LOADED;
+	slot->interval_time = FTS_OPTIMIZE_INTERVAL_IN_SECS;
+
+	return(TRUE);
+}
+
+/**********************************************************************//**
+Remove the table from the vector if it exists. */
+static
+ibool
+fts_optimize_del_table(
+/*===================*/
+	ib_vector_t*	tables,			/*!< in/out: vector of tables */
+	fts_msg_del_t*	msg)			/*!< in: table to delete */
+{
+	ulint		i;
+	dict_table_t*	table = msg->table;
+
+	for (i = 0; i < ib_vector_size(tables); ++i) {
+		fts_slot_t*	slot;
+
+		slot = static_cast<fts_slot_t*>(ib_vector_get(tables, i));
+
+		/* FIXME: Should we assert on this ? */
+		if (slot->state != FTS_STATE_EMPTY
+		    && slot->table->id == table->id) {
+
+			ut_print_timestamp(stderr);
+			fprintf(stderr, " InnoDB: FTS Optimize Removing "
+				"table %s\n", table->name);
+
+			slot->table = NULL;
+			slot->state = FTS_STATE_EMPTY;
+
+			return(TRUE);
+		}
+	}
+
+	return(FALSE);
+}
+
+/**********************************************************************//**
+Calculate how many of the registered tables need to be optimized.
+@return no. of tables to optimize */
+static
+ulint
+fts_optimize_how_many(
+/*==================*/
+	const ib_vector_t*	tables)		/*!< in: registered tables
+						vector*/
+{
+	ulint		i;
+	ib_time_t	delta;
+	ulint		n_tables = 0;
+	ib_time_t	current_time;
+
+	current_time = ut_time();
+
+	for (i = 0; i < ib_vector_size(tables); ++i) {
+		const fts_slot_t*	slot;
+
+		slot = static_cast<const fts_slot_t*>(
+			ib_vector_get_const(tables, i));
+
+		switch (slot->state) {
+		case FTS_STATE_DONE:
+		case FTS_STATE_LOADED:
+			ut_a(slot->completed <= current_time);
+
+			delta = current_time - slot->completed;
+
+			/* Skip slots that have been optimized recently. */
+			if (delta >= slot->interval_time) {
+				++n_tables;
+			}
+			break;
+
+		case FTS_STATE_RUNNING:
+			ut_a(slot->last_run <= current_time);
+
+			delta = current_time - slot->last_run;
+
+			if (delta > slot->interval_time) {
+				++n_tables;
+			}
+			break;
+
+			/* Slots in a state other than the above
+			are ignored. */
+		case FTS_STATE_EMPTY:
+		case FTS_STATE_SUSPENDED:
+			break;
+		}
+
+	}
+
+	return(n_tables);
+}
+
+#if 0
+/*********************************************************************//**
+Check whether a table needs to be optimized. */
+static
+void
+fts_optimize_need_sync(
+/*===================*/
+	ib_vector_t*	tables)	/*!< in: list of tables */
+{
+	dict_table_t*	table = NULL;
+	fts_slot_t*	slot;
+	ulint		num_table = ib_vector_size(tables);
+
+	if (!num_table) {
+		return;
+	}
+
+	if (fts_optimize_sync_iterator >= num_table) {
+		fts_optimize_sync_iterator = 0;
+	}
+
+	slot = ib_vector_get(tables, fts_optimize_sync_iterator);
+	table = slot->table;
+
+	if (!table) {
+		return;
+	}
+
+	ut_ad(table->fts);
+
+	if (table->fts->cache) {
+		ulint	deleted = table->fts->cache->deleted;
+
+		if (table->fts->cache->added
+		    >= fts_optimize_add_threshold) {
+			fts_sync_table(table);
+		} else if (deleted >= fts_optimize_delete_threshold) {
+			fts_optimize_do_table(table);
+
+			mutex_enter(&table->fts->cache->deleted_lock);
+			table->fts->cache->deleted -= deleted;
+			mutex_exit(&table->fts->cache->deleted_lock);
+		}
+	}
+
+	fts_optimize_sync_iterator++;
+
+	return;
+}
+#endif
+
+/**********************************************************************//**
+Optimize all FTS tables.
+@return Dummy return */
+UNIV_INTERN
+os_thread_ret_t
+fts_optimize_thread(
+/*================*/
+	void*		arg)			/*!< in: work queue*/
+{
+	mem_heap_t*	heap;
+	ib_vector_t*	tables;
+	ib_alloc_t*	heap_alloc;
+	ulint		current = 0;
+	ibool		done = FALSE;
+	ulint		n_tables = 0;
+	os_event_t	exit_event = 0;
+	ulint		n_optimize = 0;
+	ib_wqueue_t*	wq = (ib_wqueue_t*) arg;
+
+	heap = mem_heap_create(sizeof(dict_table_t*) * 64);
+	heap_alloc = ib_heap_allocator_create(heap);
+
+	tables = ib_vector_create(heap_alloc, sizeof(fts_slot_t), 4);
+
+	while(!done && srv_shutdown_state == SRV_SHUTDOWN_NONE) {
+
+		/* If there is no message in the queue and we have tables
+		to optimize then optimize the tables. */
+
+		if (!done
+		    && ib_wqueue_is_empty(wq)
+		    && n_tables > 0
+		    && n_optimize > 0) {
+
+			fts_slot_t*	slot;
+
+			ut_a(ib_vector_size(tables) > 0);
+
+			slot = static_cast<fts_slot_t*>(
+				ib_vector_get(tables, current));
+
+			/* Handle the case of empty slots. */
+			if (slot->state != FTS_STATE_EMPTY) {
+
+				slot->state = FTS_STATE_RUNNING;
+
+				fts_optimize_table_bk(slot);
+			}
+
+			++current;
+
+			/* Wrap around the counter. */
+			if (current >= ib_vector_size(tables)) {
+				n_optimize = fts_optimize_how_many(tables);
+
+				current = 0;
+			}
+
+		} else if (n_optimize == 0 || !ib_wqueue_is_empty(wq)) {
+			fts_msg_t*	msg;
+
+			msg = static_cast<fts_msg_t*>(
+				ib_wqueue_timedwait(wq,
+						    FTS_QUEUE_WAIT_IN_USECS));
+
+			/* Timeout ? */
+			if (msg == NULL) {
+				continue;
+			}
+
+			switch (msg->type) {
+			case FTS_MSG_START:
+				break;
+
+			case FTS_MSG_PAUSE:
+				break;
+
+			case FTS_MSG_STOP:
+				done = TRUE;
+				exit_event = (os_event_t) msg->ptr;
+				break;
+
+			case FTS_MSG_ADD_TABLE:
+				ut_a(!done);
+				if (fts_optimize_new_table(
+					tables,
+					static_cast<dict_table_t*>(
+					msg->ptr))) {
+					++n_tables;
+				}
+				break;
+
+			case FTS_MSG_OPTIMIZE_TABLE:
+				if (!done) {
+					fts_optimize_start_table(
+						tables,
+						static_cast<dict_table_t*>(
+						msg->ptr));
+				}
+				break;
+
+			case FTS_MSG_DEL_TABLE:
+				if (fts_optimize_del_table(
+					tables, static_cast<fts_msg_del_t*>(
+						msg->ptr))) {
+					--n_tables;
+				}
+
+				/* Signal the producer that we have
+				removed the table. */
+				os_event_set(
+					((fts_msg_del_t*) msg->ptr)->event);
+				break;
+
+			default:
+				ut_error;
+			}
+
+			mem_heap_free(msg->heap);
+
+			if (!done) {
+				n_optimize = fts_optimize_how_many(tables);
+			} else {
+				n_optimize = 0;
+			}
+		}
+	}
+
+	/* Server is being shutdown, sync the data from FTS cache to disk
+	if needed */
+	if (n_tables > 0) {
+		ulint	i;
+
+		for (i = 0; i < ib_vector_size(tables); i++) {
+			fts_slot_t*	slot;
+
+			slot = static_cast<fts_slot_t*>(
+				ib_vector_get(tables, i));
+
+			if (slot->state != FTS_STATE_EMPTY) {
+				dict_table_t*	table;
+
+			        table = dict_table_open_on_name_no_stats(
+					slot->table->name, FALSE,
+					DICT_ERR_IGNORE_INDEX_ROOT);
+
+				if (table) {
+
+					if (dict_table_has_fts_index(table)) {
+						fts_sync_table(table);
+					}
+
+					fts_free(table);
+					dict_table_close(table, FALSE);
+				}
+			}
+		}
+	}
+
+	ib_vector_free(tables);
+
+	ut_print_timestamp(stderr);
+	fprintf(stderr, " InnoDB: FTS optimize thread exiting.\n");
+
+	ib_wqueue_free(wq);
+
+	os_event_set(exit_event);
+
+	/* We count the number of threads in os_thread_exit(). A created
+	thread should always use that to exit and not use return() to exit. */
+	os_thread_exit(NULL);
+
+	OS_THREAD_DUMMY_RETURN;
+}
+
+/**********************************************************************//**
+Startup the optimize thread and create the work queue. */
+UNIV_INTERN
+void
+fts_optimize_init(void)
+/*===================*/
+{
+	/* For now we only support one optimize thread. */
+	ut_a(fts_optimize_wq == NULL);
+
+	fts_optimize_wq = ib_wqueue_create();
+	ut_a(fts_optimize_wq != NULL);
+
+	os_thread_create(fts_optimize_thread, fts_optimize_wq, NULL);
+}
+
+/**********************************************************************//**
+Check whether the work queue is initialized.
+@return TRUE if optimze queue is initialized. */
+UNIV_INTERN
+ibool
+fts_optimize_is_init(void)
+/*======================*/
+{
+	return(fts_optimize_wq != NULL);
+}
+
+/**********************************************************************//**
+Signal the optimize thread to prepare for shutdown. */
+
+void
+fts_optimize_start_shutdown(void)
+/*=============================*/
+{
+	fts_msg_t*	msg;
+	os_event_t	event;
+
+	/* We tell the OPTIMIZE thread to switch to state done, we
+	can't delete the work queue here because the add thread needs
+	deregister the FTS tables. */
+	event = os_event_create(NULL);
+
+	msg = fts_optimize_create_msg(FTS_MSG_STOP, NULL);
+	msg->ptr = event;
+
+	ib_wqueue_add(fts_optimize_wq, msg, msg->heap);
+
+	os_event_wait(event);
+	os_event_free(event);
+}
+
+/**********************************************************************//**
+Reset the work queue. */
+
+void
+fts_optimize_end(void)
+/*==================*/
+{
+	// FIXME: Potential race condition here: We should wait for
+	// the optimize thread to confirm shutdown.
+	fts_optimize_wq = NULL;
+}
diff --git a/storage/innobase/fts/fts0pars.cc b/storage/innobase/fts/fts0pars.cc
new file mode 100644
index 00000000000..4fdfff5ca42
--- /dev/null
+++ b/storage/innobase/fts/fts0pars.cc
@@ -0,0 +1,1912 @@
+
+/* A Bison parser, made by GNU Bison 2.4.1.  */
+
+/* Skeleton implementation for Bison's Yacc-like parsers in C
+
+      Copyright (C) 1984, 1989, 1990, 2000, 2001, 2002, 2003, 2004, 2005, 2006
+   Free Software Foundation, Inc.
+
+   This program is free software: you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation, either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
+
+/* As a special exception, you may create a larger work that contains
+   part or all of the Bison parser skeleton and distribute that work
+   under terms of your choice, so long as that work isn't itself a
+   parser generator using the skeleton or a modified version thereof
+   as a parser skeleton.  Alternatively, if you modify or redistribute
+   the parser skeleton itself, you may (at your option) remove this
+   special exception, which will cause the skeleton and the resulting
+   Bison output files to be licensed under the GNU General Public
+   License without this special exception.
+
+   This special exception was added by the Free Software Foundation in
+   version 2.2 of Bison.  */
+
+/* C LALR(1) parser skeleton written by Richard Stallman, by
+   simplifying the original so-called "semantic" parser.  */
+
+/* All symbols defined below should begin with yy or YY, to avoid
+   infringing on user name space.  This should be done even for local
+   variables, as they might otherwise be expanded by user macros.
+   There are some unavoidable exceptions within include files to
+   define necessary library symbols; they are noted "INFRINGES ON
+   USER NAME SPACE" below.  */
+
+/* Identify Bison output.  */
+#define YYBISON 1
+
+/* Bison version.  */
+#define YYBISON_VERSION "2.4.1"
+
+/* Skeleton name.  */
+#define YYSKELETON_NAME "yacc.c"
+
+/* Pure parsers.  */
+#define YYPURE 1
+
+/* Push parsers.  */
+#define YYPUSH 0
+
+/* Pull parsers.  */
+#define YYPULL 1
+
+/* Using locations.  */
+#define YYLSP_NEEDED 0
+
+/* Substitute the variable and function names.  */
+#define yyparse         ftsparse
+#define yylex           ftslex
+#define yyerror         ftserror
+#define yylval          ftslval
+#define yychar          ftschar
+#define yydebug         ftsdebug
+#define yynerrs         ftsnerrs
+
+
+/* Copy the first part of user declarations.  */
+
+/* Line 189 of yacc.c  */
+#line 26 "fts0pars.y"
+
+
+#include "mem0mem.h"
+#include "fts0ast.h"
+#include "fts0blex.h"
+#include "fts0tlex.h"
+#include "fts0pars.h"
+
+extern	int fts_lexer(YYSTYPE*, fts_lexer_t*);
+extern	int fts_blexer(YYSTYPE*, yyscan_t);
+extern	int fts_tlexer(YYSTYPE*, yyscan_t);
+
+typedef int (*fts_scan)();
+
+extern int ftserror(const char* p);
+
+/* Required for reentrant parser */
+#define ftslex	fts_lexer
+
+#define YYERROR_VERBOSE
+
+/* For passing an argument to yyparse() */
+#define YYPARSE_PARAM state
+#define YYLEX_PARAM ((fts_ast_state_t*) state)->lexer
+
+typedef	int	(*fts_scanner_alt)(YYSTYPE* val, yyscan_t yyscanner);
+typedef	int	(*fts_scanner)();
+
+struct fts_lexer_struct {
+	fts_scanner	scanner;
+	void*		yyscanner;
+};
+
+
+
+/* Line 189 of yacc.c  */
+#line 117 "fts0pars.cc"
+
+/* Enabling traces.  */
+#ifndef YYDEBUG
+# define YYDEBUG 0
+#endif
+
+/* Enabling verbose error messages.  */
+#ifdef YYERROR_VERBOSE
+# undef YYERROR_VERBOSE
+# define YYERROR_VERBOSE 1
+#else
+# define YYERROR_VERBOSE 0
+#endif
+
+/* Enabling the token table.  */
+#ifndef YYTOKEN_TABLE
+# define YYTOKEN_TABLE 0
+#endif
+
+
+/* Tokens.  */
+#ifndef YYTOKENTYPE
+# define YYTOKENTYPE
+   /* Put the tokens into the symbol table, so that GDB and other debuggers
+      know about them.  */
+   enum yytokentype {
+     FTS_OPER = 258,
+     FTS_TEXT = 259,
+     FTS_TERM = 260,
+     FTS_NUMB = 261
+   };
+#endif
+
+
+
+#if ! defined YYSTYPE && ! defined YYSTYPE_IS_DECLARED
+typedef union YYSTYPE
+{
+
+/* Line 214 of yacc.c  */
+#line 61 "fts0pars.y"
+
+	int		oper;
+	char*		token;
+	fts_ast_node_t*	node;
+
+
+
+/* Line 214 of yacc.c  */
+#line 167 "fts0pars.cc"
+} YYSTYPE;
+# define YYSTYPE_IS_TRIVIAL 1
+# define yystype YYSTYPE /* obsolescent; will be withdrawn */
+# define YYSTYPE_IS_DECLARED 1
+#endif
+
+
+/* Copy the second part of user declarations.  */
+
+
+/* Line 264 of yacc.c  */
+#line 179 "fts0pars.cc"
+
+#ifdef short
+# undef short
+#endif
+
+#ifdef YYTYPE_UINT8
+typedef YYTYPE_UINT8 yytype_uint8;
+#else
+typedef unsigned char yytype_uint8;
+#endif
+
+#ifdef YYTYPE_INT8
+typedef YYTYPE_INT8 yytype_int8;
+#elif (defined __STDC__ || defined __C99__FUNC__ \
+     || defined __cplusplus || defined _MSC_VER)
+typedef signed char yytype_int8;
+#else
+typedef short int yytype_int8;
+#endif
+
+#ifdef YYTYPE_UINT16
+typedef YYTYPE_UINT16 yytype_uint16;
+#else
+typedef unsigned short int yytype_uint16;
+#endif
+
+#ifdef YYTYPE_INT16
+typedef YYTYPE_INT16 yytype_int16;
+#else
+typedef short int yytype_int16;
+#endif
+
+#ifndef YYSIZE_T
+# ifdef __SIZE_TYPE__
+#  define YYSIZE_T __SIZE_TYPE__
+# elif defined size_t
+#  define YYSIZE_T size_t
+# elif ! defined YYSIZE_T && (defined __STDC__ || defined __C99__FUNC__ \
+     || defined __cplusplus || defined _MSC_VER)
+#  include <stddef.h> /* INFRINGES ON USER NAME SPACE */
+#  define YYSIZE_T size_t
+# else
+#  define YYSIZE_T unsigned int
+# endif
+#endif
+
+#define YYSIZE_MAXIMUM ((YYSIZE_T) -1)
+
+#ifndef YY_
+# if YYENABLE_NLS
+#  if ENABLE_NLS
+#   include <libintl.h> /* INFRINGES ON USER NAME SPACE */
+#   define YY_(msgid) dgettext ("bison-runtime", msgid)
+#  endif
+# endif
+# ifndef YY_
+#  define YY_(msgid) msgid
+# endif
+#endif
+
+/* Suppress unused-variable warnings by "using" E.  */
+#if ! defined lint || defined __GNUC__
+# define YYUSE(e) ((void) (e))
+#else
+# define YYUSE(e) /* empty */
+#endif
+
+/* Identity function, used to suppress warnings about constant conditions.  */
+#ifndef lint
+# define YYID(n) (n)
+#else
+#if (defined __STDC__ || defined __C99__FUNC__ \
+     || defined __cplusplus || defined _MSC_VER)
+static int
+YYID (int yyi)
+#else
+static int
+YYID (yyi)
+    int yyi;
+#endif
+{
+  return yyi;
+}
+#endif
+
+#if ! defined yyoverflow || YYERROR_VERBOSE
+
+/* The parser invokes alloca or malloc; define the necessary symbols.  */
+
+# ifdef YYSTACK_USE_ALLOCA
+#  if YYSTACK_USE_ALLOCA
+#   ifdef __GNUC__
+#    define YYSTACK_ALLOC __builtin_alloca
+#   elif defined __BUILTIN_VA_ARG_INCR
+#    include <alloca.h> /* INFRINGES ON USER NAME SPACE */
+#   elif defined _AIX
+#    define YYSTACK_ALLOC __alloca
+#   elif defined _MSC_VER
+#    include <malloc.h> /* INFRINGES ON USER NAME SPACE */
+#    define alloca _alloca
+#   else
+#    define YYSTACK_ALLOC alloca
+#    if ! defined _ALLOCA_H && ! defined _STDLIB_H && (defined __STDC__ || defined __C99__FUNC__ \
+     || defined __cplusplus || defined _MSC_VER)
+#     include <stdlib.h> /* INFRINGES ON USER NAME SPACE */
+#     ifndef _STDLIB_H
+#      define _STDLIB_H 1
+#     endif
+#    endif
+#   endif
+#  endif
+# endif
+
+# ifdef YYSTACK_ALLOC
+   /* Pacify GCC's `empty if-body' warning.  */
+#  define YYSTACK_FREE(Ptr) do { /* empty */; } while (YYID (0))
+#  ifndef YYSTACK_ALLOC_MAXIMUM
+    /* The OS might guarantee only one guard page at the bottom of the stack,
+       and a page size can be as small as 4096 bytes.  So we cannot safely
+       invoke alloca (N) if N exceeds 4096.  Use a slightly smaller number
+       to allow for a few compiler-allocated temporary stack slots.  */
+#   define YYSTACK_ALLOC_MAXIMUM 4032 /* reasonable circa 2006 */
+#  endif
+# else
+#  define YYSTACK_ALLOC YYMALLOC
+#  define YYSTACK_FREE YYFREE
+#  ifndef YYSTACK_ALLOC_MAXIMUM
+#   define YYSTACK_ALLOC_MAXIMUM YYSIZE_MAXIMUM
+#  endif
+#  if (defined __cplusplus && ! defined _STDLIB_H \
+       && ! ((defined YYMALLOC || defined malloc) \
+	     && (defined YYFREE || defined free)))
+#   include <stdlib.h> /* INFRINGES ON USER NAME SPACE */
+#   ifndef _STDLIB_H
+#    define _STDLIB_H 1
+#   endif
+#  endif
+#  ifndef YYMALLOC
+#   define YYMALLOC malloc
+#   if ! defined malloc && ! defined _STDLIB_H && (defined __STDC__ || defined __C99__FUNC__ \
+     || defined __cplusplus || defined _MSC_VER)
+void *malloc (YYSIZE_T); /* INFRINGES ON USER NAME SPACE */
+#   endif
+#  endif
+#  ifndef YYFREE
+#   define YYFREE free
+#   if ! defined free && ! defined _STDLIB_H && (defined __STDC__ || defined __C99__FUNC__ \
+     || defined __cplusplus || defined _MSC_VER)
+void free (void*); /* INFRINGES ON USER NAME SPACE */
+#   endif
+#  endif
+# endif
+#endif /* ! defined yyoverflow || YYERROR_VERBOSE */
+
+
+#if (! defined yyoverflow \
+     && (! defined __cplusplus \
+	 || (defined YYSTYPE_IS_TRIVIAL && YYSTYPE_IS_TRIVIAL)))
+
+/* A type that is properly aligned for any stack member.  */
+union yyalloc
+{
+  yytype_int16 yyss_alloc;
+  YYSTYPE yyvs_alloc;
+};
+
+/* The size of the maximum gap between one aligned stack and the next.  */
+# define YYSTACK_GAP_MAXIMUM (sizeof (union yyalloc) - 1)
+
+/* The size of an array large to enough to hold all stacks, each with
+   N elements.  */
+# define YYSTACK_BYTES(N) \
+     ((N) * (sizeof (yytype_int16) + sizeof (YYSTYPE)) \
+      + YYSTACK_GAP_MAXIMUM)
+
+/* Copy COUNT objects from FROM to TO.  The source and destination do
+   not overlap.  */
+# ifndef YYCOPY
+#  if defined __GNUC__ && 1 < __GNUC__
+#   define YYCOPY(To, From, Count) \
+      __builtin_memcpy (To, From, (Count) * sizeof (*(From)))
+#  else
+#   define YYCOPY(To, From, Count)		\
+      do					\
+	{					\
+	  YYSIZE_T yyi;				\
+	  for (yyi = 0; yyi < (Count); yyi++)	\
+	    (To)[yyi] = (From)[yyi];		\
+	}					\
+      while (YYID (0))
+#  endif
+# endif
+
+/* Relocate STACK from its old location to the new one.  The
+   local variables YYSIZE and YYSTACKSIZE give the old and new number of
+   elements in the stack, and YYPTR gives the new location of the
+   stack.  Advance YYPTR to a properly aligned location for the next
+   stack.  */
+# define YYSTACK_RELOCATE(Stack_alloc, Stack)				\
+    do									\
+      {									\
+	YYSIZE_T yynewbytes;						\
+	YYCOPY (&yyptr->Stack_alloc, Stack, yysize);			\
+	Stack = &yyptr->Stack_alloc;					\
+	yynewbytes = yystacksize * sizeof (*Stack) + YYSTACK_GAP_MAXIMUM; \
+	yyptr += yynewbytes / sizeof (*yyptr);				\
+      }									\
+    while (YYID (0))
+
+#endif
+
+/* YYFINAL -- State number of the termination state.  */
+#define YYFINAL  3
+/* YYLAST -- Last index in YYTABLE.  */
+#define YYLAST   43
+
+/* YYNTOKENS -- Number of terminals.  */
+#define YYNTOKENS  16
+/* YYNNTS -- Number of nonterminals.  */
+#define YYNNTS  8
+/* YYNRULES -- Number of rules.  */
+#define YYNRULES  23
+/* YYNRULES -- Number of states.  */
+#define YYNSTATES  31
+
+/* YYTRANSLATE(YYLEX) -- Bison symbol number corresponding to YYLEX.  */
+#define YYUNDEFTOK  2
+#define YYMAXUTOK   261
+
+#define YYTRANSLATE(YYX)						\
+  ((unsigned int) (YYX) <= YYMAXUTOK ? yytranslate[YYX] : YYUNDEFTOK)
+
+/* YYTRANSLATE[YYLEX] -- Bison symbol number corresponding to YYLEX.  */
+static const yytype_uint8 yytranslate[] =
+{
+       0,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+      12,    13,    14,     7,     2,     8,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+      10,     2,    11,     2,    15,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     9,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     1,     2,     3,     4,
+       5,     6
+};
+
+#if YYDEBUG
+/* YYPRHS[YYN] -- Index of the first RHS symbol of rule number YYN in
+   YYRHS.  */
+static const yytype_uint8 yyprhs[] =
+{
+       0,     0,     3,     5,     6,     9,    12,    16,    21,    23,
+      25,    28,    32,    36,    39,    44,    47,    49,    51,    53,
+      55,    57,    59,    61
+};
+
+/* YYRHS -- A `-1'-separated list of the rules' RHS.  */
+static const yytype_int8 yyrhs[] =
+{
+      17,     0,    -1,    18,    -1,    -1,    18,    20,    -1,    18,
+      19,    -1,    12,    18,    13,    -1,    21,    12,    18,    13,
+      -1,    22,    -1,    23,    -1,    22,    14,    -1,    23,    15,
+       6,    -1,    21,    22,    14,    -1,    21,    22,    -1,    21,
+      23,    15,     6,    -1,    21,    23,    -1,     8,    -1,     7,
+      -1,     9,    -1,    10,    -1,    11,    -1,     5,    -1,     6,
+      -1,     4,    -1
+};
+
+/* YYRLINE[YYN] -- source line where rule number YYN was defined.  */
+static const yytype_uint8 yyrline[] =
+{
+       0,    79,    79,    85,    89,    99,   111,   115,   124,   128,
+     132,   136,   141,   147,   152,   159,   165,   169,   173,   177,
+     181,   186,   191,   198
+};
+#endif
+
+#if YYDEBUG || YYERROR_VERBOSE || YYTOKEN_TABLE
+/* YYTNAME[SYMBOL-NUM] -- String name of the symbol SYMBOL-NUM.
+   First, the terminals, then, starting at YYNTOKENS, nonterminals.  */
+static const char *const yytname[] =
+{
+  "$end", "error", "$undefined", "FTS_OPER", "FTS_TEXT", "FTS_TERM",
+  "FTS_NUMB", "'+'", "'-'", "'~'", "'<'", "'>'", "'('", "')'", "'*'",
+  "'@'", "$accept", "query", "expr_lst", "sub_expr", "expr", "prefix",
+  "term", "text", 0
+};
+#endif
+
+# ifdef YYPRINT
+/* YYTOKNUM[YYLEX-NUM] -- Internal token number corresponding to
+   token YYLEX-NUM.  */
+static const yytype_uint16 yytoknum[] =
+{
+       0,   256,   257,   258,   259,   260,   261,    43,    45,   126,
+      60,    62,    40,    41,    42,    64
+};
+# endif
+
+/* YYR1[YYN] -- Symbol number of symbol that rule YYN derives.  */
+static const yytype_uint8 yyr1[] =
+{
+       0,    16,    17,    18,    18,    18,    19,    19,    20,    20,
+      20,    20,    20,    20,    20,    20,    21,    21,    21,    21,
+      21,    22,    22,    23
+};
+
+/* YYR2[YYN] -- Number of symbols composing right hand side of rule YYN.  */
+static const yytype_uint8 yyr2[] =
+{
+       0,     2,     1,     0,     2,     2,     3,     4,     1,     1,
+       2,     3,     3,     2,     4,     2,     1,     1,     1,     1,
+       1,     1,     1,     1
+};
+
+/* YYDEFACT[STATE-NAME] -- Default rule to reduce with in state
+   STATE-NUM when YYTABLE doesn't specify something else to do.  Zero
+   means the default is an error.  */
+static const yytype_uint8 yydefact[] =
+{
+       3,     0,     2,     1,    23,    21,    22,    17,    16,    18,
+      19,    20,     3,     5,     4,     0,     8,     9,     0,     3,
+      13,    15,    10,     0,     6,     0,    12,     0,    11,     7,
+      14
+};
+
+/* YYDEFGOTO[NTERM-NUM].  */
+static const yytype_int8 yydefgoto[] =
+{
+      -1,     1,     2,    13,    14,    15,    16,    17
+};
+
+/* YYPACT[STATE-NUM] -- Index in YYTABLE of the portion describing
+   STATE-NUM.  */
+#define YYPACT_NINF -5
+static const yytype_int8 yypact[] =
+{
+      -5,    33,    16,    -5,    -5,    -5,    -5,    -5,    -5,    -5,
+      -5,    -5,    -5,    -5,    -5,    25,    21,    19,    -4,    -5,
+      22,    23,    -5,    34,    -5,     6,    -5,    35,    -5,    -5,
+      -5
+};
+
+/* YYPGOTO[NTERM-NUM].  */
+static const yytype_int8 yypgoto[] =
+{
+      -5,    -5,    20,    -5,    -5,    -5,    27,    28
+};
+
+/* YYTABLE[YYPACT[STATE-NUM]].  What to do in state STATE-NUM.  If
+   positive, shift that token.  If negative, reduce the rule which
+   number is the opposite.  If zero, do what YYDEFACT says.
+   If YYTABLE_NINF, syntax error.  */
+#define YYTABLE_NINF -1
+static const yytype_uint8 yytable[] =
+{
+       4,     5,     6,     7,     8,     9,    10,    11,    12,    24,
+       4,     5,     6,     7,     8,     9,    10,    11,    12,    29,
+       4,     5,     6,     7,     8,     9,    10,    11,    12,     4,
+       5,     6,    18,     3,    23,    22,    26,    19,    27,    25,
+      28,    30,    20,    21
+};
+
+static const yytype_uint8 yycheck[] =
+{
+       4,     5,     6,     7,     8,     9,    10,    11,    12,    13,
+       4,     5,     6,     7,     8,     9,    10,    11,    12,    13,
+       4,     5,     6,     7,     8,     9,    10,    11,    12,     4,
+       5,     6,    12,     0,    15,    14,    14,    12,    15,    19,
+       6,     6,    15,    15
+};
+
+/* YYSTOS[STATE-NUM] -- The (internal number of the) accessing
+   symbol of state STATE-NUM.  */
+static const yytype_uint8 yystos[] =
+{
+       0,    17,    18,     0,     4,     5,     6,     7,     8,     9,
+      10,    11,    12,    19,    20,    21,    22,    23,    18,    12,
+      22,    23,    14,    15,    13,    18,    14,    15,     6,    13,
+       6
+};
+
+#define yyerrok		(yyerrstatus = 0)
+#define yyclearin	(yychar = YYEMPTY)
+#define YYEMPTY		(-2)
+#define YYEOF		0
+
+#define YYACCEPT	goto yyacceptlab
+#define YYABORT		goto yyabortlab
+#define YYERROR		goto yyerrorlab
+
+
+/* Like YYERROR except do call yyerror.  This remains here temporarily
+   to ease the transition to the new meaning of YYERROR, for GCC.
+   Once GCC version 2 has supplanted version 1, this can go.  */
+
+#define YYFAIL		goto yyerrlab
+
+#define YYRECOVERING()  (!!yyerrstatus)
+
+#define YYBACKUP(Token, Value)					\
+do								\
+  if (yychar == YYEMPTY && yylen == 1)				\
+    {								\
+      yychar = (Token);						\
+      yylval = (Value);						\
+      yytoken = YYTRANSLATE (yychar);				\
+      YYPOPSTACK (1);						\
+      goto yybackup;						\
+    }								\
+  else								\
+    {								\
+      yyerror (YY_("syntax error: cannot back up")); \
+      YYERROR;							\
+    }								\
+while (YYID (0))
+
+
+#define YYTERROR	1
+#define YYERRCODE	256
+
+
+/* YYLLOC_DEFAULT -- Set CURRENT to span from RHS[1] to RHS[N].
+   If N is 0, then set CURRENT to the empty location which ends
+   the previous symbol: RHS[0] (always defined).  */
+
+#define YYRHSLOC(Rhs, K) ((Rhs)[K])
+#ifndef YYLLOC_DEFAULT
+# define YYLLOC_DEFAULT(Current, Rhs, N)				\
+    do									\
+      if (YYID (N))                                                    \
+	{								\
+	  (Current).first_line   = YYRHSLOC (Rhs, 1).first_line;	\
+	  (Current).first_column = YYRHSLOC (Rhs, 1).first_column;	\
+	  (Current).last_line    = YYRHSLOC (Rhs, N).last_line;		\
+	  (Current).last_column  = YYRHSLOC (Rhs, N).last_column;	\
+	}								\
+      else								\
+	{								\
+	  (Current).first_line   = (Current).last_line   =		\
+	    YYRHSLOC (Rhs, 0).last_line;				\
+	  (Current).first_column = (Current).last_column =		\
+	    YYRHSLOC (Rhs, 0).last_column;				\
+	}								\
+    while (YYID (0))
+#endif
+
+
+/* YY_LOCATION_PRINT -- Print the location on the stream.
+   This macro was not mandated originally: define only if we know
+   we won't break user code: when these are the locations we know.  */
+
+#ifndef YY_LOCATION_PRINT
+# if YYLTYPE_IS_TRIVIAL
+#  define YY_LOCATION_PRINT(File, Loc)			\
+     fprintf (File, "%d.%d-%d.%d",			\
+	      (Loc).first_line, (Loc).first_column,	\
+	      (Loc).last_line,  (Loc).last_column)
+# else
+#  define YY_LOCATION_PRINT(File, Loc) ((void) 0)
+# endif
+#endif
+
+
+/* YYLEX -- calling `yylex' with the right arguments.  */
+
+#ifdef YYLEX_PARAM
+# define YYLEX yylex (&yylval, YYLEX_PARAM)
+#else
+# define YYLEX yylex (&yylval)
+#endif
+
+/* Enable debugging if requested.  */
+#if YYDEBUG
+
+# ifndef YYFPRINTF
+#  include <stdio.h> /* INFRINGES ON USER NAME SPACE */
+#  define YYFPRINTF fprintf
+# endif
+
+# define YYDPRINTF(Args)			\
+do {						\
+  if (yydebug)					\
+    YYFPRINTF Args;				\
+} while (YYID (0))
+
+# define YY_SYMBOL_PRINT(Title, Type, Value, Location)			  \
+do {									  \
+  if (yydebug)								  \
+    {									  \
+      YYFPRINTF (stderr, "%s ", Title);					  \
+      yy_symbol_print (stderr,						  \
+		  Type, Value); \
+      YYFPRINTF (stderr, "\n");						  \
+    }									  \
+} while (YYID (0))
+
+
+/*--------------------------------.
+| Print this symbol on YYOUTPUT.  |
+`--------------------------------*/
+
+/*ARGSUSED*/
+#if (defined __STDC__ || defined __C99__FUNC__ \
+     || defined __cplusplus || defined _MSC_VER)
+static void
+yy_symbol_value_print (FILE *yyoutput, int yytype, YYSTYPE const * const yyvaluep)
+#else
+static void
+yy_symbol_value_print (yyoutput, yytype, yyvaluep)
+    FILE *yyoutput;
+    int yytype;
+    YYSTYPE const * const yyvaluep;
+#endif
+{
+  if (!yyvaluep)
+    return;
+# ifdef YYPRINT
+  if (yytype < YYNTOKENS)
+    YYPRINT (yyoutput, yytoknum[yytype], *yyvaluep);
+# else
+  YYUSE (yyoutput);
+# endif
+  switch (yytype)
+    {
+      default:
+	break;
+    }
+}
+
+
+/*--------------------------------.
+| Print this symbol on YYOUTPUT.  |
+`--------------------------------*/
+
+#if (defined __STDC__ || defined __C99__FUNC__ \
+     || defined __cplusplus || defined _MSC_VER)
+static void
+yy_symbol_print (FILE *yyoutput, int yytype, YYSTYPE const * const yyvaluep)
+#else
+static void
+yy_symbol_print (yyoutput, yytype, yyvaluep)
+    FILE *yyoutput;
+    int yytype;
+    YYSTYPE const * const yyvaluep;
+#endif
+{
+  if (yytype < YYNTOKENS)
+    YYFPRINTF (yyoutput, "token %s (", yytname[yytype]);
+  else
+    YYFPRINTF (yyoutput, "nterm %s (", yytname[yytype]);
+
+  yy_symbol_value_print (yyoutput, yytype, yyvaluep);
+  YYFPRINTF (yyoutput, ")");
+}
+
+/*------------------------------------------------------------------.
+| yy_stack_print -- Print the state stack from its BOTTOM up to its |
+| TOP (included).                                                   |
+`------------------------------------------------------------------*/
+
+#if (defined __STDC__ || defined __C99__FUNC__ \
+     || defined __cplusplus || defined _MSC_VER)
+static void
+yy_stack_print (yytype_int16 *yybottom, yytype_int16 *yytop)
+#else
+static void
+yy_stack_print (yybottom, yytop)
+    yytype_int16 *yybottom;
+    yytype_int16 *yytop;
+#endif
+{
+  YYFPRINTF (stderr, "Stack now");
+  for (; yybottom <= yytop; yybottom++)
+    {
+      int yybot = *yybottom;
+      YYFPRINTF (stderr, " %d", yybot);
+    }
+  YYFPRINTF (stderr, "\n");
+}
+
+# define YY_STACK_PRINT(Bottom, Top)				\
+do {								\
+  if (yydebug)							\
+    yy_stack_print ((Bottom), (Top));				\
+} while (YYID (0))
+
+
+/*------------------------------------------------.
+| Report that the YYRULE is going to be reduced.  |
+`------------------------------------------------*/
+
+#if (defined __STDC__ || defined __C99__FUNC__ \
+     || defined __cplusplus || defined _MSC_VER)
+static void
+yy_reduce_print (YYSTYPE *yyvsp, int yyrule)
+#else
+static void
+yy_reduce_print (yyvsp, yyrule)
+    YYSTYPE *yyvsp;
+    int yyrule;
+#endif
+{
+  int yynrhs = yyr2[yyrule];
+  int yyi;
+  unsigned long int yylno = yyrline[yyrule];
+  YYFPRINTF (stderr, "Reducing stack by rule %d (line %lu):\n",
+	     yyrule - 1, yylno);
+  /* The symbols being reduced.  */
+  for (yyi = 0; yyi < yynrhs; yyi++)
+    {
+      YYFPRINTF (stderr, "   $%d = ", yyi + 1);
+      yy_symbol_print (stderr, yyrhs[yyprhs[yyrule] + yyi],
+		       &(yyvsp[(yyi + 1) - (yynrhs)])
+		       		       );
+      YYFPRINTF (stderr, "\n");
+    }
+}
+
+# define YY_REDUCE_PRINT(Rule)		\
+do {					\
+  if (yydebug)				\
+    yy_reduce_print (yyvsp, Rule); \
+} while (YYID (0))
+
+/* Nonzero means print parse trace.  It is left uninitialized so that
+   multiple parsers can coexist.  */
+int yydebug;
+#else /* !YYDEBUG */
+# define YYDPRINTF(Args)
+# define YY_SYMBOL_PRINT(Title, Type, Value, Location)
+# define YY_STACK_PRINT(Bottom, Top)
+# define YY_REDUCE_PRINT(Rule)
+#endif /* !YYDEBUG */
+
+
+/* YYINITDEPTH -- initial size of the parser's stacks.  */
+#ifndef	YYINITDEPTH
+# define YYINITDEPTH 200
+#endif
+
+/* YYMAXDEPTH -- maximum size the stacks can grow to (effective only
+   if the built-in stack extension method is used).
+
+   Do not make this value too large; the results are undefined if
+   YYSTACK_ALLOC_MAXIMUM < YYSTACK_BYTES (YYMAXDEPTH)
+   evaluated with infinite-precision integer arithmetic.  */
+
+#ifndef YYMAXDEPTH
+# define YYMAXDEPTH 10000
+#endif
+
+
+
+#if YYERROR_VERBOSE
+
+# ifndef yystrlen
+#  if defined __GLIBC__ && defined _STRING_H
+#   define yystrlen strlen
+#  else
+/* Return the length of YYSTR.  */
+#if (defined __STDC__ || defined __C99__FUNC__ \
+     || defined __cplusplus || defined _MSC_VER)
+static YYSIZE_T
+yystrlen (const char *yystr)
+#else
+static YYSIZE_T
+yystrlen (yystr)
+    const char *yystr;
+#endif
+{
+  YYSIZE_T yylen;
+  for (yylen = 0; yystr[yylen]; yylen++)
+    continue;
+  return yylen;
+}
+#  endif
+# endif
+
+# ifndef yystpcpy
+#  if defined __GLIBC__ && defined _STRING_H && defined _GNU_SOURCE
+#   define yystpcpy stpcpy
+#  else
+/* Copy YYSRC to YYDEST, returning the address of the terminating '\0' in
+   YYDEST.  */
+#if (defined __STDC__ || defined __C99__FUNC__ \
+     || defined __cplusplus || defined _MSC_VER)
+static char *
+yystpcpy (char *yydest, const char *yysrc)
+#else
+static char *
+yystpcpy (yydest, yysrc)
+    char *yydest;
+    const char *yysrc;
+#endif
+{
+  char *yyd = yydest;
+  const char *yys = yysrc;
+
+  while ((*yyd++ = *yys++) != '\0')
+    continue;
+
+  return yyd - 1;
+}
+#  endif
+# endif
+
+# ifndef yytnamerr
+/* Copy to YYRES the contents of YYSTR after stripping away unnecessary
+   quotes and backslashes, so that it's suitable for yyerror.  The
+   heuristic is that double-quoting is unnecessary unless the string
+   contains an apostrophe, a comma, or backslash (other than
+   backslash-backslash).  YYSTR is taken from yytname.  If YYRES is
+   null, do not copy; instead, return the length of what the result
+   would have been.  */
+static YYSIZE_T
+yytnamerr (char *yyres, const char *yystr)
+{
+  if (*yystr == '"')
+    {
+      YYSIZE_T yyn = 0;
+      char const *yyp = yystr;
+
+      for (;;)
+	switch (*++yyp)
+	  {
+	  case '\'':
+	  case ',':
+	    goto do_not_strip_quotes;
+
+	  case '\\':
+	    if (*++yyp != '\\')
+	      goto do_not_strip_quotes;
+	    /* Fall through.  */
+	  default:
+	    if (yyres)
+	      yyres[yyn] = *yyp;
+	    yyn++;
+	    break;
+
+	  case '"':
+	    if (yyres)
+	      yyres[yyn] = '\0';
+	    return yyn;
+	  }
+    do_not_strip_quotes: ;
+    }
+
+  if (! yyres)
+    return yystrlen (yystr);
+
+  return yystpcpy (yyres, yystr) - yyres;
+}
+# endif
+
+/* Copy into YYRESULT an error message about the unexpected token
+   YYCHAR while in state YYSTATE.  Return the number of bytes copied,
+   including the terminating null byte.  If YYRESULT is null, do not
+   copy anything; just return the number of bytes that would be
+   copied.  As a special case, return 0 if an ordinary "syntax error"
+   message will do.  Return YYSIZE_MAXIMUM if overflow occurs during
+   size calculation.  */
+static YYSIZE_T
+yysyntax_error (char *yyresult, int yystate, int yychar)
+{
+  int yyn = yypact[yystate];
+
+  if (! (YYPACT_NINF < yyn && yyn <= YYLAST))
+    return 0;
+  else
+    {
+      int yytype = YYTRANSLATE (yychar);
+      YYSIZE_T yysize0 = yytnamerr (0, yytname[yytype]);
+      YYSIZE_T yysize = yysize0;
+      YYSIZE_T yysize1;
+      int yysize_overflow = 0;
+      enum { YYERROR_VERBOSE_ARGS_MAXIMUM = 5 };
+      char const *yyarg[YYERROR_VERBOSE_ARGS_MAXIMUM];
+      int yyx;
+
+# if 0
+      /* This is so xgettext sees the translatable formats that are
+	 constructed on the fly.  */
+      YY_("syntax error, unexpected %s");
+      YY_("syntax error, unexpected %s, expecting %s");
+      YY_("syntax error, unexpected %s, expecting %s or %s");
+      YY_("syntax error, unexpected %s, expecting %s or %s or %s");
+      YY_("syntax error, unexpected %s, expecting %s or %s or %s or %s");
+# endif
+      char *yyfmt;
+      char const *yyf;
+      static char const yyunexpected[] = "syntax error, unexpected %s";
+      static char const yyexpecting[] = ", expecting %s";
+      static char const yyor[] = " or %s";
+      char yyformat[sizeof yyunexpected
+		    + sizeof yyexpecting - 1
+		    + ((YYERROR_VERBOSE_ARGS_MAXIMUM - 2)
+		       * (sizeof yyor - 1))];
+      char const *yyprefix = yyexpecting;
+
+      /* Start YYX at -YYN if negative to avoid negative indexes in
+	 YYCHECK.  */
+      int yyxbegin = yyn < 0 ? -yyn : 0;
+
+      /* Stay within bounds of both yycheck and yytname.  */
+      int yychecklim = YYLAST - yyn + 1;
+      int yyxend = yychecklim < YYNTOKENS ? yychecklim : YYNTOKENS;
+      int yycount = 1;
+
+      yyarg[0] = yytname[yytype];
+      yyfmt = yystpcpy (yyformat, yyunexpected);
+
+      for (yyx = yyxbegin; yyx < yyxend; ++yyx)
+	if (yycheck[yyx + yyn] == yyx && yyx != YYTERROR)
+	  {
+	    if (yycount == YYERROR_VERBOSE_ARGS_MAXIMUM)
+	      {
+		yycount = 1;
+		yysize = yysize0;
+		yyformat[sizeof yyunexpected - 1] = '\0';
+		break;
+	      }
+	    yyarg[yycount++] = yytname[yyx];
+	    yysize1 = yysize + yytnamerr (0, yytname[yyx]);
+	    yysize_overflow |= (yysize1 < yysize);
+	    yysize = yysize1;
+	    yyfmt = yystpcpy (yyfmt, yyprefix);
+	    yyprefix = yyor;
+	  }
+
+      yyf = YY_(yyformat);
+      yysize1 = yysize + yystrlen (yyf);
+      yysize_overflow |= (yysize1 < yysize);
+      yysize = yysize1;
+
+      if (yysize_overflow)
+	return YYSIZE_MAXIMUM;
+
+      if (yyresult)
+	{
+	  /* Avoid sprintf, as that infringes on the user's name space.
+	     Don't have undefined behavior even if the translation
+	     produced a string with the wrong number of "%s"s.  */
+	  char *yyp = yyresult;
+	  int yyi = 0;
+	  while ((*yyp = *yyf) != '\0')
+	    {
+	      if (*yyp == '%' && yyf[1] == 's' && yyi < yycount)
+		{
+		  yyp += yytnamerr (yyp, yyarg[yyi++]);
+		  yyf += 2;
+		}
+	      else
+		{
+		  yyp++;
+		  yyf++;
+		}
+	    }
+	}
+      return yysize;
+    }
+}
+#endif /* YYERROR_VERBOSE */
+
+
+/*-----------------------------------------------.
+| Release the memory associated to this symbol.  |
+`-----------------------------------------------*/
+
+/*ARGSUSED*/
+#if (defined __STDC__ || defined __C99__FUNC__ \
+     || defined __cplusplus || defined _MSC_VER)
+static void
+yydestruct (const char *yymsg, int yytype, YYSTYPE *yyvaluep)
+#else
+static void
+yydestruct (yymsg, yytype, yyvaluep)
+    const char *yymsg;
+    int yytype;
+    YYSTYPE *yyvaluep;
+#endif
+{
+  YYUSE (yyvaluep);
+
+  if (!yymsg)
+    yymsg = "Deleting";
+  YY_SYMBOL_PRINT (yymsg, yytype, yyvaluep, yylocationp);
+
+  switch (yytype)
+    {
+
+      default:
+	break;
+    }
+}
+
+/* Prevent warnings from -Wmissing-prototypes.  */
+#ifdef YYPARSE_PARAM
+#if defined __STDC__ || defined __cplusplus
+int yyparse (void *YYPARSE_PARAM);
+#else
+int yyparse ();
+#endif
+#else /* ! YYPARSE_PARAM */
+#if defined __STDC__ || defined __cplusplus
+int yyparse (void);
+#else
+int yyparse ();
+#endif
+#endif /* ! YYPARSE_PARAM */
+
+
+
+
+
+/*-------------------------.
+| yyparse or yypush_parse.  |
+`-------------------------*/
+
+#ifdef YYPARSE_PARAM
+#if (defined __STDC__ || defined __C99__FUNC__ \
+     || defined __cplusplus || defined _MSC_VER)
+int
+yyparse (void *YYPARSE_PARAM)
+#else
+int
+yyparse (YYPARSE_PARAM)
+    void *YYPARSE_PARAM;
+#endif
+#else /* ! YYPARSE_PARAM */
+#if (defined __STDC__ || defined __C99__FUNC__ \
+     || defined __cplusplus || defined _MSC_VER)
+int
+yyparse (void)
+#else
+int
+yyparse ()
+
+#endif
+#endif
+{
+/* The lookahead symbol.  */
+int yychar;
+
+/* The semantic value of the lookahead symbol.  */
+YYSTYPE yylval;
+
+    /* Number of syntax errors so far.  */
+    int yynerrs;
+
+    int yystate;
+    /* Number of tokens to shift before error messages enabled.  */
+    int yyerrstatus;
+
+    /* The stacks and their tools:
+       `yyss': related to states.
+       `yyvs': related to semantic values.
+
+       Refer to the stacks thru separate pointers, to allow yyoverflow
+       to reallocate them elsewhere.  */
+
+    /* The state stack.  */
+    yytype_int16 yyssa[YYINITDEPTH];
+    yytype_int16 *yyss;
+    yytype_int16 *yyssp;
+
+    /* The semantic value stack.  */
+    YYSTYPE yyvsa[YYINITDEPTH];
+    YYSTYPE *yyvs;
+    YYSTYPE *yyvsp;
+
+    YYSIZE_T yystacksize;
+
+  int yyn;
+  int yyresult;
+  /* Lookahead token as an internal (translated) token number.  */
+  int yytoken;
+  /* The variables used to return semantic value and location from the
+     action routines.  */
+  YYSTYPE yyval;
+
+#if YYERROR_VERBOSE
+  /* Buffer for error messages, and its allocated size.  */
+  char yymsgbuf[128];
+  char *yymsg = yymsgbuf;
+  YYSIZE_T yymsg_alloc = sizeof yymsgbuf;
+#endif
+
+#define YYPOPSTACK(N)   (yyvsp -= (N), yyssp -= (N))
+
+  /* The number of symbols on the RHS of the reduced rule.
+     Keep to zero when no symbol should be popped.  */
+  int yylen = 0;
+
+  yytoken = 0;
+  yyss = yyssa;
+  yyvs = yyvsa;
+  yystacksize = YYINITDEPTH;
+
+  YYDPRINTF ((stderr, "Starting parse\n"));
+
+  yystate = 0;
+  yyerrstatus = 0;
+  yynerrs = 0;
+  yychar = YYEMPTY; /* Cause a token to be read.  */
+
+  /* Initialize stack pointers.
+     Waste one element of value and location stack
+     so that they stay on the same level as the state stack.
+     The wasted elements are never initialized.  */
+  yyssp = yyss;
+  yyvsp = yyvs;
+
+  goto yysetstate;
+
+/*------------------------------------------------------------.
+| yynewstate -- Push a new state, which is found in yystate.  |
+`------------------------------------------------------------*/
+ yynewstate:
+  /* In all cases, when you get here, the value and location stacks
+     have just been pushed.  So pushing a state here evens the stacks.  */
+  yyssp++;
+
+ yysetstate:
+  *yyssp = yystate;
+
+  if (yyss + yystacksize - 1 <= yyssp)
+    {
+      /* Get the current used size of the three stacks, in elements.  */
+      YYSIZE_T yysize = yyssp - yyss + 1;
+
+#ifdef yyoverflow
+      {
+	/* Give user a chance to reallocate the stack.  Use copies of
+	   these so that the &'s don't force the real ones into
+	   memory.  */
+	YYSTYPE *yyvs1 = yyvs;
+	yytype_int16 *yyss1 = yyss;
+
+	/* Each stack pointer address is followed by the size of the
+	   data in use in that stack, in bytes.  This used to be a
+	   conditional around just the two extra args, but that might
+	   be undefined if yyoverflow is a macro.  */
+	yyoverflow (YY_("memory exhausted"),
+		    &yyss1, yysize * sizeof (*yyssp),
+		    &yyvs1, yysize * sizeof (*yyvsp),
+		    &yystacksize);
+
+	yyss = yyss1;
+	yyvs = yyvs1;
+      }
+#else /* no yyoverflow */
+# ifndef YYSTACK_RELOCATE
+      goto yyexhaustedlab;
+# else
+      /* Extend the stack our own way.  */
+      if (YYMAXDEPTH <= yystacksize)
+	goto yyexhaustedlab;
+      yystacksize *= 2;
+      if (YYMAXDEPTH < yystacksize)
+	yystacksize = YYMAXDEPTH;
+
+      {
+	yytype_int16 *yyss1 = yyss;
+	union yyalloc *yyptr =
+	  (union yyalloc*) YYSTACK_ALLOC (YYSTACK_BYTES (yystacksize));
+	if (! yyptr)
+	  goto yyexhaustedlab;
+	YYSTACK_RELOCATE (yyss_alloc, yyss);
+	YYSTACK_RELOCATE (yyvs_alloc, yyvs);
+#  undef YYSTACK_RELOCATE
+	if (yyss1 != yyssa)
+	  YYSTACK_FREE (yyss1);
+      }
+# endif
+#endif /* no yyoverflow */
+
+      yyssp = yyss + yysize - 1;
+      yyvsp = yyvs + yysize - 1;
+
+      YYDPRINTF ((stderr, "Stack size increased to %lu\n",
+		  (unsigned long int) yystacksize));
+
+      if (yyss + yystacksize - 1 <= yyssp)
+	YYABORT;
+    }
+
+  YYDPRINTF ((stderr, "Entering state %d\n", yystate));
+
+  if (yystate == YYFINAL)
+    YYACCEPT;
+
+  goto yybackup;
+
+/*-----------.
+| yybackup.  |
+`-----------*/
+yybackup:
+
+  /* Do appropriate processing given the current state.  Read a
+     lookahead token if we need one and don't already have one.  */
+
+  /* First try to decide what to do without reference to lookahead token.  */
+  yyn = yypact[yystate];
+  if (yyn == YYPACT_NINF)
+    goto yydefault;
+
+  /* Not known => get a lookahead token if don't already have one.  */
+
+  /* YYCHAR is either YYEMPTY or YYEOF or a valid lookahead symbol.  */
+  if (yychar == YYEMPTY)
+    {
+      YYDPRINTF ((stderr, "Reading a token: "));
+      yychar = YYLEX;
+    }
+
+  if (yychar <= YYEOF)
+    {
+      yychar = yytoken = YYEOF;
+      YYDPRINTF ((stderr, "Now at end of input.\n"));
+    }
+  else
+    {
+      yytoken = YYTRANSLATE (yychar);
+      YY_SYMBOL_PRINT ("Next token is", yytoken, &yylval, &yylloc);
+    }
+
+  /* If the proper action on seeing token YYTOKEN is to reduce or to
+     detect an error, take that action.  */
+  yyn += yytoken;
+  if (yyn < 0 || YYLAST < yyn || yycheck[yyn] != yytoken)
+    goto yydefault;
+  yyn = yytable[yyn];
+  if (yyn <= 0)
+    {
+      if (yyn == 0 || yyn == YYTABLE_NINF)
+	goto yyerrlab;
+      yyn = -yyn;
+      goto yyreduce;
+    }
+
+  /* Count tokens shifted since error; after three, turn off error
+     status.  */
+  if (yyerrstatus)
+    yyerrstatus--;
+
+  /* Shift the lookahead token.  */
+  YY_SYMBOL_PRINT ("Shifting", yytoken, &yylval, &yylloc);
+
+  /* Discard the shifted token.  */
+  yychar = YYEMPTY;
+
+  yystate = yyn;
+  *++yyvsp = yylval;
+
+  goto yynewstate;
+
+
+/*-----------------------------------------------------------.
+| yydefault -- do the default action for the current state.  |
+`-----------------------------------------------------------*/
+yydefault:
+  yyn = yydefact[yystate];
+  if (yyn == 0)
+    goto yyerrlab;
+  goto yyreduce;
+
+
+/*-----------------------------.
+| yyreduce -- Do a reduction.  |
+`-----------------------------*/
+yyreduce:
+  /* yyn is the number of a rule to reduce with.  */
+  yylen = yyr2[yyn];
+
+  /* If YYLEN is nonzero, implement the default value of the action:
+     `$$ = $1'.
+
+     Otherwise, the following line sets YYVAL to garbage.
+     This behavior is undocumented and Bison
+     users should not rely upon it.  Assigning to YYVAL
+     unconditionally makes the parser a bit smaller, and it avoids a
+     GCC warning that YYVAL may be used uninitialized.  */
+  yyval = yyvsp[1-yylen];
+
+
+  YY_REDUCE_PRINT (yyn);
+  switch (yyn)
+    {
+        case 2:
+
+/* Line 1455 of yacc.c  */
+#line 79 "fts0pars.y"
+    {
+		(yyval.node) = (yyvsp[(1) - (1)].node);
+		((fts_ast_state_t*) state)->root = (yyval.node);
+	;}
+    break;
+
+  case 3:
+
+/* Line 1455 of yacc.c  */
+#line 85 "fts0pars.y"
+    {
+		(yyval.node) = NULL;
+	;}
+    break;
+
+  case 4:
+
+/* Line 1455 of yacc.c  */
+#line 89 "fts0pars.y"
+    {
+		(yyval.node) = (yyvsp[(1) - (2)].node);
+
+		if (!(yyval.node)) {
+			(yyval.node) = fts_ast_create_node_list(state, (yyvsp[(2) - (2)].node));
+		} else {
+			fts_ast_add_node((yyval.node), (yyvsp[(2) - (2)].node));
+		}
+	;}
+    break;
+
+  case 5:
+
+/* Line 1455 of yacc.c  */
+#line 99 "fts0pars.y"
+    {
+		(yyval.node) = (yyvsp[(1) - (2)].node);
+		(yyval.node) = fts_ast_create_node_list(state, (yyvsp[(1) - (2)].node));
+
+		if (!(yyval.node)) {
+			(yyval.node) = fts_ast_create_node_subexp_list(state, (yyvsp[(2) - (2)].node));
+		} else {
+			fts_ast_add_node((yyval.node), (yyvsp[(2) - (2)].node));
+		}
+	;}
+    break;
+
+  case 6:
+
+/* Line 1455 of yacc.c  */
+#line 111 "fts0pars.y"
+    {
+		(yyval.node) = (yyvsp[(2) - (3)].node);
+	;}
+    break;
+
+  case 7:
+
+/* Line 1455 of yacc.c  */
+#line 115 "fts0pars.y"
+    {
+		(yyval.node) = fts_ast_create_node_subexp_list(state, (yyvsp[(1) - (4)].node));
+
+		if ((yyvsp[(3) - (4)].node)) {
+			fts_ast_add_node((yyval.node), (yyvsp[(3) - (4)].node));
+		}
+	;}
+    break;
+
+  case 8:
+
+/* Line 1455 of yacc.c  */
+#line 124 "fts0pars.y"
+    {
+		(yyval.node) = (yyvsp[(1) - (1)].node);
+	;}
+    break;
+
+  case 9:
+
+/* Line 1455 of yacc.c  */
+#line 128 "fts0pars.y"
+    {
+		(yyval.node) = (yyvsp[(1) - (1)].node);
+	;}
+    break;
+
+  case 10:
+
+/* Line 1455 of yacc.c  */
+#line 132 "fts0pars.y"
+    {
+		fts_ast_term_set_wildcard((yyvsp[(1) - (2)].node));
+	;}
+    break;
+
+  case 11:
+
+/* Line 1455 of yacc.c  */
+#line 136 "fts0pars.y"
+    {
+		fts_ast_term_set_distance((yyvsp[(1) - (3)].node), strtoul((yyvsp[(3) - (3)].token), NULL, 10));
+		free((yyvsp[(3) - (3)].token));
+	;}
+    break;
+
+  case 12:
+
+/* Line 1455 of yacc.c  */
+#line 141 "fts0pars.y"
+    {
+		(yyval.node) = fts_ast_create_node_list(state, (yyvsp[(1) - (3)].node));
+		fts_ast_add_node((yyval.node), (yyvsp[(2) - (3)].node));
+		fts_ast_term_set_wildcard((yyvsp[(2) - (3)].node));
+	;}
+    break;
+
+  case 13:
+
+/* Line 1455 of yacc.c  */
+#line 147 "fts0pars.y"
+    {
+		(yyval.node) = fts_ast_create_node_list(state, (yyvsp[(1) - (2)].node));
+		fts_ast_add_node((yyval.node), (yyvsp[(2) - (2)].node));
+	;}
+    break;
+
+  case 14:
+
+/* Line 1455 of yacc.c  */
+#line 152 "fts0pars.y"
+    {
+		(yyval.node) = fts_ast_create_node_list(state, (yyvsp[(1) - (4)].node));
+		fts_ast_add_node((yyval.node), (yyvsp[(2) - (4)].node));
+		fts_ast_term_set_distance((yyvsp[(2) - (4)].node), strtoul((yyvsp[(4) - (4)].token), NULL, 10));
+		free((yyvsp[(4) - (4)].token));
+	;}
+    break;
+
+  case 15:
+
+/* Line 1455 of yacc.c  */
+#line 159 "fts0pars.y"
+    {
+		(yyval.node) = fts_ast_create_node_list(state, (yyvsp[(1) - (2)].node));
+		fts_ast_add_node((yyval.node), (yyvsp[(2) - (2)].node));
+	;}
+    break;
+
+  case 16:
+
+/* Line 1455 of yacc.c  */
+#line 165 "fts0pars.y"
+    {
+		(yyval.node) = fts_ast_create_node_oper(state, FTS_IGNORE);
+	;}
+    break;
+
+  case 17:
+
+/* Line 1455 of yacc.c  */
+#line 169 "fts0pars.y"
+    {
+		(yyval.node) = fts_ast_create_node_oper(state, FTS_EXIST);
+	;}
+    break;
+
+  case 18:
+
+/* Line 1455 of yacc.c  */
+#line 173 "fts0pars.y"
+    {
+		(yyval.node) = fts_ast_create_node_oper(state, FTS_NEGATE);
+	;}
+    break;
+
+  case 19:
+
+/* Line 1455 of yacc.c  */
+#line 177 "fts0pars.y"
+    {
+		(yyval.node) = fts_ast_create_node_oper(state, FTS_DECR_RATING);
+	;}
+    break;
+
+  case 20:
+
+/* Line 1455 of yacc.c  */
+#line 181 "fts0pars.y"
+    {
+		(yyval.node) = fts_ast_create_node_oper(state, FTS_INCR_RATING);
+	;}
+    break;
+
+  case 21:
+
+/* Line 1455 of yacc.c  */
+#line 186 "fts0pars.y"
+    {
+		(yyval.node)  = fts_ast_create_node_term(state, (yyvsp[(1) - (1)].token));
+		free((yyvsp[(1) - (1)].token));
+	;}
+    break;
+
+  case 22:
+
+/* Line 1455 of yacc.c  */
+#line 191 "fts0pars.y"
+    {
+		(yyval.node)  = fts_ast_create_node_term(state, (yyvsp[(1) - (1)].token));
+		free((yyvsp[(1) - (1)].token));
+	;}
+    break;
+
+  case 23:
+
+/* Line 1455 of yacc.c  */
+#line 198 "fts0pars.y"
+    {
+		(yyval.node)  = fts_ast_create_node_text(state, (yyvsp[(1) - (1)].token));
+		free((yyvsp[(1) - (1)].token));
+	;}
+    break;
+
+
+
+/* Line 1455 of yacc.c  */
+#line 1617 "fts0pars.cc"
+      default: break;
+    }
+  YY_SYMBOL_PRINT ("-> $$ =", yyr1[yyn], &yyval, &yyloc);
+
+  YYPOPSTACK (yylen);
+  yylen = 0;
+  YY_STACK_PRINT (yyss, yyssp);
+
+  *++yyvsp = yyval;
+
+  /* Now `shift' the result of the reduction.  Determine what state
+     that goes to, based on the state we popped back to and the rule
+     number reduced by.  */
+
+  yyn = yyr1[yyn];
+
+  yystate = yypgoto[yyn - YYNTOKENS] + *yyssp;
+  if (0 <= yystate && yystate <= YYLAST && yycheck[yystate] == *yyssp)
+    yystate = yytable[yystate];
+  else
+    yystate = yydefgoto[yyn - YYNTOKENS];
+
+  goto yynewstate;
+
+
+/*------------------------------------.
+| yyerrlab -- here on detecting error |
+`------------------------------------*/
+yyerrlab:
+  /* If not already recovering from an error, report this error.  */
+  if (!yyerrstatus)
+    {
+      ++yynerrs;
+#if ! YYERROR_VERBOSE
+      yyerror (YY_("syntax error"));
+#else
+      {
+	YYSIZE_T yysize = yysyntax_error (0, yystate, yychar);
+	if (yymsg_alloc < yysize && yymsg_alloc < YYSTACK_ALLOC_MAXIMUM)
+	  {
+	    YYSIZE_T yyalloc = 2 * yysize;
+	    if (! (yysize <= yyalloc && yyalloc <= YYSTACK_ALLOC_MAXIMUM))
+	      yyalloc = YYSTACK_ALLOC_MAXIMUM;
+	    if (yymsg != yymsgbuf)
+	      YYSTACK_FREE (yymsg);
+	    yymsg = (char*) YYSTACK_ALLOC (yyalloc);
+	    if (yymsg)
+	      yymsg_alloc = yyalloc;
+	    else
+	      {
+		yymsg = yymsgbuf;
+		yymsg_alloc = sizeof yymsgbuf;
+	      }
+	  }
+
+	if (0 < yysize && yysize <= yymsg_alloc)
+	  {
+	    (void) yysyntax_error (yymsg, yystate, yychar);
+	    yyerror (yymsg);
+	  }
+	else
+	  {
+	    yyerror (YY_("syntax error"));
+	    if (yysize != 0)
+	      goto yyexhaustedlab;
+	  }
+      }
+#endif
+    }
+
+
+
+  if (yyerrstatus == 3)
+    {
+      /* If just tried and failed to reuse lookahead token after an
+	 error, discard it.  */
+
+      if (yychar <= YYEOF)
+	{
+	  /* Return failure if at end of input.  */
+	  if (yychar == YYEOF)
+	    YYABORT;
+	}
+      else
+	{
+	  yydestruct ("Error: discarding",
+		      yytoken, &yylval);
+	  yychar = YYEMPTY;
+	}
+    }
+
+  /* Else will try to reuse lookahead token after shifting the error
+     token.  */
+  goto yyerrlab1;
+
+
+/*---------------------------------------------------.
+| yyerrorlab -- error raised explicitly by YYERROR.  |
+`---------------------------------------------------*/
+yyerrorlab:
+
+  /* Pacify compilers like GCC when the user code never invokes
+     YYERROR and the label yyerrorlab therefore never appears in user
+     code.  */
+  if (/*CONSTCOND*/ 0)
+     goto yyerrorlab;
+
+  /* Do not reclaim the symbols of the rule which action triggered
+     this YYERROR.  */
+  YYPOPSTACK (yylen);
+  yylen = 0;
+  YY_STACK_PRINT (yyss, yyssp);
+  yystate = *yyssp;
+  goto yyerrlab1;
+
+
+/*-------------------------------------------------------------.
+| yyerrlab1 -- common code for both syntax error and YYERROR.  |
+`-------------------------------------------------------------*/
+yyerrlab1:
+  yyerrstatus = 3;	/* Each real token shifted decrements this.  */
+
+  for (;;)
+    {
+      yyn = yypact[yystate];
+      if (yyn != YYPACT_NINF)
+	{
+	  yyn += YYTERROR;
+	  if (0 <= yyn && yyn <= YYLAST && yycheck[yyn] == YYTERROR)
+	    {
+	      yyn = yytable[yyn];
+	      if (0 < yyn)
+		break;
+	    }
+	}
+
+      /* Pop the current state because it cannot handle the error token.  */
+      if (yyssp == yyss)
+	YYABORT;
+
+
+      yydestruct ("Error: popping",
+		  yystos[yystate], yyvsp);
+      YYPOPSTACK (1);
+      yystate = *yyssp;
+      YY_STACK_PRINT (yyss, yyssp);
+    }
+
+  *++yyvsp = yylval;
+
+
+  /* Shift the error token.  */
+  YY_SYMBOL_PRINT ("Shifting", yystos[yyn], yyvsp, yylsp);
+
+  yystate = yyn;
+  goto yynewstate;
+
+
+/*-------------------------------------.
+| yyacceptlab -- YYACCEPT comes here.  |
+`-------------------------------------*/
+yyacceptlab:
+  yyresult = 0;
+  goto yyreturn;
+
+/*-----------------------------------.
+| yyabortlab -- YYABORT comes here.  |
+`-----------------------------------*/
+yyabortlab:
+  yyresult = 1;
+  goto yyreturn;
+
+#if !defined(yyoverflow) || YYERROR_VERBOSE
+/*-------------------------------------------------.
+| yyexhaustedlab -- memory exhaustion comes here.  |
+`-------------------------------------------------*/
+yyexhaustedlab:
+  yyerror (YY_("memory exhausted"));
+  yyresult = 2;
+  /* Fall through.  */
+#endif
+
+yyreturn:
+  if (yychar != YYEMPTY)
+     yydestruct ("Cleanup: discarding lookahead",
+		 yytoken, &yylval);
+  /* Do not reclaim the symbols of the rule which action triggered
+     this YYABORT or YYACCEPT.  */
+  YYPOPSTACK (yylen);
+  YY_STACK_PRINT (yyss, yyssp);
+  while (yyssp != yyss)
+    {
+      yydestruct ("Cleanup: popping",
+		  yystos[*yyssp], yyvsp);
+      YYPOPSTACK (1);
+    }
+#ifndef yyoverflow
+  if (yyss != yyssa)
+    YYSTACK_FREE (yyss);
+#endif
+#if YYERROR_VERBOSE
+  if (yymsg != yymsgbuf)
+    YYSTACK_FREE (yymsg);
+#endif
+  /* Make sure YYID is used.  */
+  return YYID (yyresult);
+}
+
+
+
+/* Line 1675 of yacc.c  */
+#line 203 "fts0pars.y"
+
+
+/********************************************************************
+*/
+int
+ftserror(
+/*=====*/
+	const char*	p)
+{
+	fprintf(stderr, "%s\n", p);
+	return(0);
+}
+
+/********************************************************************
+Create a fts_lexer_t instance.*/
+
+fts_lexer_t*
+fts_lexer_create(
+/*=============*/
+	ibool		boolean_mode,
+	const byte*	query,
+	ulint		query_len)
+{
+	fts_lexer_t*	fts_lexer = static_cast<fts_lexer_t*>(
+		ut_malloc(sizeof(fts_lexer_t)));
+
+	if (boolean_mode) {
+		fts0blex_init(&fts_lexer->yyscanner);
+		fts0b_scan_bytes((char*) query, query_len, fts_lexer->yyscanner);
+		fts_lexer->scanner = (fts_scan) fts_blexer;
+		/* FIXME: Debugging */
+		/* fts0bset_debug(1 , fts_lexer->yyscanner); */
+	} else {
+		fts0tlex_init(&fts_lexer->yyscanner);
+		fts0t_scan_bytes((char*) query, query_len, fts_lexer->yyscanner);
+		fts_lexer->scanner = (fts_scan) fts_tlexer;
+	}
+
+	return(fts_lexer);
+}
+
+/********************************************************************
+Free an fts_lexer_t instance.*/
+void
+
+fts_lexer_free(
+/*===========*/
+	fts_lexer_t*	fts_lexer)
+{
+	if (fts_lexer->scanner == (fts_scan) fts_blexer) {
+		fts0blex_destroy(fts_lexer->yyscanner);
+	} else {
+		fts0tlex_destroy(fts_lexer->yyscanner);
+	}
+
+	ut_free(fts_lexer);
+}
+
+/********************************************************************
+Call the appropaiate scanner.*/
+
+int
+fts_lexer(
+/*======*/
+	YYSTYPE*	val,
+	fts_lexer_t*	fts_lexer)
+{
+	fts_scanner_alt func_ptr;
+
+	func_ptr = (fts_scanner_alt) fts_lexer->scanner;
+
+	return(func_ptr(val, fts_lexer->yyscanner));
+}
+
+/********************************************************************
+Parse the query.*/
+int
+fts_parse(
+/*======*/
+	fts_ast_state_t*	state)
+{
+	return(ftsparse(state));
+}
+
diff --git a/storage/innobase/fts/fts0pars.y b/storage/innobase/fts/fts0pars.y
new file mode 100644
index 00000000000..fe644d84eea
--- /dev/null
+++ b/storage/innobase/fts/fts0pars.y
@@ -0,0 +1,285 @@
+/*****************************************************************************
+
+Copyright (c) 2007, 2011,  Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**
+ * @file fts/fts0pars.y
+ * FTS parser: input file for the GNU Bison parser generator
+ *
+ * Created 2007/5/9 Sunny Bains
+ */
+
+%{
+
+#include "mem0mem.h"
+#include "fts0ast.h"
+#include "fts0blex.h"
+#include "fts0tlex.h"
+#include "fts0pars.h"
+
+extern	int fts_lexer(YYSTYPE*, fts_lexer_t*);
+extern	int fts_blexer(YYSTYPE*, yyscan_t);
+extern	int fts_tlexer(YYSTYPE*, yyscan_t);
+
+typedef int (*fts_scan)();
+
+extern int ftserror(const char* p);
+
+/* Required for reentrant parser */
+#define ftslex	fts_lexer
+
+#define YYERROR_VERBOSE
+
+/* For passing an argument to yyparse() */
+#define YYPARSE_PARAM state
+#define YYLEX_PARAM ((fts_ast_state_t*) state)->lexer
+
+typedef	int	(*fts_scanner_alt)(YYSTYPE* val, yyscan_t yyscanner);
+typedef	int	(*fts_scanner)();
+
+struct fts_lexer_struct {
+	fts_scanner	scanner;
+	void*		yyscanner;
+};
+
+%}
+
+%union {
+	int		oper;
+	char*		token;
+	fts_ast_node_t*	node;
+};
+
+/* Enable re-entrant parser */
+%pure_parser
+
+%token<oper>	FTS_OPER
+%token<token>	FTS_TEXT FTS_TERM FTS_NUMB
+
+%type<node>	prefix term text expr sub_expr expr_lst query
+
+%nonassoc	'+' '-' '~' '<' '>'
+
+%%
+
+query	: expr_lst	{
+		$$ = $1;
+		((fts_ast_state_t*) state)->root = $$;
+	}
+	;
+
+expr_lst: /* Empty */	{
+		$$ = NULL;
+	}
+
+	| expr_lst expr	{
+		$$ = $1;
+
+		if (!$$) {
+			$$ = fts_ast_create_node_list(state, $2);
+		} else {
+			fts_ast_add_node($$, $2);
+		}
+	}
+
+	| expr_lst sub_expr		{
+		$$ = $1;
+		$$ = fts_ast_create_node_list(state, $1);
+
+		if (!$$) {
+			$$ = fts_ast_create_node_subexp_list(state, $2);
+		} else {
+			fts_ast_add_node($$, $2);
+		}
+	}
+	;
+
+sub_expr: '(' expr_lst ')'		{
+		$$ = $2;
+	}
+
+	| prefix '(' expr_lst ')'	{
+		$$ = fts_ast_create_node_subexp_list(state, $1);
+
+		if ($3) {
+			fts_ast_add_node($$, $3);
+		}
+	}
+	;
+
+expr	: term		{
+		$$ = $1;
+	}
+
+	| text		{
+		$$ = $1;
+	}
+
+	| term '*' {
+		fts_ast_term_set_wildcard($1);
+	}
+
+	| text '@' FTS_NUMB {
+		fts_ast_term_set_distance($1, strtoul($3, NULL, 10));
+		free($3);
+	}
+
+	| prefix term '*' {
+		$$ = fts_ast_create_node_list(state, $1);
+		fts_ast_add_node($$, $2);
+		fts_ast_term_set_wildcard($2);
+	}
+
+	| prefix term	{
+		$$ = fts_ast_create_node_list(state, $1);
+		fts_ast_add_node($$, $2);
+	}
+
+	| prefix text '@' FTS_NUMB {
+		$$ = fts_ast_create_node_list(state, $1);
+		fts_ast_add_node($$, $2);
+		fts_ast_term_set_distance($2, strtoul($4, NULL, 10));
+		free($4);
+	}
+
+	| prefix text {
+		$$ = fts_ast_create_node_list(state, $1);
+		fts_ast_add_node($$, $2);
+	}
+	;
+
+prefix	: '-'		{
+		$$ = fts_ast_create_node_oper(state, FTS_IGNORE);
+	}
+
+	| '+'		{
+		$$ = fts_ast_create_node_oper(state, FTS_EXIST);
+	}
+
+	| '~'		{
+		$$ = fts_ast_create_node_oper(state, FTS_NEGATE);
+	}
+
+	| '<'		{
+		$$ = fts_ast_create_node_oper(state, FTS_DECR_RATING);
+	}
+
+	| '>'		{
+		$$ = fts_ast_create_node_oper(state, FTS_INCR_RATING);
+	}
+	;
+
+term	: FTS_TERM	{
+		$$  = fts_ast_create_node_term(state, $1);
+		free($1);
+	}
+
+	| FTS_NUMB	{
+		$$  = fts_ast_create_node_term(state, $1);
+		free($1);
+	}
+
+	;
+
+text	: FTS_TEXT	{
+		$$  = fts_ast_create_node_text(state, $1);
+		free($1);
+	}
+	;
+%%
+
+/********************************************************************
+*/
+int
+ftserror(
+/*=====*/
+	const char*	p)
+{
+	fprintf(stderr, "%s\n", p);
+	return(0);
+}
+
+/********************************************************************
+Create a fts_lexer_t instance.*/
+
+fts_lexer_t*
+fts_lexer_create(
+/*=============*/
+	ibool		boolean_mode,
+	const byte*	query,
+	ulint		query_len)
+{
+	fts_lexer_t*	fts_lexer = static_cast<fts_lexer_t*>(
+		ut_malloc(sizeof(fts_lexer_t)));
+
+	if (boolean_mode) {
+		fts0blex_init(&fts_lexer->yyscanner);
+		fts0b_scan_bytes((char*) query, query_len, fts_lexer->yyscanner);
+		fts_lexer->scanner = (fts_scan) fts_blexer;
+		/* FIXME: Debugging */
+		/* fts0bset_debug(1 , fts_lexer->yyscanner); */
+	} else {
+		fts0tlex_init(&fts_lexer->yyscanner);
+		fts0t_scan_bytes((char*) query, query_len, fts_lexer->yyscanner);
+		fts_lexer->scanner = (fts_scan) fts_tlexer;
+	}
+
+	return(fts_lexer);
+}
+
+/********************************************************************
+Free an fts_lexer_t instance.*/
+void
+
+fts_lexer_free(
+/*===========*/
+	fts_lexer_t*	fts_lexer)
+{
+	if (fts_lexer->scanner == (fts_scan) fts_blexer) {
+		fts0blex_destroy(fts_lexer->yyscanner);
+	} else {
+		fts0tlex_destroy(fts_lexer->yyscanner);
+	}
+
+	ut_free(fts_lexer);
+}
+
+/********************************************************************
+Call the appropaiate scanner.*/
+
+int
+fts_lexer(
+/*======*/
+	YYSTYPE*	val,
+	fts_lexer_t*	fts_lexer)
+{
+	fts_scanner_alt func_ptr;
+
+	func_ptr = (fts_scanner_alt) fts_lexer->scanner;
+
+	return(func_ptr(val, fts_lexer->yyscanner));
+}
+
+/********************************************************************
+Parse the query.*/
+int
+fts_parse(
+/*======*/
+	fts_ast_state_t*	state)
+{
+	return(ftsparse(state));
+}
diff --git a/storage/innobase/fts/fts0que.cc b/storage/innobase/fts/fts0que.cc
new file mode 100644
index 00000000000..58b429a8406
--- /dev/null
+++ b/storage/innobase/fts/fts0que.cc
@@ -0,0 +1,3760 @@
+/*****************************************************************************
+
+Copyright (c) 2007, 2011, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file fts/fts0que.cc
+Full Text Search functionality.
+
+Created 2007/03/27 Sunny Bains
+Completed 2011/7/10 Sunny and Jimmy Yang
+*******************************************************/
+
+#include "ut0rbt.h"
+#include "row0sel.h"
+#include "fts0fts.h"
+#include "fts0priv.h"
+#include "fts0ast.h"
+#include "fts0pars.h"
+#include "fts0types.h"
+#include "ha_prototypes.h"
+#include <ctype.h>
+
+#ifndef UNIV_NONINL
+#include "fts0types.ic"
+#include "fts0vlc.ic"
+#endif
+
+#define FTS_ELEM(t, n, i, j) (t[(i) * n + (j)])
+
+#define RANK_DOWNGRADE		(-1.0F)
+#define RANK_UPGRADE		(1.0F)
+
+/* Maximum number of words supported in a proximity search.
+FIXME, this limitation can be removed easily. Need to see
+if we want to enforce such limitation */
+#define MAX_PROXIMITY_ITEM	128
+
+/* Coeffecient to use for normalize relevance ranking. */
+static const double FTS_NORMALIZE_COEFF = 0.0115F;
+
+// FIXME: Need to have a generic iterator that traverses the ilist.
+
+/* For parsing the search phrase */
+static const char* FTS_PHRASE_DELIMITER = "\t ";
+
+typedef struct fts_match_struct fts_match_t;
+typedef	struct fts_query_struct fts_query_t;
+typedef struct fts_phrase_struct fts_phrase_t;
+typedef struct fts_select_struct fts_select_t;
+typedef struct fts_doc_freq_struct fts_doc_freq_t;
+typedef struct fts_word_freq_struct fts_word_freq_t;
+
+/** State of an FTS query. */
+struct fts_query_struct {
+	mem_heap_t*	heap;		/*!< Heap to use for allocations */
+
+	trx_t*		trx;		/*!< The query transaction */
+
+	dict_index_t*	index;		/*!< The FTS index to search */
+					/*!< FTS auxiliary common table def */
+	fts_table_t	fts_common_table;
+
+	fts_table_t	fts_index_table;/*!< FTS auxiliary index table def */
+
+	fts_doc_ids_t*	deleted;	/*!< Deleted doc ids that need to be
+					filtered from the output */
+
+	fts_ast_node_t*	root;		/*!< Abstract syntax tree */
+
+	fts_ast_node_t* cur_node;	/*!< Current tree node */
+
+	ib_rbt_t*       doc_ids;	/*!< The current set of matching
+					doc ids, elements are of
+					type fts_ranking_t */
+
+	ib_rbt_t*	intersection;	/*!< The doc ids that were found in
+					doc_ids, this tree will become
+					the new doc_ids, elements are of type
+					fts_ranking_t */
+
+					/*!< Prepared statement to read the
+					nodes from the FTS INDEX */
+	que_t*		read_nodes_graph;
+
+	fts_ast_oper_t	oper;		/*!< Current boolean mode operator */
+
+					/*!< TRUE if we want to collect the
+					word positions within the document */
+	ibool		collect_positions;
+
+	ulint		flags;		/*!< Specify the full text search type,
+					such as  boolean search, phrase
+					search, proximity search etc. */
+
+	ulint		distance;	/*!< The proximity distance of a
+					phrase search. */
+
+					/*!< These doc ids are used as a
+					boundary condition when searching the
+					FTS index rows */
+
+	doc_id_t	lower_doc_id;	/*!< Lowest doc id in doc_ids */
+
+	doc_id_t	upper_doc_id;	/*!< Highest doc id in doc_ids */
+
+	ibool		boolean_mode;	/*!< TRUE if boolean mode query */
+
+	ib_vector_t*	matched;	/*!< Array of matching documents
+					(fts_match_t) to search for a phrase */
+
+	ib_vector_t**	match_array;	/*!< Used for proximity search, contains
+					position info for each matched word
+					in the word list */
+
+	ulint		total_docs;	/*!< The total number of documents */
+
+	ulint		total_words;	/*!< The total number of words */
+
+	ulint		error;		/*!< Error code if any, that is
+					encountered during query processing */
+
+	ib_rbt_t*	word_freqs;	/*!< RB tree of word frequencies per
+					document, its elements are of type
+					fts_word_freq_t */
+
+	ibool		inited;		/*!< Flag to test whether the query
+					processing has started or not */
+	ibool		multi_exist;	/*!< multiple FTS_EXIST oper */
+};
+
+/** For phrase matching, first we collect the documents and the positions
+then we match. */
+struct fts_match_struct {
+	doc_id_t	doc_id;		/*!< Document id */
+
+	ulint		start;		/*!< Start the phrase match from
+					this offset within the positions
+					vector. */
+
+	ib_vector_t*	positions;	/*!< Offsets of a word in a
+					document */
+};
+
+/** For matching tokens in a phrase search. We use this data structure in
+the callback that determines whether a document should be accepted or
+rejected for a phrase search. */
+struct fts_select_struct {
+	doc_id_t	doc_id;		/*!< The document id to match */
+
+	ulint		min_pos;	/*!< For found to be TRUE at least
+					one position must be greater than
+					min_pos. */
+
+	ibool		found;		/*!< TRUE if found */
+
+	fts_word_freq_t*
+			word_freq;	/*!< Word frequency instance of the
+					current word being looked up in
+					the FTS index */
+};
+
+/** The match positions and tokesn to match */
+struct fts_phrase_struct {
+	ibool		found;		/*!< Match result */
+
+	const fts_match_t*
+			match;		/*!< Positions within text */
+
+	const ib_vector_t*
+			tokens;		/*!< Tokens to match */
+
+	ulint		distance;	/*!< For matching on proximity
+					distance. Can be 0 for exact match */
+	CHARSET_INFO*	charset;	/*!< Phrase match charset */
+	mem_heap_t*     heap;		/*!< Heap for word processing */
+	ulint		zip_size;	/*!< row zip size */
+};
+
+/** For storing the frequncy of a word/term in a document */
+struct fts_doc_freq_struct {
+	doc_id_t	doc_id;		/*!< Document id */
+	ulint		freq;		/*!< Frequency of a word in a document */
+};
+
+/** To determine the word frequency per document. */
+struct fts_word_freq_struct {
+	byte*		word;		/*!< Word for which we need the freq,
+					it's allocated on the query heap */
+
+	ib_rbt_t*	doc_freqs;	/*!< RB Tree for storing per document
+					word frequencies. The elements are
+					of type fts_doc_freq_t */
+	ulint		doc_count;	/*!< Total number of documents that
+					contain this word */
+	double		idf;		/*!< Inverse document frequency */
+};
+
+/********************************************************************
+Callback function to fetch the rows in an FTS INDEX record.
+@return always TRUE */
+static
+ibool
+fts_query_index_fetch_nodes(
+/*========================*/
+	void*		row,		/*!< in: sel_node_t* */
+	void*		user_arg);	/*!< in: pointer to ib_vector_t */
+
+/********************************************************************
+Read and filter nodes.
+@return fts_node_t instance */
+static
+void
+fts_query_filter_doc_ids(
+/*=====================*/
+	fts_query_t*	query,		/*!< in: query instance */
+	const byte*	word,		/*!< in: the current word */
+	fts_word_freq_t*word_freq,	/*!< in/out: word frequency */
+	const fts_node_t*
+			node,		/*!< in: current FTS node */
+	void*		data,		/*!< in: doc id ilist */
+	ulint		len,		/*!< in: doc id ilist size */
+	ibool		calc_doc_count);/*!< in: whether to remember doc
+					count */
+
+#if 0
+/*****************************************************************//***
+Find a doc_id in a word's ilist.
+@return TRUE if found. */
+static
+ibool
+fts_query_find_doc_id(
+/*==================*/
+	fts_select_t*	select,		/*!< in/out: search the doc id selected,
+					update the frequency if found. */
+	void*		data,		/*!< in: doc id ilist */
+	ulint		len);		/*!< in: doc id ilist size */
+#endif
+
+/*************************************************************//**
+This function implements a simple "blind" query expansion search:
+words in documents found in the first search pass will be used as
+search arguments to search the document again, thus "expand"
+the search result set.
+@return DB_SUCCESS if success, otherwise the error code */
+static
+ulint
+fts_expand_query(
+/*=============*/
+	dict_index_t*	index,		/*!< in: FTS index to search */
+	fts_query_t*	query);		/*!< in: query result, to be freed
+					by the client */
+/*************************************************************//**
+This function finds documents that contain all words in a
+phrase or proximity search. And if proximity search, verify
+the words are close to each other enough, as in specified distance.
+This function is called for phrase and proximity search.
+@return TRUE if documents are found, FALSE if otherwise */
+static
+ibool
+fts_check_phrase_proximity(
+/*=======================*/
+	fts_query_t*	query,		/*!< in:  query instance */
+	ib_vector_t*	tokens);	/*!< in: Tokens contain words */
+/*************************************************************//**
+This function check the words in result document are close to each
+other enough (within proximity rnage). This is used for proximity search.
+@return TRUE if words are close to each other, FALSE if otherwise */
+static
+ulint
+fts_proximity_check_position(
+/*=========================*/
+	fts_match_t**	match,		/*!< in: query instance */
+	ulint		num_match,	/*!< in: number of matching
+					items */
+	ulint		distance);	/*!< in: distance value
+					for proximity search */
+#if 0
+/********************************************************************
+Get the total number of words in a documents. */
+static
+ulint
+fts_query_terms_in_document(
+/*========================*/
+					/*!< out: DB_SUCCESS if all went well
+					else error code */
+	fts_query_t*	query,		/*!< in: FTS query state */
+	doc_id_t	doc_id,		/*!< in: the word to check */
+	ulint*		total);		/*!< out: total words in document */
+#endif
+
+/********************************************************************
+Compare two fts_doc_freq_t doc_ids.
+@return < 0 if n1 < n2, 0 if n1 == n2, > 0 if n1 > n2 */
+UNIV_INLINE
+int
+fts_freq_doc_id_cmp(
+/*================*/
+	const void*	p1,			/*!< in: id1 */
+	const void*	p2)			/*!< in: id2 */
+{
+	const fts_doc_freq_t*	fq1 = (const fts_doc_freq_t*) p1;
+	const fts_doc_freq_t*	fq2 = (const fts_doc_freq_t*) p2;
+
+	return((int) (fq1->doc_id - fq2->doc_id));
+}
+
+#if 0
+/*******************************************************************//**
+Print the table used for calculating LCS. */
+static
+void
+fts_print_lcs_table(
+/*================*/
+	const ulint*	table,		/*!< in: array to print */
+	ulint		n_rows,		/*!< in: total no. of rows */
+	ulint		n_cols)		/*!< in: total no. of cols */
+{
+	ulint		i;
+
+	for (i = 0; i < n_rows; ++i) {
+		ulint	j;
+
+		printf("\n");
+
+		for (j = 0; j < n_cols; ++j) {
+
+			printf("%2lu ", FTS_ELEM(table, n_cols, i, j));
+		}
+	}
+}
+
+/********************************************************************
+Find the longest common subsequence between the query string and
+the document. */
+static
+ulint
+fts_query_lcs(
+/*==========*/
+					/*!< out: LCS (length) between
+					two ilists */
+	const	ulint*	p1,		/*!< in: word positions of query */
+	ulint	len_p1,			/*!< in: no. of elements in p1 */
+	const	ulint*	p2,		/*!< in: word positions within document */
+	ulint	len_p2)			/*!< in: no. of elements in p2 */
+{
+	int	i;
+	ulint	len = 0;
+	ulint	r = len_p1;
+	ulint	c = len_p2;
+	ulint	size = (r + 1) * (c + 1) * sizeof(ulint);
+	ulint*	table = (ulint*) ut_malloc(size);
+
+	/* Traverse the table backwards, from the last row to the first and
+	also from the last column to the first. We compute the smaller
+	common subsequeces first, then use the caluclated values to determine
+	the longest common subsequence. The result will be in TABLE[0][0]. */
+	for (i = r; i >= 0; --i) {
+		int	j;
+
+		for (j = c; j >= 0; --j) {
+
+			if (p1[i] == (ulint) -1 || p2[j] == (ulint) -1) {
+
+				FTS_ELEM(table, c, i, j) = 0;
+
+			} else if (p1[i] == p2[j]) {
+
+				FTS_ELEM(table, c, i, j) = FTS_ELEM(
+					table, c, i + 1, j + 1) + 1;
+
+			} else {
+
+				ulint	value;
+
+				value = ut_max(
+					FTS_ELEM(table, c, i + 1, j),
+					FTS_ELEM(table, c, i, j + 1));
+
+				FTS_ELEM(table, c, i, j) = value;
+			}
+		}
+	}
+
+	len = FTS_ELEM(table, c, 0, 0);
+
+	fts_print_lcs_table(table, r, c);
+	printf("\nLen=%lu\n", len);
+
+	ut_free(table);
+
+	return(len);
+}
+#endif
+
+/*******************************************************************//**
+Compare two byte* arrays.
+@return 0 if p1 == p2, < 0 if p1 <  p2, > 0 if p1 >  p2 */
+static
+int
+fts_query_strcmp(
+/*=============*/
+	const void*	p1,		/*!< in: pointer to elem */
+	const void*	p2)		/*!< in: pointer to elem */
+{
+	void* temp = const_cast<void*>(p2);
+
+	return(strcmp(static_cast<const char*>(p1),
+		      *(static_cast <char**>(temp))));
+}
+
+/*******************************************************************//**
+Compare two fts_ranking_t instance on their rank value and doc ids in
+descending order on the rank and ascending order on doc id.
+@return 0 if p1 == p2, < 0 if p1 <  p2, > 0 if p1 >  p2 */
+static
+int
+fts_query_compare_rank(
+/*===================*/
+	const void*	p1,		/*!< in: pointer to elem */
+	const void*	p2)		/*!< in: pointer to elem */
+{
+	const fts_ranking_t*	r1 = (const fts_ranking_t*) p1;
+	const fts_ranking_t*	r2 = (const fts_ranking_t*) p2;
+
+	if (r2->rank < r1->rank) {
+		return(-1);
+	} else if (r2->rank == r1->rank) {
+
+		if (r1->doc_id < r2->doc_id) {
+			return(1);
+		} else if (r1->doc_id > r2->doc_id) {
+			return(1);
+		}
+
+		return(0);
+	}
+
+	return(1);
+}
+
+#ifdef FTS_UTF8_DEBUG
+/*******************************************************************//**
+Convert string to lowercase.
+@return lower case string, callers responsibility to delete using
+ut_free() */
+static
+byte*
+fts_tolower(
+/*========*/
+	const byte*	src,		/*!< in: src string */
+	ulint		len)		/*!< in: src string length */
+{
+	fts_string_t	str;
+	byte*		lc_str = ut_malloc(len + 1);
+
+	str.f_len = len;
+	str.f_str = lc_str;
+
+	memcpy(str.f_str, src, len);
+
+	/* Make sure the last byte is NUL terminated */
+	str.f_str[len] = '\0';
+
+	fts_utf8_tolower(&str);
+
+	return(lc_str);
+}
+
+/*******************************************************************//**
+Do a case insensitive search. Doesn't check for NUL byte end marker
+only relies on len. Convert str2 to lower case before comparing.
+@return 0 if p1 == p2, < 0 if p1 <  p2, > 0 if p1 >  p2 */
+static
+int
+fts_utf8_strcmp(
+/*============*/
+	const fts_string_t*
+			str1,		/*!< in: should be lower case*/
+
+	fts_string_t*	str2)		/*!< in: any case. We will use the length
+					of this string during compare as it
+					should be the min of the two strings */
+{
+	byte		b = str2->f_str[str2->f_len];
+
+	ut_a(str2->f_len <= str1->f_len);
+
+	/* We need to write a NUL byte at the end of the string because the
+	string is converted to lowercase by a MySQL function which doesn't
+	care about the length. */
+	str2->f_str[str2->f_len] = 0;
+
+	fts_utf8_tolower(str2);
+
+	/* Restore the value we replaced above. */
+	str2->f_str[str2->f_len] = b;
+
+	return(memcmp(str1->f_str, str2->f_str, str2->f_len));
+}
+#endif
+
+/*******************************************************************//**
+Add a word if it doesn't exist, to the term freq RB tree. We store
+a pointer to the word that is passed in as the argument.
+@return pointer to word */
+static
+fts_word_freq_t*
+fts_query_add_word_freq(
+/*====================*/
+	fts_query_t*	query,		/*!< in: query instance */
+	const byte*	word)		/*!< in: term/word to add */
+{
+	ib_rbt_bound_t		parent;
+
+	/* Lookup the word in our rb tree and add if it doesn't exist. */
+	if (rbt_search(query->word_freqs, &parent, word) != 0) {
+		fts_word_freq_t	word_freq;
+		ulint		len = ut_strlen((char*) word) + 1;
+
+		memset(&word_freq, 0, sizeof(word_freq));
+
+		word_freq.word = static_cast<byte*>(
+			mem_heap_alloc(query->heap, len));
+
+		/* Need to copy the NUL character too. */
+		memcpy(word_freq.word, word, len);
+
+		word_freq.doc_count = 0;
+
+		word_freq.doc_freqs = rbt_create(
+			sizeof(fts_doc_freq_t), fts_freq_doc_id_cmp);
+
+		parent.last = rbt_add_node(
+			query->word_freqs, &parent, &word_freq);
+	}
+
+	return(rbt_value(fts_word_freq_t, parent.last));
+}
+
+/*******************************************************************//**
+Add a doc id if it doesn't exist, to the doc freq RB tree.
+@return pointer to word */
+static
+fts_doc_freq_t*
+fts_query_add_doc_freq(
+/*===================*/
+	ib_rbt_t*	doc_freqs,	/*!< in: rb tree of fts_doc_freq_t */
+	doc_id_t	doc_id)		/*!< in: doc id to add */
+{
+	ib_rbt_bound_t	parent;
+
+	/* Lookup the doc id in our rb tree and add if it doesn't exist. */
+	if (rbt_search(doc_freqs, &parent, &doc_id) != 0) {
+		fts_doc_freq_t	doc_freq;
+
+		memset(&doc_freq, 0, sizeof(doc_freq));
+
+		doc_freq.freq = 0;
+		doc_freq.doc_id = doc_id;
+
+		parent.last = rbt_add_node(doc_freqs, &parent, &doc_freq);
+	}
+
+	return(rbt_value(fts_doc_freq_t, parent.last));
+}
+
+/*******************************************************************//**
+Add the doc id to the query set only if it's not in the
+deleted array. */
+static
+void
+fts_query_union_doc_id(
+/*===================*/
+	fts_query_t*	query,		/*!< in: query instance */
+	doc_id_t	doc_id,		/*!< in: the doc id to add */
+	fts_rank_t	rank)		/*!< in: if non-zero, it is the
+					rank associated with the doc_id */
+{
+	ib_rbt_bound_t	parent;
+	ulint		size = ib_vector_size(query->deleted->doc_ids);
+	fts_update_t*	array = (fts_update_t*) query->deleted->doc_ids->data;
+
+	/* Check if the doc id is deleted and it's not already in our set. */
+	if (fts_bsearch(array, 0, size, doc_id) < 0
+	    && rbt_search(query->doc_ids, &parent, &doc_id) != 0) {
+
+		fts_ranking_t	ranking;
+
+		ranking.rank = rank;
+		ranking.doc_id = doc_id;
+		ranking.words = rbt_create(sizeof(byte*), fts_query_strcmp);
+
+		rbt_add_node(query->doc_ids, &parent, &ranking);
+	}
+}
+
+/*******************************************************************//**
+Remove the doc id from the query set only if it's not in the
+deleted set. */
+static
+void
+fts_query_remove_doc_id(
+/*====================*/
+	fts_query_t*	query,		/*!< in: query instance */
+	doc_id_t	doc_id)		/*!< in: the doc id to add */
+{
+	ib_rbt_bound_t	parent;
+	ulint		size = ib_vector_size(query->deleted->doc_ids);
+	fts_update_t*	array = (fts_update_t*) query->deleted->doc_ids->data;
+
+	/* Check if the doc id is deleted and it's in our set. */
+	if (fts_bsearch(array, 0, size, doc_id) < 0
+	    && rbt_search(query->doc_ids, &parent, &doc_id) == 0) {
+
+		fts_ranking_t*	ranking;
+
+		ranking = rbt_value(fts_ranking_t, parent.last);
+		rbt_free(ranking->words);
+
+		ut_free(rbt_remove_node(query->doc_ids, parent.last));
+	}
+}
+
+/*******************************************************************//**
+Find the doc id in the query set but not in the deleted set, artificialy
+downgrade or upgrade its ranking by a value and make/initialize its ranking
+under or above its normal range 0 to 1. This is used for Boolean Search
+operator such as Negation operator, which makes word's contribution to the
+row's relevance to be negative */
+static
+void
+fts_query_change_ranking(
+/*====================*/
+	fts_query_t*	query,		/*!< in: query instance */
+	doc_id_t	doc_id,		/*!< in: the doc id to add */
+	ibool		downgrade)	/*!< in: Whether to downgrade ranking */
+{
+	ib_rbt_bound_t	parent;
+	ulint		size = ib_vector_size(query->deleted->doc_ids);
+	fts_update_t*	array = (fts_update_t*) query->deleted->doc_ids->data;
+
+	/* Check if the doc id is deleted and it's in our set. */
+	if (fts_bsearch(array, 0, size, doc_id) < 0
+	    && rbt_search(query->doc_ids, &parent, &doc_id) == 0) {
+
+		fts_ranking_t*	ranking;
+
+		ranking = rbt_value(fts_ranking_t, parent.last);
+
+		ranking->rank += downgrade ? RANK_DOWNGRADE : RANK_UPGRADE;
+
+		/* Allow at most 2 adjustment by RANK_DOWNGRADE (-0.5)
+		and RANK_UPGRADE (0.5) */
+		if (ranking->rank >= 1.0F) {
+			ranking->rank = 1.0F;
+		} else if (ranking->rank <= -1.0F) {
+			ranking->rank = -1.0F;
+		}
+	}
+}
+
+/*******************************************************************//**
+Check the doc id in the query set only if it's not in the
+deleted array. The doc ids that were found are stored in
+another rb tree (fts_query_t::intersect). */
+static
+void
+fts_query_intersect_doc_id(
+/*=======================*/
+	fts_query_t*	query,		/*!< in: query instance */
+	doc_id_t	doc_id,		/*!< in: the doc id to add */
+	fts_rank_t	rank)		/*!< in: if non-zero, it is the
+					rank associated with the doc_id */
+{
+	ib_rbt_bound_t	parent;
+	ulint		size = ib_vector_size(query->deleted->doc_ids);
+	fts_update_t*	array = (fts_update_t*) query->deleted->doc_ids->data;
+	fts_ranking_t*	ranking;
+
+	/* Check if the doc id is deleted and it's in our set */
+	if (fts_bsearch(array, 0, size, doc_id) < 0) {
+		/* If this is the first FTS_EXIST we encountered, all of its
+		value must be in intersect list */
+		if (!query->multi_exist) {
+			fts_ranking_t	new_ranking;
+
+			if (rbt_search(query->doc_ids, &parent, &doc_id) == 0) {
+				ranking = rbt_value(fts_ranking_t, parent.last);
+				rank += (ranking->rank > 0)
+					? ranking->rank : RANK_UPGRADE;
+				if (rank >= 1.0F) {
+					rank = 1.0F;
+				}
+			}
+
+			new_ranking.rank = rank;
+			new_ranking.doc_id = doc_id;
+			new_ranking.words = rbt_create(
+				sizeof(byte*), fts_query_strcmp);
+			ranking = &new_ranking;
+
+			if (rbt_search(query->intersection, &parent,
+				       ranking) != 0) {
+				rbt_add_node(query->intersection,
+					     &parent, ranking);
+			} else {
+				rbt_free(new_ranking.words);
+			}
+		} else {
+
+			if (rbt_search(query->doc_ids, &parent, &doc_id) != 0) {
+				return;
+			}
+
+			ranking = rbt_value(fts_ranking_t, parent.last);
+
+			ranking->rank = rank;
+
+			if (ranking->words != NULL
+			    && rbt_search(query->intersection, &parent,
+					  ranking) != 0) {
+				rbt_add_node(query->intersection, &parent,
+					     ranking);
+
+				/* Note that the intersection has taken
+				ownership of the ranking data. */
+				ranking->words = NULL;
+			}
+		}
+	}
+}
+
+/*******************************************************************//**
+Free the document ranking rb tree. */
+static
+void
+fts_query_free_doc_ids(
+/*===================*/
+	ib_rbt_t*	doc_ids)	/*!< in: rb tree to free */
+{
+	const ib_rbt_node_t*	node;
+
+	for (node = rbt_first(doc_ids); node; node = rbt_first(doc_ids)) {
+
+		fts_ranking_t*	ranking;
+
+		ranking = rbt_value(fts_ranking_t, node);
+
+		if (ranking->words) {
+			rbt_free(ranking->words);
+			ranking->words = NULL;
+		}
+
+		ut_free(rbt_remove_node(doc_ids, node));
+	}
+
+	rbt_free(doc_ids);
+}
+
+/*******************************************************************//**
+Add the word to the documents "list" of matching words from
+the query. We make a copy of the word from the query heap. */
+static
+void
+fts_query_add_word_to_document(
+/*===========================*/
+	fts_query_t*		query,	/*!< in: query to update */
+	doc_id_t		doc_id,	/*!< in: the document to update */
+	const byte*		word)	/*!< in: the token to add */
+{
+	ib_rbt_bound_t		parent;
+	fts_ranking_t*		ranking = NULL;
+
+	/* First we search the intersection RB tree as it could have
+	taken ownership of the words rb tree instance. */
+	if (query->intersection
+	    && rbt_search(query->intersection, &parent, &doc_id) == 0) {
+
+		ranking = rbt_value(fts_ranking_t, parent.last);
+	}
+
+	if (ranking == NULL
+	    && rbt_search(query->doc_ids, &parent, &doc_id) == 0) {
+
+		ranking = rbt_value(fts_ranking_t, parent.last);
+	}
+
+	if (ranking != NULL) {
+		ulint	len;
+		byte*	term;
+
+		len = ut_strlen((char*) word) + 1;
+
+		term = static_cast<byte*>(mem_heap_alloc(query->heap, len));
+
+		/* Need to copy the NUL character too. */
+		memcpy(term, (char*) word, len);
+
+		/* The current set must have ownership of the RB tree. */
+		ut_a(ranking->words != NULL);
+
+		/* If the word doesn't exist in the words "list" we add it. */
+		if (rbt_search(ranking->words, &parent, term) != 0) {
+			rbt_add_node(ranking->words, &parent, &term);
+		}
+	}
+}
+
+/*******************************************************************//**
+Check the node ilist. */
+static
+void
+fts_query_check_node(
+/*=================*/
+	fts_query_t*		query,	/*!< in: query to update */
+	const fts_string_t*	token,	/*!< in: the token to search */
+	const fts_node_t*	node)	/*!< in: node to check */
+{
+	/* Skip nodes whose doc ids are out range. */
+	if (query->oper == FTS_EXIST
+	    && ((query->upper_doc_id > 0
+		&& node->first_doc_id > query->upper_doc_id)
+		|| (query->lower_doc_id > 0
+		    && node->last_doc_id < query->lower_doc_id))) {
+
+		/* Ignore */
+
+	} else {
+		int		ret;
+		ib_rbt_bound_t	parent;
+		ulint		ilist_size = node->ilist_size;
+		fts_word_freq_t*word_freqs;
+
+		/* The word must exist. */
+		ret = rbt_search(query->word_freqs, &parent, token->f_str);
+		ut_a(ret == 0);
+
+		word_freqs = rbt_value(fts_word_freq_t, parent.last);
+
+		fts_query_filter_doc_ids(
+			query, token->f_str, word_freqs, node,
+			node->ilist, ilist_size, TRUE);
+	}
+}
+
+/*****************************************************************//**
+Search index cache for word with wildcard match.
+@return number of words matched */
+static
+ulint
+fts_cache_find_wildcard(
+/*====================*/
+	fts_query_t*		query,		/*!< in: query instance */
+	const fts_index_cache_t*index_cache,	/*!< in: cache to search */
+	const fts_string_t*	token)		/*!< in: token to search */
+{
+	ib_rbt_bound_t		parent;
+	const ib_vector_t*	nodes = NULL;
+	fts_string_t		srch_text;
+	byte			term[FTS_MAX_WORD_LEN + 1];
+	ulint			num_word = 0;
+
+	srch_text.f_len = (token->f_str[token->f_len - 1] == '%')
+			? token->f_len - 1
+			: token->f_len;
+
+	strncpy((char*) term, (char*) token->f_str, srch_text.f_len);
+	term[srch_text.f_len] = '\0';
+	srch_text.f_str = term;
+
+	/* Lookup the word in the rb tree */
+	if (rbt_search_cmp(index_cache->words, &parent, &srch_text, NULL,
+			   innobase_fts_text_cmp_prefix) == 0) {
+		const fts_tokenizer_word_t*     word;
+		ulint				i;
+		const ib_rbt_node_t*		cur_node;
+		ibool				forward = FALSE;
+
+		word = rbt_value(fts_tokenizer_word_t, parent.last);
+		cur_node = parent.last;
+
+		while (innobase_fts_text_cmp_prefix(
+			index_cache->charset, &srch_text, &word->text) == 0) {
+
+			nodes = word->nodes;
+
+			for (i = 0; nodes && i < ib_vector_size(nodes); ++i) {
+				int                     ret;
+				const fts_node_t*       node;
+				ib_rbt_bound_t          freq_parent;
+				fts_word_freq_t*	word_freqs;
+
+				node = static_cast<const fts_node_t*>(
+					ib_vector_get_const(nodes, i));
+
+				ret = rbt_search(query->word_freqs,
+						 &freq_parent,
+						 srch_text.f_str);
+
+				ut_a(ret == 0);
+
+				word_freqs = rbt_value(
+					fts_word_freq_t,
+					freq_parent.last);
+
+				fts_query_filter_doc_ids(
+					query, srch_text.f_str,
+					word_freqs, node,
+					node->ilist, node->ilist_size, TRUE);
+			}
+
+			num_word++;
+
+			if (!forward) {
+				cur_node = rbt_prev(
+					index_cache->words, cur_node);
+			} else {
+cont_search:
+				cur_node = rbt_next(
+					index_cache->words, cur_node);
+			}
+
+			if (!cur_node) {
+				break;
+			}
+
+			word = rbt_value(fts_tokenizer_word_t, cur_node);
+		}
+
+		if (!forward) {
+			forward = TRUE;
+			cur_node = parent.last;
+			goto cont_search;
+		}
+	}
+
+	return(num_word);
+}
+
+/*****************************************************************//**
+Set difference.
+@return DB_SUCCESS if all went well */
+static
+ulint
+fts_query_difference(
+/*=================*/
+	fts_query_t*		query,	/*!< in: query instance */
+	const fts_string_t*	token)	/*!< in: token to search */
+{
+	ulint			n_doc_ids= 0;
+	trx_t*			trx = query->trx;
+	dict_table_t*		table = query->index->table;
+
+	ut_a(query->oper == FTS_IGNORE);
+
+#ifdef FTS_INTERNAL_DIAG_PRINT
+	fprintf(stderr, "DIFFERENCE: Searching: '%.*s'\n",
+		(int) token->f_len, token->f_str);
+#endif
+
+	if (query->doc_ids) {
+		n_doc_ids = rbt_size(query->doc_ids);
+	}
+
+	/* There is nothing we can substract from an empty set. */
+	if (query->doc_ids && !rbt_empty(query->doc_ids)) {
+		ulint			i;
+		fts_fetch_t		fetch;
+		const ib_vector_t*	nodes;
+		const fts_index_cache_t*index_cache;
+		que_t*			graph = NULL;
+		fts_cache_t*		cache = table->fts->cache;
+
+		rw_lock_x_lock(&cache->lock);
+
+		index_cache = fts_find_index_cache(cache, query->index);
+
+		/* Must find the index cache */
+		ut_a(index_cache != NULL);
+
+		/* Search the cache for a matching word first. */
+		nodes = fts_cache_find_word(index_cache, token);
+
+		for (i = 0; nodes && i < ib_vector_size(nodes); ++i) {
+			const fts_node_t*	node;
+
+			node = static_cast<const fts_node_t*>(
+				ib_vector_get_const(nodes, i));
+
+			fts_query_check_node(query, token, node);
+		}
+
+		rw_lock_x_unlock(&cache->lock);
+
+		/* Setup the callback args for filtering and
+		consolidating the ilist. */
+		fetch.read_arg = query;
+		fetch.read_record = fts_query_index_fetch_nodes;
+
+		query->error = fts_index_fetch_nodes(
+			trx, &graph, &query->fts_index_table, token, &fetch);
+
+		fts_que_graph_free(graph);
+	}
+
+	/* The size can't increase. */
+	ut_a(rbt_size(query->doc_ids) <= n_doc_ids);
+
+	return(query->error);
+}
+
+/*****************************************************************//**
+Intersect the token doc ids with the current set.
+@return DB_SUCCESS if all went well */
+static
+ulint
+fts_query_intersect(
+/*================*/
+	fts_query_t*		query,	/*!< in: query instance */
+	const fts_string_t*	token)	/*!< in: the token to search */
+{
+	ulint			n_doc_ids = 0;
+	trx_t*			trx = query->trx;
+	dict_table_t*		table = query->index->table;
+
+	ut_a(query->oper == FTS_EXIST);
+
+#ifdef FTS_INTERNAL_DIAG_PRINT
+	fprintf(stderr, "INTERSECT: Searching: '%.*s'\n",
+		(int) token->f_len, token->f_str);
+#endif
+
+	if (!query->inited) {
+
+		ut_a(rbt_empty(query->doc_ids));
+
+		/* Since this is the first time we need to convert this
+		intersection query into a union query. Otherwise we
+		will end up with an empty set. */
+		query->oper = FTS_NONE;
+		query->inited = TRUE;
+	}
+
+	if (query->doc_ids) {
+		n_doc_ids = rbt_size(query->doc_ids);
+	}
+
+	/* If the words set is not empty or this is the first time. */
+
+	if (!rbt_empty(query->doc_ids) || query->oper == FTS_NONE) {
+		ulint			i;
+		fts_fetch_t		fetch;
+		const ib_vector_t*	nodes;
+		const fts_index_cache_t*index_cache;
+		que_t*			graph = NULL;
+		fts_cache_t*		cache = table->fts->cache;
+
+		ut_a(!query->intersection);
+
+		/* Only if this is not the first time. */
+		if (query->oper != FTS_NONE) {
+
+			/* Create the rb tree that will hold the doc ids of
+			the intersection. */
+			query->intersection = rbt_create(
+				sizeof(fts_ranking_t), fts_ranking_doc_id_cmp);
+		}
+
+		/* This is to avoid decompressing the ilist if the
+		node's ilist doc ids are out of range. */
+		if (!rbt_empty(query->doc_ids) && query->multi_exist) {
+			const ib_rbt_node_t*	node;
+			doc_id_t*		doc_id;
+
+			node = rbt_first(query->doc_ids);
+			doc_id = rbt_value(doc_id_t, node);
+			query->lower_doc_id = *doc_id;
+
+			node = rbt_last(query->doc_ids);
+			doc_id = rbt_value(doc_id_t, node);
+			query->upper_doc_id = *doc_id;
+
+		} else {
+			query->lower_doc_id = 0;
+			query->upper_doc_id = 0;
+		}
+
+		/* Search the cache for a matching word first. */
+
+		rw_lock_x_lock(&cache->lock);
+
+		/* Search for the index specific cache. */
+		index_cache = fts_find_index_cache(cache, query->index);
+
+		/* Must find the index cache. */
+		ut_a(index_cache != NULL);
+
+		if (query->cur_node->term.wildcard) {
+			/* Wildcard search the index cache */
+			fts_cache_find_wildcard(query, index_cache, token);
+		} else {
+			nodes = fts_cache_find_word(index_cache, token);
+
+			for (i = 0; nodes && i < ib_vector_size(nodes); ++i) {
+				const fts_node_t*	node;
+
+				node = static_cast<const fts_node_t*>(
+					ib_vector_get_const(nodes, i));
+
+				fts_query_check_node(query, token, node);
+			}
+		}
+
+		rw_lock_x_unlock(&cache->lock);
+
+		/* Setup the callback args for filtering and
+		consolidating the ilist. */
+		fetch.read_arg = query;
+		fetch.read_record = fts_query_index_fetch_nodes;
+
+		query->error = fts_index_fetch_nodes(
+			trx, &graph, &query->fts_index_table, token, &fetch);
+
+		fts_que_graph_free(graph);
+
+		if (query->error == DB_SUCCESS) {
+			if (query->oper == FTS_EXIST) {
+
+				/* The size can't increase. */
+				ut_a(rbt_size(query->doc_ids) <= n_doc_ids);
+			}
+
+			/* Make the intesection (rb tree) the current doc id
+			set and free the old set. */
+			if (query->intersection) {
+				fts_query_free_doc_ids(query->doc_ids);
+				query->doc_ids = query->intersection;
+				query->intersection = NULL;
+			}
+
+			/* Reset the set operation to intersect. */
+			query->oper = FTS_EXIST;
+		}
+	}
+
+	if (!query->multi_exist) {
+		query->multi_exist = TRUE;
+	}
+
+	return(query->error);
+}
+
+/*****************************************************************//**
+Query index cache.
+@return DB_SUCCESS if all went well */
+static
+ulint
+fts_query_cache(
+/*============*/
+	fts_query_t*		query,	/*!< in/out: query instance */
+	const fts_string_t*	token)	/*!< in: token to search */
+{
+	const fts_index_cache_t*index_cache;
+	dict_table_t*		table = query->index->table;
+	fts_cache_t*		cache = table->fts->cache;
+
+	/* Search the cache for a matching word first. */
+	rw_lock_x_lock(&cache->lock);
+
+	/* Search for the index specific cache. */
+	index_cache = fts_find_index_cache(cache, query->index);
+
+	/* Must find the index cache. */
+	ut_a(index_cache != NULL);
+
+	if (query->cur_node->term.wildcard
+	    && query->flags != FTS_PROXIMITY
+	    && query->flags != FTS_PHRASE) {
+		/* Wildcard search the index cache */
+		fts_cache_find_wildcard(query, index_cache, token);
+	} else {
+		const ib_vector_t*      nodes;
+		ulint			i;
+
+		nodes = fts_cache_find_word(index_cache, token);
+
+		for (i = 0; nodes && i < ib_vector_size(nodes); ++i) {
+			const fts_node_t*	node;
+
+			node = static_cast<const fts_node_t*>(
+				ib_vector_get_const(nodes, i));
+
+			fts_query_check_node(query, token, node);
+		}
+	}
+
+	rw_lock_x_unlock(&cache->lock);
+
+	return(DB_SUCCESS);
+}
+
+/*****************************************************************//**
+Set union.
+@return DB_SUCCESS if all went well */
+static
+ulint
+fts_query_union(
+/*============*/
+	fts_query_t*		query,	/*!< in: query instance */
+	fts_string_t*		token)	/*!< in: token to search */
+{
+	fts_fetch_t		fetch;
+	ulint			n_doc_ids = 0;
+	trx_t*			trx = query->trx;
+	que_t*			graph = NULL;
+
+	ut_a(query->oper == FTS_NONE || query->oper == FTS_DECR_RATING ||
+	     query->oper == FTS_NEGATE || query->oper == FTS_INCR_RATING);
+
+#ifdef FTS_INTERNAL_DIAG_PRINT
+	fprintf(stderr, "UNION: Searching: '%.*s'\n",
+		(int) token->f_len, token->f_str);
+#endif
+
+	query->error = DB_SUCCESS;
+
+	if (query->doc_ids) {
+		n_doc_ids = rbt_size(query->doc_ids);
+	}
+
+	if (token->f_len == 0) {
+		return(query->error);
+	}
+
+	/* Single '%' would confuse parser in pars_like_rebind(). In addition,
+	our wildcard search only supports prefix search */
+	if (*token->f_str == '%') {
+		if (token->f_len == 1) {
+			return(query->error);
+		}
+		token->f_str++;
+		token->f_len--;
+	}
+
+	fts_query_cache(query, token);
+
+	/* Setup the callback args for filtering and
+	consolidating the ilist. */
+	fetch.read_arg = query;
+	fetch.read_record = fts_query_index_fetch_nodes;
+
+	/* Read the nodes from disk. */
+	query->error = fts_index_fetch_nodes(
+		trx, &graph, &query->fts_index_table, token, &fetch);
+
+	fts_que_graph_free(graph);
+
+	if (query->error == DB_SUCCESS) {
+
+		/* The size can't decrease. */
+		ut_a(rbt_size(query->doc_ids) >= n_doc_ids);
+
+		/* Calulate the number of doc ids that were added to
+		the current doc id set. */
+		if (query->doc_ids) {
+			n_doc_ids = rbt_size(query->doc_ids) - n_doc_ids;
+		}
+
+		/* In case there were no matching docs then we reset the
+		state, otherwise intersection will not be able to detect
+		that it's being called for the first time. */
+		if (!rbt_empty(query->doc_ids)) {
+			query->inited = TRUE;
+		}
+	}
+
+	return(query->error);
+}
+
+/*****************************************************************//**
+Depending upon the current query operator process the doc id. */
+static
+void
+fts_query_process_doc_id(
+/*=====================*/
+	fts_query_t*	query,		/*!< in: query instance */
+	doc_id_t	doc_id,		/*!< in: doc id to process */
+	fts_rank_t	rank)		/*!< in: if non-zero, it is the
+					rank associated with the doc_id */
+{
+	switch (query->oper) {
+	case FTS_NONE:
+		fts_query_union_doc_id(query, doc_id, rank);
+		break;
+
+	case FTS_EXIST:
+		fts_query_intersect_doc_id(query, doc_id, rank);
+		break;
+
+	case FTS_IGNORE:
+		fts_query_remove_doc_id(query, doc_id);
+		break;
+
+	case FTS_NEGATE:
+		fts_query_change_ranking(query, doc_id, TRUE);
+		break;
+
+	case FTS_DECR_RATING:
+		fts_query_union_doc_id(query, doc_id, rank);
+		fts_query_change_ranking(query, doc_id, TRUE);
+		break;
+
+	case FTS_INCR_RATING:
+		fts_query_union_doc_id(query, doc_id, rank);
+		fts_query_change_ranking(query, doc_id, FALSE);
+		break;
+
+	default:
+		ut_error;
+	}
+}
+
+/*****************************************************************//**
+Merge two result sets. */
+static
+void
+fts_merge_doc_ids(
+/*==============*/
+	fts_query_t*	query,		/*!< in,out: query instance */
+	const ib_rbt_t*	doc_ids)	/*!< in: result set to merge */
+{
+	const ib_rbt_node_t*	node;
+
+	ut_a(!rbt_empty(doc_ids));
+	ut_a(!query->intersection);
+
+	/* To process FTS_EXIST operation (intersection), we need
+	to create a new result set for fts_query_intersect(). */
+	if (query->oper == FTS_EXIST) {
+
+		query->intersection = rbt_create(
+			sizeof(fts_ranking_t), fts_ranking_doc_id_cmp);
+	}
+
+	/* Merge the elements to the result set. */
+	for (node = rbt_first(doc_ids); node; node = rbt_next(doc_ids, node)) {
+		fts_ranking_t*		ranking;
+
+		ranking = rbt_value(fts_ranking_t, node);
+
+		fts_query_process_doc_id(
+			query, ranking->doc_id, ranking->rank);
+	}
+
+	/* If it is an intersection operation, reset query->doc_ids
+	to query->intersection and free the old result list. */
+	if (query->oper == FTS_EXIST && query->intersection != NULL) {
+		fts_query_free_doc_ids(query->doc_ids);
+		query->doc_ids = query->intersection;
+		query->intersection = NULL;
+	}
+}
+
+/*****************************************************************//**
+Skip non-whitespace in a string. Move ptr to the next word boundary.
+@return pointer to first whitespace character or end */
+UNIV_INLINE
+byte*
+fts_query_skip_word(
+/*================*/
+	byte*		ptr,		/*!< in: start of scan */
+	const byte*	end)		/*!< in: pointer to end of string */
+{
+	/* TODO: Does this have to be UTF-8 too ? */
+	while (ptr < end && !(ispunct(*ptr) || isspace(*ptr))) {
+		++ptr;
+	}
+
+	return(ptr);
+}
+
+/*****************************************************************//**
+Check whether the remaining terms in the phrase match the text.
+@return TRUE if matched else FALSE */
+static
+ibool
+fts_query_match_phrase_terms(
+/*=========================*/
+	fts_phrase_t*	phrase,		/*!< in: phrase to match */
+	byte**		start,		/*!< in/out: text to search, we can't
+					make this const becase we need to
+					first convert the string to
+					lowercase */
+	const byte*	end,		/*!< in: pointer to the end of
+					the string to search */
+	mem_heap_t*	heap)		/*!< in: heap */
+{
+	ulint			i;
+	byte*			ptr = *start;
+	const ib_vector_t*	tokens = phrase->tokens;
+	ulint			distance = phrase->distance;
+
+	/* We check only from the second term onwards, since the first
+	must have matched otherwise we wouldn't be here. */
+	for (i = 1; ptr < end && i < ib_vector_size(tokens); /* No op */) {
+		fts_string_t		match;
+		fts_string_t		cmp_str;
+		const fts_string_t*	token;
+		int			result;
+		ulint			ret;
+		ulint			offset;
+
+		ret = innobase_mysql_fts_get_token(
+			phrase->charset, ptr, (byte*) end,
+			&match, &offset);
+
+		if (match.f_len > 0) {
+			/* Get next token to match. */
+			token = static_cast<const fts_string_t*>(
+				ib_vector_get_const(tokens, i));
+
+			fts_utf8_string_dup(&cmp_str, &match, heap);
+
+			result = innobase_fts_text_case_cmp(
+				phrase->charset, token, &cmp_str);
+
+			/* Skip the rest of the tokens if this one doesn't
+			match and the proximity distance is exceeded. */
+			if (result
+			    && (distance == ULINT_UNDEFINED
+				|| distance == 0)) {
+
+				break;
+			}
+
+			/* This token matched move to the next token. */
+			if (result == 0) {
+				/* Advance the text to search by the length
+				of the last token. */
+				ptr += ret;
+
+				/* Advance to the next token. */
+				++i;
+			} else {
+
+				ut_a(distance != ULINT_UNDEFINED);
+
+				ptr = fts_query_skip_word(ptr, end);
+			}
+
+			/* Distance can be 0 for exact matches. */
+			if (distance != ULINT_UNDEFINED && distance > 0) {
+				--distance;
+			}
+		} else {
+			ptr += ret;
+		}
+	}
+
+	*start = ptr;
+
+	/* Can't be greater than the number of elements. */
+	ut_a(i <= ib_vector_size(tokens));
+
+	/* This is the case for multiple words. */
+	if (i == ib_vector_size(tokens)) {
+		phrase->found = TRUE;
+	}
+
+	return(phrase->found);
+}
+
+/*****************************************************************//**
+Callback function to fetch and search the document.
+@return TRUE if matched else FALSE */
+static
+ibool
+fts_query_match_phrase(
+/*===================*/
+	fts_phrase_t*	phrase,		/*!< in: phrase to match */
+	byte*		start,		/*!< in: text to search, we can't make
+					this const becase we need to first
+					convert the string to lowercase */
+	ulint		cur_len,	/*!< in: length of text */
+	ulint		prev_len,	/*!< in: total length for searched
+					doc fields*/
+	mem_heap_t*	heap)		/* heap */
+{
+	ulint			i;
+	const fts_string_t*	first;
+	const byte*		end = start + cur_len;
+	const ib_vector_t*	tokens = phrase->tokens;
+	const ib_vector_t*	positions = phrase->match->positions;
+
+	ut_a(!phrase->found);
+	ut_a(phrase->match->doc_id > 0);
+	ut_a(ib_vector_size(tokens) > 0);
+	ut_a(ib_vector_size(positions) > 0);
+
+	first = static_cast<const fts_string_t*>(
+		ib_vector_get_const(tokens, 0));
+
+	ut_a(phrase->match->start < ib_vector_size(positions));
+
+	for (i = phrase->match->start; i < ib_vector_size(positions); ++i) {
+		ulint		pos;
+		fts_string_t	match;
+		fts_string_t	cmp_str;
+		byte*		ptr = start;
+		ulint		ret;
+		ulint		offset;
+
+		pos = *(ulint*) ib_vector_get_const(positions, i);
+
+		if (pos == ULINT_UNDEFINED) {
+			break;
+		}
+
+		if (pos < prev_len) {
+			continue;
+		}
+
+		/* Document positions are calculated from the beginning
+		of the first field, need to save the length for each
+		searched field to adjust the doc position when search
+		phrases. */
+		pos -= prev_len;
+		ptr = match.f_str = start + pos;
+
+		/* Within limits ? */
+		if (ptr >= end) {
+			break;
+		}
+
+		ret = innobase_mysql_fts_get_token(
+			phrase->charset, start + pos, (byte*) end,
+			&match, &offset);
+
+		if (match.f_len == 0) {
+			break;
+		}
+
+		fts_utf8_string_dup(&cmp_str, &match, heap);
+
+		if (innobase_fts_text_case_cmp(
+			phrase->charset, first, &cmp_str) == 0) {
+
+			/* This is the case for the single word
+			in the phrase. */
+			if (ib_vector_size(phrase->tokens) == 1) {
+				phrase->found = TRUE;
+				break;
+			}
+
+			ptr += ret;
+
+			/* Match the remaining terms in the phrase. */
+			if (fts_query_match_phrase_terms(phrase, &ptr,
+							 end, heap)) {
+				break;
+			}
+		}
+	}
+
+	return(phrase->found);
+}
+
+/*****************************************************************//**
+Callback function to fetch and search the document.
+@return whether the phrase is found */
+static
+ibool
+fts_query_fetch_document(
+/*=====================*/
+	void*		row,		/*!< in:  sel_node_t* */
+	void*		user_arg)	/*!< in:  fts_doc_t* */
+{
+
+	que_node_t*	exp;
+	sel_node_t*	node = static_cast<sel_node_t*>(row);
+	fts_phrase_t*	phrase = static_cast<fts_phrase_t*>(user_arg);
+	ulint		prev_len = 0;
+
+	exp = node->select_list;
+
+	phrase->found = FALSE;
+
+	while (exp) {
+		dfield_t*	dfield = que_node_get_val(exp);
+		void*		data = NULL;
+		ulint		cur_len;
+
+		if (dfield_is_ext(dfield)) {
+			data = btr_copy_externally_stored_field(
+				&cur_len, static_cast<const byte*>(data),
+				phrase->zip_size,
+				dfield_get_len(dfield), phrase->heap);
+		} else {
+			data = dfield_get_data(dfield);
+			cur_len = dfield_get_len(dfield);
+		}
+
+		if (cur_len != UNIV_SQL_NULL && cur_len != 0) {
+			phrase->found =
+				fts_query_match_phrase(
+					phrase, static_cast<byte*>(data),
+					cur_len, prev_len, phrase->heap);
+		}
+
+		if (phrase->found) {
+			break;
+		}
+
+		/* Document positions are calculated from the beginning
+		of the first field, need to save the length for each
+		searched field to adjust the doc position when search
+		phrases. */
+		prev_len += cur_len + 1;
+		exp = que_node_get_next(exp);
+	}
+
+	return(phrase->found);
+}
+
+#if 0
+/********************************************************************
+Callback function to check whether a record was found or not. */
+static
+ibool
+fts_query_select(
+/*=============*/
+	void*		row,		/*!< in:  sel_node_t* */
+	void*		user_arg)	/*!< in:  fts_doc_t* */
+{
+	int		i;
+	que_node_t*	exp;
+	sel_node_t*	node = row;
+	fts_select_t*	select = user_arg;
+
+	ut_a(select->word_freq);
+	ut_a(select->word_freq->doc_freqs);
+
+	exp = node->select_list;
+
+	for (i = 0; exp && !select->found; ++i) {
+		dfield_t*	dfield = que_node_get_val(exp);
+		void*		data = dfield_get_data(dfield);
+		ulint		len = dfield_get_len(dfield);
+
+		switch (i) {
+		case 0: /* DOC_COUNT */
+			if (len != UNIV_SQL_NULL && len != 0) {
+
+				select->word_freq->doc_count +=
+					mach_read_from_4(data);
+			}
+			break;
+
+		case 1: /* ILIST */
+			if (len != UNIV_SQL_NULL && len != 0) {
+
+				fts_query_find_doc_id(select, data, len);
+			}
+			break;
+
+		default:
+			ut_error;
+		}
+
+		exp = que_node_get_next(exp);
+	}
+
+	return(FALSE);
+}
+
+/********************************************************************
+Read the rows from the FTS index, that match word and where the
+doc id is between first and last doc id. */
+static
+ulint
+fts_query_find_term(
+/*================*/
+					/*!< out: DB_SUCCESS if all went well
+					else error code */
+	fts_query_t*		query,	/*!< in: FTS query state */
+	que_t**			graph,	/*!< in: prepared statement */
+	const fts_string_t*	word,	/*!< in: the word to fetch */
+	doc_id_t		doc_id,	/*!< in: doc id to match */
+	ulint*			min_pos,/*!< in/out: pos found must be
+					 greater than this minimum value. */
+	ibool*			found)	/*!< out: TRUE if found else FALSE */
+{
+	pars_info_t*		info;
+	ulint			error;
+	fts_select_t		select;
+	doc_id_t		match_doc_id;
+	trx_t*			trx = query->trx;
+
+	trx->op_info = "fetching FTS index matching nodes";
+
+	if (*graph) {
+		info = (*graph)->info;
+	} else {
+		info = pars_info_create();
+	}
+
+	select.found = FALSE;
+	select.doc_id = doc_id;
+	select.min_pos = *min_pos;
+	select.word_freq = fts_query_add_word_freq(query, word->f_str);
+
+	pars_info_bind_function(info, "my_func", fts_query_select, &select);
+	pars_info_bind_varchar_literal(info, "word", word->f_str, word->f_len);
+
+	/* Convert to "storage" byte order. */
+	fts_write_doc_id((byte*) &match_doc_id, doc_id);
+
+	fts_bind_doc_id(info, "min_doc_id", &match_doc_id);
+
+	fts_bind_doc_id(info, "max_doc_id", &match_doc_id);
+
+	if (!*graph) {
+		ulint		selected;
+
+		selected = fts_select_index(*word->f_str);
+
+		query->fts_index_table.suffix = fts_get_suffix(selected);
+
+		*graph = fts_parse_sql(
+			&query->fts_index_table,
+			info,
+			"DECLARE FUNCTION my_func;\n"
+			"DECLARE CURSOR c IS"
+			" SELECT doc_count, ilist\n"
+			" FROM %s\n"
+			" WHERE word LIKE :word AND "
+			"	first_doc_id <= :min_doc_id AND "
+			"	last_doc_id >= :max_doc_id\n"
+			" ORDER BY first_doc_id;\n"
+			"BEGIN\n"
+			"\n"
+			"OPEN c;\n"
+			"WHILE 1 = 1 LOOP\n"
+			"  FETCH c INTO my_func();\n"
+			"  IF c % NOTFOUND THEN\n"
+			"    EXIT;\n"
+			"  END IF;\n"
+			"END LOOP;\n"
+			"CLOSE c;");
+	}
+
+	for(;;) {
+		error = fts_eval_sql(trx, *graph);
+
+		if (error == DB_SUCCESS) {
+
+			break;				/* Exit the loop. */
+		} else {
+			ut_print_timestamp(stderr);
+
+			if (error == DB_LOCK_WAIT_TIMEOUT) {
+				fprintf(stderr, " InnoDB: Warning: lock wait "
+					"timeout reading FTS index. "
+					"Retrying!\n");
+
+				trx->error_state = DB_SUCCESS;
+			} else {
+				fprintf(stderr, " InnoDB: Error: %lu "
+					"while reading FTS index.\n", error);
+
+				break;			/* Exit the loop. */
+			}
+		}
+	}
+
+	/* Value to return */
+	*found = select.found;
+
+	if (*found) {
+		*min_pos = select.min_pos;
+	}
+
+	return(error);
+}
+
+/********************************************************************
+Callback aggregator for int columns. */
+static
+ibool
+fts_query_sum(
+/*==========*/
+					/*!< out: always returns TRUE */
+	void*		row,		/*!< in:  sel_node_t* */
+	void*		user_arg)	/*!< in:  ulint* */
+{
+
+	que_node_t*	exp;
+	sel_node_t*	node = row;
+	ulint*		total = user_arg;
+
+	exp = node->select_list;
+
+	while (exp) {
+		dfield_t*	dfield = que_node_get_val(exp);
+		void*		data = dfield_get_data(dfield);
+		ulint		len = dfield_get_len(dfield);
+
+		if (len != UNIV_SQL_NULL && len != 0) {
+			*total += mach_read_from_4(data);
+		}
+
+		exp = que_node_get_next(exp);
+	}
+
+	return(TRUE);
+}
+
+/********************************************************************
+Calculate the total documents that contain a particular word (term). */
+static
+ulint
+fts_query_total_docs_containing_term(
+/*=================================*/
+					/*!< out: DB_SUCCESS if all went well
+					else error code */
+	fts_query_t*		query,	/*!< in: FTS query state */
+	const fts_string_t*	word,	/*!< in: the word to check */
+	ulint*			total)	/*!< out: documents containing word */
+{
+	pars_info_t*		info;
+	ulint			error;
+	que_t*			graph;
+	ulint			selected;
+	trx_t*			trx = query->trx;
+
+	trx->op_info = "fetching FTS index document count";
+
+	*total = 0;
+
+	info = pars_info_create();
+
+	pars_info_bind_function(info, "my_func", fts_query_sum, total);
+	pars_info_bind_varchar_literal(info, "word", word->f_str, word->f_len);
+
+	selected = fts_select_index(*word->f_str);
+
+	query->fts_index_table.suffix = fts_get_suffix(selected);
+
+	graph = fts_parse_sql(
+		&query->fts_index_table,
+		info,
+		"DECLARE FUNCTION my_func;\n"
+		"DECLARE CURSOR c IS"
+		" SELECT doc_count\n"
+		" FROM %s\n"
+		" WHERE word = :word "
+		" ORDER BY first_doc_id;\n"
+		"BEGIN\n"
+		"\n"
+		"OPEN c;\n"
+		"WHILE 1 = 1 LOOP\n"
+		"  FETCH c INTO my_func();\n"
+		"  IF c % NOTFOUND THEN\n"
+		"    EXIT;\n"
+		"  END IF;\n"
+		"END LOOP;\n"
+		"CLOSE c;");
+
+	for(;;) {
+		error = fts_eval_sql(trx, graph);
+
+		if (error == DB_SUCCESS) {
+
+			break;				/* Exit the loop. */
+		} else {
+			ut_print_timestamp(stderr);
+
+			if (error == DB_LOCK_WAIT_TIMEOUT) {
+				fprintf(stderr, " InnoDB: Warning: lock wait "
+					"timeout reading FTS index. "
+					"Retrying!\n");
+
+				trx->error_state = DB_SUCCESS;
+			} else {
+				fprintf(stderr, " InnoDB: Error: %lu "
+					"while reading FTS index.\n", error);
+
+				break;			/* Exit the loop. */
+			}
+		}
+	}
+
+	fts_que_graph_free(graph);
+
+	return(error);
+}
+
+/********************************************************************
+Get the total number of words in a documents. */
+static
+ulint
+fts_query_terms_in_document(
+/*========================*/
+					/*!< out: DB_SUCCESS if all went well
+					else error code */
+	fts_query_t*	query,		/*!< in: FTS query state */
+	doc_id_t	doc_id,		/*!< in: the word to check */
+	ulint*		total)		/*!< out: total words in document */
+{
+	pars_info_t*	info;
+	ulint		error;
+	que_t*		graph;
+	doc_id_t	read_doc_id;
+	trx_t*		trx = query->trx;
+
+	trx->op_info = "fetching FTS document term count";
+
+	*total = 0;
+
+	info = pars_info_create();
+
+	pars_info_bind_function(info, "my_func", fts_query_sum, total);
+
+	/* Convert to "storage" byte order. */
+	fts_write_doc_id((byte*) &read_doc_id, doc_id);
+	fts_bind_doc_id(info, "doc_id", &read_doc_id);
+
+	query->fts_index_table.suffix = "DOC_ID";
+
+	graph = fts_parse_sql(
+		&query->fts_index_table,
+		info,
+		"DECLARE FUNCTION my_func;\n"
+		"DECLARE CURSOR c IS"
+		" SELECT count\n"
+		" FROM %s\n"
+		" WHERE doc_id = :doc_id "
+		"BEGIN\n"
+		"\n"
+		"OPEN c;\n"
+		"WHILE 1 = 1 LOOP\n"
+		"  FETCH c INTO my_func();\n"
+		"  IF c % NOTFOUND THEN\n"
+		"    EXIT;\n"
+		"  END IF;\n"
+		"END LOOP;\n"
+		"CLOSE c;");
+
+	for(;;) {
+		error = fts_eval_sql(trx, graph);
+
+		if (error == DB_SUCCESS) {
+
+			break;				/* Exit the loop. */
+		} else {
+			ut_print_timestamp(stderr);
+
+			if (error == DB_LOCK_WAIT_TIMEOUT) {
+				fprintf(stderr, " InnoDB: Warning: lock wait "
+					"timeout reading FTS doc id table. "
+					"Retrying!\n");
+
+				trx->error_state = DB_SUCCESS;
+			} else {
+				fprintf(stderr, " InnoDB: Error: %lu "
+					"while reading FTS doc id table.\n",
+					error);
+
+				break;			/* Exit the loop. */
+			}
+		}
+	}
+
+	fts_que_graph_free(graph);
+
+	return(error);
+}
+#endif
+
+/*****************************************************************//**
+Retrieve the document and match the phrase tokens.
+@return TRUE if matches else FALSE */
+static
+ulint
+fts_query_match_document(
+/*=====================*/
+	ib_vector_t*	tokens,		/*!< in: phrase tokens */
+	fts_get_doc_t*	get_doc,	/*!< in: table and prepared statements */
+	fts_match_t*	match,		/*!< in: doc id and positions */
+	ulint		distance,	/*!< in: proximity distance */
+	ibool*		found)		/*!< out: TRUE if phrase found */
+{
+	ulint		error;
+	fts_phrase_t	phrase;
+
+	memset(&phrase, 0x0, sizeof(phrase));
+
+	phrase.match = match;		/* Positions to match */
+	phrase.tokens = tokens;		/* Tokens to match */
+	phrase.distance = distance;
+	phrase.charset = get_doc->index_cache->charset;
+	phrase.zip_size = dict_table_zip_size(
+		get_doc->index_cache->index->table);
+	phrase.heap = mem_heap_create(512);
+
+	*found = phrase.found = FALSE;
+
+	error = fts_doc_fetch_by_doc_id(
+		get_doc, match->doc_id, NULL, FTS_FETCH_DOC_BY_ID_EQUAL,
+		fts_query_fetch_document, &phrase);
+
+	if (error != DB_SUCCESS) {
+		ut_print_timestamp(stderr);
+		fprintf(stderr, "InnoDB: Error: (%lu) matching document.\n",
+			error);
+	} else {
+		*found = phrase.found;
+	}
+
+	mem_heap_free(phrase.heap);
+
+	return(error);
+}
+
+/*****************************************************************//**
+Iterate over the matched document ids and search the for the
+actual phrase in the text.
+@return DB_SUCCESS if all OK */
+static
+ulint
+fts_query_search_phrase(
+/*====================*/
+	fts_query_t*		query,	/*!< in: query instance */
+	ib_vector_t*		tokens)	/*!< in: tokens to search */
+{
+	ulint			i;
+	fts_get_doc_t		get_doc;
+	ulint			n_matched;
+	// FIXME: Debug code
+	ulint			searched = 0;
+	fts_cache_t*		cache = query->index->table->fts->cache;
+
+	n_matched = ib_vector_size(query->matched);
+
+	/* Setup the doc retrieval infrastructure. */
+	memset(&get_doc, 0x0, sizeof(get_doc));
+
+	rw_lock_x_lock(&cache->lock);
+
+	// FIXME: We shouldn't have to cast here.
+	get_doc.index_cache = (fts_index_cache_t*)
+	fts_find_index_cache(cache, query->index);
+
+	/* Must find the index cache */
+	ut_a(get_doc.index_cache != NULL);
+
+	rw_lock_x_unlock(&cache->lock);
+
+#ifdef FTS_INTERNAL_DIAG_PRINT
+	ut_print_timestamp(stderr);
+	fprintf(stderr, " Start phrase search\n");
+#endif
+
+	/* Read the document from disk and do the actual
+	match, matching documents will be added to the current
+	doc id set. */
+	for (i = 0; i < n_matched && query->error == DB_SUCCESS; ++i) {
+		fts_match_t*	match;
+		ibool		found = FALSE;
+
+		match = static_cast<fts_match_t*>(
+			ib_vector_get(query->matched, i));
+
+		/* Skip the document ids that were filtered out by
+		an earlier pass. */
+		if (match->doc_id != 0) {
+
+			// FIXME: Debug code
+			++searched;
+
+			query->error = fts_query_match_document(
+				tokens, &get_doc,
+				match, query->distance, &found);
+
+			if (query->error == DB_SUCCESS && found) {
+				ulint	z;
+
+				fts_query_process_doc_id(query,
+							 match->doc_id, 0);
+				for (z = 0; z < ib_vector_size(tokens); z++) {
+					fts_string_t*   token;
+					token = static_cast<fts_string_t*>(
+						ib_vector_get(tokens, z));
+					fts_query_add_word_to_document(
+						query, match->doc_id,
+						token->f_str);
+				}
+			}
+		}
+	}
+
+	/* Free the prepared statement. */
+	if (get_doc.get_document_graph) {
+		fts_que_graph_free(get_doc.get_document_graph);
+		get_doc.get_document_graph = NULL;
+	}
+
+	// FIXME: Debug code
+	ut_print_timestamp(stderr);
+	printf(" End: %lu, %lu\n", searched, ib_vector_size(query->matched));
+
+	return(query->error);
+}
+
+/*****************************************************************//**
+Text/Phrase search.
+@return count of doc ids added */
+static
+ulint
+fts_query_phrase_search(
+/*====================*/
+	fts_query_t*		query,	/*!< in: query instance */
+	const fts_string_t*	phrase)	/*!< in: token to search */
+{
+	char*			src;
+	char*			state;	/* strtok_r internal state */
+	ib_vector_t*		tokens;
+	mem_heap_t*		heap = mem_heap_create(sizeof(fts_string_t));
+	char*			utf8 = strdup((char*) phrase->f_str);
+	ib_alloc_t*		heap_alloc;
+	ulint			num_token;
+
+	heap_alloc = ib_heap_allocator_create(heap);
+
+	tokens = ib_vector_create(heap_alloc, sizeof(fts_string_t), 4);
+
+	if (query->distance != ULINT_UNDEFINED && query->distance > 0) {
+		query->flags = FTS_PROXIMITY;
+	} else {
+		query->flags = FTS_PHRASE;
+	}
+
+	/* Split the phrase into tokens. */
+	for (src = utf8; /* No op */; src = NULL) {
+		fts_string_t*	token = static_cast<fts_string_t*>(
+			ib_vector_push(tokens, NULL));
+
+		token->f_str = (byte*) strtok_r(
+			src, FTS_PHRASE_DELIMITER, &state);
+
+		if (token->f_str) {
+			/* Add the word to the RB tree so that we can
+			calculate it's frequencey within a document. */
+			fts_query_add_word_freq(query, token->f_str);
+
+			token->f_len = ut_strlen((char*) token->f_str);
+		} else {
+			ib_vector_pop(tokens);
+			break;
+		}
+	}
+
+	num_token = ib_vector_size(tokens);
+
+	/* Ignore empty strings. */
+	if (num_token > 0) {
+		fts_string_t*	token;
+		fts_fetch_t	fetch;
+		trx_t*		trx = query->trx;
+		fts_ast_oper_t	oper = query->oper;
+		que_t*		graph = NULL;
+		ulint		i;
+
+		/* Create the rb tree for storing the words read form disk. */
+		if (!query->inited) {
+
+			/* Since this is the first time, we need to convert
+			this intersection query into a union query. Otherwise
+			we will end up with an empty set. */
+			if (query->oper == FTS_EXIST) {
+				query->oper = FTS_NONE;
+			}
+
+			query->inited = TRUE;
+		}
+
+		/* Create the vector for storing matching document ids
+		and the positions of the first token of the phrase. */
+		if (!query->matched) {
+			ib_alloc_t*	heap_alloc;
+
+			heap_alloc = ib_heap_allocator_create(heap);
+
+			if (!(query->flags & FTS_PROXIMITY)
+			    && !(query->flags & FTS_PHRASE)) {
+				query->matched = ib_vector_create(
+					heap_alloc, sizeof(fts_match_t),
+					64);
+			} else {
+				ut_a(num_token < MAX_PROXIMITY_ITEM);
+				query->match_array =
+					(ib_vector_t**) mem_heap_alloc(
+						heap,
+						num_token *
+						sizeof(query->matched));
+
+				for (i = 0; i < num_token; i++) {
+					query->match_array[i] =
+					ib_vector_create(
+						heap_alloc, sizeof(fts_match_t),
+						64);
+				}
+
+				query->matched = query->match_array[0];
+			}
+		}
+
+		/* Setup the callback args for filtering and consolidating
+		the ilist. */
+		fetch.read_arg = query;
+		fetch.read_record = fts_query_index_fetch_nodes;
+
+		for (i = 0; i < num_token; i++) {
+			/* Search for the first word from the phrase. */
+			token = static_cast<fts_string_t*>(
+				ib_vector_get(tokens, i));
+
+			if (query->flags & FTS_PROXIMITY
+			    || query->flags & FTS_PHRASE) {
+				query->matched = query->match_array[i];
+			}
+
+			fts_index_fetch_nodes(
+				trx, &graph, &query->fts_index_table,
+				token, &fetch);
+
+			fts_que_graph_free(graph);
+			graph = NULL;
+
+			fts_query_cache(query, token);
+
+			if (!(query->flags & FTS_PHRASE)
+			    && !(query->flags & FTS_PROXIMITY)) {
+				break;
+			}
+
+			/* If any of the token can't be found,
+			no need to continue match */
+			if (ib_vector_is_empty(query->match_array[i])) {
+				goto func_exit;
+			}
+		}
+
+		if (num_token == 1
+		    && !ib_vector_is_empty(query->match_array[0])) {
+			fts_match_t*    match;
+			ulint		n_matched;
+
+			n_matched = ib_vector_size(query->match_array[0]);
+
+			for (i = 0; i < n_matched; i++) {
+				match = static_cast<fts_match_t*>(
+					ib_vector_get(
+						query->match_array[0], i));
+
+				fts_query_process_doc_id(
+					query, match->doc_id, 0);
+
+				fts_query_add_word_to_document(
+					query, match->doc_id, token->f_str);
+			}
+			query->oper = oper;
+			goto func_exit;
+		}
+
+		/* If we are doing proximity search, verify the distance
+		between all words, and check they are in specified distance. */
+		if (query->flags & FTS_PROXIMITY) {
+			fts_check_phrase_proximity(query, tokens);
+		} else {
+			ibool	matched;
+
+			/* Phrase Search case:
+			We filter out the doc ids that don't contain
+			all the tokens in the phrase. It's cheaper to
+			search the ilist than bringing the documents in
+			and then doing a search through the text. Isolated
+			testing shows this also helps in mitigating disruption
+			of the buffer cache. */
+			matched = fts_check_phrase_proximity(query, tokens);
+			query->matched = query->match_array[0];
+
+			/* Read the actual text in and search for the phrase. */
+			if (matched) {
+				query->error = DB_SUCCESS;
+				query->error = fts_query_search_phrase(
+					query, tokens);
+			}
+		}
+
+		/* Restore original operation. */
+		query->oper = oper;
+	}
+
+func_exit:
+	free(utf8);
+	mem_heap_free(heap);
+
+	/* Don't need it anymore. */
+	query->matched = NULL;
+
+	return(query->error);
+}
+
+/*****************************************************************//**
+Find the word and evaluate.
+@return DB_SUCCESS if all went well */
+static
+ulint
+fts_query_execute(
+/*==============*/
+	fts_query_t*		query,	/*!< in: query instance */
+	fts_string_t*		token)	/*!< in: token to search */
+{
+	switch (query->oper) {
+	case FTS_NONE:
+	case FTS_NEGATE:
+	case FTS_INCR_RATING:
+	case FTS_DECR_RATING:
+		query->error = fts_query_union(query, token);
+		break;
+
+	case FTS_EXIST:
+		query->error = fts_query_intersect(query, token);
+		break;
+
+	case FTS_IGNORE:
+		query->error = fts_query_difference(query, token);
+		break;
+
+	default:
+		ut_error;
+	}
+
+	return(query->error);
+}
+
+/*****************************************************************//**
+Create a wildcard string. It's the responsibility of the caller to
+free the byte* pointer. It's allocated using ut_malloc().
+@return ptr to allocated memory */
+static
+byte*
+fts_query_get_token(
+/*================*/
+	fts_ast_node_t*	node,		/*!< in: the current sub tree */
+	fts_string_t*	token)		/*!< in: token to create */
+{
+	ulint		str_len;
+	byte*		new_ptr = NULL;
+
+	str_len = ut_strlen((char*) node->term.ptr);
+
+	ut_a(node->type == FTS_AST_TERM);
+
+	token->f_len = str_len;
+	token->f_str = node->term.ptr;
+
+	if (node->term.wildcard) {
+
+		token->f_str = static_cast<byte*>(ut_malloc(str_len + 2));
+		token->f_len = str_len + 1;
+
+		/* Need to copy the NUL character too. */
+		memcpy(token->f_str, node->term.ptr, str_len + 1);
+
+		token->f_str[str_len] = '%';
+		token->f_str[token->f_len] = 0;
+
+		new_ptr = token->f_str;
+	}
+
+	return(new_ptr);
+}
+
+/*****************************************************************//**
+Visit every node of the AST. */
+static
+ulint
+fts_query_visitor(
+/*==============*/
+	fts_ast_oper_t	oper,		/*!< in: current operator */
+	fts_ast_node_t*	node,		/*!< in: The root of the current subtree*/
+	void*		arg)		/*!< in: callback arg*/
+{
+	byte*		ptr;
+	fts_string_t	token;
+	fts_query_t*	query = static_cast<fts_query_t*>(arg);
+
+	ut_a(node);
+
+	token.f_n_char = 0;
+
+	query->oper = oper;
+
+	query->cur_node = node;
+
+	switch (node->type) {
+	case FTS_AST_TEXT:
+		token.f_str = node->text.ptr;
+		token.f_len = ut_strlen((char*) token.f_str);
+
+		/* "first second third" is treated as first & second
+		& third. Create the rb tree that will hold the doc ids
+		of the intersection. */
+		if (!query->intersection && query->oper == FTS_EXIST) {
+
+			query->intersection = rbt_create(
+				sizeof(fts_ranking_t), fts_ranking_doc_id_cmp);
+		}
+
+		/* Set the current proximity distance. */
+		query->distance = node->text.distance;
+
+		/* Force collection of doc ids and the positions. */
+		query->collect_positions = TRUE;
+
+		query->error = fts_query_phrase_search(query, &token);
+
+		query->collect_positions = FALSE;
+
+		/* Make the intesection (rb tree) the current doc id
+		set and free the old set. */
+		if (query->intersection) {
+			fts_query_free_doc_ids(query->doc_ids);
+			query->doc_ids = query->intersection;
+			query->intersection = NULL;
+		}
+
+		break;
+
+	case FTS_AST_TERM:
+
+		/* Add the word to our RB tree that will be used to
+		calculate this terms per document frequency. */
+		fts_query_add_word_freq(query, node->term.ptr);
+
+		ptr = fts_query_get_token(node, &token);
+		query->error = fts_query_execute(query, &token);
+
+		if (ptr) {
+			ut_free(ptr);
+		}
+		break;
+
+	default:
+		ut_error;
+	}
+
+	return(query->error);
+}
+
+/*****************************************************************//**
+Process (nested) sub-expression, create a new result set to store the
+sub-expression result by processing nodes under current sub-expression
+list. Merge the sub-expression result with that of parent expression list. */
+
+ulint
+fts_ast_visit_sub_exp(
+/*==================*/
+						/*!< out: DB_SUCCESS if all
+						went well */
+	fts_ast_node_t*		node,		/*!< in,out: current root node */
+	fts_ast_callback	visitor,	/*!< in: callback function */
+	void*			arg)		/*!< in,out: arg for callback */
+{
+	fts_ast_oper_t		cur_oper;
+	fts_query_t*		query = static_cast<fts_query_t*>(arg);
+	ib_rbt_t*		parent_doc_ids;
+	ib_rbt_t*		subexpr_doc_ids;
+	ulint			error = DB_SUCCESS;
+	ibool			inited = query->inited;
+
+	ut_a(node->type == FTS_AST_SUBEXP_LIST);
+
+	node = node->list.head;
+
+	if (!node || !node->next) {
+		return(error);
+	}
+
+	cur_oper = node->oper;
+
+	/* Save current result set */
+	parent_doc_ids = query->doc_ids;
+
+	/* Create new result set to store the sub-expression result. We
+	will merge this result set with the parent after processing. */
+	query->doc_ids = rbt_create(sizeof(fts_ranking_t),
+				    fts_ranking_doc_id_cmp);
+
+	/* Reset the query start flag because the sub-expression result
+	set is independent of any previous results. The state flag
+	reset is needed for not making an intersect operation on an empty
+	set in the first call to fts_query_intersect() for the first term. */
+	query->inited = FALSE;
+
+	/* Process nodes in current sub-expression and store its
+	result set in query->doc_ids we created above. */
+	error = fts_ast_visit(FTS_NONE, node->next, visitor, arg);
+
+	/* Reinstate parent node state and prepare for merge. */
+	query->inited = inited;
+	query->oper = cur_oper;
+	subexpr_doc_ids = query->doc_ids;
+
+	/* Restore current result set. */
+	query->doc_ids = parent_doc_ids;
+
+	if (query->oper == FTS_EXIST && !query->inited) {
+		ut_a(rbt_empty(query->doc_ids));
+		/* Since this is the first time we need to convert this
+		intersection query into a union query. Otherwise we
+		will end up with an empty set. */
+		query->oper = FTS_NONE;
+		query->inited = TRUE;
+	}
+
+	/* Merge the sub-expression result with the parent result set. */
+	if (error == DB_SUCCESS && !rbt_empty(subexpr_doc_ids)) {
+		fts_merge_doc_ids(query, subexpr_doc_ids);
+	}
+
+	if (query->oper == FTS_EXIST) {
+		query->multi_exist = TRUE;
+	}
+
+	/* Free current result set. Result already merged into parent. */
+	fts_query_free_doc_ids(subexpr_doc_ids);
+
+	return(error);
+}
+
+#if 0
+/*****************************************************************//***
+Check if the doc id exists in the ilist.
+@return TRUE if doc id found */
+static
+ulint
+fts_query_find_doc_id(
+/*==================*/
+	fts_select_t*	select,		/*!< in/out: contains the doc id to
+					find, we update the word freq if
+					document found */
+	void*		data,		/*!< in: doc id ilist */
+	ulint		len)		/*!< in: doc id ilist size */
+{
+	byte*		ptr = data;
+	doc_id_t	doc_id = 0;
+	ulint		decoded = 0;
+
+	/* Decode the ilist and search for selected doc_id. We also
+	calculate the frequency of the word in the document if found. */
+	while (decoded < len && !select->found) {
+		ulint		freq = 0;
+		ulint		min_pos = 0;
+		ulint		last_pos = 0;
+		ulint		pos = fts_decode_vlc(&ptr);
+
+		/* Add the delta. */
+		doc_id += pos;
+
+		while (*ptr) {
+			++freq;
+			last_pos += fts_decode_vlc(&ptr);
+
+			/* Only if min_pos is not set and the current
+			term exists in a position greater than the
+			min_pos of the previous term. */
+			if (min_pos == 0 && last_pos > select->min_pos) {
+				min_pos = last_pos;
+			}
+		}
+
+		/* Skip the end of word position marker. */
+		++ptr;
+
+		/* Bytes decoded so far. */
+		decoded = ptr - (byte*) data;
+
+		/* A word may exist in the document but we only consider a
+		match if it exists in a position that is greater than the
+		position of the previous term. */
+		if (doc_id == select->doc_id && min_pos > 0) {
+			fts_doc_freq_t*	doc_freq;
+
+			/* Add the doc id to the doc freq rb tree, if
+			the doc id doesn't exist it will be created. */
+			doc_freq = fts_query_add_doc_freq(
+				select->word_freq->doc_freqs, doc_id);
+
+			/* Avoid duplicating the frequency tally */
+			if (doc_freq->freq == 0) {
+				doc_freq->freq = freq;
+			}
+
+			select->found = TRUE;
+			select->min_pos = min_pos;
+		}
+	}
+
+	return(select->found);
+}
+#endif
+
+/*****************************************************************//**
+Read and filter nodes.
+@return fts_node_t instance */
+static
+void
+fts_query_filter_doc_ids(
+/*=====================*/
+	fts_query_t*	query,		/*!< in: query instance */
+	const byte*	word,		/*!< in: the current word */
+	fts_word_freq_t*word_freq,	/*!< in/out: word frequency */
+	const fts_node_t*
+			node,		/*!< in: current FTS node */
+	void*		data,		/*!< in: doc id ilist */
+	ulint		len,		/*!< in: doc id ilist size */
+	ibool		calc_doc_count)	/*!< in: whether to remember doc count */
+{
+	byte*		ptr = static_cast<byte*>(data);
+	doc_id_t	doc_id = 0;
+	ulint		decoded = 0;
+	ib_rbt_t*	doc_freqs = word_freq->doc_freqs;
+
+	/* Decode the ilist and add the doc ids to the query doc_id set. */
+	while (decoded < len) {
+		ulint		freq = 0;
+		fts_doc_freq_t*	doc_freq;
+		fts_match_t*	match = NULL;
+		ulint		last_pos = 0;
+		ulint		pos = fts_decode_vlc(&ptr);
+
+		/* Some sanity checks. */
+		if (doc_id == 0) {
+			ut_a(pos == node->first_doc_id);
+		}
+
+		/* Add the delta. */
+		doc_id += pos;
+
+		if (calc_doc_count) {
+			word_freq->doc_count++;
+		}
+
+		/* We simply collect the matching instances here. */
+		if (query->collect_positions) {
+			ib_alloc_t*	heap_alloc;
+
+			/* Create a new fts_match_t instance. */
+			match = static_cast<fts_match_t*>(
+				ib_vector_push(query->matched, NULL));
+
+			match->start = 0;
+			match->doc_id = doc_id;
+			heap_alloc = ib_vector_allocator(query->matched);
+
+			/* Allocate from the same heap as the
+			parent container. */
+			match->positions = ib_vector_create(
+				heap_alloc, sizeof(ulint), 64);
+		}
+
+		/* Unpack the positions within the document. */
+		while (*ptr) {
+			last_pos += fts_decode_vlc(&ptr);
+
+			/* Collect the matching word positions, for phrase
+			matching later. */
+			if (query->collect_positions) {
+				ib_vector_push(match->positions, &last_pos);
+			}
+
+			++freq;
+		}
+
+		/* End of list marker. */
+		last_pos = (ulint) -1;
+
+		if (query->collect_positions) {
+			ut_a(match != NULL);
+			ib_vector_push(match->positions, &last_pos);
+		}
+
+		/* Add the doc id to the doc freq rb tree, if the doc id
+		doesn't exist it will be created. */
+		doc_freq = fts_query_add_doc_freq(doc_freqs, doc_id);
+
+		/* Avoid duplicating frequency tally. */
+		if (doc_freq->freq == 0) {
+			doc_freq->freq = freq;
+		}
+
+		/* Skip the end of word position marker. */
+		++ptr;
+
+		/* Bytes decoded so far */
+		decoded = ptr - (byte*) data;
+
+		/* We simply collect the matching documents and the
+		positions here and match later. */
+		if (!query->collect_positions) {
+			fts_query_process_doc_id(query, doc_id, 0);
+		}
+
+		/* Add the word to the document's matched RB tree. */
+		fts_query_add_word_to_document(query, doc_id, word);
+	}
+
+	/* Some sanity checks. */
+	ut_a(doc_id == node->last_doc_id);
+}
+
+/*****************************************************************//**
+Read the FTS INDEX row. */
+static
+void
+fts_query_read_node(
+/*================*/
+	fts_query_t*		query,	/*!< in: query instance */
+	const fts_string_t*	word,	/*!< in: current word */
+	que_node_t*		exp)	/*!< in: query graph node */
+{
+	int			i;
+	int			ret;
+	fts_node_t		node;
+	ib_rbt_bound_t		parent;
+	fts_word_freq_t*	word_freq;
+	ibool			skip = FALSE;
+	byte			term[FTS_MAX_WORD_LEN + 1];
+
+	ut_a(query->cur_node->type == FTS_AST_TERM ||
+	     query->cur_node->type == FTS_AST_TEXT);
+
+	/* Need to consider the wildcard search case, the word frequency
+	is created on the search string not the actual word. So we need
+	to assign the frequency on search string behalf. */
+	if (query->cur_node->type == FTS_AST_TERM
+	    && query->cur_node->term.wildcard) {
+
+		/* These cast are safe since we only care about the
+		terminating NUL character as an end of string marker. */
+		ut_strcpy((char*) term, (char*) query->cur_node->term.ptr);
+	} else {
+		/* Need to copy the NUL character too. */
+		memcpy(term, word->f_str, word->f_len);
+		term[word->f_len] = 0;
+	}
+
+	/* Lookup the word in our rb tree, it must exist. */
+	ret = rbt_search(query->word_freqs, &parent, term);
+
+	ut_a(ret == 0);
+
+	word_freq = rbt_value(fts_word_freq_t, parent.last);
+
+	/* Start from 1 since the first column has been read by the caller.
+	Also, we rely on the order of the columns projected, to filter
+	out ilists that are out of range and we always want to read
+	the doc_count irrespective of the suitablility of the row. */
+
+	for (i = 1; exp && !skip; exp = que_node_get_next(exp), ++i) {
+
+		dfield_t*	dfield = que_node_get_val(exp);
+		byte*		data = static_cast<byte*>(
+			dfield_get_data(dfield));
+		ulint		len = dfield_get_len(dfield);
+
+		ut_a(len != UNIV_SQL_NULL);
+
+		/* Note: The column numbers below must match the SELECT. */
+
+		switch (i) {
+		case 1: /* DOC_COUNT */
+			word_freq->doc_count += mach_read_from_4(data);
+			break;
+
+		case 2: /* FIRST_DOC_ID */
+			node.first_doc_id = fts_read_doc_id(data);
+
+			/* Skip nodes whose doc ids are out range. */
+			if (query->oper == FTS_EXIST
+			    && query->upper_doc_id > 0
+			    && node.first_doc_id > query->upper_doc_id) {
+				skip = TRUE;
+			}
+			break;
+
+		case 3: /* LAST_DOC_ID */
+			node.last_doc_id = fts_read_doc_id(data);
+
+			/* Skip nodes whose doc ids are out range. */
+			if (query->oper == FTS_EXIST
+			    && query->lower_doc_id > 0
+			    && node.last_doc_id < query->lower_doc_id) {
+				skip = TRUE;
+			}
+			break;
+
+		case 4: /* ILIST */
+
+			fts_query_filter_doc_ids(
+				query, word_freq->word, word_freq,
+				&node, data, len, FALSE);
+
+			break;
+
+		default:
+			ut_error;
+		}
+	}
+
+	if (!skip) {
+		/* Make sure all columns were read. */
+
+		ut_a(i == 5);
+	}
+}
+
+/*****************************************************************//**
+Callback function to fetch the rows in an FTS INDEX record.
+@return always returns TRUE */
+static
+ibool
+fts_query_index_fetch_nodes(
+/*========================*/
+	void*		row,		/*!< in: sel_node_t* */
+	void*		user_arg)	/*!< in: pointer to fts_fetch_t */
+{
+	fts_string_t	key;
+	sel_node_t*	sel_node = static_cast<sel_node_t*>(row);
+	fts_fetch_t*	fetch = static_cast<fts_fetch_t*>(user_arg);
+	fts_query_t*	query = static_cast<fts_query_t*>(fetch->read_arg);
+	que_node_t*	exp = sel_node->select_list;
+	dfield_t*	dfield = que_node_get_val(exp);
+	void*		data = dfield_get_data(dfield);
+	ulint		dfield_len = dfield_get_len(dfield);
+
+	key.f_str = static_cast<byte*>(data);
+	key.f_len = dfield_len;
+
+	ut_a(dfield_len <= FTS_MAX_WORD_LEN);
+
+	fts_query_read_node(query, &key, que_node_get_next(exp));
+
+	return(TRUE);
+}
+
+/*****************************************************************//**
+Calculate the inverse document frequency (IDF) for all the terms. */
+static
+void
+fts_query_calculate_idf(
+/*====================*/
+	fts_query_t*	query)	/*!< in: Query state */
+{
+	const ib_rbt_node_t* node;
+	double		total_docs = query->total_docs;
+
+	/* We need to free any instances of fts_doc_freq_t that we
+	may have allocated. */
+	for (node = rbt_first(query->word_freqs);
+	     node;
+	     node = rbt_next(query->word_freqs, node)) {
+
+		fts_word_freq_t*	word_freq;
+
+		word_freq = rbt_value(fts_word_freq_t, node);
+
+		if (word_freq->doc_count > 0) {
+			if (total_docs == (double) word_freq->doc_count) {
+				/* QP assume ranking > 0 if we find
+				a match. Since Log10(1) = 0, we cannot
+				make IDF a zero value if do find a
+				word in all documents. So let's make
+				it an arbitrary very small number */
+				word_freq->idf = log10(1.0001);
+			} else {
+				word_freq->idf = log10(
+					total_docs
+					/ (double) word_freq->doc_count);
+			}
+		}
+
+		fprintf(stderr,"'%s' -> %lu/%lu %6.5lf\n",
+		       word_freq->word,
+		       query->total_docs, word_freq->doc_count,
+		       word_freq->idf);
+	}
+}
+
+/*****************************************************************//**
+Calculate the ranking of the document. */
+static
+void
+fts_query_calculate_ranking(
+/*========================*/
+	const fts_query_t*	query,		/*!< in: query state */
+	fts_ranking_t*		ranking)	/*!< in: Document to rank */
+{
+	const ib_rbt_node_t*	node;
+
+	/* At this stage, ranking->rank should not exceed the 1.0
+	bound */
+	ut_ad(ranking->rank <= 1.0 && ranking->rank >= -1.0);
+
+	for (node = rbt_first(ranking->words);
+	     node;
+	     node = rbt_first(ranking->words)) {
+
+		int			ret;
+		const byte*		word;
+		const byte**		wordp;
+		ib_rbt_bound_t		parent;
+		double			weight;
+		fts_doc_freq_t*		doc_freq;
+		fts_word_freq_t*	word_freq;
+
+		wordp = rbt_value(const byte*, node);
+		word = *wordp;
+
+		ret = rbt_search(query->word_freqs, &parent, word);
+
+		/* It must exist. */
+		ut_a(ret == 0);
+
+		word_freq = rbt_value(fts_word_freq_t, parent.last);
+
+		ret = rbt_search(
+			word_freq->doc_freqs, &parent, &ranking->doc_id);
+
+		/* It must exist. */
+		ut_a(ret == 0);
+
+		doc_freq = rbt_value(fts_doc_freq_t, parent.last);
+
+		weight = (double) doc_freq->freq * word_freq->idf;
+
+		ranking->rank += (fts_rank_t) (weight * word_freq->idf);
+
+		ut_free(rbt_remove_node(ranking->words, node));
+	}
+}
+
+/*****************************************************************//**
+Add ranking to the result set. */
+static
+void
+fts_query_add_ranking(
+/*==================*/
+	ib_rbt_t*		ranking_tree,	/*!< in: ranking tree */
+	const fts_ranking_t*	new_ranking)	/*!< in: ranking of a document */
+{
+	ib_rbt_bound_t		parent;
+
+	/* Lookup the ranking in our rb tree and add if it doesn't exist. */
+	if (rbt_search(ranking_tree, &parent, new_ranking) == 0) {
+		fts_ranking_t*	ranking;
+
+		ranking = rbt_value(fts_ranking_t, parent.last);
+
+		ranking->rank += new_ranking->rank;
+
+		ut_a(ranking->words == NULL);
+	} else {
+		rbt_add_node(ranking_tree, &parent, new_ranking);
+	}
+}
+
+/*****************************************************************//**
+Retrieve the FTS Relevance Ranking result for doc with doc_id
+@return the relevance ranking value, 0 if no ranking value
+present. */
+float
+fts_retrieve_ranking(
+/*=================*/
+	fts_result_t*	result,	/*!< in: FTS result structure */
+	doc_id_t	doc_id)	/*!< in: doc_id of the item to retrieve */
+{
+	ib_rbt_bound_t		parent;
+	fts_ranking_t		new_ranking;
+
+	if (!result || !result->rankings_by_id) {
+		return(0);
+	}
+
+	new_ranking.doc_id = doc_id;
+
+	/* Lookup the ranking in our rb tree */
+	if (rbt_search(result->rankings_by_id, &parent, &new_ranking) == 0) {
+		fts_ranking_t*  ranking;
+
+		ranking = rbt_value(fts_ranking_t, parent.last);
+
+		return (ranking->rank);
+	}
+
+	return(0);
+}
+
+/*****************************************************************//**
+Create the result and copy the data to it. */
+static
+fts_result_t*
+fts_query_prepare_result(
+/*=====================*/
+	const fts_query_t*	query,	/*!< in: Query state */
+	fts_result_t*		result)	/*!< in: result this can contain
+					data from a previous search on
+					another FTS index */
+{
+	const ib_rbt_node_t*	node;
+
+	ut_a(rbt_size(query->doc_ids) > 0);
+
+	if (result == NULL) {
+		result = static_cast<fts_result_t*>(ut_malloc(sizeof(*result)));
+
+		memset(result, 0x0, sizeof(*result));
+
+		result->rankings_by_id = rbt_create(
+			sizeof(fts_ranking_t), fts_ranking_doc_id_cmp);
+	}
+
+	for (node = rbt_first(query->doc_ids);
+	     node;
+	     node = rbt_next(query->doc_ids, node)) {
+
+		fts_ranking_t*	ranking;
+
+		ranking = rbt_value(fts_ranking_t, node);
+		fts_query_calculate_ranking(query, ranking);
+
+		// FIXME: I think we may requre this information to improve the
+		// ranking of doc ids which have more word matches from
+		// different FTS indexes.
+
+		/* We don't need these anymore free the resources. */
+		ut_a(rbt_empty(ranking->words));
+		rbt_free(ranking->words);
+		ranking->words = NULL;
+
+		fts_query_add_ranking(result->rankings_by_id, ranking);
+	}
+
+	return(result);
+}
+
+/*****************************************************************//**
+Get the result of the query. Calculate the similarity coefficient. */
+static
+fts_result_t*
+fts_query_get_result(
+/*=================*/
+	const fts_query_t*	query,	/*!< in: query instance */
+	fts_result_t*		result)	/*!< in: result */
+{
+	if (rbt_size(query->doc_ids) > 0) {
+		/* Copy the doc ids to the result. */
+		result = fts_query_prepare_result(query, result);
+	} else {
+		/* Create an empty result instance. */
+		result = static_cast<fts_result_t*>(ut_malloc(sizeof(*result)));
+		memset(result, 0, sizeof(*result));
+	}
+
+	return(result);
+}
+
+/*****************************************************************//**
+FTS Query free resources and reset. */
+static
+void
+fts_query_free(
+/*===========*/
+	fts_query_t*	query)		/*!< in: query instance to free*/
+{
+
+	if (query->read_nodes_graph) {
+		fts_que_graph_free(query->read_nodes_graph);
+	}
+
+	if (query->root) {
+		fts_ast_free_node(query->root);
+	}
+
+	if (query->deleted) {
+		fts_doc_ids_free(query->deleted);
+	}
+
+	if (query->doc_ids) {
+		fts_query_free_doc_ids(query->doc_ids);
+	}
+
+	if (query->word_freqs) {
+		const ib_rbt_node_t*	node;
+
+		/* We need to free any instances of fts_doc_freq_t that we
+		may have allocated. */
+		for (node = rbt_first(query->word_freqs);
+		     node;
+		     node = rbt_next(query->word_freqs, node)) {
+
+			fts_word_freq_t*	word_freq;
+
+			word_freq = rbt_value(fts_word_freq_t, node);
+
+			/* We need to cast away the const. */
+			rbt_free(word_freq->doc_freqs);
+		}
+
+		rbt_free(query->word_freqs);
+	}
+
+	ut_a(!query->intersection);
+
+	if (query->heap) {
+		mem_heap_free(query->heap);
+	}
+
+	memset(query, 0, sizeof(*query));
+}
+
+/*****************************************************************//**
+Parse the query using flex/bison. */
+static
+fts_ast_node_t*
+fts_query_parse(
+/*============*/
+	fts_query_t*	query,		/*!< in: query instance */
+	byte*		query_str,	/*!< in: query string */
+	ulint		query_len)	/*!< in: query string length */
+{
+	int		error;
+	fts_ast_state_t state;
+	ibool		mode = query->boolean_mode;
+
+	memset(&state, 0x0, sizeof(state));
+
+	/* Setup the scanner to use, this depends on the mode flag. */
+	state.lexer = fts_lexer_create(mode, query_str, query_len);
+	error = fts_parse(&state);
+	fts_lexer_free(state.lexer);
+	state.lexer = NULL;
+
+	/* Error during parsing ? */
+	if (error) {
+		/* Free the nodes that were allocated during parsing. */
+		fts_ast_state_free(&state);
+	} else {
+		query->root = state.root;
+	}
+
+	return(state.root);
+}
+
+
+/*******************************************************************//**
+FTS Query entry point.
+@return DB_SUCCESS if successful otherwise error code */
+UNIV_INTERN
+ulint
+fts_query(
+/*======*/
+	trx_t*		trx,		/*!< in: transaction */
+	dict_index_t*	index,		/*!< in: The FTS index to search */
+	uint		flags,		/*!< in: FTS search mode */
+	const byte*	query_str,	/*!< in: FTS query */
+	ulint		query_len,	/*!< in: FTS query string len
+					in bytes */
+	fts_result_t**	result)		/*!< in/out: result doc ids */
+{
+	fts_query_t	query;
+	ulint		error;
+	byte*		lc_query_str;
+	ulint		lc_query_str_len;
+	ulint		result_len;
+	ibool		boolean_mode;
+	trx_t*		query_trx;
+	CHARSET_INFO*	charset;
+	ulint		start_time_ms;
+
+	boolean_mode = flags & FTS_BOOL;
+
+	*result = NULL;
+	memset(&query, 0x0, sizeof(query));
+	query_trx = trx_allocate_for_background();
+	query_trx->op_info = "FTS query";
+
+	start_time_ms = ut_time_ms();
+
+	query.trx = query_trx;
+	query.index = index;
+	query.inited = FALSE;
+	query.boolean_mode = boolean_mode;
+	query.deleted = fts_doc_ids_create();
+	query.cur_node = NULL;
+
+	query.fts_common_table.type = FTS_COMMON_TABLE;
+	query.fts_common_table.table_id = index->table->id;
+	query.fts_common_table.parent = index->table->name;
+
+	charset = fts_index_get_charset(index);
+
+	query.fts_index_table.type = FTS_INDEX_TABLE;
+	query.fts_index_table.index_id = index->id;
+	query.fts_index_table.table_id = index->table->id;
+	query.fts_index_table.parent = index->table->name;
+	query.fts_index_table.charset = charset;
+
+
+	/* Setup the RB tree that will be used to collect per term
+	statistics. */
+	query.word_freqs = rbt_create_arg_cmp(
+		sizeof(fts_word_freq_t), innobase_fts_string_cmp, charset);
+
+	query.total_docs = fts_get_total_document_count(index->table);
+
+	error = fts_get_total_word_count(trx, query.index, &query.total_words);
+
+	if (error != DB_SUCCESS) {
+		goto func_exit;
+	}
+
+#ifdef	FTS_INTERNAL_DIAG_PRINT
+	fprintf(stderr, "Total docs: %lu Total words: %lu\n",
+		query.total_docs, query.total_words);
+#endif
+
+	query.fts_common_table.suffix = "DELETED";
+
+	/* Read the deleted doc_ids, we need these for filtering. */
+	error = fts_table_fetch_doc_ids(
+		NULL, &query.fts_common_table, query.deleted);
+
+	if (error != DB_SUCCESS) {
+		goto func_exit;
+	}
+
+	query.fts_common_table.suffix = "DELETED_CACHE";
+
+	error = fts_table_fetch_doc_ids(
+		NULL, &query.fts_common_table, query.deleted);
+
+	if (error != DB_SUCCESS) {
+		goto func_exit;
+	}
+
+	/* Get the deleted doc ids that are in the cache. */
+	fts_cache_append_deleted_doc_ids(
+		index->table->fts->cache, query.deleted->doc_ids);
+
+	/* Sort the vector so that we can do a binary search over the ids. */
+	ib_vector_sort(query.deleted->doc_ids, fts_update_doc_id_cmp);
+
+	/* Convert the query string to lower case before parsing. We own
+	the ut_malloc'ed result and so remember to free it before return. */
+
+	lc_query_str_len = query_len * charset->casedn_multiply + 1;
+	lc_query_str = static_cast<byte*>(ut_malloc(lc_query_str_len));
+
+	result_len = innobase_fts_casedn_str(
+		charset, (char*) query_str, query_len,
+		(char*) lc_query_str, lc_query_str_len);
+
+	ut_ad(result_len < lc_query_str_len);
+
+	lc_query_str[result_len] = 0;
+
+	query.heap = mem_heap_create(128);
+
+	/* Create the rb tree for the doc id (current) set. */
+	query.doc_ids = rbt_create(
+		sizeof(fts_ranking_t), fts_ranking_doc_id_cmp);
+
+	/* Parse the input query string. */
+	if (fts_query_parse(&query, lc_query_str, query_len)) {
+		fts_ast_node_t*	ast = query.root;
+
+		/* Traverse the Abstract Syntax Tree (AST) and execute
+		the query. */
+		query.error = fts_ast_visit(
+			FTS_NONE, ast, fts_query_visitor, &query);
+
+		/* If query expansion is requested, extend the search
+		with first search pass result */
+		if (query.error == DB_SUCCESS && (flags & FTS_EXPAND)) {
+			 query.error = fts_expand_query(index, &query);
+		}
+
+		/* Calculate the inverse document frequency of the terms. */
+		fts_query_calculate_idf(&query);
+
+		/* Copy the result from the query state, so that we can
+		return it to the caller. */
+		if (query.error == DB_SUCCESS) {
+			*result = fts_query_get_result(&query, *result);
+		}
+
+		error = query.error;
+	} else {
+		/* still return an empty result set */
+		*result = static_cast<fts_result_t*>(
+			ut_malloc(sizeof(**result)));
+		memset(*result, 0, sizeof(**result));
+	}
+
+	ut_free(lc_query_str);
+
+	if (fts_enable_diag_print && (*result)) {
+		ulint	diff_time = ut_time_ms() - start_time_ms;
+		fprintf(stderr, "FTS Search Processing time: %ld secs:"
+				" %ld millisec: row(s) %d \n",
+			diff_time / 1000, diff_time % 1000,
+			(*result)->rankings_by_id
+				? (int) rbt_size((*result)->rankings_by_id)
+				: -1);
+	}
+
+func_exit:
+	fts_query_free(&query);
+
+	trx_free_for_background(query_trx);
+
+	return(error);
+}
+
+/*****************************************************************//**
+FTS Query free result, returned by fts_query(). */
+
+void
+fts_query_free_result(
+/*==================*/
+	fts_result_t*	result)		/*!< in: result instance to free.*/
+{
+	if (result) {
+		if (result->rankings_by_id != NULL) {
+			rbt_free(result->rankings_by_id);
+			result->rankings_by_id = NULL;
+		}
+		if (result->rankings_by_rank != NULL) {
+			rbt_free(result->rankings_by_rank);
+			result->rankings_by_rank = NULL;
+		}
+
+		ut_free(result);
+		result = NULL;
+	}
+}
+
+/*****************************************************************//**
+FTS Query sort result, returned by fts_query() on fts_ranking_t::rank. */
+
+void
+fts_query_sort_result_on_rank(
+/*==========================*/
+	fts_result_t*	result)		/*!< out: result instance to sort.*/
+{
+	const ib_rbt_node_t*	node;
+	ib_rbt_t*		ranked;
+
+	ut_a(result->rankings_by_id != NULL);
+	if (result->rankings_by_rank) {
+		rbt_free(result->rankings_by_rank);
+	}
+
+	ranked = rbt_create(sizeof(fts_ranking_t), fts_query_compare_rank);
+
+	/* We need to free any instances of fts_doc_freq_t that we
+	may have allocated. */
+	for (node = rbt_first(result->rankings_by_id);
+	     node;
+	     node = rbt_next(result->rankings_by_id, node)) {
+
+		fts_ranking_t*	ranking;
+
+		ranking = rbt_value(fts_ranking_t, node);
+
+		ut_a(ranking->words == NULL);
+
+		rbt_insert(ranked, ranking, ranking);
+	}
+
+	/* Reset the current node too. */
+	result->current = NULL;
+	result->rankings_by_rank = ranked;
+}
+
+#ifdef UNIV_DEBUG
+/*******************************************************************//**
+A debug function to print result doc_id set. */
+static
+void
+fts_print_doc_id(
+/*=============*/
+	ib_rbt_t*	doc_ids)	/*!< in : tree that stores doc_ids.*/
+{
+	const ib_rbt_node_t*	node;
+	const ib_rbt_node_t*	node_word;
+
+	/* Iterate each member of the doc_id set */
+	for (node = rbt_first(doc_ids);
+	     node;
+	     node = rbt_next(doc_ids, node)) {
+		fts_ranking_t*	ranking;
+		ranking = rbt_value(fts_ranking_t, node);
+
+		fprintf(stderr, "doc_ids info, doc_id: %ld \n",
+			(ulint) ranking->doc_id);
+
+		for (node_word = rbt_first(ranking->words);
+		     node_word;
+		     node_word = rbt_next(ranking->words, node_word)) {
+
+			const byte** value;
+
+			value = rbt_value(const byte*, node_word);
+
+			fprintf(stderr, "doc_ids info, value: %s \n", *value);
+		}
+	}
+}
+#endif
+
+/*************************************************************//**
+This function implements a simple "blind" query expansion search:
+words in documents found in the first search pass will be used as
+search arguments to search the document again, thus "expand"
+the search result set.
+@return DB_SUCCESS if success, otherwise the error code */
+static
+ulint
+fts_expand_query(
+/*=============*/
+	dict_index_t*	index,		/*!< in: FTS index to search */
+	fts_query_t*	query)		/*!< in: FTS query instance */
+{
+	const ib_rbt_node_t*	node;
+	const ib_rbt_node_t*	token_node;
+	fts_doc_t		result_doc;
+	ulint			error = DB_SUCCESS;
+	const fts_index_cache_t*index_cache;
+
+	/* If no doc is found in first search pass, return */
+	if (!rbt_size(query->doc_ids)) {
+		return(error);
+	}
+
+	/* Init "result_doc", to hold words from the first search pass */
+	fts_doc_init(&result_doc);
+
+	rw_lock_x_lock(&index->table->fts->cache->lock);
+	index_cache = fts_find_index_cache(index->table->fts->cache, index);
+	rw_lock_x_unlock(&index->table->fts->cache->lock);
+
+	ut_a(index_cache);
+
+	result_doc.tokens = rbt_create_arg_cmp(
+		sizeof(fts_token_t), innobase_fts_text_cmp,
+		index_cache->charset);
+
+	result_doc.charset = index_cache->charset;
+
+#ifdef UNIV_DEBUG
+	fts_print_doc_id(query->doc_ids);
+#endif
+
+	for (node = rbt_first(query->doc_ids);
+	     node;
+	     node = rbt_next(query->doc_ids, node)) {
+
+		fts_ranking_t*	ranking;
+		const ib_rbt_node_t*	node_word;
+
+		ranking = rbt_value(fts_ranking_t, node);
+
+		/* Fetch the documents with the doc_id from the
+		result of first seach pass. Since we do not
+		store document-to-word mapping, we need to
+		fetch the original document and parse them.
+		Future optimization could be done here if we
+		support some forms of document-to-word mapping */
+		fts_doc_fetch_by_doc_id(NULL, ranking->doc_id, index,
+					FTS_FETCH_DOC_BY_ID_EQUAL,
+					fts_query_expansion_fetch_doc,
+					&result_doc);
+
+		/* Remove words that have already been searched in the
+		first pass */
+		for (node_word = rbt_first(ranking->words);
+		     node_word;
+		     node_word = rbt_next(ranking->words, node_word)) {
+			fts_string_t	str;
+			ibool		ret;
+			const byte**	strp;
+
+			strp = rbt_value(const byte*, node_word);
+			/* FIXME: We are discarding a const qualifier here. */
+			str.f_str = (byte*) *strp;
+			str.f_len = ut_strlen((const char*) str.f_str);
+			ret = rbt_delete(result_doc.tokens, &str);
+
+			/* The word must exist in the doc we found */
+			if (!ret) {
+				fprintf(stderr, " InnoDB: Error: Did not "
+					"find word %s in doc %ld for query "
+					"expansion search.\n", str.f_str,
+					(ulint) ranking->doc_id);
+			}
+		}
+	}
+
+	/* Search the table the second time with expanded search list */
+	for (token_node = rbt_first(result_doc.tokens);
+	     token_node;
+	     token_node = rbt_next(result_doc.tokens, token_node)) {
+		fts_token_t*	mytoken;
+		mytoken = rbt_value(fts_token_t, token_node);
+
+		fts_query_add_word_freq(query, mytoken->text.f_str);
+		error = fts_query_union(query, &mytoken->text);
+
+		if (error != DB_SUCCESS) {
+			break;
+		}
+	}
+
+	fts_doc_free(&result_doc);
+
+	return(error);
+}
+/*************************************************************//**
+This function finds documents that contain all words in a
+phrase or proximity search. And if proximity search, verify
+the words are close to each other enough, as in specified distance.
+This function is called for phrase and proximity search.
+@return TRUE if documents are found, FALSE if otherwise */
+static
+ibool
+fts_check_phrase_proximity(
+/*=======================*/
+	fts_query_t*	query,		/*!< in:  query instance */
+	ib_vector_t*	tokens)		/*!< in: Tokens contain words */
+{
+	ulint		n_matched;
+	ulint		i;
+	ibool		matched = FALSE;
+	ulint		num_token = ib_vector_size(tokens);
+	fts_match_t*	match[MAX_PROXIMITY_ITEM];
+	ibool		end_list = FALSE;
+
+	/* Number of matched documents for the first token */
+	n_matched = ib_vector_size(query->match_array[0]);
+
+	/* We have a set of match list for each word, we shall
+	walk through the list and find common documents that
+	contain all the matching words. */
+	for (i = 0; i < n_matched; i++) {
+		ulint	j;
+		ulint	k = 0;
+
+		match[0] = static_cast<fts_match_t*>(
+			ib_vector_get(query->match_array[0], i));
+
+		/* For remaining match list for the token(word), we
+		try to see if there is a document with the same
+		doc id */
+		for (j = 1; j < num_token; j++) {
+			match[j] = static_cast<fts_match_t*>(
+				ib_vector_get(query->match_array[j], k));
+
+			while (match[j]->doc_id < match[0]->doc_id
+			       && k < ib_vector_size(query->match_array[j])) {
+				 match[j] = static_cast<fts_match_t*>(
+					ib_vector_get(
+						query->match_array[j], k));
+				k++;
+			}
+
+			if (match[j]->doc_id > match[0]->doc_id) {
+				/* no match */
+				if (query->flags & FTS_PHRASE) {
+					match[0]->doc_id = 0;
+				}
+				break;
+			}
+
+			if (k == ib_vector_size(query->match_array[j])) {
+				end_list = TRUE;
+
+				if (match[j]->doc_id != match[0]->doc_id) {
+					/* no match */
+					if (query->flags & FTS_PHRASE) {
+						ulint	s;
+
+						match[0]->doc_id = 0;
+
+						for (s = i + 1; s < n_matched;
+						     s++) {
+							match[0] = static_cast<
+							fts_match_t*>(
+							ib_vector_get(
+							query->match_array[0],
+							s));
+							match[0]->doc_id = 0;
+						}
+					}
+
+					goto func_exit;
+				}
+			}
+
+			/* FIXME: A better solution will be a counter array
+			remember each run's last position. So we don't
+			reset it here very time */
+			k = 0;
+		}
+
+		if (j != num_token) {
+			continue;
+		}
+
+		/* For this matching doc, we need to further
+		verify whether the words in the doc are close
+		to each other, and with in distance specified
+		in the proximity search */
+		if (query->flags & FTS_PHRASE) {
+			matched = TRUE;
+		} else if (fts_proximity_check_position(
+			match, num_token, query->distance)) {
+			ulint	z;
+			/* If so, mark we find a matching doc */
+			fts_query_process_doc_id(query, match[0]->doc_id, 0);
+
+			matched = TRUE;
+			for (z = 0; z < num_token; z++) {
+				fts_string_t*	token;
+				token = static_cast<fts_string_t*>(
+					ib_vector_get(tokens, z));
+				fts_query_add_word_to_document(
+					query, match[0]->doc_id,
+					token->f_str);
+			}
+		}
+
+		if (end_list) {
+			break;
+		}
+	}
+
+func_exit:
+	return(matched);
+}
+
+/*************************************************************//**
+This function check the words in result document are close to each
+other (within proximity range). This is used for proximity search.
+@return TRUE if words are close to each other, FALSE if otherwise */
+static
+ulint
+fts_proximity_check_position(
+/*=========================*/
+	fts_match_t**	match,		/*!< in: query instance */
+	ulint		num_match,	/*!< in: number of matching
+					items */
+	ulint		distance)	/*!< in: distance value
+					for proximity search */
+{
+	ulint	i;
+	ulint	idx[MAX_PROXIMITY_ITEM];
+	ulint	num_pos[MAX_PROXIMITY_ITEM];
+	ulint	min_idx;
+
+	ut_a(num_match < MAX_PROXIMITY_ITEM);
+
+	/* Each word could appear multiple times in a doc. So
+	we need to walk through each word's position list, and find
+	closest distance between different words to see if
+	they are in the proximity distance. */
+
+	/* Assume each word's position list is sorted, we
+	will just do a walk through to all words' lists
+	similar to a the merge phase of a merge sort */
+	for (i = 0; i < num_match; i++) {
+		/* idx is the current position we are checking
+		for a particular word */
+		idx[i] = 0;
+
+		/* Number of positions for this word */
+		num_pos[i] = ib_vector_size(match[i]->positions);
+	}
+
+	/* Start with the first word */
+	min_idx = 0;
+
+	while (idx[min_idx] < num_pos[min_idx]) {
+		ulint	position[MAX_PROXIMITY_ITEM];
+		ulint	min_pos = ULINT_MAX;
+		ulint	max_pos = 0;
+
+		/* Check positions in each word position list, and
+		record the max/min position */
+		for (i = 0; i < num_match; i++) {
+			position[i] = *(ulint*) ib_vector_get_const(
+				match[i]->positions, idx[i]);
+
+			if (position[i] == ULINT_UNDEFINED) {
+				break;
+			}
+
+			if (position[i] < min_pos) {
+				min_pos = position[i];
+				min_idx = i;
+			}
+
+			if (position[i] > max_pos) {
+				max_pos = position[i];
+			}
+		}
+
+		/* If max and min position are within range, we
+		find a good match */
+		if (max_pos - min_pos <= distance
+		    && (i >= num_match || position[i] != ULINT_UNDEFINED)) {
+			return(TRUE);
+		} else {
+			/* Otherwise, move to the next position is the
+			list for the word with the smallest position */
+			idx[min_idx]++;
+		}
+	}
+
+	/* Failed to find all words within the range for the doc */
+	return(FALSE);
+}
diff --git a/storage/innobase/fts/fts0sql.cc b/storage/innobase/fts/fts0sql.cc
new file mode 100644
index 00000000000..8e60a5f1132
--- /dev/null
+++ b/storage/innobase/fts/fts0sql.cc
@@ -0,0 +1,355 @@
+/*****************************************************************************
+
+Copyright (c) 2007, 2011, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file fts/fts0sql.cc
+Full Text Search functionality.
+
+Created 2007-03-27 Sunny Bains
+*******************************************************/
+
+#include "que0que.h"
+#include "trx0roll.h"
+#include "pars0pars.h"
+#include "dict0dict.h"
+#include "fts0types.h"
+#include "fts0priv.h"
+
+#ifndef UNIV_NONINL
+#include "fts0types.ic"
+#include "fts0vlc.ic"
+#endif
+
+/** SQL statements for creating the ancillary FTS tables. %s must be replaced
+with the indexed table's id. */
+
+/** Preamble to all SQL statements. */
+static const char* fts_sql_begin=
+	"PROCEDURE P() IS\n";
+
+/** Postamble to non-committing SQL statements. */
+static const char* fts_sql_end=
+	"\n"
+	"END;\n";
+
+/******************************************************************//**
+Get the table id.
+@return number of bytes written */
+UNIV_INTERN
+int
+fts_get_table_id(
+/*=============*/
+	const fts_table_t*
+			fts_table,	/*!< in: FTS Auxiliary table */
+	char*		table_id)	/*!< out: table id, must be at least
+					FTS_AUX_MIN_TABLE_ID_LENGTH bytes
+					long */
+{
+	int		len;
+
+	switch (fts_table->type) {
+	case FTS_COMMON_TABLE:
+		len = fts_write_object_id(fts_table->table_id, table_id);
+		break;
+
+	case FTS_INDEX_TABLE:
+
+		len = fts_write_object_id(fts_table->table_id, table_id);
+
+		table_id[len] = '_';
+		++len;
+		table_id += len;
+
+		len += fts_write_object_id(fts_table->index_id, table_id);
+		break;
+
+	default:
+		ut_error;
+	}
+
+	ut_a(len >= 16);
+	ut_a(len < FTS_AUX_MIN_TABLE_ID_LENGTH);
+
+	return(len);
+}
+
+/******************************************************************//**
+Construct the prefix name of an FTS table.
+@return own: table name, must be freed with mem_free() */
+UNIV_INTERN
+char*
+fts_get_table_name_prefix(
+/*======================*/
+	const fts_table_t*
+			fts_table)	/*!< in: Auxiliary table type */
+{
+	int		len;
+	const char*	slash;
+	char*		prefix_name;
+	int		dbname_len = 0;
+	int		prefix_name_len;
+	char		table_id[FTS_AUX_MIN_TABLE_ID_LENGTH];
+
+	slash = static_cast<const char*>(
+		memchr(fts_table->parent, '/', strlen(fts_table->parent)));
+
+	if (slash) {
+		/* Print up to and including the separator. */
+		dbname_len = (slash - fts_table->parent) + 1;
+	}
+
+	len = fts_get_table_id(fts_table, table_id);
+
+	prefix_name_len = dbname_len + 4 + len + 1;
+
+	prefix_name = static_cast<char*>(mem_alloc(prefix_name_len));
+
+	len = sprintf(prefix_name, "%.*sFTS_%s",
+		      dbname_len, fts_table->parent, table_id);
+
+	ut_a(len > 0);
+	ut_a(len == prefix_name_len - 1);
+
+	return(prefix_name);
+}
+
+/******************************************************************//**
+Construct the name of an ancillary FTS table.
+@return own: table name, must be freed with mem_free() */
+UNIV_INTERN
+char*
+fts_get_table_name(
+/*===============*/
+	const fts_table_t*	fts_table)
+					/*!< in: Auxiliary table type */
+{
+	int		len;
+	char*		name;
+	int		name_len;
+	char*		prefix_name;
+
+	prefix_name = fts_get_table_name_prefix(fts_table);
+
+	name_len = strlen(prefix_name) + 1 + strlen(fts_table->suffix) + 1;
+
+	name = static_cast<char*>(mem_alloc(name_len));
+
+	len = sprintf(name, "%s_%s", prefix_name, fts_table->suffix);
+
+	ut_a(len > 0);
+	ut_a(len == name_len - 1);
+
+	mem_free(prefix_name);
+
+	return(name);
+}
+
+/******************************************************************//**
+Parse an SQL string. %s is replaced with the table's id.
+@return query graph */
+UNIV_INTERN
+que_t*
+fts_parse_sql(
+/*==========*/
+	fts_table_t*	fts_table,	/*!< in: FTS auxiliarry table info */
+	pars_info_t*	info,		/*!< in: info struct, or NULL */
+	const char*	sql)		/*!< in: SQL string to evaluate */
+{
+	char*		str;
+	que_t*		graph;
+	char*		str_tmp;
+	ibool		dict_locked;
+
+	if (fts_table != NULL) {
+		char*	table_name;
+
+		table_name = fts_get_table_name(fts_table);
+		str_tmp = ut_strreplace(sql, "%s", table_name);
+		mem_free(table_name);
+	} else {
+		ulint	sql_len = strlen(sql) + 1;
+
+		str_tmp = static_cast<char*>(mem_alloc(sql_len));
+		strcpy(str_tmp, sql);
+	}
+
+	str = ut_str3cat(fts_sql_begin, str_tmp, fts_sql_end);
+	mem_free(str_tmp);
+
+	dict_locked = (fts_table && fts_table->table
+		       && (fts_table->table->fts->fts_status
+			   & TABLE_DICT_LOCKED));
+
+	if (!dict_locked) {
+		ut_ad(!mutex_own(&(dict_sys->mutex)));
+
+		/* The InnoDB SQL parser is not re-entrant. */
+		mutex_enter(&dict_sys->mutex);
+	}
+
+	graph = pars_sql(info, str);
+	ut_a(graph);
+
+	if (!dict_locked) {
+		mutex_exit(&dict_sys->mutex);
+	}
+
+	mem_free(str);
+
+	return(graph);
+}
+
+/******************************************************************//**
+Parse an SQL string. %s is replaced with the table's id.
+@return query graph */
+UNIV_INTERN
+que_t*
+fts_parse_sql_no_dict_lock(
+/*=======================*/
+	fts_table_t*	fts_table,	/*!< in: FTS aux table info */
+	pars_info_t*	info,		/*!< in: info struct, or NULL */
+	const char*	sql)		/*!< in: SQL string to evaluate */
+{
+	char*		str;
+	que_t*		graph;
+	char*		str_tmp = NULL;
+
+#ifdef UNIV_DEBUG
+	ut_ad(mutex_own(&dict_sys->mutex));
+#endif
+
+	if (fts_table != NULL) {
+		char*		table_name;
+
+		table_name = fts_get_table_name(fts_table);
+		str_tmp = ut_strreplace(sql, "%s", table_name);
+		mem_free(table_name);
+	}
+
+	if (str_tmp != NULL) {
+		str = ut_str3cat(fts_sql_begin, str_tmp, fts_sql_end);
+		mem_free(str_tmp);
+	} else {
+		str = ut_str3cat(fts_sql_begin, sql, fts_sql_end);
+	}
+
+	//fprintf(stderr, "%s\n", str);
+
+	graph = pars_sql(info, str);
+	ut_a(graph);
+
+	mem_free(str);
+
+	return(graph);
+}
+
+/******************************************************************//**
+Evaluate an SQL query graph.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+ulint
+fts_eval_sql(
+/*=========*/
+	trx_t*		trx,		/*!< in: transaction */
+	que_t*		graph)		/*!< in: Query graph to evaluate */
+{
+	que_thr_t*	thr;
+
+	graph->trx = trx;
+	graph->fork_type = QUE_FORK_MYSQL_INTERFACE;
+
+	ut_a(thr = que_fork_start_command(graph));
+
+	que_run_threads(thr);
+
+	return(trx->error_state);
+}
+
+/******************************************************************//**
+Construct the column specification part of the SQL string for selecting the
+indexed FTS columns for the given table. Adds the necessary bound
+ids to the given 'info' and returns the SQL string. Examples:
+
+One indexed column named "text":
+
+ "$sel0",
+ info/ids: sel0 -> "text"
+
+Two indexed columns named "subject" and "content":
+
+ "$sel0, $sel1",
+ info/ids: sel0 -> "subject", sel1 -> "content",
+@return heap-allocated WHERE string */
+UNIV_INTERN
+const char*
+fts_get_select_columns_str(
+/*=======================*/
+	dict_index_t*   index,		/*!< in: index */
+	pars_info_t*    info,		/*!< in/out: parser info */
+	mem_heap_t*     heap)		/*!< in: memory heap */
+{
+	ulint		i;
+	const char*	str = "";
+
+	for (i = 0; i < index->n_user_defined_cols; i++) {
+		char*           sel_str;
+
+		dict_field_t*   field = dict_index_get_nth_field(index, i);
+
+		sel_str = mem_heap_printf(heap, "sel%lu", (ulong) i);
+
+		/* Set copy_name to TRUE since it's dynamic. */
+		pars_info_bind_id(info, TRUE, sel_str, field->name);
+
+		str = mem_heap_printf(
+			heap, "%s%s$%s", str, (*str) ? ", " : "", sel_str);
+	}
+
+	return(str);
+}
+
+/******************************************************************//**
+Commit a transaction.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+ulint
+fts_sql_commit(
+/*===========*/
+	trx_t*		trx)		/*!< in: transaction */
+{
+	ulint	error;
+
+	error = trx_commit_for_mysql(trx);
+
+	/* Commit above returns 0 on success, it should always succeed */
+	ut_a(error == DB_SUCCESS);
+
+	return(DB_SUCCESS);
+}
+
+/******************************************************************//**
+Rollback a transaction.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+ulint
+fts_sql_rollback(
+/*=============*/
+	trx_t*		trx)		/*!< in: transaction */
+{
+	return(trx_rollback_to_savepoint(trx, NULL));
+}
diff --git a/storage/innobase/fts/fts0tlex.cc b/storage/innobase/fts/fts0tlex.cc
new file mode 100644
index 00000000000..69b859716d5
--- /dev/null
+++ b/storage/innobase/fts/fts0tlex.cc
@@ -0,0 +1,1946 @@
+#include "univ.i"
+#line 2 "fts0tlex.cc"
+
+#line 4 "fts0tlex.cc"
+
+#define  YY_INT_ALIGNED short int
+
+/* A lexical scanner generated by flex */
+
+#define FLEX_SCANNER
+#define YY_FLEX_MAJOR_VERSION 2
+#define YY_FLEX_MINOR_VERSION 5
+#define YY_FLEX_SUBMINOR_VERSION 35
+#if YY_FLEX_SUBMINOR_VERSION > 0
+#define FLEX_BETA
+#endif
+
+/* First, we deal with  platform-specific or compiler-specific issues. */
+
+/* begin standard C headers. */
+#include <stdio.h>
+#include <string.h>
+#include <errno.h>
+#include <stdlib.h>
+
+/* end standard C headers. */
+
+/* flex integer type definitions */
+
+#ifndef FLEXINT_H
+#define FLEXINT_H
+
+/* C99 systems have <inttypes.h>. Non-C99 systems may or may not. */
+
+#if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
+
+/* C99 says to define __STDC_LIMIT_MACROS before including stdint.h,
+ * if you want the limit (max/min) macros for int types.
+ */
+#ifndef __STDC_LIMIT_MACROS
+#define __STDC_LIMIT_MACROS 1
+#endif
+
+#include <inttypes.h>
+typedef int8_t flex_int8_t;
+typedef uint8_t flex_uint8_t;
+typedef int16_t flex_int16_t;
+typedef uint16_t flex_uint16_t;
+typedef int32_t flex_int32_t;
+typedef uint32_t flex_uint32_t;
+#else
+typedef signed char flex_int8_t;
+typedef short int flex_int16_t;
+typedef int flex_int32_t;
+typedef unsigned char flex_uint8_t;
+typedef unsigned short int flex_uint16_t;
+typedef unsigned int flex_uint32_t;
+
+/* Limits of integral types. */
+#ifndef INT8_MIN
+#define INT8_MIN               (-128)
+#endif
+#ifndef INT16_MIN
+#define INT16_MIN              (-32767-1)
+#endif
+#ifndef INT32_MIN
+#define INT32_MIN              (-2147483647-1)
+#endif
+#ifndef INT8_MAX
+#define INT8_MAX               (127)
+#endif
+#ifndef INT16_MAX
+#define INT16_MAX              (32767)
+#endif
+#ifndef INT32_MAX
+#define INT32_MAX              (2147483647)
+#endif
+#ifndef UINT8_MAX
+#define UINT8_MAX              (255U)
+#endif
+#ifndef UINT16_MAX
+#define UINT16_MAX             (65535U)
+#endif
+#ifndef UINT32_MAX
+#define UINT32_MAX             (4294967295U)
+#endif
+
+#endif /* ! C99 */
+
+#endif /* ! FLEXINT_H */
+
+#ifdef __cplusplus
+
+/* The "const" storage-class-modifier is valid. */
+#define YY_USE_CONST
+
+#else	/* ! __cplusplus */
+
+/* C99 requires __STDC__ to be defined as 1. */
+#if defined (__STDC__)
+
+#define YY_USE_CONST
+
+#endif	/* defined (__STDC__) */
+#endif	/* ! __cplusplus */
+
+#ifdef YY_USE_CONST
+#define yyconst const
+#else
+#define yyconst
+#endif
+
+/* Returned upon end-of-file. */
+#define YY_NULL 0
+
+/* Promotes a possibly negative, possibly signed char to an unsigned
+ * integer for use as an array index.  If the signed char is negative,
+ * we want to instead treat it as an 8-bit unsigned char, hence the
+ * double cast.
+ */
+#define YY_SC_TO_UI(c) ((unsigned int) (unsigned char) c)
+
+/* An opaque pointer. */
+#ifndef YY_TYPEDEF_YY_SCANNER_T
+#define YY_TYPEDEF_YY_SCANNER_T
+typedef void* yyscan_t;
+#endif
+
+/* For convenience, these vars (plus the bison vars far below)
+   are macros in the reentrant scanner. */
+#define yyin yyg->yyin_r
+#define yyout yyg->yyout_r
+#define yyextra yyg->yyextra_r
+#define yyleng yyg->yyleng_r
+#define yytext yyg->yytext_r
+#define yylineno (YY_CURRENT_BUFFER_LVALUE->yy_bs_lineno)
+#define yycolumn (YY_CURRENT_BUFFER_LVALUE->yy_bs_column)
+#define yy_flex_debug yyg->yy_flex_debug_r
+
+/* Enter a start condition.  This macro really ought to take a parameter,
+ * but we do it the disgusting crufty way forced on us by the ()-less
+ * definition of BEGIN.
+ */
+#define BEGIN yyg->yy_start = 1 + 2 *
+
+/* Translate the current start state into a value that can be later handed
+ * to BEGIN to return to the state.  The YYSTATE alias is for lex
+ * compatibility.
+ */
+#define YY_START ((yyg->yy_start - 1) / 2)
+#define YYSTATE YY_START
+
+/* Action number for EOF rule of a given start state. */
+#define YY_STATE_EOF(state) (YY_END_OF_BUFFER + state + 1)
+
+/* Special action meaning "start processing a new file". */
+#define YY_NEW_FILE fts0trestart(yyin ,yyscanner )
+
+#define YY_END_OF_BUFFER_CHAR 0
+
+/* Size of default input buffer. */
+#ifndef YY_BUF_SIZE
+#ifdef __ia64__
+/* On IA-64, the buffer size is 16k, not 8k.
+ * Moreover, YY_BUF_SIZE is 2*YY_READ_BUF_SIZE in the general case.
+ * Ditto for the __ia64__ case accordingly.
+ */
+#define YY_BUF_SIZE 32768
+#else
+#define YY_BUF_SIZE 16384
+#endif /* __ia64__ */
+#endif
+
+/* The state buf must be large enough to hold one state per character in the main buffer.
+ */
+#define YY_STATE_BUF_SIZE   ((YY_BUF_SIZE + 2) * sizeof(yy_state_type))
+
+#ifndef YY_TYPEDEF_YY_BUFFER_STATE
+#define YY_TYPEDEF_YY_BUFFER_STATE
+typedef struct yy_buffer_state *YY_BUFFER_STATE;
+#endif
+
+#define EOB_ACT_CONTINUE_SCAN 0
+#define EOB_ACT_END_OF_FILE 1
+#define EOB_ACT_LAST_MATCH 2
+
+    #define YY_LESS_LINENO(n)
+
+/* Return all but the first "n" matched characters back to the input stream. */
+#define yyless(n) \
+	do \
+		{ \
+		/* Undo effects of setting up yytext. */ \
+        int yyless_macro_arg = (n); \
+        YY_LESS_LINENO(yyless_macro_arg);\
+		*yy_cp = yyg->yy_hold_char; \
+		YY_RESTORE_YY_MORE_OFFSET \
+		yyg->yy_c_buf_p = yy_cp = yy_bp + yyless_macro_arg - YY_MORE_ADJ; \
+		YY_DO_BEFORE_ACTION; /* set up yytext again */ \
+		} \
+	while ( 0 )
+
+#define unput(c) yyunput( c, yyg->yytext_ptr , yyscanner )
+
+#ifndef YY_TYPEDEF_YY_SIZE_T
+#define YY_TYPEDEF_YY_SIZE_T
+typedef size_t yy_size_t;
+#endif
+
+#ifndef YY_STRUCT_YY_BUFFER_STATE
+#define YY_STRUCT_YY_BUFFER_STATE
+struct yy_buffer_state
+	{
+	FILE *yy_input_file;
+
+	char *yy_ch_buf;		/* input buffer */
+	char *yy_buf_pos;		/* current position in input buffer */
+
+	/* Size of input buffer in bytes, not including room for EOB
+	 * characters.
+	 */
+	yy_size_t yy_buf_size;
+
+	/* Number of characters read into yy_ch_buf, not including EOB
+	 * characters.
+	 */
+	int yy_n_chars;
+
+	/* Whether we "own" the buffer - i.e., we know we created it,
+	 * and can realloc() it to grow it, and should free() it to
+	 * delete it.
+	 */
+	int yy_is_our_buffer;
+
+	/* Whether this is an "interactive" input source; if so, and
+	 * if we're using stdio for input, then we want to use getc()
+	 * instead of fread(), to make sure we stop fetching input after
+	 * each newline.
+	 */
+	int yy_is_interactive;
+
+	/* Whether we're considered to be at the beginning of a line.
+	 * If so, '^' rules will be active on the next match, otherwise
+	 * not.
+	 */
+	int yy_at_bol;
+
+    int yy_bs_lineno; /**< The line count. */
+    int yy_bs_column; /**< The column count. */
+
+	/* Whether to try to fill the input buffer when we reach the
+	 * end of it.
+	 */
+	int yy_fill_buffer;
+
+	int yy_buffer_status;
+
+#define YY_BUFFER_NEW 0
+#define YY_BUFFER_NORMAL 1
+	/* When an EOF's been seen but there's still some text to process
+	 * then we mark the buffer as YY_EOF_PENDING, to indicate that we
+	 * shouldn't try reading from the input source any more.  We might
+	 * still have a bunch of tokens to match, though, because of
+	 * possible backing-up.
+	 *
+	 * When we actually see the EOF, we change the status to "new"
+	 * (via fts0trestart()), so that the user can continue scanning by
+	 * just pointing yyin at a new input file.
+	 */
+#define YY_BUFFER_EOF_PENDING 2
+
+	};
+#endif /* !YY_STRUCT_YY_BUFFER_STATE */
+
+/* We provide macros for accessing buffer states in case in the
+ * future we want to put the buffer states in a more general
+ * "scanner state".
+ *
+ * Returns the top of the stack, or NULL.
+ */
+#define YY_CURRENT_BUFFER ( yyg->yy_buffer_stack \
+                          ? yyg->yy_buffer_stack[yyg->yy_buffer_stack_top] \
+                          : NULL)
+
+/* Same as previous macro, but useful when we know that the buffer stack is not
+ * NULL or when we need an lvalue. For internal use only.
+ */
+#define YY_CURRENT_BUFFER_LVALUE yyg->yy_buffer_stack[yyg->yy_buffer_stack_top]
+
+void fts0trestart (FILE *input_file ,yyscan_t yyscanner );
+void fts0t_switch_to_buffer (YY_BUFFER_STATE new_buffer ,yyscan_t yyscanner );
+YY_BUFFER_STATE fts0t_create_buffer (FILE *file,int size ,yyscan_t yyscanner );
+void fts0t_delete_buffer (YY_BUFFER_STATE b ,yyscan_t yyscanner );
+void fts0t_flush_buffer (YY_BUFFER_STATE b ,yyscan_t yyscanner );
+void fts0tpush_buffer_state (YY_BUFFER_STATE new_buffer ,yyscan_t yyscanner );
+void fts0tpop_buffer_state (yyscan_t yyscanner );
+
+static void fts0tensure_buffer_stack (yyscan_t yyscanner );
+static void fts0t_load_buffer_state (yyscan_t yyscanner );
+static void fts0t_init_buffer (YY_BUFFER_STATE b,FILE *file ,yyscan_t yyscanner );
+
+#define YY_FLUSH_BUFFER fts0t_flush_buffer(YY_CURRENT_BUFFER ,yyscanner)
+
+YY_BUFFER_STATE fts0t_scan_buffer (char *base,yy_size_t size ,yyscan_t yyscanner );
+YY_BUFFER_STATE fts0t_scan_string (yyconst char *yy_str ,yyscan_t yyscanner );
+YY_BUFFER_STATE fts0t_scan_bytes (yyconst char *bytes,int len ,yyscan_t yyscanner );
+
+void *fts0talloc (yy_size_t , yyscan_t yyscanner __attribute__((unused)) );
+void *fts0trealloc (void *,yy_size_t , yyscan_t yyscanner __attribute__((unused)) );
+void fts0tfree (void * , yyscan_t yyscanner __attribute__((unused)) );
+
+#define yy_new_buffer fts0t_create_buffer
+
+#define yy_set_interactive(is_interactive) \
+	{ \
+	if ( ! YY_CURRENT_BUFFER ){ \
+        fts0tensure_buffer_stack (yyscanner); \
+		YY_CURRENT_BUFFER_LVALUE =    \
+            fts0t_create_buffer(yyin,YY_BUF_SIZE ,yyscanner); \
+	} \
+	YY_CURRENT_BUFFER_LVALUE->yy_is_interactive = is_interactive; \
+	}
+
+#define yy_set_bol(at_bol) \
+	{ \
+	if ( ! YY_CURRENT_BUFFER ){\
+        fts0tensure_buffer_stack (yyscanner); \
+		YY_CURRENT_BUFFER_LVALUE =    \
+            fts0t_create_buffer(yyin,YY_BUF_SIZE ,yyscanner); \
+	} \
+	YY_CURRENT_BUFFER_LVALUE->yy_at_bol = at_bol; \
+	}
+
+#define YY_AT_BOL() (YY_CURRENT_BUFFER_LVALUE->yy_at_bol)
+
+/* Begin user sect3 */
+
+#define fts0twrap(n) 1
+#define YY_SKIP_YYWRAP
+
+typedef unsigned char YY_CHAR;
+
+typedef int yy_state_type;
+
+#define yytext_ptr yytext_r
+
+static yy_state_type yy_get_previous_state (yyscan_t yyscanner );
+static yy_state_type yy_try_NUL_trans (yy_state_type current_state  ,yyscan_t yyscanner);
+static int yy_get_next_buffer (yyscan_t yyscanner );
+static void yy_fatal_error (yyconst char msg[] , yyscan_t yyscanner __attribute__((unused)) );
+
+/* Done after the current pattern has been matched and before the
+ * corresponding action - sets up yytext.
+ */
+#define YY_DO_BEFORE_ACTION \
+	yyg->yytext_ptr = yy_bp; \
+	yyleng = (size_t) (yy_cp - yy_bp); \
+	yyg->yy_hold_char = *yy_cp; \
+	*yy_cp = '\0'; \
+	yyg->yy_c_buf_p = yy_cp;
+
+#define YY_NUM_RULES 6
+#define YY_END_OF_BUFFER 7
+/* This struct is not used in this scanner,
+   but its presence is necessary. */
+struct yy_trans_info
+	{
+	flex_int32_t yy_verify;
+	flex_int32_t yy_nxt;
+	};
+static yyconst flex_int16_t yy_accept[16] =
+    {   0,
+        4,    4,    7,    4,    1,    5,    1,    6,    2,    4,
+        1,    1,    0,    3,    0
+    } ;
+
+static yyconst flex_int32_t yy_ec[256] =
+    {   0,
+        1,    1,    1,    1,    1,    1,    1,    1,    2,    3,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    4,    1,    5,    1,    1,    1,    1,    1,    1,
+        1,    6,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1
+    } ;
+
+static yyconst flex_int32_t yy_meta[7] =
+    {   0,
+        1,    2,    3,    4,    5,    1
+    } ;
+
+static yyconst flex_int16_t yy_base[19] =
+    {   0,
+        0,    0,   17,    0,    5,   20,    0,    8,    0,    0,
+        0,    0,    3,   20,   20,    9,   10,   14
+    } ;
+
+static yyconst flex_int16_t yy_def[19] =
+    {   0,
+       15,    1,   15,   16,   16,   15,   17,   18,   16,   16,
+        5,   17,   18,   15,    0,   15,   15,   15
+    } ;
+
+static yyconst flex_int16_t yy_nxt[27] =
+    {   0,
+        4,    5,    6,    7,    8,    9,   11,   14,   12,   10,
+       10,   12,   14,   12,   13,   13,   15,   13,   13,    3,
+       15,   15,   15,   15,   15,   15
+    } ;
+
+static yyconst flex_int16_t yy_chk[27] =
+    {   0,
+        1,    1,    1,    1,    1,    1,    5,   13,    5,   16,
+       16,   17,    8,   17,   18,   18,    3,   18,   18,   15,
+       15,   15,   15,   15,   15,   15
+    } ;
+
+/* The intent behind this definition is that it'll catch
+ * any uses of REJECT which flex missed.
+ */
+#define REJECT reject_used_but_not_detected
+#define yymore() yymore_used_but_not_detected
+#define YY_MORE_ADJ 0
+#define YY_RESTORE_YY_MORE_OFFSET
+#line 1 "fts0tlex.l"
+/*****************************************************************************
+
+Copyright (c) 2007, 2011, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+/**
+ * @file fts/fts0tlex.l
+ * FTS parser lexical analyzer
+ *
+ * Created 2007/5/9 Sunny Bains
+ */
+#line 27 "fts0tlex.l"
+
+#include "fts0ast.h"
+#include "fts0pars.h"
+
+/* Required for reentrant parser */
+#define YY_DECL int fts_tlexer(YYSTYPE* val, yyscan_t yyscanner)
+
+#define YY_NO_INPUT 1
+#line 480 "fts0tlex.cc"
+
+#define INITIAL 0
+
+#ifndef YY_NO_UNISTD_H
+/* Special case for "unistd.h", since it is non-ANSI. We include it way
+ * down here because we want the user's section 1 to have been scanned first.
+ * The user has a chance to override it with an option.
+ */
+#include <unistd.h>
+#endif
+
+#ifndef YY_EXTRA_TYPE
+#define YY_EXTRA_TYPE void *
+#endif
+
+/* Holds the entire state of the reentrant scanner. */
+struct yyguts_t
+    {
+
+    /* User-defined. Not touched by flex. */
+    YY_EXTRA_TYPE yyextra_r;
+
+    /* The rest are the same as the globals declared in the non-reentrant scanner. */
+    FILE *yyin_r, *yyout_r;
+    size_t yy_buffer_stack_top; /**< index of top of stack. */
+    size_t yy_buffer_stack_max; /**< capacity of stack. */
+    YY_BUFFER_STATE * yy_buffer_stack; /**< Stack as an array. */
+    char yy_hold_char;
+    int yy_n_chars;
+    int yyleng_r;
+    char *yy_c_buf_p;
+    int yy_init;
+    int yy_start;
+    int yy_did_buffer_switch_on_eof;
+    int yy_start_stack_ptr;
+    int yy_start_stack_depth;
+    int *yy_start_stack;
+    yy_state_type yy_last_accepting_state;
+    char* yy_last_accepting_cpos;
+
+    int yylineno_r;
+    int yy_flex_debug_r;
+
+    char *yytext_r;
+    int yy_more_flag;
+    int yy_more_len;
+
+    }; /* end struct yyguts_t */
+
+static int yy_init_globals (yyscan_t yyscanner );
+
+int fts0tlex_init (yyscan_t* scanner);
+
+int fts0tlex_init_extra (YY_EXTRA_TYPE user_defined,yyscan_t* scanner);
+
+/* Accessor methods to globals.
+   These are made visible to non-reentrant scanners for convenience. */
+
+int fts0tlex_destroy (yyscan_t yyscanner );
+
+int fts0tget_debug (yyscan_t yyscanner );
+
+void fts0tset_debug (int debug_flag ,yyscan_t yyscanner );
+
+YY_EXTRA_TYPE fts0tget_extra (yyscan_t yyscanner );
+
+void fts0tset_extra (YY_EXTRA_TYPE user_defined ,yyscan_t yyscanner );
+
+FILE *fts0tget_in (yyscan_t yyscanner );
+
+void fts0tset_in  (FILE * in_str ,yyscan_t yyscanner );
+
+FILE *fts0tget_out (yyscan_t yyscanner );
+
+void fts0tset_out  (FILE * out_str ,yyscan_t yyscanner );
+
+int fts0tget_leng (yyscan_t yyscanner );
+
+char *fts0tget_text (yyscan_t yyscanner );
+
+int fts0tget_lineno (yyscan_t yyscanner );
+
+void fts0tset_lineno (int line_number ,yyscan_t yyscanner );
+
+/* Macros after this point can all be overridden by user definitions in
+ * section 1.
+ */
+
+#ifndef YY_SKIP_YYWRAP
+#ifdef __cplusplus
+extern "C" int fts0twrap (yyscan_t yyscanner );
+#else
+extern int fts0twrap (yyscan_t yyscanner );
+#endif
+#endif
+
+#ifndef yytext_ptr
+static void yy_flex_strncpy (char *,yyconst char *,int , yyscan_t yyscanner __attribute__((unused)));
+#endif
+
+#ifdef YY_NEED_STRLEN
+static int yy_flex_strlen (yyconst char * , yyscan_t yyscanner __attribute__((unused)));
+#endif
+
+#ifndef YY_NO_INPUT
+
+#ifdef __cplusplus
+static int yyinput (yyscan_t yyscanner );
+#else
+static int input (yyscan_t yyscanner );
+#endif
+
+#endif
+
+/* Amount of stuff to slurp up with each read. */
+#ifndef YY_READ_BUF_SIZE
+#ifdef __ia64__
+/* On IA-64, the buffer size is 16k, not 8k */
+#define YY_READ_BUF_SIZE 16384
+#else
+#define YY_READ_BUF_SIZE 8192
+#endif /* __ia64__ */
+#endif
+
+/* Copy whatever the last rule matched to the standard output. */
+#ifndef ECHO
+/* This used to be an fputs(), but since the string might contain NUL's,
+ * we now use fwrite().
+ */
+#define ECHO do { if (fwrite( yytext, yyleng, 1, yyout )) {} } while (0)
+#endif
+
+/* Gets input and stuffs it into "buf".  number of characters read, or YY_NULL,
+ * is returned in "result".
+ */
+#ifndef YY_INPUT
+#define YY_INPUT(buf,result,max_size) \
+	if ( YY_CURRENT_BUFFER_LVALUE->yy_is_interactive ) \
+		{ \
+		int c = '*'; \
+		size_t n; \
+		for ( n = 0; n < max_size && \
+			     (c = getc( yyin )) != EOF && c != '\n'; ++n ) \
+			buf[n] = (char) c; \
+		if ( c == '\n' ) \
+			buf[n++] = (char) c; \
+		if ( c == EOF && ferror( yyin ) ) \
+			YY_FATAL_ERROR( "input in flex scanner failed" ); \
+		result = n; \
+		} \
+	else \
+		{ \
+		errno=0; \
+		while ( (result = fread(buf, 1, max_size, yyin))==0 && ferror(yyin)) \
+			{ \
+			if( errno != EINTR) \
+				{ \
+				YY_FATAL_ERROR( "input in flex scanner failed" ); \
+				break; \
+				} \
+			errno=0; \
+			clearerr(yyin); \
+			} \
+		}\
+\
+
+#endif
+
+/* No semi-colon after return; correct usage is to write "yyterminate();" -
+ * we don't want an extra ';' after the "return" because that will cause
+ * some compilers to complain about unreachable statements.
+ */
+#ifndef yyterminate
+#define yyterminate() return YY_NULL
+#endif
+
+/* Number of entries by which start-condition stack grows. */
+#ifndef YY_START_STACK_INCR
+#define YY_START_STACK_INCR 25
+#endif
+
+/* Report a fatal error. */
+#ifndef YY_FATAL_ERROR
+#define YY_FATAL_ERROR(msg) yy_fatal_error( msg , yyscanner)
+#endif
+
+/* end tables serialization structures and prototypes */
+
+/* Default declaration of generated scanner - a define so the user can
+ * easily add parameters.
+ */
+#ifndef YY_DECL
+#define YY_DECL_IS_OURS 1
+
+extern int fts0tlex (yyscan_t yyscanner);
+
+#define YY_DECL int fts0tlex (yyscan_t yyscanner)
+#endif /* !YY_DECL */
+
+/* Code executed at the beginning of each rule, after yytext and yyleng
+ * have been set up.
+ */
+#ifndef YY_USER_ACTION
+#define YY_USER_ACTION
+#endif
+
+/* Code executed at the end of each rule. */
+#ifndef YY_BREAK
+#define YY_BREAK break;
+#endif
+
+#define YY_RULE_SETUP \
+	YY_USER_ACTION
+
+/** The main scanner function which does all the work.
+ */
+YY_DECL
+{
+	register yy_state_type yy_current_state;
+	register char *yy_cp, *yy_bp;
+	register int yy_act;
+    struct yyguts_t * yyg = (struct yyguts_t*) yyscanner;
+
+#line 44 "fts0tlex.l"
+
+
+#line 707 "fts0tlex.cc"
+
+	if ( !yyg->yy_init )
+		{
+		yyg->yy_init = 1;
+
+#ifdef YY_USER_INIT
+		YY_USER_INIT;
+#endif
+
+		if ( ! yyg->yy_start )
+			yyg->yy_start = 1;	/* first start state */
+
+		if ( ! yyin )
+			yyin = stdin;
+
+		if ( ! yyout )
+			yyout = stdout;
+
+		if ( ! YY_CURRENT_BUFFER ) {
+			fts0tensure_buffer_stack (yyscanner);
+			YY_CURRENT_BUFFER_LVALUE =
+				fts0t_create_buffer(yyin,YY_BUF_SIZE ,yyscanner);
+		}
+
+		fts0t_load_buffer_state(yyscanner );
+		}
+
+	while ( 1 )		/* loops until end-of-file is reached */
+		{
+		yy_cp = yyg->yy_c_buf_p;
+
+		/* Support of yytext. */
+		*yy_cp = yyg->yy_hold_char;
+
+		/* yy_bp points to the position in yy_ch_buf of the start of
+		 * the current run.
+		 */
+		yy_bp = yy_cp;
+
+		yy_current_state = yyg->yy_start;
+yy_match:
+		do
+			{
+			register YY_CHAR yy_c = yy_ec[YY_SC_TO_UI(*yy_cp)];
+			if ( yy_accept[yy_current_state] )
+				{
+				yyg->yy_last_accepting_state = yy_current_state;
+				yyg->yy_last_accepting_cpos = yy_cp;
+				}
+			while ( yy_chk[yy_base[yy_current_state] + yy_c] != yy_current_state )
+				{
+				yy_current_state = (int) yy_def[yy_current_state];
+				if ( yy_current_state >= 16 )
+					yy_c = yy_meta[(unsigned int) yy_c];
+				}
+			yy_current_state = yy_nxt[yy_base[yy_current_state] + (unsigned int) yy_c];
+			++yy_cp;
+			}
+		while ( yy_current_state != 15 );
+		yy_cp = yyg->yy_last_accepting_cpos;
+		yy_current_state = yyg->yy_last_accepting_state;
+
+yy_find_action:
+		yy_act = yy_accept[yy_current_state];
+
+		YY_DO_BEFORE_ACTION;
+
+do_action:	/* This label is used only to access EOF actions. */
+
+		switch ( yy_act )
+	{ /* beginning of action switch */
+			case 0: /* must back up */
+			/* undo the effects of YY_DO_BEFORE_ACTION */
+			*yy_cp = yyg->yy_hold_char;
+			yy_cp = yyg->yy_last_accepting_cpos;
+			yy_current_state = yyg->yy_last_accepting_state;
+			goto yy_find_action;
+
+case 1:
+YY_RULE_SETUP
+#line 46 "fts0tlex.l"
+/* Ignore whitespace */ ;
+	YY_BREAK
+case 2:
+YY_RULE_SETUP
+#line 48 "fts0tlex.l"
+{
+	val->oper = fts0tget_text(yyscanner)[0];
+
+	return(val->oper);
+}
+	YY_BREAK
+case 3:
+YY_RULE_SETUP
+#line 54 "fts0tlex.l"
+{
+	val->token = strdup(fts0tget_text(yyscanner));
+
+	return(FTS_TEXT);
+}
+	YY_BREAK
+case 4:
+YY_RULE_SETUP
+#line 60 "fts0tlex.l"
+{
+	val->token = strdup(fts0tget_text(yyscanner));
+
+	return(FTS_TERM);
+}
+	YY_BREAK
+case 5:
+/* rule 5 can match eol */
+YY_RULE_SETUP
+#line 66 "fts0tlex.l"
+
+	YY_BREAK
+case 6:
+YY_RULE_SETUP
+#line 68 "fts0tlex.l"
+ECHO;
+	YY_BREAK
+#line 829 "fts0tlex.cc"
+case YY_STATE_EOF(INITIAL):
+	yyterminate();
+
+	case YY_END_OF_BUFFER:
+		{
+		/* Amount of text matched not including the EOB char. */
+		int yy_amount_of_matched_text = (int) (yy_cp - yyg->yytext_ptr) - 1;
+
+		/* Undo the effects of YY_DO_BEFORE_ACTION. */
+		*yy_cp = yyg->yy_hold_char;
+		YY_RESTORE_YY_MORE_OFFSET
+
+		if ( YY_CURRENT_BUFFER_LVALUE->yy_buffer_status == YY_BUFFER_NEW )
+			{
+			/* We're scanning a new file or input source.  It's
+			 * possible that this happened because the user
+			 * just pointed yyin at a new source and called
+			 * fts0tlex().  If so, then we have to assure
+			 * consistency between YY_CURRENT_BUFFER and our
+			 * globals.  Here is the right place to do so, because
+			 * this is the first action (other than possibly a
+			 * back-up) that will match for the new input source.
+			 */
+			yyg->yy_n_chars = YY_CURRENT_BUFFER_LVALUE->yy_n_chars;
+			YY_CURRENT_BUFFER_LVALUE->yy_input_file = yyin;
+			YY_CURRENT_BUFFER_LVALUE->yy_buffer_status = YY_BUFFER_NORMAL;
+			}
+
+		/* Note that here we test for yy_c_buf_p "<=" to the position
+		 * of the first EOB in the buffer, since yy_c_buf_p will
+		 * already have been incremented past the NUL character
+		 * (since all states make transitions on EOB to the
+		 * end-of-buffer state).  Contrast this with the test
+		 * in input().
+		 */
+		if ( yyg->yy_c_buf_p <= &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[yyg->yy_n_chars] )
+			{ /* This was really a NUL. */
+			yy_state_type yy_next_state;
+
+			yyg->yy_c_buf_p = yyg->yytext_ptr + yy_amount_of_matched_text;
+
+			yy_current_state = yy_get_previous_state( yyscanner );
+
+			/* Okay, we're now positioned to make the NUL
+			 * transition.  We couldn't have
+			 * yy_get_previous_state() go ahead and do it
+			 * for us because it doesn't know how to deal
+			 * with the possibility of jamming (and we don't
+			 * want to build jamming into it because then it
+			 * will run more slowly).
+			 */
+
+			yy_next_state = yy_try_NUL_trans( yy_current_state , yyscanner);
+
+			yy_bp = yyg->yytext_ptr + YY_MORE_ADJ;
+
+			if ( yy_next_state )
+				{
+				/* Consume the NUL. */
+				yy_cp = ++yyg->yy_c_buf_p;
+				yy_current_state = yy_next_state;
+				goto yy_match;
+				}
+
+			else
+				{
+				yy_cp = yyg->yy_last_accepting_cpos;
+				yy_current_state = yyg->yy_last_accepting_state;
+				goto yy_find_action;
+				}
+			}
+
+		else switch ( yy_get_next_buffer( yyscanner ) )
+			{
+			case EOB_ACT_END_OF_FILE:
+				{
+				yyg->yy_did_buffer_switch_on_eof = 0;
+
+				if ( fts0twrap(yyscanner ) )
+					{
+					/* Note: because we've taken care in
+					 * yy_get_next_buffer() to have set up
+					 * yytext, we can now set up
+					 * yy_c_buf_p so that if some total
+					 * hoser (like flex itself) wants to
+					 * call the scanner after we return the
+					 * YY_NULL, it'll still work - another
+					 * YY_NULL will get returned.
+					 */
+					yyg->yy_c_buf_p = yyg->yytext_ptr + YY_MORE_ADJ;
+
+					yy_act = YY_STATE_EOF(YY_START);
+					goto do_action;
+					}
+
+				else
+					{
+					if ( ! yyg->yy_did_buffer_switch_on_eof )
+						YY_NEW_FILE;
+					}
+				break;
+				}
+
+			case EOB_ACT_CONTINUE_SCAN:
+				yyg->yy_c_buf_p =
+					yyg->yytext_ptr + yy_amount_of_matched_text;
+
+				yy_current_state = yy_get_previous_state( yyscanner );
+
+				yy_cp = yyg->yy_c_buf_p;
+				yy_bp = yyg->yytext_ptr + YY_MORE_ADJ;
+				goto yy_match;
+
+			case EOB_ACT_LAST_MATCH:
+				yyg->yy_c_buf_p =
+				&YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[yyg->yy_n_chars];
+
+				yy_current_state = yy_get_previous_state( yyscanner );
+
+				yy_cp = yyg->yy_c_buf_p;
+				yy_bp = yyg->yytext_ptr + YY_MORE_ADJ;
+				goto yy_find_action;
+			}
+		break;
+		}
+
+	default:
+		YY_FATAL_ERROR(
+			"fatal flex scanner internal error--no action found" );
+	} /* end of action switch */
+		} /* end of scanning one token */
+} /* end of fts0tlex */
+
+/* yy_get_next_buffer - try to read in a new buffer
+ *
+ * Returns a code representing an action:
+ *	EOB_ACT_LAST_MATCH -
+ *	EOB_ACT_CONTINUE_SCAN - continue scanning from current position
+ *	EOB_ACT_END_OF_FILE - end of file
+ */
+static int yy_get_next_buffer (yyscan_t yyscanner)
+{
+    struct yyguts_t * yyg = (struct yyguts_t*) yyscanner;
+	register char *dest = YY_CURRENT_BUFFER_LVALUE->yy_ch_buf;
+	register char *source = yyg->yytext_ptr;
+	register int number_to_move, i;
+	int ret_val;
+
+	if ( yyg->yy_c_buf_p > &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[yyg->yy_n_chars + 1] )
+		YY_FATAL_ERROR(
+		"fatal flex scanner internal error--end of buffer missed" );
+
+	if ( YY_CURRENT_BUFFER_LVALUE->yy_fill_buffer == 0 )
+		{ /* Don't try to fill the buffer, so this is an EOF. */
+		if ( yyg->yy_c_buf_p - yyg->yytext_ptr - YY_MORE_ADJ == 1 )
+			{
+			/* We matched a single character, the EOB, so
+			 * treat this as a final EOF.
+			 */
+			return EOB_ACT_END_OF_FILE;
+			}
+
+		else
+			{
+			/* We matched some text prior to the EOB, first
+			 * process it.
+			 */
+			return EOB_ACT_LAST_MATCH;
+			}
+		}
+
+	/* Try to read more data. */
+
+	/* First move last chars to start of buffer. */
+	number_to_move = (int) (yyg->yy_c_buf_p - yyg->yytext_ptr) - 1;
+
+	for ( i = 0; i < number_to_move; ++i )
+		*(dest++) = *(source++);
+
+	if ( YY_CURRENT_BUFFER_LVALUE->yy_buffer_status == YY_BUFFER_EOF_PENDING )
+		/* don't do the read, it's not guaranteed to return an EOF,
+		 * just force an EOF
+		 */
+		YY_CURRENT_BUFFER_LVALUE->yy_n_chars = yyg->yy_n_chars = 0;
+
+	else
+		{
+			int num_to_read =
+			YY_CURRENT_BUFFER_LVALUE->yy_buf_size - number_to_move - 1;
+
+		while ( num_to_read <= 0 )
+			{ /* Not enough room in the buffer - grow it. */
+
+			/* just a shorter name for the current buffer */
+			YY_BUFFER_STATE b = YY_CURRENT_BUFFER;
+
+			int yy_c_buf_p_offset =
+				(int) (yyg->yy_c_buf_p - b->yy_ch_buf);
+
+			if ( b->yy_is_our_buffer )
+				{
+				int new_size = b->yy_buf_size * 2;
+
+				if ( new_size <= 0 )
+					b->yy_buf_size += b->yy_buf_size / 8;
+				else
+					b->yy_buf_size *= 2;
+
+				b->yy_ch_buf = (char*)
+					/* Include room in for 2 EOB chars. */
+					fts0trealloc((void*) b->yy_ch_buf,b->yy_buf_size + 2 ,yyscanner );
+				}
+			else
+				/* Can't grow it, we don't own it. */
+				b->yy_ch_buf = 0;
+
+			if ( ! b->yy_ch_buf )
+				YY_FATAL_ERROR(
+				"fatal error - scanner input buffer overflow" );
+
+			yyg->yy_c_buf_p = &b->yy_ch_buf[yy_c_buf_p_offset];
+
+			num_to_read = YY_CURRENT_BUFFER_LVALUE->yy_buf_size -
+						number_to_move - 1;
+
+			}
+
+		if ( num_to_read > YY_READ_BUF_SIZE )
+			num_to_read = YY_READ_BUF_SIZE;
+
+		/* Read in more data. */
+		YY_INPUT( (&YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[number_to_move]),
+			yyg->yy_n_chars, (size_t) num_to_read );
+
+		YY_CURRENT_BUFFER_LVALUE->yy_n_chars = yyg->yy_n_chars;
+		}
+
+	if ( yyg->yy_n_chars == 0 )
+		{
+		if ( number_to_move == YY_MORE_ADJ )
+			{
+			ret_val = EOB_ACT_END_OF_FILE;
+			fts0trestart(yyin  ,yyscanner);
+			}
+
+		else
+			{
+			ret_val = EOB_ACT_LAST_MATCH;
+			YY_CURRENT_BUFFER_LVALUE->yy_buffer_status =
+				YY_BUFFER_EOF_PENDING;
+			}
+		}
+
+	else
+		ret_val = EOB_ACT_CONTINUE_SCAN;
+
+	if ((yy_size_t) (yyg->yy_n_chars + number_to_move) > YY_CURRENT_BUFFER_LVALUE->yy_buf_size) {
+		/* Extend the array by 50%, plus the number we really need. */
+		yy_size_t new_size = yyg->yy_n_chars + number_to_move + (yyg->yy_n_chars >> 1);
+		YY_CURRENT_BUFFER_LVALUE->yy_ch_buf = (char*) fts0trealloc((void*) YY_CURRENT_BUFFER_LVALUE->yy_ch_buf,new_size ,yyscanner );
+		if ( ! YY_CURRENT_BUFFER_LVALUE->yy_ch_buf )
+			YY_FATAL_ERROR( "out of dynamic memory in yy_get_next_buffer()" );
+	}
+
+	yyg->yy_n_chars += number_to_move;
+	YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[yyg->yy_n_chars] = YY_END_OF_BUFFER_CHAR;
+	YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[yyg->yy_n_chars + 1] = YY_END_OF_BUFFER_CHAR;
+
+	yyg->yytext_ptr = &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[0];
+
+	return ret_val;
+}
+
+/* yy_get_previous_state - get the state just before the EOB char was reached */
+
+    static yy_state_type yy_get_previous_state (yyscan_t yyscanner)
+{
+	register yy_state_type yy_current_state;
+	register char *yy_cp;
+    struct yyguts_t * yyg = (struct yyguts_t*) yyscanner;
+
+	yy_current_state = yyg->yy_start;
+
+	for ( yy_cp = yyg->yytext_ptr + YY_MORE_ADJ; yy_cp < yyg->yy_c_buf_p; ++yy_cp )
+		{
+		register YY_CHAR yy_c = (*yy_cp ? yy_ec[YY_SC_TO_UI(*yy_cp)] : 1);
+		if ( yy_accept[yy_current_state] )
+			{
+			yyg->yy_last_accepting_state = yy_current_state;
+			yyg->yy_last_accepting_cpos = yy_cp;
+			}
+		while ( yy_chk[yy_base[yy_current_state] + yy_c] != yy_current_state )
+			{
+			yy_current_state = (int) yy_def[yy_current_state];
+			if ( yy_current_state >= 16 )
+				yy_c = yy_meta[(unsigned int) yy_c];
+			}
+		yy_current_state = yy_nxt[yy_base[yy_current_state] + (unsigned int) yy_c];
+		}
+
+	return yy_current_state;
+}
+
+/* yy_try_NUL_trans - try to make a transition on the NUL character
+ *
+ * synopsis
+ *	next_state = yy_try_NUL_trans( current_state );
+ */
+    static yy_state_type yy_try_NUL_trans  (yy_state_type yy_current_state , yyscan_t yyscanner)
+{
+	register int yy_is_jam;
+    struct yyguts_t * yyg = (struct yyguts_t*) yyscanner; /* This var may be unused depending upon options. */
+	register char *yy_cp = yyg->yy_c_buf_p;
+
+	register YY_CHAR yy_c = 1;
+	if ( yy_accept[yy_current_state] )
+		{
+		yyg->yy_last_accepting_state = yy_current_state;
+		yyg->yy_last_accepting_cpos = yy_cp;
+		}
+	while ( yy_chk[yy_base[yy_current_state] + yy_c] != yy_current_state )
+		{
+		yy_current_state = (int) yy_def[yy_current_state];
+		if ( yy_current_state >= 16 )
+			yy_c = yy_meta[(unsigned int) yy_c];
+		}
+	yy_current_state = yy_nxt[yy_base[yy_current_state] + (unsigned int) yy_c];
+	yy_is_jam = (yy_current_state == 15);
+
+	return yy_is_jam ? 0 : yy_current_state;
+}
+
+#ifndef YY_NO_INPUT
+#ifdef __cplusplus
+    static int yyinput (yyscan_t yyscanner)
+#else
+    static int input  (yyscan_t yyscanner)
+#endif
+
+{
+	int c;
+    struct yyguts_t * yyg = (struct yyguts_t*) yyscanner;
+
+	*yyg->yy_c_buf_p = yyg->yy_hold_char;
+
+	if ( *yyg->yy_c_buf_p == YY_END_OF_BUFFER_CHAR )
+		{
+		/* yy_c_buf_p now points to the character we want to return.
+		 * If this occurs *before* the EOB characters, then it's a
+		 * valid NUL; if not, then we've hit the end of the buffer.
+		 */
+		if ( yyg->yy_c_buf_p < &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[yyg->yy_n_chars] )
+			/* This was really a NUL. */
+			*yyg->yy_c_buf_p = '\0';
+
+		else
+			{ /* need more input */
+			int offset = yyg->yy_c_buf_p - yyg->yytext_ptr;
+			++yyg->yy_c_buf_p;
+
+			switch ( yy_get_next_buffer( yyscanner ) )
+				{
+				case EOB_ACT_LAST_MATCH:
+					/* This happens because yy_g_n_b()
+					 * sees that we've accumulated a
+					 * token and flags that we need to
+					 * try matching the token before
+					 * proceeding.  But for input(),
+					 * there's no matching to consider.
+					 * So convert the EOB_ACT_LAST_MATCH
+					 * to EOB_ACT_END_OF_FILE.
+					 */
+
+					/* Reset buffer status. */
+					fts0trestart(yyin ,yyscanner);
+
+					/*FALLTHROUGH*/
+
+				case EOB_ACT_END_OF_FILE:
+					{
+					if ( fts0twrap(yyscanner ) )
+						return EOF;
+
+					if ( ! yyg->yy_did_buffer_switch_on_eof )
+						YY_NEW_FILE;
+#ifdef __cplusplus
+					return yyinput(yyscanner);
+#else
+					return input(yyscanner);
+#endif
+					}
+
+				case EOB_ACT_CONTINUE_SCAN:
+					yyg->yy_c_buf_p = yyg->yytext_ptr + offset;
+					break;
+				}
+			}
+		}
+
+	c = *(unsigned char*) yyg->yy_c_buf_p;	/* cast for 8-bit char's */
+	*yyg->yy_c_buf_p = '\0';	/* preserve yytext */
+	yyg->yy_hold_char = *++yyg->yy_c_buf_p;
+
+	return c;
+}
+#endif	/* ifndef YY_NO_INPUT */
+
+/** Immediately switch to a different input stream.
+ * @param input_file A readable stream.
+ * @param yyscanner The scanner object.
+ * @note This function does not reset the start condition to @c INITIAL .
+ */
+    void fts0trestart  (FILE * input_file , yyscan_t yyscanner)
+{
+    struct yyguts_t * yyg = (struct yyguts_t*) yyscanner;
+
+	if ( ! YY_CURRENT_BUFFER ){
+        fts0tensure_buffer_stack (yyscanner);
+		YY_CURRENT_BUFFER_LVALUE =
+            fts0t_create_buffer(yyin,YY_BUF_SIZE ,yyscanner);
+	}
+
+	fts0t_init_buffer(YY_CURRENT_BUFFER,input_file ,yyscanner);
+	fts0t_load_buffer_state(yyscanner );
+}
+
+/** Switch to a different input buffer.
+ * @param new_buffer The new input buffer.
+ * @param yyscanner The scanner object.
+ */
+    void fts0t_switch_to_buffer  (YY_BUFFER_STATE  new_buffer , yyscan_t yyscanner)
+{
+    struct yyguts_t * yyg = (struct yyguts_t*) yyscanner;
+
+	/* TODO. We should be able to replace this entire function body
+	 * with
+	 *		fts0tpop_buffer_state();
+	 *		fts0tpush_buffer_state(new_buffer);
+     */
+	fts0tensure_buffer_stack (yyscanner);
+	if ( YY_CURRENT_BUFFER == new_buffer )
+		return;
+
+	if ( YY_CURRENT_BUFFER )
+		{
+		/* Flush out information for old buffer. */
+		*yyg->yy_c_buf_p = yyg->yy_hold_char;
+		YY_CURRENT_BUFFER_LVALUE->yy_buf_pos = yyg->yy_c_buf_p;
+		YY_CURRENT_BUFFER_LVALUE->yy_n_chars = yyg->yy_n_chars;
+		}
+
+	YY_CURRENT_BUFFER_LVALUE = new_buffer;
+	fts0t_load_buffer_state(yyscanner );
+
+	/* We don't actually know whether we did this switch during
+	 * EOF (fts0twrap()) processing, but the only time this flag
+	 * is looked at is after fts0twrap() is called, so it's safe
+	 * to go ahead and always set it.
+	 */
+	yyg->yy_did_buffer_switch_on_eof = 1;
+}
+
+static void fts0t_load_buffer_state  (yyscan_t yyscanner)
+{
+    struct yyguts_t * yyg = (struct yyguts_t*) yyscanner;
+	yyg->yy_n_chars = YY_CURRENT_BUFFER_LVALUE->yy_n_chars;
+	yyg->yytext_ptr = yyg->yy_c_buf_p = YY_CURRENT_BUFFER_LVALUE->yy_buf_pos;
+	yyin = YY_CURRENT_BUFFER_LVALUE->yy_input_file;
+	yyg->yy_hold_char = *yyg->yy_c_buf_p;
+}
+
+/** Allocate and initialize an input buffer state.
+ * @param file A readable stream.
+ * @param size The character buffer size in bytes. When in doubt, use @c YY_BUF_SIZE.
+ * @param yyscanner The scanner object.
+ * @return the allocated buffer state.
+ */
+    YY_BUFFER_STATE fts0t_create_buffer  (FILE * file, int  size , yyscan_t yyscanner)
+{
+	YY_BUFFER_STATE b;
+
+	b = (YY_BUFFER_STATE) fts0talloc(sizeof( struct yy_buffer_state ) ,yyscanner );
+	if ( ! b )
+		YY_FATAL_ERROR( "out of dynamic memory in fts0t_create_buffer()" );
+
+	b->yy_buf_size = size;
+
+	/* yy_ch_buf has to be 2 characters longer than the size given because
+	 * we need to put in 2 end-of-buffer characters.
+	 */
+	b->yy_ch_buf = (char*) fts0talloc(b->yy_buf_size + 2 ,yyscanner );
+	if ( ! b->yy_ch_buf )
+		YY_FATAL_ERROR( "out of dynamic memory in fts0t_create_buffer()" );
+
+	b->yy_is_our_buffer = 1;
+
+	fts0t_init_buffer(b,file ,yyscanner);
+
+	return b;
+}
+
+/** Destroy the buffer.
+ * @param b a buffer created with fts0t_create_buffer()
+ * @param yyscanner The scanner object.
+ */
+    void fts0t_delete_buffer (YY_BUFFER_STATE  b , yyscan_t yyscanner)
+{
+    struct yyguts_t * yyg = (struct yyguts_t*) yyscanner;
+
+	if ( ! b )
+		return;
+
+	if ( b == YY_CURRENT_BUFFER ) /* Not sure if we should pop here. */
+		YY_CURRENT_BUFFER_LVALUE = (YY_BUFFER_STATE) 0;
+
+	if ( b->yy_is_our_buffer )
+		fts0tfree((void*) b->yy_ch_buf ,yyscanner );
+
+	fts0tfree((void*) b ,yyscanner );
+}
+
+/* Initializes or reinitializes a buffer.
+ * This function is sometimes called more than once on the same buffer,
+ * such as during a fts0trestart() or at EOF.
+ */
+    static void fts0t_init_buffer  (YY_BUFFER_STATE  b, FILE * file , yyscan_t yyscanner)
+
+{
+	int oerrno = errno;
+    struct yyguts_t * yyg = (struct yyguts_t*) yyscanner;
+
+	fts0t_flush_buffer(b ,yyscanner);
+
+	b->yy_input_file = file;
+	b->yy_fill_buffer = 1;
+
+    /* If b is the current buffer, then fts0t_init_buffer was _probably_
+     * called from fts0trestart() or through yy_get_next_buffer.
+     * In that case, we don't want to reset the lineno or column.
+     */
+    if (b != YY_CURRENT_BUFFER){
+        b->yy_bs_lineno = 1;
+        b->yy_bs_column = 0;
+    }
+
+        b->yy_is_interactive = 0;
+
+	errno = oerrno;
+}
+
+/** Discard all buffered characters. On the next scan, YY_INPUT will be called.
+ * @param b the buffer state to be flushed, usually @c YY_CURRENT_BUFFER.
+ * @param yyscanner The scanner object.
+ */
+    void fts0t_flush_buffer (YY_BUFFER_STATE  b , yyscan_t yyscanner)
+{
+    struct yyguts_t * yyg = (struct yyguts_t*) yyscanner;
+	if ( ! b )
+		return;
+
+	b->yy_n_chars = 0;
+
+	/* We always need two end-of-buffer characters.  The first causes
+	 * a transition to the end-of-buffer state.  The second causes
+	 * a jam in that state.
+	 */
+	b->yy_ch_buf[0] = YY_END_OF_BUFFER_CHAR;
+	b->yy_ch_buf[1] = YY_END_OF_BUFFER_CHAR;
+
+	b->yy_buf_pos = &b->yy_ch_buf[0];
+
+	b->yy_at_bol = 1;
+	b->yy_buffer_status = YY_BUFFER_NEW;
+
+	if ( b == YY_CURRENT_BUFFER )
+		fts0t_load_buffer_state(yyscanner );
+}
+
+/** Pushes the new state onto the stack. The new state becomes
+ *  the current state. This function will allocate the stack
+ *  if necessary.
+ *  @param new_buffer The new state.
+ *  @param yyscanner The scanner object.
+ */
+void fts0tpush_buffer_state (YY_BUFFER_STATE new_buffer , yyscan_t yyscanner)
+{
+    struct yyguts_t * yyg = (struct yyguts_t*) yyscanner;
+	if (new_buffer == NULL)
+		return;
+
+	fts0tensure_buffer_stack(yyscanner);
+
+	/* This block is copied from fts0t_switch_to_buffer. */
+	if ( YY_CURRENT_BUFFER )
+		{
+		/* Flush out information for old buffer. */
+		*yyg->yy_c_buf_p = yyg->yy_hold_char;
+		YY_CURRENT_BUFFER_LVALUE->yy_buf_pos = yyg->yy_c_buf_p;
+		YY_CURRENT_BUFFER_LVALUE->yy_n_chars = yyg->yy_n_chars;
+		}
+
+	/* Only push if top exists. Otherwise, replace top. */
+	if (YY_CURRENT_BUFFER)
+		yyg->yy_buffer_stack_top++;
+	YY_CURRENT_BUFFER_LVALUE = new_buffer;
+
+	/* copied from fts0t_switch_to_buffer. */
+	fts0t_load_buffer_state(yyscanner );
+	yyg->yy_did_buffer_switch_on_eof = 1;
+}
+
+/** Removes and deletes the top of the stack, if present.
+ *  The next element becomes the new top.
+ *  @param yyscanner The scanner object.
+ */
+void fts0tpop_buffer_state (yyscan_t yyscanner)
+{
+    struct yyguts_t * yyg = (struct yyguts_t*) yyscanner;
+	if (!YY_CURRENT_BUFFER)
+		return;
+
+	fts0t_delete_buffer(YY_CURRENT_BUFFER ,yyscanner);
+	YY_CURRENT_BUFFER_LVALUE = NULL;
+	if (yyg->yy_buffer_stack_top > 0)
+		--yyg->yy_buffer_stack_top;
+
+	if (YY_CURRENT_BUFFER) {
+		fts0t_load_buffer_state(yyscanner );
+		yyg->yy_did_buffer_switch_on_eof = 1;
+	}
+}
+
+/* Allocates the stack if it does not exist.
+ *  Guarantees space for at least one push.
+ */
+static void fts0tensure_buffer_stack (yyscan_t yyscanner)
+{
+	int num_to_alloc;
+    struct yyguts_t * yyg = (struct yyguts_t*) yyscanner;
+
+	if (!yyg->yy_buffer_stack) {
+
+		/* First allocation is just for 2 elements, since we don't know if this
+		 * scanner will even need a stack. We use 2 instead of 1 to avoid an
+		 * immediate realloc on the next call.
+         */
+		num_to_alloc = 1;
+		yyg->yy_buffer_stack = (struct yy_buffer_state**) fts0talloc
+								(num_to_alloc * sizeof(struct yy_buffer_state*)
+								, yyscanner);
+		if ( ! yyg->yy_buffer_stack )
+			YY_FATAL_ERROR( "out of dynamic memory in fts0tensure_buffer_stack()" );
+
+		memset(yyg->yy_buffer_stack, 0, num_to_alloc * sizeof(struct yy_buffer_state*));
+
+		yyg->yy_buffer_stack_max = num_to_alloc;
+		yyg->yy_buffer_stack_top = 0;
+		return;
+	}
+
+	if (yyg->yy_buffer_stack_top >= (yyg->yy_buffer_stack_max) - 1){
+
+		/* Increase the buffer to prepare for a possible push. */
+		int grow_size = 8 /* arbitrary grow size */;
+
+		num_to_alloc = yyg->yy_buffer_stack_max + grow_size;
+		yyg->yy_buffer_stack = (struct yy_buffer_state**) fts0trealloc
+								(yyg->yy_buffer_stack,
+								num_to_alloc * sizeof(struct yy_buffer_state*)
+								, yyscanner);
+		if ( ! yyg->yy_buffer_stack )
+			YY_FATAL_ERROR( "out of dynamic memory in fts0tensure_buffer_stack()" );
+
+		/* zero only the new slots.*/
+		memset(yyg->yy_buffer_stack + yyg->yy_buffer_stack_max, 0, grow_size * sizeof(struct yy_buffer_state*));
+		yyg->yy_buffer_stack_max = num_to_alloc;
+	}
+}
+
+/** Setup the input buffer state to scan directly from a user-specified character buffer.
+ * @param base the character buffer
+ * @param size the size in bytes of the character buffer
+ * @param yyscanner The scanner object.
+ * @return the newly allocated buffer state object.
+ */
+YY_BUFFER_STATE fts0t_scan_buffer  (char * base, yy_size_t  size , yyscan_t yyscanner)
+{
+	YY_BUFFER_STATE b;
+
+	if ( size < 2 ||
+	     base[size-2] != YY_END_OF_BUFFER_CHAR ||
+	     base[size-1] != YY_END_OF_BUFFER_CHAR )
+		/* They forgot to leave room for the EOB's. */
+		return 0;
+
+	b = (YY_BUFFER_STATE) fts0talloc(sizeof( struct yy_buffer_state ) ,yyscanner );
+	if ( ! b )
+		YY_FATAL_ERROR( "out of dynamic memory in fts0t_scan_buffer()" );
+
+	b->yy_buf_size = size - 2;	/* "- 2" to take care of EOB's */
+	b->yy_buf_pos = b->yy_ch_buf = base;
+	b->yy_is_our_buffer = 0;
+	b->yy_input_file = 0;
+	b->yy_n_chars = b->yy_buf_size;
+	b->yy_is_interactive = 0;
+	b->yy_at_bol = 1;
+	b->yy_fill_buffer = 0;
+	b->yy_buffer_status = YY_BUFFER_NEW;
+
+	fts0t_switch_to_buffer(b ,yyscanner );
+
+	return b;
+}
+
+/** Setup the input buffer state to scan a string. The next call to fts0tlex() will
+ * scan from a @e copy of @a str.
+ * @param yystr a NUL-terminated string to scan
+ * @param yyscanner The scanner object.
+ * @return the newly allocated buffer state object.
+ * @note If you want to scan bytes that may contain NUL values, then use
+ *       fts0t_scan_bytes() instead.
+ */
+YY_BUFFER_STATE fts0t_scan_string (yyconst char * yystr , yyscan_t yyscanner)
+{
+
+	return fts0t_scan_bytes(yystr,strlen(yystr) ,yyscanner);
+}
+
+/** Setup the input buffer state to scan the given bytes. The next call to fts0tlex() will
+ * scan from a @e copy of @a bytes.
+ * @param yybytes the byte buffer to scan
+ * @param _yybytes_len the number of bytes in the buffer pointed to by @a bytes.
+ * @param yyscanner The scanner object.
+ * @return the newly allocated buffer state object.
+ */
+YY_BUFFER_STATE fts0t_scan_bytes  (yyconst char * yybytes, int  _yybytes_len , yyscan_t yyscanner)
+{
+	YY_BUFFER_STATE b;
+	char *buf;
+	yy_size_t n;
+	int i;
+
+	/* Get memory for full buffer, including space for trailing EOB's. */
+	n = _yybytes_len + 2;
+	buf = (char*) fts0talloc(n ,yyscanner );
+	if ( ! buf )
+		YY_FATAL_ERROR( "out of dynamic memory in fts0t_scan_bytes()" );
+
+	for ( i = 0; i < _yybytes_len; ++i )
+		buf[i] = yybytes[i];
+
+	buf[_yybytes_len] = buf[_yybytes_len+1] = YY_END_OF_BUFFER_CHAR;
+
+	b = fts0t_scan_buffer(buf,n ,yyscanner);
+	if ( ! b )
+		YY_FATAL_ERROR( "bad buffer in fts0t_scan_bytes()" );
+
+	/* It's okay to grow etc. this buffer, and we should throw it
+	 * away when we're done.
+	 */
+	b->yy_is_our_buffer = 1;
+
+	return b;
+}
+
+#ifndef YY_EXIT_FAILURE
+#define YY_EXIT_FAILURE 2
+#endif
+
+static void yy_fatal_error (yyconst char* msg ,  yyscan_t yyscanner __attribute__((unused)))
+{
+    	(void) fprintf( stderr, "%s\n", msg );
+	exit( YY_EXIT_FAILURE );
+}
+
+/* Redefine yyless() so it works in section 3 code. */
+
+#undef yyless
+#define yyless(n) \
+	do \
+		{ \
+		/* Undo effects of setting up yytext. */ \
+        int yyless_macro_arg = (n); \
+        YY_LESS_LINENO(yyless_macro_arg);\
+		yytext[yyleng] = yyg->yy_hold_char; \
+		yyg->yy_c_buf_p = yytext + yyless_macro_arg; \
+		yyg->yy_hold_char = *yyg->yy_c_buf_p; \
+		*yyg->yy_c_buf_p = '\0'; \
+		yyleng = yyless_macro_arg; \
+		} \
+	while ( 0 )
+
+/* Accessor  methods (get/set functions) to struct members. */
+
+/** Get the user-defined data for this scanner.
+ * @param yyscanner The scanner object.
+ */
+YY_EXTRA_TYPE fts0tget_extra  (yyscan_t yyscanner)
+{
+    struct yyguts_t * yyg = (struct yyguts_t*) yyscanner;
+    return yyextra;
+}
+
+/** Get the current line number.
+ * @param yyscanner The scanner object.
+ */
+int fts0tget_lineno  (yyscan_t yyscanner)
+{
+    struct yyguts_t * yyg = (struct yyguts_t*) yyscanner;
+
+        if (! YY_CURRENT_BUFFER)
+            return 0;
+
+    return yylineno;
+}
+
+/** Get the current column number.
+ * @param yyscanner The scanner object.
+ */
+int fts0tget_column  (yyscan_t yyscanner)
+{
+    struct yyguts_t * yyg = (struct yyguts_t*) yyscanner;
+
+        if (! YY_CURRENT_BUFFER)
+            return 0;
+
+    return yycolumn;
+}
+
+/** Get the input stream.
+ * @param yyscanner The scanner object.
+ */
+FILE *fts0tget_in  (yyscan_t yyscanner)
+{
+    struct yyguts_t * yyg = (struct yyguts_t*) yyscanner;
+    return yyin;
+}
+
+/** Get the output stream.
+ * @param yyscanner The scanner object.
+ */
+FILE *fts0tget_out  (yyscan_t yyscanner)
+{
+    struct yyguts_t * yyg = (struct yyguts_t*) yyscanner;
+    return yyout;
+}
+
+/** Get the length of the current token.
+ * @param yyscanner The scanner object.
+ */
+int fts0tget_leng  (yyscan_t yyscanner)
+{
+    struct yyguts_t * yyg = (struct yyguts_t*) yyscanner;
+    return yyleng;
+}
+
+/** Get the current token.
+ * @param yyscanner The scanner object.
+ */
+
+char *fts0tget_text  (yyscan_t yyscanner)
+{
+    struct yyguts_t * yyg = (struct yyguts_t*) yyscanner;
+    return yytext;
+}
+
+/** Set the user-defined data. This data is never touched by the scanner.
+ * @param user_defined The data to be associated with this scanner.
+ * @param yyscanner The scanner object.
+ */
+void fts0tset_extra (YY_EXTRA_TYPE  user_defined , yyscan_t yyscanner)
+{
+    struct yyguts_t * yyg = (struct yyguts_t*) yyscanner;
+    yyextra = user_defined ;
+}
+
+/** Set the current line number.
+ * @param line_number
+ * @param yyscanner The scanner object.
+ */
+void fts0tset_lineno (int  line_number , yyscan_t yyscanner)
+{
+    struct yyguts_t * yyg = (struct yyguts_t*) yyscanner;
+
+        /* lineno is only valid if an input buffer exists. */
+        if (! YY_CURRENT_BUFFER )
+           yy_fatal_error( "fts0tset_lineno called with no buffer" , yyscanner);
+
+    yylineno = line_number;
+}
+
+/** Set the current column.
+ * @param line_number
+ * @param yyscanner The scanner object.
+ */
+void fts0tset_column (int  column_no , yyscan_t yyscanner)
+{
+    struct yyguts_t * yyg = (struct yyguts_t*) yyscanner;
+
+        /* column is only valid if an input buffer exists. */
+        if (! YY_CURRENT_BUFFER )
+           yy_fatal_error( "fts0tset_column called with no buffer" , yyscanner);
+
+    yycolumn = column_no;
+}
+
+/** Set the input stream. This does not discard the current
+ * input buffer.
+ * @param in_str A readable stream.
+ * @param yyscanner The scanner object.
+ * @see fts0t_switch_to_buffer
+ */
+void fts0tset_in (FILE *  in_str , yyscan_t yyscanner)
+{
+    struct yyguts_t * yyg = (struct yyguts_t*) yyscanner;
+    yyin = in_str ;
+}
+
+void fts0tset_out (FILE *  out_str , yyscan_t yyscanner)
+{
+    struct yyguts_t * yyg = (struct yyguts_t*) yyscanner;
+    yyout = out_str ;
+}
+
+int fts0tget_debug  (yyscan_t yyscanner)
+{
+    struct yyguts_t * yyg = (struct yyguts_t*) yyscanner;
+    return yy_flex_debug;
+}
+
+void fts0tset_debug (int  bdebug , yyscan_t yyscanner)
+{
+    struct yyguts_t * yyg = (struct yyguts_t*) yyscanner;
+    yy_flex_debug = bdebug ;
+}
+
+/* Accessor methods for yylval and yylloc */
+
+/* User-visible API */
+
+/* fts0tlex_init is special because it creates the scanner itself, so it is
+ * the ONLY reentrant function that doesn't take the scanner as the last argument.
+ * That's why we explicitly handle the declaration, instead of using our macros.
+ */
+
+int fts0tlex_init(yyscan_t* ptr_yy_globals)
+
+{
+    if (ptr_yy_globals == NULL){
+        errno = EINVAL;
+        return 1;
+    }
+
+    *ptr_yy_globals = (yyscan_t) fts0talloc ( sizeof( struct yyguts_t ), NULL );
+
+    if (*ptr_yy_globals == NULL){
+        errno = ENOMEM;
+        return 1;
+    }
+
+    /* By setting to 0xAA, we expose bugs in yy_init_globals. Leave at 0x00 for releases. */
+    memset(*ptr_yy_globals,0x00,sizeof(struct yyguts_t));
+
+    return yy_init_globals ( *ptr_yy_globals );
+}
+
+/* fts0tlex_init_extra has the same functionality as fts0tlex_init, but follows the
+ * convention of taking the scanner as the last argument. Note however, that
+ * this is a *pointer* to a scanner, as it will be allocated by this call (and
+ * is the reason, too, why this function also must handle its own declaration).
+ * The user defined value in the first argument will be available to fts0talloc in
+ * the yyextra field.
+ */
+
+int fts0tlex_init_extra(YY_EXTRA_TYPE yy_user_defined,yyscan_t* ptr_yy_globals )
+
+{
+    struct yyguts_t dummy_yyguts;
+
+    fts0tset_extra (yy_user_defined, &dummy_yyguts);
+
+    if (ptr_yy_globals == NULL){
+        errno = EINVAL;
+        return 1;
+    }
+
+    *ptr_yy_globals = (yyscan_t) fts0talloc ( sizeof( struct yyguts_t ), &dummy_yyguts );
+
+    if (*ptr_yy_globals == NULL){
+        errno = ENOMEM;
+        return 1;
+    }
+
+    /* By setting to 0xAA, we expose bugs in
+    yy_init_globals. Leave at 0x00 for releases. */
+    memset(*ptr_yy_globals,0x00,sizeof(struct yyguts_t));
+
+    fts0tset_extra (yy_user_defined, *ptr_yy_globals);
+
+    return yy_init_globals ( *ptr_yy_globals );
+}
+
+static int yy_init_globals (yyscan_t yyscanner)
+{
+    struct yyguts_t * yyg = (struct yyguts_t*) yyscanner;
+    /* Initialization is the same as for the non-reentrant scanner.
+     * This function is called from fts0tlex_destroy(), so don't allocate here.
+     */
+
+    yyg->yy_buffer_stack = 0;
+    yyg->yy_buffer_stack_top = 0;
+    yyg->yy_buffer_stack_max = 0;
+    yyg->yy_c_buf_p = (char*) 0;
+    yyg->yy_init = 0;
+    yyg->yy_start = 0;
+
+    yyg->yy_start_stack_ptr = 0;
+    yyg->yy_start_stack_depth = 0;
+    yyg->yy_start_stack =  NULL;
+
+/* Defined in main.c */
+#ifdef YY_STDINIT
+    yyin = stdin;
+    yyout = stdout;
+#else
+    yyin = (FILE*) 0;
+    yyout = (FILE*) 0;
+#endif
+
+    /* For future reference: Set errno on error, since we are called by
+     * fts0tlex_init()
+     */
+    return 0;
+}
+
+/* fts0tlex_destroy is for both reentrant and non-reentrant scanners. */
+int fts0tlex_destroy  (yyscan_t yyscanner)
+{
+    struct yyguts_t * yyg = (struct yyguts_t*) yyscanner;
+
+    /* Pop the buffer stack, destroying each element. */
+	while(YY_CURRENT_BUFFER){
+		fts0t_delete_buffer(YY_CURRENT_BUFFER ,yyscanner );
+		YY_CURRENT_BUFFER_LVALUE = NULL;
+		fts0tpop_buffer_state(yyscanner);
+	}
+
+	/* Destroy the stack itself. */
+	fts0tfree(yyg->yy_buffer_stack ,yyscanner);
+	yyg->yy_buffer_stack = NULL;
+
+    /* Destroy the start condition stack. */
+        fts0tfree(yyg->yy_start_stack ,yyscanner );
+        yyg->yy_start_stack = NULL;
+
+    /* Reset the globals. This is important in a non-reentrant scanner so the next time
+     * fts0tlex() is called, initialization will occur. */
+    yy_init_globals( yyscanner);
+
+    /* Destroy the main struct (reentrant only). */
+    fts0tfree ( yyscanner , yyscanner );
+    yyscanner = NULL;
+    return 0;
+}
+
+/*
+ * Internal utility routines.
+ */
+
+#ifndef yytext_ptr
+static void yy_flex_strncpy (char* s1, yyconst char * s2, int n ,  yyscan_t yyscanner __attribute__((unused)))
+{
+	register int i;
+	for ( i = 0; i < n; ++i )
+		s1[i] = s2[i];
+}
+#endif
+
+#ifdef YY_NEED_STRLEN
+static int yy_flex_strlen (yyconst char * s ,  yyscan_t yyscanner __attribute__((unused)))
+{
+	register int n;
+	for ( n = 0; s[n]; ++n )
+		;
+
+	return n;
+}
+#endif
+
+void *fts0talloc (yy_size_t  size ,  yyscan_t yyscanner __attribute__((unused)))
+{
+	return (void*) malloc( size );
+}
+
+void *fts0trealloc  (void * ptr, yy_size_t  size ,  yyscan_t yyscanner __attribute__((unused)))
+{
+	/* The cast to (char*) in the following accommodates both
+	 * implementations that use char* generic pointers, and those
+	 * that use void* generic pointers.  It works with the latter
+	 * because both ANSI C and C++ allow castless assignment from
+	 * any pointer type to void*, and deal with argument conversions
+	 * as though doing an assignment.
+	 */
+	return (void*) realloc( (char*) ptr, size );
+}
+
+void fts0tfree (void * ptr ,  yyscan_t yyscanner __attribute__((unused)))
+{
+	free( (char*) ptr );	/* see fts0trealloc() for (char*) cast */
+}
+
+#define YYTABLES_NAME "yytables"
+
+#line 68 "fts0tlex.l"
+
+
+
diff --git a/storage/innobase/fts/fts0tlex.l b/storage/innobase/fts/fts0tlex.l
new file mode 100644
index 00000000000..8b04a9fecf1
--- /dev/null
+++ b/storage/innobase/fts/fts0tlex.l
@@ -0,0 +1,68 @@
+/*****************************************************************************
+
+Copyright (c) 2007, 2011, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**
+ * @file fts/fts0tlex.l
+ * FTS parser lexical analyzer
+ *
+ * Created 2007/5/9 Sunny Bains
+ */
+
+%{
+
+#include "fts0ast.h"
+#include "fts0pars.h"
+
+/* Required for reentrant parser */
+#define YY_DECL int fts_tlexer(YYSTYPE* val, yyscan_t yyscanner)
+
+%}
+
+%option noinput
+%option nounput
+%option noyywrap
+%option nostdinit
+%option reentrant
+%option never-interactive
+
+
+%%
+
+[\t ]+	/* Ignore whitespace */ ;
+
+[*]	{
+	val->oper = fts0tget_text(yyscanner)[0];
+
+	return(val->oper);
+}
+
+\"[^\"\n]*\"	{
+	val->token = strdup(fts0tget_text(yyscanner));
+
+	return(FTS_TEXT);
+}
+
+[^" \n]*	{
+	val->token = strdup(fts0tget_text(yyscanner));
+
+	return(FTS_TERM);
+}
+
+\n
+
+%%
diff --git a/storage/innobase/fts/make_parser.sh b/storage/innobase/fts/make_parser.sh
new file mode 100755
index 00000000000..2c072914c8b
--- /dev/null
+++ b/storage/innobase/fts/make_parser.sh
@@ -0,0 +1,49 @@
+#!/bin/sh
+#
+# Copyright (c) 2007, 2011, Oracle and/or its affiliates. All Rights Reserved.
+#
+# This program is free software; you can redistribute it and/or modify it under
+# the terms of the GNU General Public License as published by the Free Software
+# Foundation; version 2 of the License.
+#
+# This program is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License along with
+# this program; if not, write to the Free Software Foundation, Inc.,
+# 51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+
+TMPF=t.$$
+
+make -f Makefile.query
+
+echo '#include "univ.i"' > $TMPF
+
+# This is to avoid compiler warning about unused parameters.
+# FIXME: gcc extension "__attribute__" causing compilation errors on windows
+# platform. Quote them out for now.
+sed -e '
+s/^\(static.*void.*yy_fatal_error.*msg.*,\)\(.*yyscanner\)/\1 \2 __attribute__((unused))/;
+s/^\(static.*void.*yy_flex_strncpy.*n.*,\)\(.*yyscanner\)/\1 \2 __attribute__((unused))/;
+s/^\(static.*int.*yy_flex_strlen.*s.*,\)\(.*yyscanner\)/\1 \2 __attribute__((unused))/;
+s/^\(\(static\|void\).*fts0[bt]alloc.*,\)\(.*yyscanner\)/\1 \3 __attribute__((unused))/;
+s/^\(\(static\|void\).*fts0[bt]realloc.*,\)\(.*yyscanner\)/\1 \3 __attribute__((unused))/;
+s/^\(\(static\|void\).*fts0[bt]free.*,\)\(.*yyscanner\)/\1 \3 __attribute__((unused))/;
+' < fts0blex.cc >> $TMPF
+
+mv $TMPF fts0blex.cc
+
+echo '#include "univ.i"' > $TMPF
+
+sed -e '
+s/^\(static.*void.*yy_fatal_error.*msg.*,\)\(.*yyscanner\)/\1 \2 __attribute__((unused))/;
+s/^\(static.*void.*yy_flex_strncpy.*n.*,\)\(.*yyscanner\)/\1 \2 __attribute__((unused))/;
+s/^\(static.*int.*yy_flex_strlen.*s.*,\)\(.*yyscanner\)/\1 \2 __attribute__((unused))/;
+s/^\(\(static\|void\).*fts0[bt]alloc.*,\)\(.*yyscanner\)/\1 \3 __attribute__((unused))/;
+s/^\(\(static\|void\).*fts0[bt]realloc.*,\)\(.*yyscanner\)/\1 \3 __attribute__((unused))/;
+s/^\(\(static\|void\).*fts0[bt]free.*,\)\(.*yyscanner\)/\1 \3 __attribute__((unused))/;
+' < fts0tlex.cc >> $TMPF
+
+mv $TMPF fts0tlex.cc
diff --git a/storage/innobase/fut/fut0fut.c b/storage/innobase/fut/fut0fut.cc
index 20b45a575e6..9bb1c512182 100644
--- a/storage/innobase/fut/fut0fut.c
+++ b/storage/innobase/fut/fut0fut.cc
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1995, 2011, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,13 +11,13 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
 /******************************************************************//**
-@file fut/fut0fut.c
+@file fut/fut0fut.cc
 File-based utilities
 
 Created 12/13/1995 Heikki Tuuri
diff --git a/storage/innobase/fut/fut0lst.c b/storage/innobase/fut/fut0lst.cc
index a1e21c22725..8f96a6426d2 100644
--- a/storage/innobase/fut/fut0lst.c
+++ b/storage/innobase/fut/fut0lst.cc
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1995, 2011, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,13 +11,13 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
 /******************************************************************//**
-@file fut/fut0lst.c
+@file fut/fut0lst.cc
 File-based list utilities
 
 Created 11/28/1995 Heikki Tuuri
diff --git a/storage/innobase/ha/ha0ha.c b/storage/innobase/ha/ha0ha.cc
index 594a10dc431..dd99e3afae5 100644
--- a/storage/innobase/ha/ha0ha.c
+++ b/storage/innobase/ha/ha0ha.cc
@@ -11,13 +11,13 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
 /********************************************************************//**
-@file ha/ha0ha.c
+@file ha/ha0ha.cc
 The hash table with external chains
 
 Created 8/22/1994 Heikki Tuuri
@@ -31,7 +31,9 @@ Created 8/22/1994 Heikki Tuuri
 #ifdef UNIV_DEBUG
 # include "buf0buf.h"
 #endif /* UNIV_DEBUG */
-#include "btr0sea.h"
+#ifndef UNIV_HOTBACKUP
+# include "btr0sea.h"
+#endif /* !UNIV_HOTBACKUP */
 #include "page0page.h"
 
 /*************************************************************//**
@@ -44,43 +46,56 @@ ha_create_func(
 /*===========*/
 	ulint	n,		/*!< in: number of array cells */
 #ifdef UNIV_SYNC_DEBUG
-	ulint	mutex_level,	/*!< in: level of the mutexes in the latching
-				order: this is used in the debug version */
+	ulint	sync_level,	/*!< in: level of the mutexes or rw_locks
+				in the latching order: this is used in the
+				 debug version */
 #endif /* UNIV_SYNC_DEBUG */
-	ulint	n_mutexes)	/*!< in: number of mutexes to protect the
-				hash table: must be a power of 2, or 0 */
+	ulint	n_sync_obj,	/*!< in: number of mutexes or rw_locks
+				to protect the hash table: must be a
+				power of 2, or 0 */
+	ulint	type)		/*!< in: type of datastructure for which
+				the memory heap is going to be used e.g.:
+				MEM_HEAP_FOR_BTR_SEARCH or
+				MEM_HEAP_FOR_PAGE_HASH */
 {
 	hash_table_t*	table;
 #ifndef UNIV_HOTBACKUP
 	ulint		i;
 #endif /* !UNIV_HOTBACKUP */
 
-	ut_ad(ut_is_2pow(n_mutexes));
+	ut_a(type == MEM_HEAP_FOR_BTR_SEARCH
+	     || type == MEM_HEAP_FOR_PAGE_HASH);
+
+	ut_ad(ut_is_2pow(n_sync_obj));
 	table = hash_create(n);
 
-#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
-# ifndef UNIV_HOTBACKUP
-	table->adaptive = TRUE;
-# endif /* !UNIV_HOTBACKUP */
-#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
 	/* Creating MEM_HEAP_BTR_SEARCH type heaps can potentially fail,
 	but in practise it never should in this case, hence the asserts. */
 
-	if (n_mutexes == 0) {
-		table->heap = mem_heap_create_in_btr_search(
-			ut_min(4096, MEM_MAX_ALLOC_IN_BUF));
+	if (n_sync_obj == 0) {
+		table->heap = mem_heap_create_typed(
+			ut_min(4096, MEM_MAX_ALLOC_IN_BUF), type);
 		ut_a(table->heap);
 
 		return(table);
 	}
 
 #ifndef UNIV_HOTBACKUP
-	hash_create_mutexes(table, n_mutexes, mutex_level);
+	if (type == MEM_HEAP_FOR_PAGE_HASH) {
+		/* We create a hash table protected by rw_locks for
+		buf_pool->page_hash. */
+		hash_create_sync_obj(table, HASH_TABLE_SYNC_RW_LOCK,
+				     n_sync_obj, sync_level);
+	} else {
+		hash_create_sync_obj(table, HASH_TABLE_SYNC_MUTEX,
+				     n_sync_obj, sync_level);
+	}
 
-	table->heaps = mem_alloc(n_mutexes * sizeof(void*));
+	table->heaps = static_cast<mem_heap_t**>(
+		mem_alloc(n_sync_obj * sizeof(void*)));
 
-	for (i = 0; i < n_mutexes; i++) {
-		table->heaps[i] = mem_heap_create_in_btr_search(4096);
+	for (i = 0; i < n_sync_obj; i++) {
+		table->heaps[i] = mem_heap_create_typed(4096, type);
 		ut_a(table->heaps[i]);
 	}
 #endif /* !UNIV_HOTBACKUP */
@@ -89,6 +104,65 @@ ha_create_func(
 }
 
 /*************************************************************//**
+Empties a hash table and frees the memory heaps. */
+UNIV_INTERN
+void
+ha_clear(
+/*=====*/
+	hash_table_t*	table)	/*!< in, own: hash table */
+{
+	ulint	i;
+	ulint	n;
+
+	ut_ad(table);
+	ut_ad(table->magic_n == HASH_TABLE_MAGIC_N);
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(!table->adaptive
+	       || rw_lock_own(&btr_search_latch, RW_LOCK_EXCLUSIVE));
+#endif /* UNIV_SYNC_DEBUG */
+
+#ifndef UNIV_HOTBACKUP
+	/* Free the memory heaps. */
+	n = table->n_sync_obj;
+
+	for (i = 0; i < n; i++) {
+		mem_heap_free(table->heaps[i]);
+	}
+
+	if (table->heaps) {
+		mem_free(table->heaps);
+	}
+
+	switch (table->type) {
+	case HASH_TABLE_SYNC_MUTEX:
+		mem_free(table->sync_obj.mutexes);
+		table->sync_obj.mutexes = NULL;
+		break;
+
+	case HASH_TABLE_SYNC_RW_LOCK:
+		mem_free(table->sync_obj.rw_locks);
+		table->sync_obj.rw_locks = NULL;
+		break;
+
+	case HASH_TABLE_SYNC_NONE:
+		/* do nothing */
+		break;
+	}
+
+	table->n_sync_obj = 0;
+	table->type = HASH_TABLE_SYNC_NONE;
+
+#endif /* !UNIV_HOTBACKUP */
+
+	/* Clear the hash table. */
+	n = hash_get_n_cells(table);
+
+	for (i = 0; i < n; i++) {
+		hash_get_nth_cell(table, i)->node = NULL;
+	}
+}
+
+/*************************************************************//**
 Inserts an entry into a hash table. If an entry with the same fold number
 is found, its node is updated to point to the new data, and no new node
 is inserted. If btr_search_enabled is set to FALSE, we will only allow
@@ -106,7 +180,7 @@ ha_insert_for_fold_func(
 #if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
 	buf_block_t*	block,	/*!< in: buffer block containing the data */
 #endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
-	rec_t*		data)	/*!< in: data, must not be NULL */
+	const rec_t*	data)	/*!< in: data, must not be NULL */
 {
 	hash_cell_t*	cell;
 	ha_node_t*	node;
@@ -119,17 +193,14 @@ ha_insert_for_fold_func(
 #if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
 	ut_a(block->frame == page_align(data));
 #endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
-#ifdef UNIV_SYNC_DEBUG
-	ut_ad(rw_lock_own(&btr_search_latch, RW_LOCK_EX));
-#endif /* UNIV_SYNC_DEBUG */
-	ASSERT_HASH_MUTEX_OWN(table, fold);
+	hash_assert_can_modify(table, fold);
 	ut_ad(btr_search_enabled);
 
 	hash = hash_calc_hash(fold, table);
 
 	cell = hash_get_nth_cell(table, hash);
 
-	prev_node = cell->node;
+	prev_node = static_cast<ha_node_t*>(cell->node);
 
 	while (prev_node != NULL) {
 		if (prev_node->fold == fold) {
@@ -147,7 +218,7 @@ ha_insert_for_fold_func(
 
 			prev_node->block = block;
 #endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
-			prev_node->data = data;
+			prev_node->data = (rec_t*) data;
 
 			return(TRUE);
 		}
@@ -157,7 +228,8 @@ ha_insert_for_fold_func(
 
 	/* We have to allocate a new chain node */
 
-	node = mem_heap_alloc(hash_get_heap(table, fold), sizeof(ha_node_t));
+	node = static_cast<ha_node_t*>(
+		mem_heap_alloc(hash_get_heap(table, fold), sizeof(ha_node_t)));
 
 	if (node == NULL) {
 		/* It was a btr search type memory heap and at the moment
@@ -168,7 +240,7 @@ ha_insert_for_fold_func(
 		return(FALSE);
 	}
 
-	ha_node_set_data(node, block, data);
+	ha_node_set_data(node, block, (rec_t*) data);
 
 #if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
 # ifndef UNIV_HOTBACKUP
@@ -182,7 +254,7 @@ ha_insert_for_fold_func(
 
 	node->next = NULL;
 
-	prev_node = cell->node;
+	prev_node = static_cast<ha_node_t*>(cell->node);
 
 	if (prev_node == NULL) {
 
@@ -231,9 +303,10 @@ ha_delete_hash_node(
 
 /*********************************************************//**
 Looks for an element when we know the pointer to the data, and updates
-the pointer to data, if found. */
+the pointer to data, if found.
+@return TRUE if found */
 UNIV_INTERN
-void
+ibool
 ha_search_and_update_if_found_func(
 /*===============================*/
 	hash_table_t*	table,	/*!< in/out: hash table */
@@ -248,7 +321,7 @@ ha_search_and_update_if_found_func(
 
 	ut_ad(table);
 	ut_ad(table->magic_n == HASH_TABLE_MAGIC_N);
-	ASSERT_HASH_MUTEX_OWN(table, fold);
+	hash_assert_can_modify(table, fold);
 #if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
 	ut_a(new_block->frame == page_align(new_data));
 #endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
@@ -257,7 +330,7 @@ ha_search_and_update_if_found_func(
 #endif /* UNIV_SYNC_DEBUG */
 
 	if (!btr_search_enabled) {
-		return;
+		return(FALSE);
 	}
 
 	node = ha_search_with_data(table, fold, data);
@@ -275,7 +348,11 @@ ha_search_and_update_if_found_func(
 		node->block = new_block;
 #endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
 		node->data = new_data;
+
+		return(TRUE);
 	}
+
+	return(FALSE);
 }
 
 #ifndef UNIV_HOTBACKUP
@@ -294,10 +371,7 @@ ha_remove_all_nodes_to_page(
 
 	ut_ad(table);
 	ut_ad(table->magic_n == HASH_TABLE_MAGIC_N);
-	ASSERT_HASH_MUTEX_OWN(table, fold);
-#ifdef UNIV_SYNC_DEBUG
-	ut_ad(rw_lock_own(&btr_search_latch, RW_LOCK_EX));
-#endif /* UNIV_SYNC_DEBUG */
+	hash_assert_can_modify(table, fold);
 	ut_ad(btr_search_enabled);
 
 	node = ha_chain_get_first(table, fold);
@@ -343,8 +417,6 @@ ha_validate(
 	ulint		start_index,	/*!< in: start index */
 	ulint		end_index)	/*!< in: end index */
 {
-	hash_cell_t*	cell;
-	ha_node_t*	node;
 	ibool		ok	= TRUE;
 	ulint		i;
 
@@ -355,12 +427,15 @@ ha_validate(
 	ut_a(end_index < hash_get_n_cells(table));
 
 	for (i = start_index; i <= end_index; i++) {
+		ha_node_t*	node;
+		hash_cell_t*	cell;
 
 		cell = hash_get_nth_cell(table, i);
 
-		node = cell->node;
+		for (node = static_cast<ha_node_t*>(cell->node);
+		     node != 0;
+		     node = node->next) {
 
-		while (node) {
 			if (hash_calc_hash(node->fold, table) != i) {
 				ut_print_timestamp(stderr);
 				fprintf(stderr,
@@ -371,8 +446,6 @@ ha_validate(
 
 				ok = FALSE;
 			}
-
-			node = node->next;
 		}
 	}
 
diff --git a/storage/innobase/ha/ha0storage.c b/storage/innobase/ha/ha0storage.cc
index 698e34f1166..6820591f316 100644
--- a/storage/innobase/ha/ha0storage.c
+++ b/storage/innobase/ha/ha0storage.cc
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 2007, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 2007, 2011, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,13 +11,13 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
 /**************************************************//**
-@file ha/ha0storage.c
+@file ha/ha0storage.cc
 Hash storage.
 Provides a data structure that stores chunks of data in
 its own storage, avoiding duplicates.
@@ -51,7 +51,7 @@ ha_storage_get(
 
 	/* avoid repetitive calls to ut_fold_binary() in the HASH_SEARCH
 	macro */
-	fold = ut_fold_binary(data, data_len);
+	fold = ut_fold_binary(static_cast<const byte*>(data), data_len);
 
 #define IS_FOUND	\
 	node->data_len == data_len && memcmp(node->data, data, data_len) == 0
@@ -128,7 +128,7 @@ ha_storage_put_memlim(
 
 	/* avoid repetitive calls to ut_fold_binary() in the HASH_INSERT
 	macro */
-	fold = ut_fold_binary(data, data_len);
+	fold = ut_fold_binary(static_cast<const byte*>(data), data_len);
 
 	HASH_INSERT(
 		ha_storage_node_t,	/* type used in the hash chain */
diff --git a/storage/innobase/ha/hash0hash.c b/storage/innobase/ha/hash0hash.c
deleted file mode 100644
index 9589da00454..00000000000
--- a/storage/innobase/ha/hash0hash.c
+++ /dev/null
@@ -1,184 +0,0 @@
-/*****************************************************************************
-
-Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
-
-This program is free software; you can redistribute it and/or modify it under
-the terms of the GNU General Public License as published by the Free Software
-Foundation; version 2 of the License.
-
-This program is distributed in the hope that it will be useful, but WITHOUT
-ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
-FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
-
-*****************************************************************************/
-
-/**************************************************//**
-@file ha/hash0hash.c
-The simple hash table utility
-
-Created 5/20/1997 Heikki Tuuri
-*******************************************************/
-
-#include "hash0hash.h"
-#ifdef UNIV_NONINL
-#include "hash0hash.ic"
-#endif
-
-#include "mem0mem.h"
-
-#ifndef UNIV_HOTBACKUP
-
-# ifdef UNIV_PFS_MUTEX
-UNIV_INTERN mysql_pfs_key_t	hash_table_mutex_key;
-# endif /* UNIV_PFS_MUTEX */
-
-/************************************************************//**
-Reserves the mutex for a fold value in a hash table. */
-UNIV_INTERN
-void
-hash_mutex_enter(
-/*=============*/
-	hash_table_t*	table,	/*!< in: hash table */
-	ulint		fold)	/*!< in: fold */
-{
-	mutex_enter(hash_get_mutex(table, fold));
-}
-
-/************************************************************//**
-Releases the mutex for a fold value in a hash table. */
-UNIV_INTERN
-void
-hash_mutex_exit(
-/*============*/
-	hash_table_t*	table,	/*!< in: hash table */
-	ulint		fold)	/*!< in: fold */
-{
-	mutex_exit(hash_get_mutex(table, fold));
-}
-
-/************************************************************//**
-Reserves all the mutexes of a hash table, in an ascending order. */
-UNIV_INTERN
-void
-hash_mutex_enter_all(
-/*=================*/
-	hash_table_t*	table)	/*!< in: hash table */
-{
-	ulint	i;
-
-	for (i = 0; i < table->n_mutexes; i++) {
-
-		mutex_enter(table->mutexes + i);
-	}
-}
-
-/************************************************************//**
-Releases all the mutexes of a hash table. */
-UNIV_INTERN
-void
-hash_mutex_exit_all(
-/*================*/
-	hash_table_t*	table)	/*!< in: hash table */
-{
-	ulint	i;
-
-	for (i = 0; i < table->n_mutexes; i++) {
-
-		mutex_exit(table->mutexes + i);
-	}
-}
-#endif /* !UNIV_HOTBACKUP */
-
-/*************************************************************//**
-Creates a hash table with >= n array cells. The actual number of cells is
-chosen to be a prime number slightly bigger than n.
-@return	own: created table */
-UNIV_INTERN
-hash_table_t*
-hash_create(
-/*========*/
-	ulint	n)	/*!< in: number of array cells */
-{
-	hash_cell_t*	array;
-	ulint		prime;
-	hash_table_t*	table;
-
-	prime = ut_find_prime(n);
-
-	table = mem_alloc(sizeof(hash_table_t));
-
-	array = ut_malloc(sizeof(hash_cell_t) * prime);
-
-	table->array = array;
-	table->n_cells = prime;
-#ifndef UNIV_HOTBACKUP
-# if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
-	table->adaptive = FALSE;
-# endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
-	table->n_mutexes = 0;
-	table->mutexes = NULL;
-	table->heaps = NULL;
-#endif /* !UNIV_HOTBACKUP */
-	table->heap = NULL;
-	ut_d(table->magic_n = HASH_TABLE_MAGIC_N);
-
-	/* Initialize the cell array */
-	hash_table_clear(table);
-
-	return(table);
-}
-
-/*************************************************************//**
-Frees a hash table. */
-UNIV_INTERN
-void
-hash_table_free(
-/*============*/
-	hash_table_t*	table)	/*!< in, own: hash table */
-{
-	ut_ad(table);
-	ut_ad(table->magic_n == HASH_TABLE_MAGIC_N);
-#ifndef UNIV_HOTBACKUP
-	ut_a(table->mutexes == NULL);
-#endif /* !UNIV_HOTBACKUP */
-
-	ut_free(table->array);
-	mem_free(table);
-}
-
-#ifndef UNIV_HOTBACKUP
-/*************************************************************//**
-Creates a mutex array to protect a hash table. */
-UNIV_INTERN
-void
-hash_create_mutexes_func(
-/*=====================*/
-	hash_table_t*	table,		/*!< in: hash table */
-#ifdef UNIV_SYNC_DEBUG
-	ulint		sync_level,	/*!< in: latching order level of the
-					mutexes: used in the debug version */
-#endif /* UNIV_SYNC_DEBUG */
-	ulint		n_mutexes)	/*!< in: number of mutexes, must be a
-					power of 2 */
-{
-	ulint	i;
-
-	ut_ad(table);
-	ut_ad(table->magic_n == HASH_TABLE_MAGIC_N);
-	ut_a(n_mutexes > 0);
-	ut_a(ut_is_2pow(n_mutexes));
-
-	table->mutexes = mem_alloc(n_mutexes * sizeof(mutex_t));
-
-	for (i = 0; i < n_mutexes; i++) {
-		mutex_create(hash_table_mutex_key,
-			     table->mutexes + i, sync_level);
-	}
-
-	table->n_mutexes = n_mutexes;
-}
-#endif /* !UNIV_HOTBACKUP */
diff --git a/storage/innobase/ha/hash0hash.cc b/storage/innobase/ha/hash0hash.cc
new file mode 100644
index 00000000000..99128a676d5
--- /dev/null
+++ b/storage/innobase/ha/hash0hash.cc
@@ -0,0 +1,403 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2011, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file ha/hash0hash.cc
+The simple hash table utility
+
+Created 5/20/1997 Heikki Tuuri
+*******************************************************/
+
+#include "hash0hash.h"
+#ifdef UNIV_NONINL
+#include "hash0hash.ic"
+#endif
+
+#include "mem0mem.h"
+
+#ifndef UNIV_HOTBACKUP
+
+# ifdef UNIV_PFS_MUTEX
+UNIV_INTERN mysql_pfs_key_t	hash_table_mutex_key;
+# endif /* UNIV_PFS_MUTEX */
+
+# ifdef UNIV_PFS_RWLOCK
+UNIV_INTERN mysql_pfs_key_t	hash_table_rw_lock_key;
+# endif /* UNIV_PFS_RWLOCK */
+/************************************************************//**
+Reserves the mutex for a fold value in a hash table. */
+UNIV_INTERN
+void
+hash_mutex_enter(
+/*=============*/
+	hash_table_t*	table,	/*!< in: hash table */
+	ulint		fold)	/*!< in: fold */
+{
+	ut_ad(table->type == HASH_TABLE_SYNC_MUTEX);
+	mutex_enter(hash_get_mutex(table, fold));
+}
+
+/************************************************************//**
+Releases the mutex for a fold value in a hash table. */
+UNIV_INTERN
+void
+hash_mutex_exit(
+/*============*/
+	hash_table_t*	table,	/*!< in: hash table */
+	ulint		fold)	/*!< in: fold */
+{
+	ut_ad(table->type == HASH_TABLE_SYNC_MUTEX);
+	mutex_exit(hash_get_mutex(table, fold));
+}
+
+/************************************************************//**
+Reserves all the mutexes of a hash table, in an ascending order. */
+UNIV_INTERN
+void
+hash_mutex_enter_all(
+/*=================*/
+	hash_table_t*	table)	/*!< in: hash table */
+{
+	ulint	i;
+
+	ut_ad(table->type == HASH_TABLE_SYNC_MUTEX);
+	for (i = 0; i < table->n_sync_obj; i++) {
+
+		mutex_enter(table->sync_obj.mutexes + i);
+	}
+}
+
+/************************************************************//**
+Releases all the mutexes of a hash table. */
+UNIV_INTERN
+void
+hash_mutex_exit_all(
+/*================*/
+	hash_table_t*	table)	/*!< in: hash table */
+{
+	ulint	i;
+
+	ut_ad(table->type == HASH_TABLE_SYNC_MUTEX);
+	for (i = 0; i < table->n_sync_obj; i++) {
+
+		mutex_exit(table->sync_obj.mutexes + i);
+	}
+}
+
+/************************************************************//**
+Releases all but the passed in mutex of a hash table. */
+UNIV_INTERN
+void
+hash_mutex_exit_all_but(
+/*====================*/
+	hash_table_t*	table,		/*!< in: hash table */
+	mutex_t*	keep_mutex)	/*!< in: mutex to keep */
+{
+	ulint	i;
+
+	ut_ad(table->type == HASH_TABLE_SYNC_MUTEX);
+	for (i = 0; i < table->n_sync_obj; i++) {
+
+		mutex_t* mutex = table->sync_obj.mutexes + i;
+		if (UNIV_LIKELY(keep_mutex != mutex)) {
+			mutex_exit(mutex);
+		}
+	}
+
+	ut_ad(mutex_own(keep_mutex));
+}
+
+/************************************************************//**
+s-lock a lock for a fold value in a hash table. */
+UNIV_INTERN
+void
+hash_lock_s(
+/*========*/
+	hash_table_t*	table,	/*!< in: hash table */
+	ulint		fold)	/*!< in: fold */
+{
+
+	rw_lock_t* lock = hash_get_lock(table, fold);
+
+	ut_ad(table->type == HASH_TABLE_SYNC_RW_LOCK);
+	ut_ad(lock);
+
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(!rw_lock_own(lock, RW_LOCK_SHARED));
+	ut_ad(!rw_lock_own(lock, RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+
+	rw_lock_s_lock(lock);
+}
+
+/************************************************************//**
+x-lock a lock for a fold value in a hash table. */
+UNIV_INTERN
+void
+hash_lock_x(
+/*========*/
+	hash_table_t*	table,	/*!< in: hash table */
+	ulint		fold)	/*!< in: fold */
+{
+
+	rw_lock_t* lock = hash_get_lock(table, fold);
+
+	ut_ad(table->type == HASH_TABLE_SYNC_RW_LOCK);
+	ut_ad(lock);
+
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(!rw_lock_own(lock, RW_LOCK_SHARED));
+	ut_ad(!rw_lock_own(lock, RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+
+	rw_lock_x_lock(lock);
+}
+
+/************************************************************//**
+unlock an s-lock for a fold value in a hash table. */
+UNIV_INTERN
+void
+hash_unlock_s(
+/*==========*/
+
+	hash_table_t*	table,	/*!< in: hash table */
+	ulint		fold)	/*!< in: fold */
+{
+
+	rw_lock_t* lock = hash_get_lock(table, fold);
+
+	ut_ad(table->type == HASH_TABLE_SYNC_RW_LOCK);
+	ut_ad(lock);
+
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own(lock, RW_LOCK_SHARED));
+#endif /* UNIV_SYNC_DEBUG */
+
+	rw_lock_s_unlock(lock);
+}
+
+/************************************************************//**
+unlock x-lock for a fold value in a hash table. */
+UNIV_INTERN
+void
+hash_unlock_x(
+/*==========*/
+	hash_table_t*	table,	/*!< in: hash table */
+	ulint		fold)	/*!< in: fold */
+{
+	rw_lock_t* lock = hash_get_lock(table, fold);
+
+	ut_ad(table->type == HASH_TABLE_SYNC_RW_LOCK);
+	ut_ad(lock);
+
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own(lock, RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+
+	rw_lock_x_unlock(lock);
+}
+
+/************************************************************//**
+Reserves all the locks of a hash table, in an ascending order. */
+UNIV_INTERN
+void
+hash_lock_x_all(
+/*============*/
+	hash_table_t*	table)	/*!< in: hash table */
+{
+	ulint	i;
+
+	ut_ad(table->type == HASH_TABLE_SYNC_RW_LOCK);
+	for (i = 0; i < table->n_sync_obj; i++) {
+
+		rw_lock_t* lock = table->sync_obj.rw_locks + i;
+#ifdef UNIV_SYNC_DEBUG
+		ut_ad(!rw_lock_own(lock, RW_LOCK_SHARED));
+		ut_ad(!rw_lock_own(lock, RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+
+		rw_lock_x_lock(lock);
+	}
+}
+
+/************************************************************//**
+Releases all the locks of a hash table, in an ascending order. */
+UNIV_INTERN
+void
+hash_unlock_x_all(
+/*==============*/
+	hash_table_t*	table)	/*!< in: hash table */
+{
+	ulint	i;
+
+	ut_ad(table->type == HASH_TABLE_SYNC_RW_LOCK);
+	for (i = 0; i < table->n_sync_obj; i++) {
+
+		rw_lock_t* lock = table->sync_obj.rw_locks + i;
+#ifdef UNIV_SYNC_DEBUG
+		ut_ad(rw_lock_own(lock, RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+
+		rw_lock_x_unlock(lock);
+	}
+}
+
+/************************************************************//**
+Releases all but passed in lock of a hash table, */
+UNIV_INTERN
+void
+hash_unlock_x_all_but(
+/*==================*/
+	hash_table_t*	table,		/*!< in: hash table */
+	rw_lock_t*	keep_lock)	/*!< in: lock to keep */
+{
+	ulint	i;
+
+	ut_ad(table->type == HASH_TABLE_SYNC_RW_LOCK);
+	for (i = 0; i < table->n_sync_obj; i++) {
+
+		rw_lock_t* lock = table->sync_obj.rw_locks + i;
+#ifdef UNIV_SYNC_DEBUG
+		ut_ad(rw_lock_own(lock, RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+
+		if (UNIV_LIKELY(keep_lock != lock)) {
+			rw_lock_x_unlock(lock);
+		}
+	}
+}
+
+#endif /* !UNIV_HOTBACKUP */
+
+/*************************************************************//**
+Creates a hash table with >= n array cells. The actual number of cells is
+chosen to be a prime number slightly bigger than n.
+@return	own: created table */
+UNIV_INTERN
+hash_table_t*
+hash_create(
+/*========*/
+	ulint	n)	/*!< in: number of array cells */
+{
+	hash_cell_t*	array;
+	ulint		prime;
+	hash_table_t*	table;
+
+	prime = ut_find_prime(n);
+
+	table = static_cast<hash_table_t*>(mem_alloc(sizeof(hash_table_t)));
+
+	array = static_cast<hash_cell_t*>(
+		ut_malloc(sizeof(hash_cell_t) * prime));
+
+	/* The default type of hash_table is HASH_TABLE_SYNC_NONE i.e.:
+	the caller is responsible for access control to the table. */
+	table->type = HASH_TABLE_SYNC_NONE;
+	table->array = array;
+	table->n_cells = prime;
+#ifndef UNIV_HOTBACKUP
+# if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+	table->adaptive = FALSE;
+# endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+	table->n_sync_obj = 0;
+	table->sync_obj.mutexes = NULL;
+	table->heaps = NULL;
+#endif /* !UNIV_HOTBACKUP */
+	table->heap = NULL;
+	ut_d(table->magic_n = HASH_TABLE_MAGIC_N);
+
+	/* Initialize the cell array */
+	hash_table_clear(table);
+
+	return(table);
+}
+
+/*************************************************************//**
+Frees a hash table. */
+UNIV_INTERN
+void
+hash_table_free(
+/*============*/
+	hash_table_t*	table)	/*!< in, own: hash table */
+{
+	ut_ad(table);
+	ut_ad(table->magic_n == HASH_TABLE_MAGIC_N);
+
+	ut_free(table->array);
+	mem_free(table);
+}
+
+#ifndef UNIV_HOTBACKUP
+/*************************************************************//**
+Creates a sync object array to protect a hash table.
+::sync_obj can be mutexes or rw_locks depening on the type of
+hash table. */
+UNIV_INTERN
+void
+hash_create_sync_obj_func(
+/*======================*/
+	hash_table_t*		table,	/*!< in: hash table */
+	enum hash_table_sync_t	type,	/*!< in: HASH_TABLE_SYNC_MUTEX
+					or HASH_TABLE_SYNC_RW_LOCK */
+#ifdef UNIV_SYNC_DEBUG
+	ulint			sync_level,/*!< in: latching order level
+					of the mutexes: used in the
+					debug version */
+#endif /* UNIV_SYNC_DEBUG */
+	ulint			n_sync_obj)/*!< in: number of sync objects,
+					must be a power of 2 */
+{
+	ulint	i;
+
+	ut_ad(table);
+	ut_ad(table->magic_n == HASH_TABLE_MAGIC_N);
+	ut_a(n_sync_obj > 0);
+	ut_a(ut_is_2pow(n_sync_obj));
+
+	table->type = type;
+
+	switch (type) {
+	case HASH_TABLE_SYNC_MUTEX:
+		table->sync_obj.mutexes = static_cast<mutex_t*>(
+			mem_alloc(n_sync_obj * sizeof(mutex_t)));
+
+		for (i = 0; i < n_sync_obj; i++) {
+			mutex_create(hash_table_mutex_key,
+			     table->sync_obj.mutexes + i, sync_level);
+		}
+
+		break;
+
+	case HASH_TABLE_SYNC_RW_LOCK:
+		table->sync_obj.rw_locks = static_cast<rw_lock_t*>(
+			mem_alloc(n_sync_obj * sizeof(rw_lock_t)));
+
+		for (i = 0; i < n_sync_obj; i++) {
+			rw_lock_create(hash_table_rw_lock_key,
+			     table->sync_obj.rw_locks + i, sync_level);
+		}
+
+		break;
+
+	case HASH_TABLE_SYNC_NONE:
+		ut_error;
+	}
+
+	table->n_sync_obj = n_sync_obj;
+}
+#endif /* !UNIV_HOTBACKUP */
diff --git a/storage/innobase/handler/ha_innodb.cc b/storage/innobase/handler/ha_innodb.cc
index b97869e74ee..df655bc6f42 100644
--- a/storage/innobase/handler/ha_innodb.cc
+++ b/storage/innobase/handler/ha_innodb.cc
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 2000, 2012, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2000, 2012, Oracle and/or its affiliates. All rights reserved.
 Copyright (c) 2008, 2009 Google Inc.
 Copyright (c) 2009, Percona Inc.
 
@@ -31,37 +31,25 @@ this program; if not, write to the Free Software Foundation, Inc.,
 
 *****************************************************************************/
 
-/* TODO list for the InnoDB handler in 5.0:
-  - fix savepoint functions to use savepoint storage area
-  - Find out what kind of problems the OS X case-insensitivity causes to
-    table and database names; should we 'normalize' the names like we do
-    in Windows?
-*/
-
-#ifdef USE_PRAGMA_IMPLEMENTATION
-#pragma implementation				// gcc: Class implementation
-#endif
-
 #include <sql_table.h>	// explain_filename, nz2, EXPLAIN_PARTITIONS_AS_COMMENT,
 			// EXPLAIN_FILENAME_MAX_EXTRA_LENGTH
 
 #include <sql_acl.h>	// PROCESS_ACL
-#include <m_ctype.h>
+#include <debug_sync.h> // DEBUG_SYNC
 #include <mysys_err.h>
-#include <mysql/plugin.h>
 #include <innodb_priv.h>
-#include <mysql/psi/psi.h>
-#include <my_sys.h>
-
 #ifdef _WIN32
 #include <io.h>
 #endif
+
 /** @file ha_innodb.cc */
 
 /* Include necessary InnoDB headers */
-extern "C" {
 #include "univ.i"
+#include "buf0dump.h"
 #include "buf0lru.h"
+#include "buf0flu.h"
+#include "buf0dblwr.h"
 #include "btr0sea.h"
 #include "os0file.h"
 #include "os0thread.h"
@@ -86,10 +74,15 @@ extern "C" {
 #include "trx0xa.h"
 #include "row0merge.h"
 #include "dict0boot.h"
+#include "dict0stats.h"
 #include "ha_prototypes.h"
 #include "ut0mem.h"
 #include "ibuf0ibuf.h"
-}
+#include "dict0dict.h"
+#include "srv0mon.h"
+#include "pars0pars.h"
+#include "fts0fts.h"
+#include "fts0types.h"
 
 #include "ha_innodb.h"
 #include "i_s.h"
@@ -109,23 +102,22 @@ static bool innodb_inited = 0;
 
 #define INSIDE_HA_INNOBASE_CC
 
-/* In the Windows plugin, the return value of current_thd is
-undefined.  Map it to NULL. */
-
 #define EQ_CURRENT_THD(thd) ((thd) == current_thd)
 
-
 static struct handlerton* innodb_hton_ptr;
 
 static const long AUTOINC_OLD_STYLE_LOCKING = 0;
 static const long AUTOINC_NEW_STYLE_LOCKING = 1;
 static const long AUTOINC_NO_LOCKING = 2;
 
-static long innobase_mirrored_log_groups, innobase_log_files_in_group,
-	innobase_log_buffer_size,
-	innobase_additional_mem_pool_size, innobase_file_io_threads,
-	innobase_force_recovery, innobase_open_files,
-	innobase_autoinc_lock_mode;
+static long innobase_mirrored_log_groups;
+static long innobase_log_files_in_group;
+static long innobase_log_buffer_size;
+static long innobase_additional_mem_pool_size;
+static long innobase_file_io_threads;
+static long innobase_force_recovery;
+static long innobase_open_files;
+static long innobase_autoinc_lock_mode;
 static ulong innobase_commit_concurrency = 0;
 static ulong innobase_read_io_threads;
 static ulong innobase_write_io_threads;
@@ -137,6 +129,10 @@ static long long innobase_buffer_pool_size, innobase_log_file_size;
 Connected to buf_LRU_old_ratio. */
 static uint innobase_old_blocks_pct;
 
+/** Maximum on-disk size of change buffer in terms of percentage
+of the buffer pool. */
+static uint innobase_change_buffer_max_size = CHANGE_BUFFER_DEFAULT_SIZE;
+
 /* The default values for the following char* start-up parameters
 are determined in innobase_init below: */
 
@@ -145,6 +141,10 @@ static char*	innobase_data_file_path			= NULL;
 static char*	innobase_log_group_home_dir		= NULL;
 static char*	innobase_file_format_name		= NULL;
 static char*	innobase_change_buffering		= NULL;
+static char*	innobase_enable_monitor_counter		= NULL;
+static char*	innobase_disable_monitor_counter	= NULL;
+static char*	innobase_reset_monitor_counter		= NULL;
+static char*	innobase_reset_all_monitor_counter	= NULL;
 
 /* The highest file format being used in the database. The value can be
 set by user, however, it will be adjusted to the newer file format if
@@ -153,6 +153,10 @@ static char*	innobase_file_format_max		= NULL;
 
 static char*	innobase_file_flush_method		= NULL;
 
+/* This variable can be set in the server configure file, specifying
+stopword table to be used */
+static char*	innobase_server_stopword_table		= NULL;
+
 /* Below we have boolean-valued start-up parameters, and their default
 values */
 
@@ -169,12 +173,15 @@ static my_bool	innobase_rollback_on_timeout		= FALSE;
 static my_bool	innobase_create_status_file		= FALSE;
 static my_bool	innobase_stats_on_metadata		= TRUE;
 static my_bool	innobase_large_prefix			= FALSE;
+static my_bool	innodb_optimize_fulltext_only		= FALSE;
 
 
 static char*	internal_innobase_data_file_path	= NULL;
 
 static char*	innodb_version_str = (char*) INNODB_VERSION_STR;
 
+static char*	fts_server_stopword_table		= NULL;
+
 /** Possible values for system variable "innodb_stats_method". The values
 are defined the same as its corresponding MyISAM system variable
 "myisam_stats_method"(see "myisam_stats_method_names"), for better usability */
@@ -194,6 +201,26 @@ static TYPELIB innodb_stats_method_typelib = {
 	NULL
 };
 
+/** Possible values for system variable "innodb_checksum_algorithm". */
+static const char* innodb_checksum_algorithm_names[] = {
+	"crc32",
+	"strict_crc32",
+	"innodb",
+	"strict_innodb",
+	"none",
+	"strict_none",
+	NullS
+};
+
+/** Used to define an enumerate type of the system variable
+innodb_checksum_algorithm. */
+static TYPELIB innodb_checksum_algorithm_typelib = {
+	array_elements(innodb_checksum_algorithm_names) - 1,
+	"innodb_checksum_algorithm_typelib",
+	innodb_checksum_algorithm_names,
+	NULL
+};
+
 /* The following counter is used to convey information to InnoDB
 about server activity: in selects it is not sensible to call
 srv_active_wake_master_thread after each fetch or search, we only do
@@ -214,6 +241,14 @@ static const char* innobase_change_buffering_values[IBUF_USE_COUNT] = {
 	"all"		/* IBUF_USE_ALL */
 };
 
+/* Call back function array defined by MySQL and used to
+retrieve FTS results. */
+const struct _ft_vft ft_vft_result = {NULL,
+				      innobase_fts_find_ranking,
+				      innobase_fts_close_ranking,
+				      innobase_fts_retrieve_ranking,
+				      NULL};
+
 #ifdef HAVE_PSI_INTERFACE
 /* Keys to register pthread mutexes/cond in the current file with
 performance schema */
@@ -223,9 +258,9 @@ static mysql_pfs_key_t	commit_cond_mutex_key;
 static mysql_pfs_key_t	commit_cond_key;
 
 static PSI_mutex_info	all_pthread_mutexes[] = {
-        {&commit_threads_m_key, "commit_threads_m", 0},
-        {&commit_cond_mutex_key, "commit_cond_mutex", 0},
-        {&innobase_share_mutex_key, "innobase_share_mutex", 0}
+	{&commit_threads_m_key, "commit_threads_m", 0},
+	{&commit_cond_mutex_key, "commit_cond_mutex", 0},
+	{&innobase_share_mutex_key, "innobase_share_mutex", 0}
 };
 
 static PSI_cond_info	all_innodb_conds[] = {
@@ -238,7 +273,6 @@ performance schema instrumented if "UNIV_PFS_MUTEX"
 is defined */
 static PSI_mutex_info all_innodb_mutexes[] = {
 	{&autoinc_mutex_key, "autoinc_mutex", 0},
-	{&btr_search_enabled_mutex_key, "btr_search_enabled_mutex", 0},
 #  ifndef PFS_SKIP_BUFFER_MUTEX_RWLOCK
 	{&buffer_block_mutex_key, "buffer_block_mutex", 0},
 #  endif /* !PFS_SKIP_BUFFER_MUTEX_RWLOCK */
@@ -250,13 +284,19 @@ static PSI_mutex_info all_innodb_mutexes[] = {
 	{&file_format_max_mutex_key, "file_format_max_mutex", 0},
 	{&fil_system_mutex_key, "fil_system_mutex", 0},
 	{&flush_list_mutex_key, "flush_list_mutex", 0},
+	{&fts_bg_threads_mutex_key, "fts_bg_threads_mutex", 0},
+	{&fts_delete_mutex_key, "fts_delete_mutex", 0},
+	{&fts_optimize_mutex_key, "fts_optimize_mutex", 0},
+	{&fts_doc_id_mutex_key, "fts_doc_id_mutex", 0},
 	{&log_flush_order_mutex_key, "log_flush_order_mutex", 0},
 	{&hash_table_mutex_key, "hash_table_mutex", 0},
 	{&ibuf_bitmap_mutex_key, "ibuf_bitmap_mutex", 0},
 	{&ibuf_mutex_key, "ibuf_mutex", 0},
 	{&ibuf_pessimistic_insert_mutex_key,
 		 "ibuf_pessimistic_insert_mutex", 0},
-	{&kernel_mutex_key, "kernel_mutex", 0},
+#  ifndef HAVE_ATOMIC_BUILTINS
+	{&server_mutex_key, "server_mutex", 0},
+#  endif /* !HAVE_ATOMIC_BUILTINS */
 	{&log_sys_mutex_key, "log_sys_mutex", 0},
 #  ifdef UNIV_MEM_DEBUG
 	{&mem_hash_mutex_key, "mem_hash_mutex", 0},
@@ -275,12 +315,26 @@ static PSI_mutex_info all_innodb_mutexes[] = {
 	{&srv_innodb_monitor_mutex_key, "srv_innodb_monitor_mutex", 0},
 	{&srv_misc_tmpfile_mutex_key, "srv_misc_tmpfile_mutex", 0},
 	{&srv_monitor_file_mutex_key, "srv_monitor_file_mutex", 0},
-	{&syn_arr_mutex_key, "syn_arr_mutex", 0},
 #  ifdef UNIV_SYNC_DEBUG
 	{&sync_thread_mutex_key, "sync_thread_mutex", 0},
 #  endif /* UNIV_SYNC_DEBUG */
-	{&trx_doublewrite_mutex_key, "trx_doublewrite_mutex", 0},
-	{&trx_undo_mutex_key, "trx_undo_mutex", 0}
+	{&buf_dblwr_mutex_key, "buf_dblwr_mutex", 0},
+	{&trx_undo_mutex_key, "trx_undo_mutex", 0},
+	{&srv_sys_mutex_key, "srv_sys_mutex", 0},
+	{&lock_sys_mutex_key, "lock_mutex", 0},
+	{&lock_sys_wait_mutex_key, "lock_wait_mutex", 0},
+	{&trx_mutex_key, "trx_mutex", 0},
+	{&srv_sys_tasks_mutex_key, "srv_threads_mutex", 0},
+	/* mutex with os_fast_mutex_ interfaces */
+#  ifndef PFS_SKIP_EVENT_MUTEX
+	{&event_os_mutex_key, "event_os_mutex", 0},
+#  endif /* PFS_SKIP_EVENT_MUTEX */
+	{&os_mutex_key, "os_mutex", 0},
+#ifndef HAVE_ATOMIC_BUILTINS
+	{&srv_conc_mutex_key, "srv_conc_mutex", 0},
+#endif /* !HAVE_ATOMIC_BUILTINS */
+	{&ut_list_mutex_key, "ut_list_mutex", 0},
+	{&trx_sys_mutex_key, "trx_sys_mutex", 0},
 };
 # endif /* UNIV_PFS_MUTEX */
 
@@ -302,10 +356,13 @@ static PSI_rwlock_info all_innodb_rwlocks[] = {
 	{&dict_operation_lock_key, "dict_operation_lock", 0},
 	{&fil_space_latch_key, "fil_space_latch", 0},
 	{&checkpoint_lock_key, "checkpoint_lock", 0},
+	{&fts_cache_rw_lock_key, "fts_cache_rw_lock", 0},
+	{&fts_cache_init_rw_lock_key, "fts_cache_init_rw_lock", 0},
 	{&trx_i_s_cache_lock_key, "trx_i_s_cache_lock", 0},
 	{&trx_purge_latch_key, "trx_purge_latch", 0},
 	{&index_tree_rw_lock_key, "index_tree_rw_lock", 0},
-	{&dict_table_stats_latch_key, "dict_table_stats", 0}
+	{&dict_table_stats_latch_key, "dict_table_stats", 0},
+	{&hash_table_rw_lock_key, "hash table locks", 0}
 };
 # endif /* UNIV_PFS_RWLOCK */
 
@@ -320,7 +377,8 @@ static PSI_thread_info	all_innodb_threads[] = {
 	{&srv_error_monitor_thread_key, "srv_error_monitor_thread", 0},
 	{&srv_monitor_thread_key, "srv_monitor_thread", 0},
 	{&srv_master_thread_key, "srv_master_thread", 0},
-	{&srv_purge_thread_key, "srv_purge_thread", 0}
+	{&srv_purge_thread_key, "srv_purge_thread", 0},
+	{&buf_page_cleaner_thread_key, "page_cleaner_thread", 0}
 };
 # endif /* UNIV_PFS_THREAD */
 
@@ -335,25 +393,301 @@ static PSI_file_info	all_innodb_files[] = {
 # endif /* UNIV_PFS_IO */
 #endif /* HAVE_PSI_INTERFACE */
 
-static INNOBASE_SHARE *get_share(const char *table_name);
-static void free_share(INNOBASE_SHARE *share);
-static int innobase_close_connection(handlerton *hton, THD* thd);
-static void innobase_commit_ordered(handlerton *hton, THD* thd, bool all);
-static int innobase_commit(handlerton *hton, THD* thd, bool all);
-static int innobase_rollback(handlerton *hton, THD* thd, bool all);
-static int innobase_rollback_to_savepoint(handlerton *hton, THD* thd,
-           void *savepoint);
-static int innobase_savepoint(handlerton *hton, THD* thd, void *savepoint);
-static int innobase_release_savepoint(handlerton *hton, THD* thd,
-           void *savepoint);
-static handler *innobase_create_handler(handlerton *hton,
-                                        TABLE_SHARE *table,
-                                        MEM_ROOT *mem_root);
-
-/* "GEN_CLUST_INDEX" is the name reserved for Innodb default
+/*************************************************************//**
+Check whether valid argument given to innodb_ft_*_stopword_table.
+This function is registered as a callback with MySQL.
+@return 0 for valid stopword table */
+static
+int
+innodb_stopword_table_validate(
+/*===========================*/
+	THD*				thd,	/*!< in: thread handle */
+	struct st_mysql_sys_var*	var,	/*!< in: pointer to system
+						variable */
+	void*				save,	/*!< out: immediate result
+						for update function */
+	struct st_mysql_value*		value);	/*!< in: incoming string */
+/****************************************************************//**
+Update the session variable innodb_session_stopword_table
+with the "saved" stopword table name value. This function
+is registered as a callback with MySQL. */
+static
+void
+innodb_session_stopword_update(
+/*===========================*/
+	THD*				thd,	/*!< in: thread handle */
+	struct st_mysql_sys_var*	var,	/*!< in: pointer to
+						system variable */
+	void*				var_ptr,/*!< out: where the
+						formal string goes */
+	const void*			save);	/*!< in: immediate result
+						from check function */
+/** "GEN_CLUST_INDEX" is the name reserved for Innodb default
 system primary index. */
 static const char innobase_index_reserve_name[]= "GEN_CLUST_INDEX";
 
+static const char innobase_hton_name[]= "InnoDB";
+
+static MYSQL_THDVAR_BOOL(support_xa, PLUGIN_VAR_OPCMDARG,
+  "Enable InnoDB support for the XA two-phase commit",
+  /* check_func */ NULL, /* update_func */ NULL,
+  /* default */ TRUE);
+
+static MYSQL_THDVAR_BOOL(table_locks, PLUGIN_VAR_OPCMDARG,
+  "Enable InnoDB locking in LOCK TABLES",
+  /* check_func */ NULL, /* update_func */ NULL,
+  /* default */ TRUE);
+
+static MYSQL_THDVAR_BOOL(strict_mode, PLUGIN_VAR_OPCMDARG,
+  "Use strict mode when evaluating create options.",
+  NULL, NULL, FALSE);
+
+static MYSQL_THDVAR_BOOL(ft_enable_stopword, PLUGIN_VAR_OPCMDARG,
+  "Create FTS index with stopword.",
+  NULL, NULL,
+  /* default */ TRUE);
+
+static MYSQL_THDVAR_BOOL(analyze_is_persistent, PLUGIN_VAR_OPCMDARG,
+  "ANALYZE TABLE in InnoDB uses a more precise (and slow) sampling "
+  "algorithm and saves the results persistently.",
+  /* check_func */ NULL, /* update_func */ NULL,
+  /* default */ FALSE);
+
+static MYSQL_THDVAR_ULONG(lock_wait_timeout, PLUGIN_VAR_RQCMDARG,
+  "Timeout in seconds an InnoDB transaction may wait for a lock before being rolled back. Values above 100000000 disable the timeout.",
+  NULL, NULL, 50, 1, 1024 * 1024 * 1024, 0);
+
+static MYSQL_THDVAR_STR(ft_user_stopword_table, PLUGIN_VAR_OPCMDARG,
+  "User supplied stopword table name, effective in the session level.",
+  innodb_stopword_table_validate, innodb_session_stopword_update, NULL);
+
+static SHOW_VAR innodb_status_variables[]= {
+  {"buffer_pool_dump_status",
+  (char*) &export_vars.innodb_buffer_pool_dump_status,	  SHOW_CHAR},
+  {"buffer_pool_load_status",
+  (char*) &export_vars.innodb_buffer_pool_load_status,	  SHOW_CHAR},
+  {"buffer_pool_pages_data",
+  (char*) &export_vars.innodb_buffer_pool_pages_data,	  SHOW_LONG},
+  {"buffer_pool_pages_dirty",
+  (char*) &export_vars.innodb_buffer_pool_pages_dirty,	  SHOW_LONG},
+  {"buffer_pool_pages_flushed",
+  (char*) &export_vars.innodb_buffer_pool_pages_flushed,  SHOW_LONG},
+  {"buffer_pool_pages_free",
+  (char*) &export_vars.innodb_buffer_pool_pages_free,	  SHOW_LONG},
+#ifdef UNIV_DEBUG
+  {"buffer_pool_pages_latched",
+  (char*) &export_vars.innodb_buffer_pool_pages_latched,  SHOW_LONG},
+#endif /* UNIV_DEBUG */
+  {"buffer_pool_pages_misc",
+  (char*) &export_vars.innodb_buffer_pool_pages_misc,	  SHOW_LONG},
+  {"buffer_pool_pages_total",
+  (char*) &export_vars.innodb_buffer_pool_pages_total,	  SHOW_LONG},
+  {"buffer_pool_read_ahead_rnd",
+  (char*) &export_vars.innodb_buffer_pool_read_ahead_rnd, SHOW_LONG},
+  {"buffer_pool_read_ahead",
+  (char*) &export_vars.innodb_buffer_pool_read_ahead,	  SHOW_LONG},
+  {"buffer_pool_read_ahead_evicted",
+  (char*) &export_vars.innodb_buffer_pool_read_ahead_evicted, SHOW_LONG},
+  {"buffer_pool_read_requests",
+  (char*) &export_vars.innodb_buffer_pool_read_requests,  SHOW_LONG},
+  {"buffer_pool_reads",
+  (char*) &export_vars.innodb_buffer_pool_reads,	  SHOW_LONG},
+  {"buffer_pool_wait_free",
+  (char*) &export_vars.innodb_buffer_pool_wait_free,	  SHOW_LONG},
+  {"buffer_pool_write_requests",
+  (char*) &export_vars.innodb_buffer_pool_write_requests, SHOW_LONG},
+  {"data_fsyncs",
+  (char*) &export_vars.innodb_data_fsyncs,		  SHOW_LONG},
+  {"data_pending_fsyncs",
+  (char*) &export_vars.innodb_data_pending_fsyncs,	  SHOW_LONG},
+  {"data_pending_reads",
+  (char*) &export_vars.innodb_data_pending_reads,	  SHOW_LONG},
+  {"data_pending_writes",
+  (char*) &export_vars.innodb_data_pending_writes,	  SHOW_LONG},
+  {"data_read",
+  (char*) &export_vars.innodb_data_read,		  SHOW_LONG},
+  {"data_reads",
+  (char*) &export_vars.innodb_data_reads,		  SHOW_LONG},
+  {"data_writes",
+  (char*) &export_vars.innodb_data_writes,		  SHOW_LONG},
+  {"data_written",
+  (char*) &export_vars.innodb_data_written,		  SHOW_LONG},
+  {"dblwr_pages_written",
+  (char*) &export_vars.innodb_dblwr_pages_written,	  SHOW_LONG},
+  {"dblwr_writes",
+  (char*) &export_vars.innodb_dblwr_writes,		  SHOW_LONG},
+  {"have_atomic_builtins",
+  (char*) &export_vars.innodb_have_atomic_builtins,	  SHOW_BOOL},
+  {"log_waits",
+  (char*) &export_vars.innodb_log_waits,		  SHOW_LONG},
+  {"log_write_requests",
+  (char*) &export_vars.innodb_log_write_requests,	  SHOW_LONG},
+  {"log_writes",
+  (char*) &export_vars.innodb_log_writes,		  SHOW_LONG},
+  {"os_log_fsyncs",
+  (char*) &export_vars.innodb_os_log_fsyncs,		  SHOW_LONG},
+  {"os_log_pending_fsyncs",
+  (char*) &export_vars.innodb_os_log_pending_fsyncs,	  SHOW_LONG},
+  {"os_log_pending_writes",
+  (char*) &export_vars.innodb_os_log_pending_writes,	  SHOW_LONG},
+  {"os_log_written",
+  (char*) &export_vars.innodb_os_log_written,		  SHOW_LONGLONG},
+  {"page_size",
+  (char*) &export_vars.innodb_page_size,		  SHOW_LONG},
+  {"pages_created",
+  (char*) &export_vars.innodb_pages_created,		  SHOW_LONG},
+  {"pages_read",
+  (char*) &export_vars.innodb_pages_read,		  SHOW_LONG},
+  {"pages_written",
+  (char*) &export_vars.innodb_pages_written,		  SHOW_LONG},
+  {"row_lock_current_waits",
+  (char*) &export_vars.innodb_row_lock_current_waits,	  SHOW_LONG},
+  {"row_lock_time",
+  (char*) &export_vars.innodb_row_lock_time,		  SHOW_LONGLONG},
+  {"row_lock_time_avg",
+  (char*) &export_vars.innodb_row_lock_time_avg,	  SHOW_LONG},
+  {"row_lock_time_max",
+  (char*) &export_vars.innodb_row_lock_time_max,	  SHOW_LONG},
+  {"row_lock_waits",
+  (char*) &export_vars.innodb_row_lock_waits,		  SHOW_LONG},
+  {"rows_deleted",
+  (char*) &export_vars.innodb_rows_deleted,		  SHOW_LONG},
+  {"rows_inserted",
+  (char*) &export_vars.innodb_rows_inserted,		  SHOW_LONG},
+  {"rows_read",
+  (char*) &export_vars.innodb_rows_read,		  SHOW_LONG},
+  {"rows_updated",
+  (char*) &export_vars.innodb_rows_updated,		  SHOW_LONG},
+  {"num_open_files",
+  (char*) &export_vars.innodb_num_open_files,		  SHOW_LONG},
+  {"truncated_status_writes",
+  (char*) &export_vars.innodb_truncated_status_writes,	  SHOW_LONG},
+  {"available_undo_logs",
+  (char*) &export_vars.innodb_available_undo_logs,        SHOW_LONG},
+  {NullS, NullS, SHOW_LONG}
+};
+
+/************************************************************************//**
+Handling the shared INNOBASE_SHARE structure that is needed to provide table
+locking. Register the table name if it doesn't exist in the hash table. */
+static
+INNOBASE_SHARE*
+get_share(
+/*======*/
+	const char*	table_name);	/*!< in: table to lookup */
+
+/************************************************************************//**
+Free the shared object that was registered with get_share(). */
+static
+void
+free_share(
+/*=======*/
+	INNOBASE_SHARE*	share);		/*!< in/own: share to free */
+
+/*****************************************************************//**
+Frees a possible InnoDB trx object associated with the current THD.
+@return	0 or error number */
+static
+int
+innobase_close_connection(
+/*======================*/
+	handlerton*	hton,		/*!< in/out: Innodb handlerton */
+	THD*		thd);		/*!< in: MySQL thread handle for
+					which to close the connection */
+
+static
+void
+innobase_commit_ordered(
+/*======================*/
+        handlerton *hton,               /*!< in/out: Innodb handlerton */
+        THD* thd,                       /*!< in: MySQL thread handle */
+        bool all);			/*!< in: TRUE - commit transaction
+                                             FALSE - the current SQL statement
+                                             ended */
+
+/*****************************************************************//**
+Commits a transaction in an InnoDB database or marks an SQL statement
+ended.
+@return	0 */
+static
+int
+innobase_commit(
+/*============*/
+	handlerton*	hton,		/*!< in/out: Innodb handlerton */
+	THD*		thd,		/*!< in: MySQL thread handle of the
+					user for whom the transaction should
+					be committed */
+	bool		commit_trx);	/*!< in: true - commit transaction
+					false - the current SQL statement
+					ended */
+
+/*****************************************************************//**
+Rolls back a transaction to a savepoint.
+@return 0 if success, HA_ERR_NO_SAVEPOINT if no savepoint with the
+given name */
+static
+int
+innobase_rollback(
+/*==============*/
+	handlerton*	hton,		/*!< in/out: Innodb handlerton */
+	THD*		thd,		/*!< in: handle to the MySQL thread
+					of the user whose transaction should
+					be rolled back */
+	bool		rollback_trx);	/*!< in: TRUE - rollback entire
+					transaction FALSE - rollback the current
+					statement only */
+
+/*****************************************************************//**
+Rolls back a transaction to a savepoint.
+@return 0 if success, HA_ERR_NO_SAVEPOINT if no savepoint with the
+given name */
+static
+int
+innobase_rollback_to_savepoint(
+/*===========================*/
+	handlerton*	hton,		/*!< in/out: InnoDB handlerton */
+	THD*		thd,		/*!< in: handle to the MySQL thread of
+					the user whose XA transaction should
+					be rolled back to savepoint */
+	void*		savepoint);	/*!< in: savepoint data */
+
+/*****************************************************************//**
+Sets a transaction savepoint.
+@return	always 0, that is, always succeeds */
+static
+int
+innobase_savepoint(
+/*===============*/
+	handlerton*	hton,		/*!< in/out: InnoDB handlerton */
+	THD*		thd,		/*!< in: handle to the MySQL thread of
+					the user's XA transaction for which
+					we need to take a savepoint */
+	void*		savepoint);	/*!< in: savepoint data */
+
+/*****************************************************************//**
+Release transaction savepoint name.
+@return 0 if success, HA_ERR_NO_SAVEPOINT if no savepoint with the
+given name */
+static
+int
+innobase_release_savepoint(
+/*=======================*/
+	handlerton*	hton,		/*!< in/out: handlerton for Innodb */
+	THD*		thd,		/*!< in: handle to the MySQL thread
+					of the user whose transaction's
+					savepoint should be released */
+	void*		savepoint);	/*!< in: savepoint data */
+
+/************************************************************************//**
+Function for constructing an InnoDB table handler instance. */
+static
+handler*
+innobase_create_handler(
+/*====================*/
+	handlerton*	hton,		/*!< in/out: handlerton for Innodb */
+	TABLE_SHARE*	table,
+	MEM_ROOT*	mem_root);
+
 /** @brief Initialize the default value of innodb_commit_concurrency.
 
 Once InnoDB is running, the innodb_commit_concurrency must not change
@@ -365,8 +699,18 @@ to 0, even if it was initially set to nonzero at the command line
 or configuration file. */
 static
 void
-innobase_commit_concurrency_init_default(void);
-/*==========================================*/
+innobase_commit_concurrency_init_default();
+/*=======================================*/
+
+/** @brief Initialize the default and max value of innodb_undo_logs.
+
+Once InnoDB is running, the default value and the max value of
+innodb_undo_logs must be equal to the available undo logs,
+given by srv_available_undo_logs. */
+static
+void
+innobase_undo_logs_init_default_max();
+/*==================================*/
 
 /************************************************************//**
 Validate the file format name and return its corresponding id.
@@ -375,8 +719,8 @@ static
 uint
 innobase_file_format_name_lookup(
 /*=============================*/
-	const char*	format_name);		/*!< in: pointer to file format
-						name */
+	const char*	format_name);	/*!< in: pointer to file format
+					name */
 /************************************************************//**
 Validate the file format check config parameters, as a side effect it
 sets the srv_max_file_format_at_startup variable.
@@ -385,7 +729,7 @@ static
 int
 innobase_file_format_validate_and_set(
 /*==================================*/
-	const char*	format_max);		/*!< in: parameter value */
+	const char*	format_max);	/*!< in: parameter value */
 /****************************************************************//**
 Return alter table flags supported in an InnoDB database. */
 static
@@ -394,66 +738,6 @@ innobase_alter_table_flags(
 /*=======================*/
 	uint	flags);
 
-static const char innobase_hton_name[]= "InnoDB";
-
-/*************************************************************//**
-Check for a valid value of innobase_commit_concurrency.
-@return	0 for valid innodb_commit_concurrency */
-static
-int
-innobase_commit_concurrency_validate(
-/*=================================*/
-	THD*				thd,	/*!< in: thread handle */
-	struct st_mysql_sys_var*	var,	/*!< in: pointer to system
-						variable */
-	void*				save,	/*!< out: immediate result
-						for update function */
-	struct st_mysql_value*		value)	/*!< in: incoming string */
-{
-	long long	intbuf;
-	ulong		commit_concurrency;
-
-	DBUG_ENTER("innobase_commit_concurrency_validate");
-
-	if (value->val_int(value, &intbuf)) {
-		/* The value is NULL. That is invalid. */
-		DBUG_RETURN(1);
-	}
-
-	*reinterpret_cast<ulong*>(save) = commit_concurrency
-		= static_cast<ulong>(intbuf);
-
-	/* Allow the value to be updated, as long as it remains zero
-	or nonzero. */
-	DBUG_RETURN(!(!commit_concurrency == !innobase_commit_concurrency));
-}
-
-static MYSQL_THDVAR_BOOL(support_xa, PLUGIN_VAR_OPCMDARG,
-  "Enable InnoDB support for the XA two-phase commit",
-  /* check_func */ NULL, /* update_func */ NULL,
-  /* default */ TRUE);
-
-static MYSQL_THDVAR_BOOL(table_locks, PLUGIN_VAR_OPCMDARG,
-  "Enable InnoDB locking in LOCK TABLES",
-  /* check_func */ NULL, /* update_func */ NULL,
-  /* default */ TRUE);
-
-static MYSQL_THDVAR_BOOL(strict_mode, PLUGIN_VAR_OPCMDARG,
-  "Use strict mode when evaluating create options.",
-  NULL, NULL, FALSE);
-
-static MYSQL_THDVAR_ULONG(lock_wait_timeout, PLUGIN_VAR_RQCMDARG,
-  "Timeout in seconds an InnoDB transaction may wait for a lock before being rolled back. Values above 100000000 disable the timeout.",
-  NULL, NULL, 50, 1, 1024 * 1024 * 1024, 0);
-
-
-static handler *innobase_create_handler(handlerton *hton,
-                                        TABLE_SHARE *table,
-                                        MEM_ROOT *mem_root)
-{
-  return new (mem_root) ha_innobase(hton, table);
-}
-
 /*******************************************************************//**
 This function is used to prepare an X/Open XA distributed transaction.
 @return	0 or error number */
@@ -461,13 +745,13 @@ static
 int
 innobase_xa_prepare(
 /*================*/
-        handlerton*	hton,	/*!< in: InnoDB handlerton */
-	THD*		thd,	/*!< in: handle to the MySQL thread of
-				the user whose XA transaction should
-				be prepared */
-	bool		all);	/*!< in: TRUE - commit transaction
-				FALSE - the current SQL statement
-				ended */
+	handlerton*	hton,		/*!< in: InnoDB handlerton */
+	THD*		thd,		/*!< in: handle to the MySQL thread of
+					the user whose XA transaction should
+					be prepared */
+	bool		all);		/*!< in: true - prepare transaction
+					false - the current SQL statement
+					ended */
 /*******************************************************************//**
 This function is used to recover X/Open XA distributed transactions.
 @return	number of prepared transactions stored in xid_list */
@@ -475,9 +759,9 @@ static
 int
 innobase_xa_recover(
 /*================*/
-	handlerton*	hton,	/*!< in: InnoDB handlerton */
-	XID*		xid_list,/*!< in/out: prepared transactions */
-	uint		len);	/*!< in: number of slots in xid_list */
+	handlerton*	hton,		/*!< in: InnoDB handlerton */
+	XID*		xid_list,	/*!< in/out: prepared transactions */
+	uint		len);		/*!< in: number of slots in xid_list */
 /*******************************************************************//**
 This function is used to commit one X/Open XA distributed transaction
 which is in the prepared state
@@ -486,8 +770,9 @@ static
 int
 innobase_commit_by_xid(
 /*===================*/
-	handlerton* hton,
-	XID*	xid);	/*!< in: X/Open XA transaction identification */
+	handlerton*	hton,		/*!< in: InnoDB handlerton */
+	XID*		xid);		/*!< in: X/Open XA transaction
+					identification */
 /*******************************************************************//**
 This function is used to rollback one X/Open XA distributed transaction
 which is in the prepared state
@@ -496,9 +781,9 @@ static
 int
 innobase_rollback_by_xid(
 /*=====================*/
-	handlerton*	hton,	/*!< in: InnoDB handlerton */
-	XID*		xid);	/*!< in: X/Open XA transaction
-				identification */
+	handlerton*	hton,		/*!< in: InnoDB handlerton */
+	XID*		xid);		/*!< in: X/Open XA transaction
+					identification */
 /*******************************************************************//**
 Create a consistent view for a cursor based on current transaction
 which is created if the corresponding MySQL thread still lacks one.
@@ -509,8 +794,8 @@ static
 void*
 innobase_create_cursor_view(
 /*========================*/
-	handlerton*	hton,	/*!< in: innobase hton */
-	THD*		thd);	/*!< in: user thread handle */
+	handlerton*	hton,		/*!< in: innobase hton */
+	THD*		thd);		/*!< in: user thread handle */
 /*******************************************************************//**
 Set the given consistent cursor view to a transaction which is created
 if the corresponding MySQL thread still lacks one. If the given
@@ -520,9 +805,10 @@ static
 void
 innobase_set_cursor_view(
 /*=====================*/
-	handlerton* hton,
-	THD*	thd,	/*!< in: user thread handle */
-	void*	curview);/*!< in: Consistent cursor view to be set */
+	handlerton*	hton,		/*!< in: handlerton of Innodb */
+	THD*		thd,		/*!< in: user thread handle */
+	void*		curview);	/*!< in: Consistent cursor view to
+					be set */
 /*******************************************************************//**
 Close the given consistent cursor view of a transaction and restore
 global read view to a transaction read view. Transaction is created if the
@@ -531,25 +817,30 @@ static
 void
 innobase_close_cursor_view(
 /*=======================*/
-	handlerton* hton,
-	THD*	thd,	/*!< in: user thread handle */
-	void*	curview);/*!< in: Consistent read view to be closed */
+	handlerton*	hton,		/*!< in: handlerton of Innodb */
+	THD*		thd,		/*!< in: user thread handle */
+	void*		curview);	/*!< in: Consistent read view to be
+					closed */
 /*****************************************************************//**
 Removes all tables in the named database inside InnoDB. */
 static
 void
 innobase_drop_database(
 /*===================*/
-	handlerton* hton, /*!< in: handlerton of Innodb */
-	char*	path);	/*!< in: database path; inside InnoDB the name
-			of the last directory in the path is used as
-			the database name: for example, in 'mysql/data/test'
-			the database name is 'test' */
+	handlerton*	hton,		/*!< in: handlerton of Innodb */
+	char*		path);		/*!< in: database path; inside InnoDB
+					the name of the last directory in
+					the path is used as the database name:
+					for example, in 'mysql/data/test' the
+					database name is 'test' */
 /*******************************************************************//**
 Closes an InnoDB database. */
 static
 int
-innobase_end(handlerton *hton, ha_panic_function type);
+innobase_end(
+/*=========*/
+	handlerton*		hton,	/* in: Innodb handlerton */
+	ha_panic_function	type);
 
 /*****************************************************************//**
 Creates an InnoDB transaction struct for the thd if it does not yet have one.
@@ -561,10 +852,10 @@ static
 int
 innobase_start_trx_and_assign_read_view(
 /*====================================*/
-			/* out: 0 */
-	handlerton* hton, /* in: Innodb handlerton */
-	THD*	thd);	/* in: MySQL thread handle of the user for whom
-			the transaction should be committed */
+	handlerton*	hton,		/* in: Innodb handlerton */
+	THD*		thd);		/* in: MySQL thread handle of the
+					user for whom the transaction should
+					be committed */
 /****************************************************************//**
 Flushes InnoDB logs to disk and makes a checkpoint. Really, a commit flushes
 the logs, and the name of this function should be innobase_checkpoint.
@@ -573,22 +864,32 @@ static
 bool
 innobase_flush_logs(
 /*================*/
-	handlerton*	hton);	/*!< in: InnoDB handlerton */
+	handlerton*	hton);		/*!< in: InnoDB handlerton */
 
 /************************************************************************//**
-Implements the SHOW INNODB STATUS command. Sends the output of the InnoDB
-Monitor to the client. */
+Implements the SHOW ENGINE INNODB STATUS command. Sends the output of the
+InnoDB Monitor to the client.
+@return 0 on success */
 static
-bool
+int
 innodb_show_status(
 /*===============*/
-	handlerton*	hton,	/*!< in: the innodb handlerton */
-	THD*	thd,	/*!< in: the MySQL query thread of the caller */
-	stat_print_fn *stat_print);
+	handlerton*	hton,		/*!< in: the innodb handlerton */
+	THD*		thd,		/*!< in: the MySQL query thread of
+					the caller */
+	stat_print_fn*	stat_print);
+/************************************************************************//**
+Return 0 on success and non-zero on failure. Note: the bool return type
+seems to be abused here, should be an int. */
 static
-bool innobase_show_status(handlerton *hton, THD* thd,
-                          stat_print_fn* stat_print,
-                          enum ha_stat_type stat_type);
+bool
+innobase_show_status(
+/*=================*/
+	handlerton*		hton,	/*!< in: the innodb handlerton */
+	THD*			thd,	/*!< in: the MySQL query thread of
+					the caller */
+	stat_print_fn*		stat_print,
+	enum ha_stat_type	stat_type);
 
 /*****************************************************************//**
 Commits a transaction in an InnoDB database. */
@@ -598,106 +899,89 @@ innobase_commit_low(
 /*================*/
 	trx_t*	trx);	/*!< in: transaction handle */
 
-static SHOW_VAR innodb_status_variables[]= {
-  {"buffer_pool_pages_data",
-  (char*) &export_vars.innodb_buffer_pool_pages_data,	  SHOW_LONG},
-  {"buffer_pool_pages_dirty",
-  (char*) &export_vars.innodb_buffer_pool_pages_dirty,	  SHOW_LONG},
-  {"buffer_pool_pages_flushed",
-  (char*) &export_vars.innodb_buffer_pool_pages_flushed,  SHOW_LONG},
-  {"buffer_pool_pages_free",
-  (char*) &export_vars.innodb_buffer_pool_pages_free,	  SHOW_LONG},
-#ifdef UNIV_DEBUG
-  {"buffer_pool_pages_latched",
-  (char*) &export_vars.innodb_buffer_pool_pages_latched,  SHOW_LONG},
-#endif /* UNIV_DEBUG */
-  {"buffer_pool_pages_misc",
-  (char*) &export_vars.innodb_buffer_pool_pages_misc,	  SHOW_LONG},
-  {"buffer_pool_pages_total",
-  (char*) &export_vars.innodb_buffer_pool_pages_total,	  SHOW_LONG},
-  {"buffer_pool_read_ahead_rnd",
-  (char*) &export_vars.innodb_buffer_pool_read_ahead_rnd, SHOW_LONG},
-  {"buffer_pool_read_ahead",
-  (char*) &export_vars.innodb_buffer_pool_read_ahead,	  SHOW_LONG},
-  {"buffer_pool_read_ahead_evicted",
-  (char*) &export_vars.innodb_buffer_pool_read_ahead_evicted, SHOW_LONG},
-  {"buffer_pool_read_requests",
-  (char*) &export_vars.innodb_buffer_pool_read_requests,  SHOW_LONG},
-  {"buffer_pool_reads",
-  (char*) &export_vars.innodb_buffer_pool_reads,	  SHOW_LONG},
-  {"buffer_pool_wait_free",
-  (char*) &export_vars.innodb_buffer_pool_wait_free,	  SHOW_LONG},
-  {"buffer_pool_write_requests",
-  (char*) &export_vars.innodb_buffer_pool_write_requests, SHOW_LONG},
-  {"data_fsyncs",
-  (char*) &export_vars.innodb_data_fsyncs,		  SHOW_LONG},
-  {"data_pending_fsyncs",
-  (char*) &export_vars.innodb_data_pending_fsyncs,	  SHOW_LONG},
-  {"data_pending_reads",
-  (char*) &export_vars.innodb_data_pending_reads,	  SHOW_LONG},
-  {"data_pending_writes",
-  (char*) &export_vars.innodb_data_pending_writes,	  SHOW_LONG},
-  {"data_read",
-  (char*) &export_vars.innodb_data_read,		  SHOW_LONG},
-  {"data_reads",
-  (char*) &export_vars.innodb_data_reads,		  SHOW_LONG},
-  {"data_writes",
-  (char*) &export_vars.innodb_data_writes,		  SHOW_LONG},
-  {"data_written",
-  (char*) &export_vars.innodb_data_written,		  SHOW_LONG},
-  {"dblwr_pages_written",
-  (char*) &export_vars.innodb_dblwr_pages_written,	  SHOW_LONG},
-  {"dblwr_writes",
-  (char*) &export_vars.innodb_dblwr_writes,		  SHOW_LONG},
-  {"have_atomic_builtins",
-  (char*) &export_vars.innodb_have_atomic_builtins,	  SHOW_BOOL},
-  {"log_waits",
-  (char*) &export_vars.innodb_log_waits,		  SHOW_LONG},
-  {"log_write_requests",
-  (char*) &export_vars.innodb_log_write_requests,	  SHOW_LONG},
-  {"log_writes",
-  (char*) &export_vars.innodb_log_writes,		  SHOW_LONG},
-  {"os_log_fsyncs",
-  (char*) &export_vars.innodb_os_log_fsyncs,		  SHOW_LONG},
-  {"os_log_pending_fsyncs",
-  (char*) &export_vars.innodb_os_log_pending_fsyncs,	  SHOW_LONG},
-  {"os_log_pending_writes",
-  (char*) &export_vars.innodb_os_log_pending_writes,	  SHOW_LONG},
-  {"os_log_written",
-  (char*) &export_vars.innodb_os_log_written,		  SHOW_LONG},
-  {"page_size",
-  (char*) &export_vars.innodb_page_size,		  SHOW_LONG},
-  {"pages_created",
-  (char*) &export_vars.innodb_pages_created,		  SHOW_LONG},
-  {"pages_read",
-  (char*) &export_vars.innodb_pages_read,		  SHOW_LONG},
-  {"pages_written",
-  (char*) &export_vars.innodb_pages_written,		  SHOW_LONG},
-  {"row_lock_current_waits",
-  (char*) &export_vars.innodb_row_lock_current_waits,	  SHOW_LONG},
-  {"row_lock_time",
-  (char*) &export_vars.innodb_row_lock_time,		  SHOW_LONGLONG},
-  {"row_lock_time_avg",
-  (char*) &export_vars.innodb_row_lock_time_avg,	  SHOW_LONG},
-  {"row_lock_time_max",
-  (char*) &export_vars.innodb_row_lock_time_max,	  SHOW_LONG},
-  {"row_lock_waits",
-  (char*) &export_vars.innodb_row_lock_waits,		  SHOW_LONG},
-  {"rows_deleted",
-  (char*) &export_vars.innodb_rows_deleted,		  SHOW_LONG},
-  {"rows_inserted",
-  (char*) &export_vars.innodb_rows_inserted,		  SHOW_LONG},
-  {"rows_read",
-  (char*) &export_vars.innodb_rows_read,		  SHOW_LONG},
-  {"rows_updated",
-  (char*) &export_vars.innodb_rows_updated,		  SHOW_LONG},
-  {"truncated_status_writes",
-  (char*) &export_vars.innodb_truncated_status_writes,	SHOW_LONG},
-  {NullS, NullS, SHOW_LONG}
-};
+/****************************************************************//**
+Parse and enable InnoDB monitor counters during server startup.
+User can enable monitor counters/groups by specifying
+"loose-innodb_monitor_enable = monitor_name1;monitor_name2..."
+in server configuration file or at the command line. */
+static
+void
+innodb_enable_monitor_at_startup(
+/*=============================*/
+	char*	str);	/*!< in: monitor counter enable list */
+
+/*************************************************************//**
+Check for a valid value of innobase_commit_concurrency.
+@return	0 for valid innodb_commit_concurrency */
+static
+int
+innobase_commit_concurrency_validate(
+/*=================================*/
+	THD*				thd,	/*!< in: thread handle */
+	struct st_mysql_sys_var*	var,	/*!< in: pointer to system
+						variable */
+	void*				save,	/*!< out: immediate result
+						for update function */
+	struct st_mysql_value*		value)	/*!< in: incoming string */
+{
+	long long	intbuf;
+	ulong		commit_concurrency;
+
+	DBUG_ENTER("innobase_commit_concurrency_validate");
+
+	if (value->val_int(value, &intbuf)) {
+		/* The value is NULL. That is invalid. */
+		DBUG_RETURN(1);
+	}
+
+	*reinterpret_cast<ulong*>(save) = commit_concurrency
+		= static_cast<ulong>(intbuf);
+
+	/* Allow the value to be updated, as long as it remains zero
+	or nonzero. */
+	DBUG_RETURN(!(!commit_concurrency == !innobase_commit_concurrency));
+}
+
+/*******************************************************************//**
+Function for constructing an InnoDB table handler instance. */
+static
+handler*
+innobase_create_handler(
+/*====================*/
+	handlerton*	hton,	/*!< in: InnoDB handlerton */
+	TABLE_SHARE*	table,
+	MEM_ROOT*	mem_root)
+{
+	return new (mem_root) ha_innobase(hton, table);
+}
 
 /* General functions */
 
+/*************************************************************//**
+Check that a page_size is correct for InnoDB.  If correct, set the
+associated page_size_shift which is the power of 2 for this page size.
+@return	an associated page_size_shift if valid, 0 if invalid. */
+inline
+int
+innodb_page_size_validate(
+/*======================*/
+	ulong	page_size)		/*!< in: Page Size to evaluate */
+{
+	ulong		n;
+
+	DBUG_ENTER("innodb_page_size_validate");
+
+	for (n = UNIV_PAGE_SIZE_SHIFT_MIN;
+	     n <= UNIV_PAGE_SIZE_SHIFT_MAX;
+	     n++) {
+		if (page_size == (ulong) (1 << n)) {
+			DBUG_RETURN(n);
+		}
+	}
+
+	DBUG_RETURN(0);
+}
+
 /******************************************************************//**
 Returns true if the thread is the replication thread on the slave
 server. Used in srv_conc_enter_innodb() to determine if the thread
@@ -705,7 +989,7 @@ should be allowed to enter InnoDB - the replication thread is treated
 differently than other threads. Also used in
 srv_conc_force_exit_innodb().
 @return	true if thd is the replication thread */
-extern "C" UNIV_INTERN
+UNIV_INTERN
 ibool
 thd_is_replication_slave_thread(
 /*============================*/
@@ -715,57 +999,101 @@ thd_is_replication_slave_thread(
 }
 
 /******************************************************************//**
+Returns true if transaction should be flagged as read-only.
+@return	true if the thd is marked as read-only */
+UNIV_INTERN
+ibool
+thd_trx_is_read_only(
+/*=================*/
+	void*	thd)	/*!< in: thread handle (THD*) */
+{
+	/* Waiting on WL#6046 to complete. */
+	return(FALSE);
+}
+
+/******************************************************************//**
+Check if the transaction is an auto-commit transaction. TRUE also
+implies that it is a SELECT (read-only) transaction.
+@return	true if the transaction is an auto commit read-only transaction. */
+UNIV_INTERN
+ibool
+thd_trx_is_auto_commit(
+/*===================*/
+	void*	thd)	/*!< in: thread handle (THD*) can be NULL */
+{
+	return(thd != NULL
+	       && !thd_test_options(
+		       static_cast<THD*>(thd),
+		       OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)
+	       && thd_is_select(thd));
+}
+
+/******************************************************************//**
 Save some CPU by testing the value of srv_thread_concurrency in inline
 functions. */
 static inline
 void
-innodb_srv_conc_enter_innodb(
-/*=========================*/
+innobase_srv_conc_enter_innodb(
+/*===========================*/
 	trx_t*	trx)	/*!< in: transaction handle */
 {
-	if (UNIV_LIKELY(!srv_thread_concurrency)) {
+	if (srv_thread_concurrency) {
+		if (trx->n_tickets_to_enter_innodb > 0) {
 
-		return;
-	}
+			/* If trx has 'free tickets' to enter the engine left,
+			then use one such ticket */
 
-	srv_conc_enter_innodb(trx);
+			--trx->n_tickets_to_enter_innodb;
+
+		} else if (trx->mysql_thd != NULL
+			   && thd_is_replication_slave_thread(trx->mysql_thd)) {
+
+			UT_WAIT_FOR(
+				srv_conc_get_active_threads()
+				< srv_thread_concurrency,
+				srv_replication_delay * 1000);
+
+		}  else {
+			srv_conc_enter_innodb(trx);
+		}
+	}
 }
 
 /******************************************************************//**
-Save some CPU by testing the value of srv_thread_concurrency in inline
-functions. */
+Note that the thread wants to leave InnoDB only if it doesn't have
+any spare tickets. */
 static inline
 void
-innodb_srv_conc_exit_innodb(
-/*========================*/
+innobase_srv_conc_exit_innodb(
+/*==========================*/
 	trx_t*	trx)	/*!< in: transaction handle */
 {
-	if (UNIV_LIKELY(!trx->declared_to_be_inside_innodb)) {
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(!sync_thread_levels_nonempty_trx(trx->has_search_latch));
+#endif /* UNIV_SYNC_DEBUG */
 
-		return;
-	}
+	/* This is to avoid making an unnecessary function call. */
+	if (trx->declared_to_be_inside_innodb
+	    && trx->n_tickets_to_enter_innodb == 0) {
 
-	srv_conc_exit_innodb(trx);
+		srv_conc_force_exit_innodb(trx);
+	}
 }
 
 /******************************************************************//**
-Releases possible search latch and InnoDB thread FIFO ticket. These should
-be released at each SQL statement end, and also when mysqld passes the
-control to the client. It does no harm to release these also in the middle
-of an SQL statement. */
+Force a thread to leave InnoDB even if it has spare tickets. */
 static inline
 void
-innobase_release_stat_resources(
-/*============================*/
-	trx_t*	trx)	/*!< in: transaction object */
+innobase_srv_conc_force_exit_innodb(
+/*================================*/
+	trx_t*	trx)	/*!< in: transaction handle */
 {
-	if (trx->has_search_latch) {
-		trx_search_latch_release_if_reserved(trx);
-	}
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(!sync_thread_levels_nonempty_trx(trx->has_search_latch));
+#endif /* UNIV_SYNC_DEBUG */
 
+	/* This is to avoid making an unnecessary function call. */
 	if (trx->declared_to_be_inside_innodb) {
-		/* Release our possible ticket in the FIFO */
-
 		srv_conc_force_exit_innodb(trx);
 	}
 }
@@ -776,7 +1104,7 @@ non-transactional tables. Used by the deadlock detector when deciding
 which transaction to rollback in case of a deadlock - we try to avoid
 rolling back transactions that have edited non-transactional tables.
 @return	true if non-transactional tables have been edited */
-extern "C" UNIV_INTERN
+UNIV_INTERN
 ibool
 thd_has_edited_nontrans_tables(
 /*===========================*/
@@ -788,7 +1116,7 @@ thd_has_edited_nontrans_tables(
 /******************************************************************//**
 Returns true if the thread is executing a SELECT statement.
 @return	true if thd is executing SELECT */
-extern "C" UNIV_INTERN
+UNIV_INTERN
 ibool
 thd_is_select(
 /*==========*/
@@ -801,7 +1129,7 @@ thd_is_select(
 Returns true if the thread supports XA,
 global value of innodb_supports_xa if thd is NULL.
 @return	true if thd has XA support */
-extern "C" UNIV_INTERN
+UNIV_INTERN
 ibool
 thd_supports_xa(
 /*============*/
@@ -814,7 +1142,7 @@ thd_supports_xa(
 /******************************************************************//**
 Returns the lock wait timeout for the current connection.
 @return	the lock wait timeout, in seconds */
-extern "C" UNIV_INTERN
+UNIV_INTERN
 ulong
 thd_lock_wait_timeout(
 /*==================*/
@@ -828,7 +1156,7 @@ thd_lock_wait_timeout(
 
 /******************************************************************//**
 Set the time waited for the lock for the current query. */
-extern "C" UNIV_INTERN
+UNIV_INTERN
 void
 thd_set_lock_wait_time(
 /*===================*/
@@ -836,7 +1164,7 @@ thd_set_lock_wait_time(
 	ulint	value)	/*!< in: time waited for the lock */
 {
 	if (thd) {
-		thd_storage_lock_wait((THD*)thd, value);
+		thd_storage_lock_wait((THD*) thd, value);
 	}
 }
 
@@ -864,8 +1192,6 @@ innobase_release_temporary_latches(
 	handlerton*	hton,	/*!< in: handlerton */
 	THD*		thd)	/*!< in: MySQL thread */
 {
-	trx_t*	trx;
-
 	DBUG_ASSERT(hton == innodb_hton_ptr);
 
 	if (!innodb_inited) {
@@ -873,11 +1199,12 @@ innobase_release_temporary_latches(
 		return(0);
 	}
 
-	trx = thd_to_trx(thd);
+	trx_t*	trx = thd_to_trx(thd);
 
-	if (trx) {
-		innobase_release_stat_resources(trx);
+	if (trx != NULL) {
+		trx_search_latch_release_if_reserved(trx);
 	}
+
 	return(0);
 }
 
@@ -903,7 +1230,7 @@ Converts an InnoDB error code to a MySQL error code and also tells to MySQL
 about a possible transaction rollback inside InnoDB caused by a lock wait
 timeout or a deadlock.
 @return	MySQL error code */
-extern "C" UNIV_INTERN
+UNIV_INTERN
 int
 convert_error_code_to_mysql(
 /*========================*/
@@ -920,7 +1247,8 @@ convert_error_code_to_mysql(
 		/* fall through */
 
 	case DB_FOREIGN_EXCEED_MAX_CASCADE:
-		push_warning_printf(thd, MYSQL_ERROR::WARN_LEVEL_WARN,
+		ut_ad(thd);
+		push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
 				    HA_ERR_ROW_IS_REFERENCED,
 				    "InnoDB: Cannot delete/update "
 				    "rows with cascading foreign key "
@@ -944,6 +1272,9 @@ convert_error_code_to_mysql(
 		handling stage. */
 		return(HA_ERR_FOUND_DUPP_KEY);
 
+	case DB_READ_ONLY:
+		return(HA_ERR_READ_ONLY_TRANSACTION);
+
 	case DB_FOREIGN_DUPLICATE_KEY:
 		return(HA_ERR_FOREIGN_DUPLICATE_KEY);
 
@@ -971,7 +1302,7 @@ convert_error_code_to_mysql(
 
 		if (thd) {
 			thd_mark_transaction_to_rollback(
-				thd, (bool)row_rollback_on_timeout);
+				thd, (bool) row_rollback_on_timeout);
 		}
 
 		return(HA_ERR_LOCK_WAIT_TIMEOUT);
@@ -999,9 +1330,6 @@ convert_error_code_to_mysql(
 	case DB_OUT_OF_FILE_SPACE:
 		return(HA_ERR_RECORD_FILE_FULL);
 
-	case DB_TABLE_IN_FK_CHECK:
-		return(HA_ERR_TABLE_IN_FK_CHECK);
-
 	case DB_TABLE_IS_BEING_USED:
 		return(HA_ERR_WRONG_COMMAND);
 
@@ -1036,6 +1364,9 @@ convert_error_code_to_mysql(
 	case DB_PRIMARY_KEY_IS_NULL:
 		return(ER_PRIMARY_CANT_HAVE_NULL);
 
+	case DB_FTS_INVALID_DOCID:
+		return(HA_FTS_INVALID_DOCID);
+
 	case DB_TOO_MANY_CONCURRENT_TRXS:
 		/* New error code HA_ERR_TOO_MANY_CONCURRENT_TRXS is only
 		available in 5.1.38 and later, but the plugin should still
@@ -1056,14 +1387,14 @@ convert_error_code_to_mysql(
 
 /*************************************************************//**
 Prints info of a THD object (== user session thread) to the given file. */
-extern "C" UNIV_INTERN
+UNIV_INTERN
 void
 innobase_mysql_print_thd(
 /*=====================*/
 	FILE*	f,		/*!< in: output stream */
 	void*	thd,		/*!< in: pointer to a MySQL THD object */
 	uint	max_query_len)	/*!< in: max query length to print, or 0 to
-				   use the default max length */
+				use the default max length */
 {
 	char	buffer[1024];
 
@@ -1074,7 +1405,7 @@ innobase_mysql_print_thd(
 
 /******************************************************************//**
 Get the variable length bounds of the given character set. */
-extern "C" UNIV_INTERN
+UNIV_INTERN
 void
 innobase_get_cset_width(
 /*====================*/
@@ -1083,7 +1414,7 @@ innobase_get_cset_width(
 	ulint*	mbmaxlen)	/*!< out: maximum length of a char (in bytes) */
 {
 	CHARSET_INFO*	cs;
-	ut_ad(cset < 256);
+	ut_ad(cset <= MAX_CHAR_COLL_NUM);
 	ut_ad(mbminlen);
 	ut_ad(mbmaxlen);
 
@@ -1117,7 +1448,7 @@ innobase_get_cset_width(
 
 /******************************************************************//**
 Converts an identifier to a table name. */
-extern "C" UNIV_INTERN
+UNIV_INTERN
 void
 innobase_convert_from_table_id(
 /*===========================*/
@@ -1133,7 +1464,7 @@ innobase_convert_from_table_id(
 
 /******************************************************************//**
 Converts an identifier to UTF-8. */
-extern "C" UNIV_INTERN
+UNIV_INTERN
 void
 innobase_convert_from_id(
 /*=====================*/
@@ -1150,20 +1481,44 @@ innobase_convert_from_id(
 /******************************************************************//**
 Compares NUL-terminated UTF-8 strings case insensitively.
 @return	0 if a=b, <0 if a<b, >1 if a>b */
-extern "C" UNIV_INTERN
+UNIV_INTERN
 int
 innobase_strcasecmp(
 /*================*/
 	const char*	a,	/*!< in: first string to compare */
 	const char*	b)	/*!< in: second string to compare */
 {
+	if (!a) {
+		if (!b) {
+			return(0);
+		} else {
+			return(-1);
+		}
+	} else if (!b) {
+		return(1);
+	}
+
 	return(my_strcasecmp(system_charset_info, a, b));
 }
 
 /******************************************************************//**
+Compares NUL-terminated UTF-8 strings case insensitively. The
+second string contains wildcards.
+@return 0 if a match is found, 1 if not */
+UNIV_INTERN
+int
+innobase_wildcasecmp(
+/*=================*/
+	const char*	a,	/*!< in: string to compare */
+	const char*	b)	/*!< in: wildcard string to compare */
+{
+	return(wild_case_compare(system_charset_info, a, b));
+}
+
+/******************************************************************//**
 Strip dir name from a full path name and return only the file name
 @return file name or "null" if no file name */
-extern "C" UNIV_INTERN
+UNIV_INTERN
 const char*
 innobase_basename(
 /*==============*/
@@ -1176,7 +1531,7 @@ innobase_basename(
 
 /******************************************************************//**
 Makes all characters in a NUL-terminated UTF-8 string lower case. */
-extern "C" UNIV_INTERN
+UNIV_INTERN
 void
 innobase_casedn_str(
 /*================*/
@@ -1188,7 +1543,7 @@ innobase_casedn_str(
 /**********************************************************************//**
 Determines the connection character set.
 @return	connection character set */
-extern "C" UNIV_INTERN
+UNIV_INTERN
 struct charset_info_st*
 innobase_get_charset(
 /*=================*/
@@ -1200,7 +1555,7 @@ innobase_get_charset(
 /**********************************************************************//**
 Determines the current SQL statement.
 @return	SQL statement string */
-extern "C" UNIV_INTERN
+UNIV_INTERN
 const char*
 innobase_get_stmt(
 /*==============*/
@@ -1215,12 +1570,25 @@ innobase_get_stmt(
 }
 
 /**********************************************************************//**
+Get the current setting of the table_def_size global parameter. We do
+a dirty read because for one there is no synchronization object and
+secondly there is little harm in doing so even if we get a torn read.
+@return	value of table_def_size */
+UNIV_INTERN
+ulint
+innobase_get_table_cache_size(void)
+/*===============================*/
+{
+	return(table_def_size);
+}
+
+/**********************************************************************//**
 Get the current setting of the lower_case_table_names global parameter from
 mysqld.cc. We do a dirty read because for one there is no synchronization
 object and secondly there is little harm in doing so even if we get a torn
 read.
 @return	value of lower_case_table_names */
-extern "C" UNIV_INTERN
+UNIV_INTERN
 ulint
 innobase_get_lower_case_table_names(void)
 /*=====================================*/
@@ -1233,7 +1601,6 @@ extern MYSQL_PLUGIN_IMPORT MY_TMPDIR mysql_tmpdir_list;
 /*******************************************************************//**
 Map an OS error to an errno value. The OS error number is stored in
 _doserrno and the mapped value is stored in errno) */
-extern "C"
 void __cdecl
 _dosmaperr(
 	unsigned long);	/*!< in: OS error value */
@@ -1241,7 +1608,7 @@ _dosmaperr(
 /*********************************************************************//**
 Creates a temporary file.
 @return	temporary file descriptor, or < 0 on error */
-extern "C" UNIV_INTERN
+UNIV_INTERN
 int
 innobase_mysql_tmpfile(void)
 /*========================*/
@@ -1325,7 +1692,7 @@ innobase_mysql_tmpfile(void)
 /*********************************************************************//**
 Creates a temporary file.
 @return	temporary file descriptor, or < 0 on error */
-extern "C" UNIV_INTERN
+UNIV_INTERN
 int
 innobase_mysql_tmpfile(void)
 /*========================*/
@@ -1345,21 +1712,21 @@ innobase_mysql_tmpfile(void)
 #ifdef _WIN32
 		/* Note that on Windows, the integer returned by mysql_tmpfile
 		has no relation to C runtime file descriptor. Here, we need
-		to call my_get_osfhandle to get the HANDLE and then convert it 
+		to call my_get_osfhandle to get the HANDLE and then convert it
 		to C runtime filedescriptor. */
 		{
 			HANDLE hFile = my_get_osfhandle(fd);
 			HANDLE hDup;
-			BOOL bOK = 
-				DuplicateHandle(GetCurrentProcess(), hFile, GetCurrentProcess(),
-								&hDup, 0, FALSE, DUPLICATE_SAME_ACCESS);
-			if(bOK) {
-				fd2 = _open_osfhandle((intptr_t)hDup,0);
-			}
-			else {
+			BOOL bOK = DuplicateHandle(
+					GetCurrentProcess(),
+					hFile, GetCurrentProcess(),
+					&hDup, 0, FALSE, DUPLICATE_SAME_ACCESS);
+			if (bOK) {
+				fd2 = _open_osfhandle((intptr_t) hDup, 0);
+			} else {
 				my_osmaperr(GetLastError());
 				fd2 = -1;
-			}	
+			}
 		}
 #else
 		fd2 = dup(fd);
@@ -1380,7 +1747,7 @@ innobase_mysql_tmpfile(void)
 /*********************************************************************//**
 Wrapper around MySQL's copy_and_convert function.
 @return	number of bytes copied to 'to' */
-extern "C" UNIV_INTERN
+UNIV_INTERN
 ulint
 innobase_convert_string(
 /*====================*/
@@ -1390,13 +1757,15 @@ innobase_convert_string(
 	CHARSET_INFO*	to_cs,		/*!< in: character set to convert to */
 	const void*	from,		/*!< in: string to convert */
 	ulint		from_length,	/*!< in: number of bytes to convert */
-	CHARSET_INFO*	from_cs,	/*!< in: character set to convert from */
+	CHARSET_INFO*	from_cs,	/*!< in: character set to convert
+					from */
 	uint*		errors)		/*!< out: number of errors encountered
 					during the conversion */
 {
-  return(copy_and_convert((char*)to, (uint32) to_length, to_cs,
-                          (const char*)from, (uint32) from_length, from_cs,
-                          errors));
+	return(copy_and_convert(
+			(char*) to, (uint32) to_length, to_cs,
+			(const char*) from, (uint32) from_length, from_cs,
+			errors));
 }
 
 /*******************************************************************//**
@@ -1408,7 +1777,7 @@ The result is always NUL-terminated (provided buf_size > 0) and the
 number of bytes that were written to "buf" is returned (including the
 terminating NUL).
 @return	number of bytes that were written */
-extern "C" UNIV_INTERN
+UNIV_INTERN
 ulint
 innobase_raw_format(
 /*================*/
@@ -1451,87 +1820,70 @@ values we want to reserve for multi-value inserts e.g.,
 
 	INSERT INTO T VALUES(), (), ();
 
-innobase_next_autoinc() will be called with increment set to 3 where
-autoinc_lock_mode != TRADITIONAL because we want to reserve 3 values for
-the multi-value INSERT above.
+innobase_next_autoinc() will be called with increment set to
+n * 3 where autoinc_lock_mode != TRADITIONAL because we want
+to reserve 3 values for the multi-value INSERT above.
 @return	the next value */
 static
 ulonglong
 innobase_next_autoinc(
 /*==================*/
 	ulonglong	current,	/*!< in: Current value */
-	ulonglong	need,		/*!< in: count of values needed */
-	ulonglong	step,		/*!< in: AUTOINC increment step */
+	ulonglong	increment,	/*!< in: increment current by */
 	ulonglong	offset,		/*!< in: AUTOINC offset */
 	ulonglong	max_value)	/*!< in: max value for type */
 {
 	ulonglong	next_value;
-	ulonglong	block = need * step;
 
 	/* Should never be 0. */
-	ut_a(need > 0);
-	ut_a(block > 0);
-	ut_a(max_value > 0);
-
-	/* Current value should never be greater than the maximum. */
-	ut_a(current <= max_value);
+	ut_a(increment > 0);
 
 	/* According to MySQL documentation, if the offset is greater than
-	the step then the offset is ignored. */
-	if (offset > block) {
+	the increment then the offset is ignored. */
+	if (offset > increment) {
 		offset = 0;
 	}
 
-	/* Check for overflow. */
-	if (block >= max_value
-	    || offset > max_value
-	    || current == max_value
-	    || max_value - offset <= offset) {
-
+	if (max_value <= current) {
 		next_value = max_value;
-	} else {
-		ut_a(max_value > current);
-
-		ulonglong	free = max_value - current;
-
-		if (free < offset || free - offset <= block) {
+	} else if (offset <= 1) {
+		/* Offset 0 and 1 are the same, because there must be at
+		least one node in the system. */
+		if (max_value - current <= increment) {
 			next_value = max_value;
 		} else {
-			next_value = 0;
+			next_value = current + increment;
 		}
-	}
-
-	if (next_value == 0) {
-		ulonglong	next;
-
+	} else if (max_value > current) {
 		if (current > offset) {
-			next = (current - offset) / step;
+			next_value = ((current - offset) / increment) + 1;
 		} else {
-			next = (offset - current) / step;
+			next_value = ((offset - current) / increment) + 1;
 		}
 
-		ut_a(max_value > next);
-		next_value = next * step;
+		ut_a(increment > 0);
+		ut_a(next_value > 0);
+
 		/* Check for multiplication overflow. */
-		ut_a(next_value >= next);
-		ut_a(max_value > next_value);
+		if (increment > (max_value / next_value)) {
 
-		/* Check for overflow */
-		if (max_value - next_value >= block) {
+			next_value = max_value;
+		} else {
+			next_value *= increment;
 
-			next_value += block;
+			ut_a(max_value >= next_value);
 
-			if (max_value - next_value >= offset) {
-				next_value += offset;
-			} else {
+			/* Check for overflow. */
+			if (max_value - next_value <= offset) {
 				next_value = max_value;
+			} else {
+				next_value += offset;
 			}
-		} else {
-			next_value = max_value;
 		}
+	} else {
+		next_value = max_value;
 	}
 
-	ut_a(next_value != 0);
 	ut_a(next_value <= max_value);
 
 	return(next_value);
@@ -1559,9 +1911,9 @@ innobase_trx_init(
 }
 
 /*********************************************************************//**
-Allocates an InnoDB transaction for a MySQL handler object.
+Allocates an InnoDB transaction for a MySQL handler object for DML.
 @return	InnoDB transaction handle */
-extern "C" UNIV_INTERN
+UNIV_INTERN
 trx_t*
 innobase_trx_allocate(
 /*==================*/
@@ -1675,15 +2027,18 @@ trx_is_started(
 /*===========*/
 	trx_t*	trx)	/* in: transaction */
 {
-	return(trx->conc_state != TRX_NOT_STARTED);
+	return(trx->state != TRX_STATE_NOT_STARTED);
 }
 
 /*********************************************************************//**
 Construct ha_innobase handler. */
 UNIV_INTERN
-ha_innobase::ha_innobase(handlerton *hton, TABLE_SHARE *table_arg)
-  :handler(hton, table_arg),
-  int_table_flags(HA_REC_NOT_IN_SEQ |
+ha_innobase::ha_innobase(
+/*=====================*/
+	handlerton*	hton,
+	TABLE_SHARE*	table_arg)
+	:handler(hton, table_arg),
+	int_table_flags(HA_REC_NOT_IN_SEQ |
 		  HA_NULL_IN_KEY |
 		  HA_CAN_INDEX_BLOBS |
 		  HA_CAN_SQL_HANDLER |
@@ -1691,15 +2046,16 @@ ha_innobase::ha_innobase(handlerton *hton, TABLE_SHARE *table_arg)
 		  HA_PRIMARY_KEY_IN_READ_INDEX |
 		  HA_BINLOG_ROW_CAPABLE |
 		  HA_CAN_GEOMETRY | HA_PARTIAL_COLUMN_READ |
-		  HA_TABLE_SCAN_ON_INDEX),
-  start_of_scan(0),
-  num_write_row(0)
+		  HA_TABLE_SCAN_ON_INDEX | HA_CAN_FULLTEXT),
+	start_of_scan(0),
+	num_write_row(0)
 {}
 
 /*********************************************************************//**
 Destruct ha_innobase handler. */
 UNIV_INTERN
 ha_innobase::~ha_innobase()
+/*======================*/
 {
 }
 
@@ -1735,6 +2091,7 @@ ha_innobase::update_thd()
 /*=====================*/
 {
 	THD*	thd = ha_thd();
+
 	ut_ad(EQ_CURRENT_THD(thd));
 	update_thd(thd);
 }
@@ -1764,9 +2121,9 @@ innobase_register_trx(
 
 	trx_register_for_2pc(trx);
 }
-  
-/*   BACKGROUND INFO: HOW THE MYSQL QUERY CACHE WORKS WITH INNODB
-     ------------------------------------------------------------
+
+/*	BACKGROUND INFO: HOW THE MYSQL QUERY CACHE WORKS WITH INNODB
+	------------------------------------------------------------
 
 1) The use of the query cache for TBL is disabled when there is an
 uncommitted change to TBL.
@@ -1823,9 +2180,9 @@ read view to it if there is no read view yet.
 Why a deadlock of threads is not possible: the query cache calls this function
 at the start of a SELECT processing. Then the calling thread cannot be
 holding any InnoDB semaphores. The calling thread is holding the
-query cache mutex, and this function will reserver the InnoDB kernel mutex.
+query cache mutex, and this function will reserve the InnoDB trx_sys->mutex.
 Thus, the 'rank' in sync0sync.h of the MySQL query cache mutex is above
-the InnoDB kernel mutex.
+the InnoDB trx_sys->mutex.
 @return TRUE if permitted, FALSE if not; note that the value FALSE
 does not mean we should invalidate the query cache: invalidation is
 called explicitly */
@@ -1858,17 +2215,16 @@ innobase_query_caching_of_table_permitted(
 		return((my_bool)FALSE);
 	}
 
-	if (trx->has_search_latch) {
+	if (UNIV_UNLIKELY(trx->has_search_latch)) {
 		sql_print_error("The calling thread is holding the adaptive "
 				"search, latch though calling "
 				"innobase_query_caching_of_table_permitted.");
-
-		mutex_enter(&kernel_mutex);
 		trx_print(stderr, trx, 1024);
-		mutex_exit(&kernel_mutex);
 	}
 
-	innobase_release_stat_resources(trx);
+	trx_search_latch_release_if_reserved(trx);
+
+	innobase_srv_conc_force_exit_innodb(trx);
 
 	if (!thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)) {
 
@@ -1904,8 +2260,9 @@ innobase_query_caching_of_table_permitted(
 
 	memcpy(norm_name, full_name, full_name_len);
 
-	norm_name[strlen(norm_name)] = '/'; /* InnoDB uses '/' as the
-					    separator between db and table */
+	norm_name[strlen(norm_name)] = '/'; 	/* InnoDB uses '/' as the
+						separator between db and
+						table */
 	norm_name[full_name_len] = '\0';
 #ifdef __WIN__
 	innobase_casedn_str(norm_name);
@@ -1927,7 +2284,7 @@ innobase_query_caching_of_table_permitted(
 
 /*****************************************************************//**
 Invalidates the MySQL query cache for the table. */
-extern "C" UNIV_INTERN
+UNIV_INTERN
 void
 innobase_invalidate_query_cache(
 /*============================*/
@@ -1942,8 +2299,8 @@ innobase_invalidate_query_cache(
 					also the null chars count */
 {
 	/* Note that the sync0sync.h rank of the query cache mutex is just
-	above the InnoDB kernel mutex. The caller of this function must not
-	have latches of a lower rank. */
+	above the InnoDB trx_sys_t->lock. The caller of this function must
+	not have latches of a lower rank. */
 
 	/* Argument TRUE below means we are using transactions */
 #ifdef HAVE_QUERY_CACHE
@@ -2046,7 +2403,7 @@ no_quote:
 Convert a table or index name to the MySQL system_charset_info (UTF-8)
 and quote it if needed.
 @return	pointer to the end of buf */
-extern "C" UNIV_INTERN
+UNIV_INTERN
 char*
 innobase_convert_name(
 /*==================*/
@@ -2125,7 +2482,7 @@ innobase_format_name(
 /**********************************************************************//**
 Determines if the currently running transaction has been interrupted.
 @return	TRUE if interrupted */
-extern "C" UNIV_INTERN
+UNIV_INTERN
 ibool
 trx_is_interrupted(
 /*===============*/
@@ -2137,7 +2494,7 @@ trx_is_interrupted(
 /**********************************************************************//**
 Determines if the currently running transaction is in strict mode.
 @return	TRUE if strict */
-extern "C" UNIV_INTERN
+UNIV_INTERN
 ibool
 trx_is_strict(
 /*==========*/
@@ -2150,14 +2507,24 @@ trx_is_strict(
 /**************************************************************//**
 Resets some fields of a prebuilt struct. The template is used in fast
 retrieval of just those column values MySQL needs in its processing. */
-static
+inline
 void
-reset_template(
-/*===========*/
-	row_prebuilt_t*	prebuilt)	/*!< in/out: prebuilt struct */
+ha_innobase::reset_template(void)
+/*=============================*/
 {
+	ut_ad(prebuilt->magic_n == ROW_PREBUILT_ALLOCATED);
+	ut_ad(prebuilt->magic_n2 == prebuilt->magic_n);
+
 	prebuilt->keep_other_fields_on_keyread = 0;
 	prebuilt->read_just_key = 0;
+	/* Reset index condition pushdown state. */
+	if (prebuilt->idx_cond) {
+		prebuilt->idx_cond = NULL;
+		prebuilt->idx_cond_n_cols = 0;
+		/* Invalidate prebuilt->mysql_template
+		in ha_innobase::write_row(). */
+		prebuilt->template_type = ROW_MYSQL_NO_TEMPLATE;
+	}
 }
 
 /*****************************************************************//**
@@ -2181,11 +2548,13 @@ ha_innobase::init_table_handle_for_HANDLER(void)
 	/* Initialize the prebuilt struct much like it would be inited in
 	external_lock */
 
-	innobase_release_stat_resources(prebuilt->trx);
+	trx_search_latch_release_if_reserved(prebuilt->trx);
+
+	innobase_srv_conc_force_exit_innodb(prebuilt->trx);
 
 	/* If the transaction is not started yet, start it */
 
-	trx_start_if_not_started(prebuilt->trx);
+	trx_start_if_not_started_xa(prebuilt->trx);
 
 	/* Assign a read view if the transaction does not have it yet */
 
@@ -2212,7 +2581,7 @@ ha_innobase::init_table_handle_for_HANDLER(void)
 	we???? */
 
 	prebuilt->used_in_HANDLER = TRUE;
-	reset_template(prebuilt);
+	reset_template();
 }
 
 /*********************************************************************//**
@@ -2229,37 +2598,44 @@ innobase_init(
 	bool		ret;
 	char		*default_path;
 	uint		format_id;
+	ulong		num_pll_degree;
 
 	DBUG_ENTER("innobase_init");
-        handlerton *innobase_hton= (handlerton *)p;
-        innodb_hton_ptr = innobase_hton;
-
-        innobase_hton->state = SHOW_OPTION_YES;
-        innobase_hton->db_type= DB_TYPE_INNODB;
-        innobase_hton->savepoint_offset=sizeof(trx_named_savept_t);
-        innobase_hton->close_connection=innobase_close_connection;
-        innobase_hton->savepoint_set=innobase_savepoint;
-        innobase_hton->savepoint_rollback=innobase_rollback_to_savepoint;
-        innobase_hton->savepoint_release=innobase_release_savepoint;
-	innobase_hton->prepare_ordered=NULL;
-        innobase_hton->commit_ordered=innobase_commit_ordered;
-        innobase_hton->commit=innobase_commit;
-        innobase_hton->rollback=innobase_rollback;
-        innobase_hton->prepare=innobase_xa_prepare;
-        innobase_hton->recover=innobase_xa_recover;
-        innobase_hton->commit_by_xid=innobase_commit_by_xid;
-        innobase_hton->rollback_by_xid=innobase_rollback_by_xid;
-        innobase_hton->create_cursor_read_view=innobase_create_cursor_view;
-        innobase_hton->set_cursor_read_view=innobase_set_cursor_view;
-        innobase_hton->close_cursor_read_view=innobase_close_cursor_view;
-        innobase_hton->create=innobase_create_handler;
-        innobase_hton->drop_database=innobase_drop_database;
-        innobase_hton->panic=innobase_end;
-        innobase_hton->start_consistent_snapshot=innobase_start_trx_and_assign_read_view;
-        innobase_hton->flush_logs=innobase_flush_logs;
-        innobase_hton->show_status=innobase_show_status;
-        innobase_hton->flags=HTON_NO_FLAGS;
-        innobase_hton->release_temporary_latches=innobase_release_temporary_latches;
+	handlerton *innobase_hton= (handlerton*) p;
+	innodb_hton_ptr = innobase_hton;
+
+	innobase_hton->state = SHOW_OPTION_YES;
+	innobase_hton->db_type= DB_TYPE_INNODB;
+	innobase_hton->savepoint_offset = sizeof(trx_named_savept_t);
+	innobase_hton->close_connection = innobase_close_connection;
+	innobase_hton->savepoint_set = innobase_savepoint;
+	innobase_hton->savepoint_rollback = innobase_rollback_to_savepoint;
+	innobase_hton->savepoint_release = innobase_release_savepoint;
+	innobase_hton->prepare_ordered= NULL;
+        innobase_hton->commit_ordered= innobase_commit_ordered;
+	innobase_hton->commit = innobase_commit;
+	innobase_hton->rollback = innobase_rollback;
+	innobase_hton->prepare = innobase_xa_prepare;
+	innobase_hton->recover = innobase_xa_recover;
+	innobase_hton->commit_by_xid = innobase_commit_by_xid;
+	innobase_hton->rollback_by_xid = innobase_rollback_by_xid;
+	innobase_hton->create_cursor_read_view = innobase_create_cursor_view;
+	innobase_hton->set_cursor_read_view = innobase_set_cursor_view;
+	innobase_hton->close_cursor_read_view = innobase_close_cursor_view;
+	innobase_hton->create = innobase_create_handler;
+	innobase_hton->drop_database = innobase_drop_database;
+	innobase_hton->panic = innobase_end;
+
+	innobase_hton->start_consistent_snapshot =
+		innobase_start_trx_and_assign_read_view;
+
+	innobase_hton->flush_logs = innobase_flush_logs;
+	innobase_hton->show_status = innobase_show_status;
+	innobase_hton->flags = HTON_NO_FLAGS;
+
+	innobase_hton->release_temporary_latches =
+		innobase_release_temporary_latches;
+
 	innobase_hton->alter_table_flags = innobase_alter_table_flags;
 
 	ut_a(DATA_MYSQL_TRUE_VARCHAR == (ulint)MYSQL_TYPE_VARCHAR);
@@ -2267,17 +2643,20 @@ innobase_init(
 #ifndef DBUG_OFF
 	static const char	test_filename[] = "-@";
 	char			test_tablename[sizeof test_filename
-				+ sizeof srv_mysql50_table_name_prefix];
-	if ((sizeof test_tablename) - 1
-			!= filename_to_tablename(test_filename, test_tablename,
-			sizeof test_tablename, true)
+				+ sizeof(srv_mysql50_table_name_prefix) - 1];
+	if ((sizeof(test_tablename)) - 1
+			!= filename_to_tablename(test_filename,
+						 test_tablename,
+						 sizeof(test_tablename), true)
 			|| strncmp(test_tablename,
-			srv_mysql50_table_name_prefix,
-			sizeof srv_mysql50_table_name_prefix)
+				   srv_mysql50_table_name_prefix,
+				   sizeof(srv_mysql50_table_name_prefix) - 1)
 			|| strcmp(test_tablename
-			+ sizeof srv_mysql50_table_name_prefix,
-			test_filename)) {
+				  + sizeof(srv_mysql50_table_name_prefix) - 1,
+				  test_filename)) {
+
 		sql_print_error("tablename encoding has been changed");
+
 		goto error;
 	}
 #endif /* DBUG_OFF */
@@ -2291,17 +2670,9 @@ innobase_init(
 
 			goto error;
 		}
-
-		if (innobase_log_file_size > UINT_MAX32) {
-			sql_print_error(
-				"innobase_log_file_size can't be over 4GB"
-				" on 32-bit systems");
-
-			goto error;
-		}
 	}
 
-	os_innodb_umask = (ulint)my_umask;
+	os_innodb_umask = (ulint) my_umask;
 
 	/* First calculate the default path for innodb_data_home_dir etc.,
 	in case the user has not given any value.
@@ -2357,7 +2728,7 @@ mem_free_and_error:
 		goto error;
 	}
 
-	/* -------------- Log files ---------------------------*/
+	/* -------------- All log files ---------------------------*/
 
 	/* The default dir for log files is the datadir of MySQL */
 
@@ -2391,7 +2762,7 @@ mem_free_and_error:
 		format_id = innobase_file_format_name_lookup(
 			innobase_file_format_name);
 
-		if (format_id > DICT_TF_FORMAT_MAX) {
+		if (format_id > UNIV_FORMAT_MAX) {
 
 			sql_print_error("InnoDB: wrong innodb_file_format.");
 
@@ -2417,12 +2788,12 @@ mem_free_and_error:
 	if (!innobase_file_format_check) {
 
 		/* Set the value to disable checking. */
-		srv_max_file_format_at_startup = DICT_TF_FORMAT_MAX + 1;
+		srv_max_file_format_at_startup = UNIV_FORMAT_MAX + 1;
 
 	} else {
 
 		/* Set the value to the lowest supported format. */
-		srv_max_file_format_at_startup = DICT_TF_FORMAT_MIN;
+		srv_max_file_format_at_startup = UNIV_FORMAT_MIN;
 	}
 
 	/* Did the user specify a format name that we support?
@@ -2436,11 +2807,17 @@ mem_free_and_error:
 				"should be any value up to %s or its "
 				"equivalent numeric id",
 				trx_sys_file_format_id_to_name(
-					DICT_TF_FORMAT_MAX));
+					UNIV_FORMAT_MAX));
 
 		goto mem_free_and_error;
 	}
 
+	/* Remember stopword table name supplied at startup */
+	if (innobase_server_stopword_table) {
+		fts_server_stopword_table =
+			my_strdup(innobase_server_stopword_table,  MYF(0));
+	}
+
 	if (innobase_change_buffering) {
 		ulint	use;
 
@@ -2472,11 +2849,29 @@ innobase_change_buffering_inited_ok:
 
 	srv_n_log_groups = (ulint) innobase_mirrored_log_groups;
 	srv_n_log_files = (ulint) innobase_log_files_in_group;
-	srv_log_file_size = (ulint) innobase_log_file_size;
+	srv_log_file_size = (ib_uint64_t) innobase_log_file_size;
 
 #ifdef UNIV_LOG_ARCHIVE
 	srv_log_archive_on = (ulint) innobase_log_archive;
 #endif /* UNIV_LOG_ARCHIVE */
+
+	/* Check that the value of system variable innodb_page_size was
+	set correctly.  Its value was put into srv_page_size. If valid,
+	return the associated srv_page_size_shift.*/
+	srv_page_size_shift = innodb_page_size_validate(srv_page_size);
+	if (!srv_page_size_shift) {
+		sql_print_error("InnoDB: Invalid page size=%lu.\n",
+				srv_page_size);
+		goto mem_free_and_error;
+	}
+	if (UNIV_PAGE_SIZE_DEF != srv_page_size) {
+		ut_print_timestamp(stderr);
+		fprintf(stderr,
+			" InnoDB: innodb-page-size has been changed"
+			" from the default value %d to %lu.\n",
+			UNIV_PAGE_SIZE_DEF, srv_page_size);
+	}
+
 	srv_log_buffer_size = (ulint) innobase_log_buffer_size;
 
 	srv_buf_pool_size = (ulint) innobase_buffer_pool_size;
@@ -2484,6 +2879,29 @@ innobase_change_buffering_inited_ok:
 
 	srv_mem_pool_size = (ulint) innobase_additional_mem_pool_size;
 
+	if (innobase_additional_mem_pool_size
+	    != 8*1024*1024L /* the default */ ) {
+
+		ut_print_timestamp(stderr);
+		fprintf(stderr,
+			" InnoDB: Warning: Using "
+			"innodb_additional_mem_pool_size is DEPRECATED. "
+			"This option may be removed in future releases, "
+			"together with the option innodb_use_sys_malloc "
+			"and with the InnoDB's internal memory "
+			"allocator.\n");
+	}
+
+	if (!srv_use_sys_malloc ) {
+		ut_print_timestamp(stderr);
+		fprintf(stderr,
+			" InnoDB: Warning: Setting "
+			"innodb_use_sys_malloc to FALSE is DEPRECATED. "
+			"This option may be removed in future releases, "
+			"together with the InnoDB's internal memory "
+			"allocator.\n");
+	}
+
 	srv_n_file_io_threads = (ulint) innobase_file_io_threads;
 	srv_n_read_io_threads = (ulint) innobase_read_io_threads;
 	srv_n_write_io_threads = (ulint) innobase_write_io_threads;
@@ -2491,26 +2909,55 @@ innobase_change_buffering_inited_ok:
 	srv_force_recovery = (ulint) innobase_force_recovery;
 
 	srv_use_doublewrite_buf = (ibool) innobase_use_doublewrite;
-	srv_use_checksums = (ibool) innobase_use_checksums;
+	if (!innobase_use_checksums) {
+		ut_print_timestamp(stderr);
+		fprintf(stderr,
+			" InnoDB: Warning: Setting "
+			"innodb_checksums to OFF is DEPRECATED. "
+			"This option may be removed in future releases. "
+			"You should set innodb_checksum_algorithm=NONE "
+			"instead.\n");
+		srv_checksum_algorithm = SRV_CHECKSUM_ALGORITHM_NONE;
+	}
 
 #ifdef HAVE_LARGE_PAGES
-        if ((os_use_large_pages = (ibool) my_use_large_pages))
+	if ((os_use_large_pages = (ibool) my_use_large_pages)) {
 		os_large_page_size = (ulint) opt_large_page_size;
+	}
 #endif
 
 	row_rollback_on_timeout = (ibool) innobase_rollback_on_timeout;
 
 	srv_locks_unsafe_for_binlog = (ibool) innobase_locks_unsafe_for_binlog;
+	if (innobase_locks_unsafe_for_binlog) {
+		ut_print_timestamp(stderr);
+		fprintf(stderr,
+			" InnoDB: Warning: Using "
+			"innodb_locks_unsafe_for_binlog is DEPRECATED. "
+			"This option may be removed in future releases. "
+			"Please use READ COMMITTED transaction isolation "
+			"level instead, see " REFMAN "set-transaction.html.\n");
+	}
 
 	srv_max_n_open_files = (ulint) innobase_open_files;
 	srv_innodb_status = (ibool) innobase_create_status_file;
 
 	srv_print_verbose_log = mysqld_embedded ? 0 : 1;
 
+	/* Round up fts_sort_pll_degree to nearest power of 2 number */
+	for (num_pll_degree = 1;
+	     num_pll_degree < fts_sort_pll_degree;
+	     num_pll_degree <<= 1) {
+
+		/* No op */
+	}
+
+	fts_sort_pll_degree = num_pll_degree;
+
 	/* Store the default charset-collation number of this MySQL
 	installation */
 
-	data_mysql_default_charset_coll = (ulint)default_charset_info->number;
+	data_mysql_default_charset_coll = (ulint) default_charset_info->number;
 
 	ut_a(DATA_MYSQL_LATIN1_SWEDISH_CHARSET_COLL ==
 					my_charset_latin1.number);
@@ -2528,41 +2975,33 @@ innobase_change_buffering_inited_ok:
 
 #ifdef HAVE_PSI_INTERFACE
 	/* Register keys with MySQL performance schema */
-	if (PSI_server) {
-		int	count;
+	int	count;
 
-                count = array_elements(all_pthread_mutexes);
-                PSI_server->register_mutex("innodb",
-                                           all_pthread_mutexes, count);
+	count = array_elements(all_pthread_mutexes);
+ 	mysql_mutex_register("innodb", all_pthread_mutexes, count);
 
 # ifdef UNIV_PFS_MUTEX
-		count = array_elements(all_innodb_mutexes);
-		PSI_server->register_mutex("innodb",
-					   all_innodb_mutexes, count);
+	count = array_elements(all_innodb_mutexes);
+	mysql_mutex_register("innodb", all_innodb_mutexes, count);
 # endif /* UNIV_PFS_MUTEX */
 
 # ifdef UNIV_PFS_RWLOCK
-		count = array_elements(all_innodb_rwlocks);
-		PSI_server->register_rwlock("innodb",
-					    all_innodb_rwlocks, count);
+	count = array_elements(all_innodb_rwlocks);
+	mysql_rwlock_register("innodb", all_innodb_rwlocks, count);
 # endif /* UNIV_PFS_MUTEX */
 
 # ifdef UNIV_PFS_THREAD
-		count = array_elements(all_innodb_threads);
-		PSI_server->register_thread("innodb",
-					    all_innodb_threads, count);
+	count = array_elements(all_innodb_threads);
+	mysql_thread_register("innodb", all_innodb_threads, count);
 # endif /* UNIV_PFS_THREAD */
 
 # ifdef UNIV_PFS_IO
-		count = array_elements(all_innodb_files);
-		PSI_server->register_file("innodb",
-					  all_innodb_files, count);
+	count = array_elements(all_innodb_files);
+	mysql_file_register("innodb", all_innodb_files, count);
 # endif /* UNIV_PFS_IO */
 
-		count = array_elements(all_innodb_conds);
-		PSI_server->register_cond("innodb",
-					  all_innodb_conds, count);
-	}
+	count = array_elements(all_innodb_conds);
+	mysql_cond_register("innodb", all_innodb_conds, count);
 #endif /* HAVE_PSI_INTERFACE */
 
 	/* Since we in this module access directly the fields of a trx
@@ -2577,9 +3016,14 @@ innobase_change_buffering_inited_ok:
 		goto mem_free_and_error;
 	}
 
+	/* Adjust the innodb_undo_logs config object */
+	innobase_undo_logs_init_default_max();
+
 	innobase_old_blocks_pct = buf_LRU_old_ratio_update(
 		innobase_old_blocks_pct, TRUE);
 
+	ibuf_max_size_update(innobase_change_buffer_max_size);
+
 	innobase_open_tables = hash_create(200);
 	mysql_mutex_init(innobase_share_mutex_key,
 			 &innobase_share_mutex,
@@ -2600,6 +3044,22 @@ innobase_change_buffering_inited_ok:
 	/* Get the current high water mark format. */
 	innobase_file_format_max = (char*) trx_sys_file_format_max_get();
 
+	/* Currently, monitor counter information are not persistent. */
+	memset(monitor_set_tbl, 0, sizeof monitor_set_tbl);
+
+	memset(innodb_counter_value, 0, sizeof innodb_counter_value);
+
+	/* Do this as late as possible so server is fully starts up,
+	since  we might get some initial stats if user choose to turn
+	on some counters from start up */
+	if (innobase_enable_monitor_counter) {
+		innodb_enable_monitor_at_startup(
+			innobase_enable_monitor_counter);
+	}
+
+	/* Turn on monitor counters that are default on */
+	srv_mon_default_on();
+
 	DBUG_RETURN(FALSE);
 error:
 	DBUG_RETURN(TRUE);
@@ -2624,6 +3084,7 @@ innobase_end(
 	if (innodb_inited) {
 
 		srv_fast_shutdown = (ulint) innobase_fast_shutdown;
+
 		innodb_inited = 0;
 		hash_table_free(innobase_open_tables);
 		innobase_open_tables = NULL;
@@ -2702,9 +3163,9 @@ static
 int
 innobase_start_trx_and_assign_read_view(
 /*====================================*/
-        handlerton *hton, /*!< in: Innodb handlerton */
-	THD*	thd)	/*!< in: MySQL thread handle of the user for whom
-			the transaction should be committed */
+	handlerton*	hton,	/*!< in: Innodb handlerton */
+	THD*		thd)	/*!< in: MySQL thread handle of the user for
+				whom the transaction should be committed */
 {
 	trx_t*	trx;
 
@@ -2716,14 +3177,17 @@ innobase_start_trx_and_assign_read_view(
 	trx = check_trx_exists(thd);
 
 	/* This is just to play safe: release a possible FIFO ticket and
-	search latch. Since we will reserve the kernel mutex, we have to
-	release the search system latch first to obey the latching order. */
+	search latch. Since we can potentially reserve the trx_sys->mutex,
+	we have to release the search system latch first to obey the latching
+	order. */
+
+	trx_search_latch_release_if_reserved(trx);
 
-	innobase_release_stat_resources(trx);
+	innobase_srv_conc_force_exit_innodb(trx);
 
 	/* If the transaction is not started yet, start it */
 
-	trx_start_if_not_started(trx);
+	trx_start_if_not_started_xa(trx);
 
 	/* Assign a read view if the transaction does not have it yet */
 
@@ -2847,11 +3311,13 @@ static
 int
 innobase_commit(
 /*============*/
-        handlerton *hton, /*!< in: Innodb handlerton */
-	THD* 	thd,	/*!< in: MySQL thread handle of the user for whom
-			the transaction should be committed */
-	bool	all)	/*!< in:	TRUE - commit transaction
-				FALSE - the current SQL statement ended */
+	handlerton*	hton,		/*!< in: Innodb handlerton */
+	THD*		thd,		/*!< in: MySQL thread handle of the
+					user for whom the transaction should
+					be committed */
+	bool		commit_trx)	/*!< in: true - commit transaction
+					false - the current SQL statement
+					ended */
 {
 	trx_t*		trx;
 
@@ -2861,7 +3327,7 @@ innobase_commit(
 
 	trx = check_trx_exists(thd);
 
-	/* Since we will reserve the kernel mutex, we have to release
+	/* Since we will reserve the trx_sys->mutex, we have to release
 	the search system latch first to obey the latching order. */
 
 	if (trx->has_search_latch && !trx_is_active_commit_ordered(trx)) {
@@ -2879,7 +3345,7 @@ innobase_commit(
 				"but transaction is active");
 	}
 
-	if (all
+	if (commit_trx
 	    || (!thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN))) {
 
 		/* Run the fast part of commit if we did not already. */
@@ -2894,7 +3360,6 @@ innobase_commit(
 		Now finish by doing a write + flush of logs. */
 		trx_commit_complete_for_mysql(trx);
                 trx_deregister_from_2pc(trx);
-
 	} else {
 		/* We just mark the SQL statement ended and do not do a
 		transaction commit */
@@ -2902,7 +3367,7 @@ innobase_commit(
 		/* If we had reserved the auto-inc lock for some
 		table in this SQL statement we release it now */
 
-		row_unlock_table_autoinc_for_mysql(trx);
+		lock_unlock_table_autoinc(trx);
 
 		/* Store the current undo_no of the transaction so that we
 		know where to roll back if we have to roll back the next
@@ -2913,11 +3378,10 @@ innobase_commit(
 
 	trx->n_autoinc_rows = 0; /* Reset the number AUTO-INC rows required */
 
-	if (trx->declared_to_be_inside_innodb) {
-		/* Release our possible ticket in the FIFO */
+	/* This is a statement level variable. */
+	trx->fts_next_doc_id = 0;
 
-		srv_conc_force_exit_innodb(trx);
-	}
+	innobase_srv_conc_force_exit_innodb(trx);
 
 	/* Tell the InnoDB server that there might be work for utility
 	threads: */
@@ -2933,11 +3397,13 @@ static
 int
 innobase_rollback(
 /*==============*/
-        handlerton *hton, /*!< in: Innodb handlerton */ 
-	THD*	thd,	/*!< in: handle to the MySQL thread of the user
-			whose transaction should be rolled back */
-	bool	all)	/*!< in:	TRUE - commit transaction
-				FALSE - the current SQL statement ended */
+	handlerton*	hton,		/*!< in: Innodb handlerton */
+	THD*		thd,		/*!< in: handle to the MySQL thread
+					of the user whose transaction should
+					be rolled back */
+	bool		rollback_trx)	/*!< in: TRUE - rollback entire
+					transaction FALSE - rollback the current
+					statement only */
 {
 	int	error = 0;
 	trx_t*	trx;
@@ -2949,10 +3415,12 @@ innobase_rollback(
 	trx = check_trx_exists(thd);
 
 	/* Release a possible FIFO ticket and search latch. Since we will
-	reserve the kernel mutex, we have to release the search system latch
-	first to obey the latching order. */
+	reserve the trx_sys->mutex, we have to release the search system
+	latch first to obey the latching order. */
+
+	trx_search_latch_release_if_reserved(trx);
 
-	innobase_release_stat_resources(trx);
+	innobase_srv_conc_force_exit_innodb(trx);
 
 	trx->n_autoinc_rows = 0; /* Reset the number AUTO-INC rows required */
 
@@ -2960,9 +3428,12 @@ innobase_rollback(
 	we come here to roll back the latest SQL statement) we
 	release it now before a possibly lengthy rollback */
 
-	row_unlock_table_autoinc_for_mysql(trx);
+	lock_unlock_table_autoinc(trx);
 
-	if (all
+	/* This is a statement level variable. */
+	trx->fts_next_doc_id = 0;
+
+	if (rollback_trx
 	    || !thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)) {
 
 		error = trx_rollback_for_mysql(trx);
@@ -2989,18 +3460,22 @@ innobase_rollback_trx(
 	DBUG_PRINT("trans", ("aborting transaction"));
 
 	/* Release a possible FIFO ticket and search latch. Since we will
-	reserve the kernel mutex, we have to release the search system latch
-	first to obey the latching order. */
+	reserve the trx_sys->mutex, we have to release the search system
+	latch first to obey the latching order. */
+
+	trx_search_latch_release_if_reserved(trx);
 
-	innobase_release_stat_resources(trx);
+	innobase_srv_conc_force_exit_innodb(trx);
 
 	/* If we had reserved the auto-inc lock for some table (if
 	we come here to roll back the latest SQL statement) we
 	release it now before a possibly lengthy rollback */
 
-	row_unlock_table_autoinc_for_mysql(trx);
+	lock_unlock_table_autoinc(trx);
 
-	error = trx_rollback_for_mysql(trx);
+	if (!trx->read_only) {
+		error = trx_rollback_for_mysql(trx);
+	}
 
 	DBUG_RETURN(convert_error_code_to_mysql(error, 0, NULL));
 }
@@ -3013,10 +3488,11 @@ static
 int
 innobase_rollback_to_savepoint(
 /*===========================*/
-        handlerton *hton,       /*!< in: Innodb handlerton */ 
-	THD*	thd,		/*!< in: handle to the MySQL thread of the user
-				whose transaction should be rolled back */
-	void*	savepoint)	/*!< in: savepoint data */
+	handlerton*	hton,		/*!< in: Innodb handlerton */
+	THD*		thd,		/*!< in: handle to the MySQL thread
+					of the user whose transaction should
+					be rolled back to savepoint */
+	void*		savepoint)	/*!< in: savepoint data */
 {
 	ib_int64_t	mysql_binlog_cache_pos;
 	int		error = 0;
@@ -3029,17 +3505,24 @@ innobase_rollback_to_savepoint(
 	trx = check_trx_exists(thd);
 
 	/* Release a possible FIFO ticket and search latch. Since we will
-	reserve the kernel mutex, we have to release the search system latch
-	first to obey the latching order. */
+	reserve the trx_sys->mutex, we have to release the search system
+	latch first to obey the latching order. */
+
+	trx_search_latch_release_if_reserved(trx);
 
-	innobase_release_stat_resources(trx);
+	innobase_srv_conc_force_exit_innodb(trx);
 
 	/* TODO: use provided savepoint data area to store savepoint data */
 
-	longlong2str((ulint)savepoint, name, 36);
+	longlong2str((ulint) savepoint, name, 36);
+
+	error = (int) trx_rollback_to_savepoint_for_mysql(
+		trx, name, &mysql_binlog_cache_pos);
+
+	if (error == DB_SUCCESS && trx->fts_trx != NULL) {
+		fts_savepoint_rollback(trx, name);
+	}
 
-	error = (int) trx_rollback_to_savepoint_for_mysql(trx, name,
-						&mysql_binlog_cache_pos);
 	DBUG_RETURN(convert_error_code_to_mysql(error, 0, NULL));
 }
 
@@ -3051,10 +3534,11 @@ static
 int
 innobase_release_savepoint(
 /*=======================*/
-        handlerton*	hton,	/*!< in: handlerton for Innodb */
-	THD*	thd,		/*!< in: handle to the MySQL thread of the user
-				whose transaction should be rolled back */
-	void*	savepoint)	/*!< in: savepoint data */
+	handlerton*	hton,		/*!< in: handlerton for Innodb */
+	THD*		thd,		/*!< in: handle to the MySQL thread
+					of the user whose transaction's
+					savepoint should be released */
+	void*		savepoint)	/*!< in: savepoint data */
 {
 	int		error = 0;
 	trx_t*		trx;
@@ -3067,10 +3551,14 @@ innobase_release_savepoint(
 
 	/* TODO: use provided savepoint data area to store savepoint data */
 
-	longlong2str((ulint)savepoint, name, 36);
+	longlong2str((ulint) savepoint, name, 36);
 
 	error = (int) trx_release_savepoint_for_mysql(trx, name);
 
+	if (error == DB_SUCCESS && trx->fts_trx != NULL) {
+		fts_savepoint_release(trx, name);
+	}
+
 	DBUG_RETURN(convert_error_code_to_mysql(error, 0, NULL));
 }
 
@@ -3081,7 +3569,7 @@ static
 int
 innobase_savepoint(
 /*===============*/
-	handlerton*	hton,   /*!< in: handle to the Innodb handlerton */
+	handlerton*	hton,	/*!< in: handle to the Innodb handlerton */
 	THD*	thd,		/*!< in: handle to the MySQL thread */
 	void*	savepoint)	/*!< in: savepoint data */
 {
@@ -3091,33 +3579,33 @@ innobase_savepoint(
 	DBUG_ENTER("innobase_savepoint");
 	DBUG_ASSERT(hton == innodb_hton_ptr);
 
-	/*
-	  In the autocommit mode there is no sense to set a savepoint
-	  (unless we are in sub-statement), so SQL layer ensures that
-	  this method is never called in such situation.
-	*/
-#ifdef MYSQL_SERVER /* plugins cannot access thd->in_sub_stmt */
-	DBUG_ASSERT(thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN) ||
-		thd->in_sub_stmt);
-#endif /* MYSQL_SERVER */
+	/* In the autocommit mode there is no sense to set a savepoint
+	(unless we are in sub-statement), so SQL layer ensures that
+	this method is never called in such situation.  */
 
 	trx = check_trx_exists(thd);
 
 	/* Release a possible FIFO ticket and search latch. Since we will
-	reserve the kernel mutex, we have to release the search system latch
-	first to obey the latching order. */
+	reserve the trx_sys->mutex, we have to release the search system
+	latch first to obey the latching order. */
 
-	innobase_release_stat_resources(trx);
+	trx_search_latch_release_if_reserved(trx);
+
+	innobase_srv_conc_force_exit_innodb(trx);
 
 	/* Cannot happen outside of transaction */
 	DBUG_ASSERT(trx_is_registered_for_2pc(trx));
 
 	/* TODO: use provided savepoint data area to store savepoint data */
 	char name[64];
-	longlong2str((ulint)savepoint,name,36);
+	longlong2str((ulint) savepoint,name,36);
 
 	error = (int) trx_savepoint_for_mysql(trx, name, (ib_int64_t)0);
 
+	if (error == DB_SUCCESS && trx->fts_trx != NULL) {
+		fts_savepoint_take(trx, name);
+	}
+
 	DBUG_RETURN(convert_error_code_to_mysql(error, 0, NULL));
 }
 
@@ -3128,9 +3616,9 @@ static
 int
 innobase_close_connection(
 /*======================*/
-        handlerton*	hton,	/*!< in:  innobase handlerton */
-	THD*	thd)	/*!< in: handle to the MySQL thread of the user
-			whose resources should be free'd */
+	handlerton*	hton,	/*!< in: innobase handlerton */
+	THD*		thd)	/*!< in: handle to the MySQL thread of the user
+				whose resources should be free'd */
 {
 	trx_t*	trx;
 
@@ -3146,14 +3634,13 @@ innobase_close_connection(
 				"but transaction is active");
 	}
 
-
 	if (trx_is_started(trx) && global_system_variables.log_warnings) {
 
 		sql_print_warning(
 			"MySQL is closing a connection that has an active "
-			"InnoDB transaction.  %llu row modifications will "
-			"roll back.",
-			(ullint) trx->undo_no);
+			"InnoDB transaction.  "TRX_ID_FMT" row modifications "
+			"will roll back.",
+			trx->undo_no);
 	}
 
 	innobase_rollback_trx(trx);
@@ -3163,7 +3650,6 @@ innobase_close_connection(
 	DBUG_RETURN(0);
 }
 
-
 /*************************************************************************//**
 ** InnoDB database tables
 *****************************************************************************/
@@ -3180,24 +3666,15 @@ ha_innobase::get_row_type() const
 	if (prebuilt && prebuilt->table) {
 		const ulint	flags = prebuilt->table->flags;
 
-		if (UNIV_UNLIKELY(!flags)) {
+		switch (dict_tf_get_rec_format(flags)) {
+		case REC_FORMAT_REDUNDANT:
 			return(ROW_TYPE_REDUNDANT);
-		}
-
-		ut_ad(flags & DICT_TF_COMPACT);
-
-		switch (flags & DICT_TF_FORMAT_MASK) {
-		case DICT_TF_FORMAT_51 << DICT_TF_FORMAT_SHIFT:
+		case REC_FORMAT_COMPACT:
 			return(ROW_TYPE_COMPACT);
-		case DICT_TF_FORMAT_ZIP << DICT_TF_FORMAT_SHIFT:
-			if (flags & DICT_TF_ZSSIZE_MASK) {
-				return(ROW_TYPE_COMPRESSED);
-			} else {
-				return(ROW_TYPE_DYNAMIC);
-			}
-#if DICT_TF_FORMAT_ZIP != DICT_TF_FORMAT_MAX
-# error "DICT_TF_FORMAT_ZIP != DICT_TF_FORMAT_MAX"
-#endif
+		case REC_FORMAT_COMPRESSED:
+			return(ROW_TYPE_COMPRESSED);
+		case REC_FORMAT_DYNAMIC:
+			return(ROW_TYPE_DYNAMIC);
 		}
 	}
 	ut_ad(0);
@@ -3214,19 +3691,22 @@ handler::Table_flags
 ha_innobase::table_flags() const
 /*============================*/
 {
-       /* Need to use tx_isolation here since table flags is (also)
-          called before prebuilt is inited. */
-        ulong const tx_isolation = thd_tx_isolation(ha_thd());
-        if (tx_isolation <= ISO_READ_COMMITTED)
-                return int_table_flags;
-        return int_table_flags | HA_BINLOG_STMT_CAPABLE;
+	/* Need to use tx_isolation here since table flags is (also)
+	called before prebuilt is inited. */
+	ulong const tx_isolation = thd_tx_isolation(ha_thd());
+
+	if (tx_isolation <= ISO_READ_COMMITTED) {
+		return(int_table_flags);
+	}
+
+	return(int_table_flags | HA_BINLOG_STMT_CAPABLE);
 }
 
 /****************************************************************//**
 Gives the file extension of an InnoDB single-table tablespace. */
 static const char* ha_innobase_exts[] = {
-  ".ibd",
-  NullS
+	".ibd",
+	NullS
 };
 
 /****************************************************************//**
@@ -3241,15 +3721,21 @@ ha_innobase::table_type() const
 }
 
 /****************************************************************//**
-Returns the index type. */
+Returns the index type.
+@return index type */
 UNIV_INTERN
 const char*
 ha_innobase::index_type(
 /*====================*/
-	uint)
-				/*!< out: index type */
+	uint	keynr)		/*!< : index number */
 {
-	return("BTREE");
+	dict_index_t*	index = innobase_get_index(keynr);
+
+	if (index && index->type & DICT_FTS) {
+		return("FULLTEXT");
+	} else {
+		return("BTREE");
+	}
 }
 
 /****************************************************************//**
@@ -3270,16 +3756,18 @@ UNIV_INTERN
 ulong
 ha_innobase::index_flags(
 /*=====================*/
-	uint index,
+	uint	key,
 	uint,
-	bool)
-const
+	bool) const
 {
-       ulong extra_flag= 0;
-       if (table && index == table->s->primary_key)
-             extra_flag= HA_CLUSTERED_INDEX;
-	return(HA_READ_NEXT | HA_READ_PREV | HA_READ_ORDER | extra_flag
-	       | HA_READ_RANGE | HA_KEYREAD_ONLY);
+	ulong extra_flag= 0;
+	if (key == table_share->primary_key)
+	  extra_flag= HA_CLUSTERED_INDEX;
+	return((table_share->key_info[key].algorithm == HA_KEY_ALG_FULLTEXT)
+		 ? 0
+		 : (HA_READ_NEXT | HA_READ_PREV | HA_READ_ORDER
+		  | HA_READ_RANGE | HA_KEYREAD_ONLY | extra_flag
+		  | HA_DO_INDEX_COND_PUSHDOWN));
 }
 
 /****************************************************************//**
@@ -3302,11 +3790,25 @@ ha_innobase::max_supported_key_length() const
 /*=========================================*/
 {
 	/* An InnoDB page must store >= 2 keys; a secondary key record
-	must also contain the primary key value: max key length is
-	therefore set to slightly less than 1 / 4 of page size which
-	is 16 kB; but currently MySQL does not work with keys whose
-	size is > MAX_KEY_LENGTH */
-	return(3500);
+	must also contain the primary key value.  Therefore, if both
+	the primary key and the secondary key are at this maximum length,
+	it must be less than 1/4th of the free space on a page including
+	record overhead.
+
+	MySQL imposes its own limit to this number; MAX_KEY_LENGTH = 3072.
+
+	For page sizes = 16k, InnoDB historically reported 3500 bytes here,
+	But the MySQL limit of 3072 was always used through the handler
+	interface. */
+
+	switch (UNIV_PAGE_SIZE) {
+	case 4096:
+		return(768);
+	case 8192:
+		return(1536);
+	default:
+		return(3500);
+	}
 }
 
 /****************************************************************//**
@@ -3315,6 +3817,7 @@ Returns the key map of keys that are usable for scanning.
 UNIV_INTERN
 const key_map*
 ha_innobase::keys_to_use_for_scanning()
+/*===================================*/
 {
 	return(&key_map_full);
 }
@@ -3325,6 +3828,7 @@ Determines if table caching is supported.
 UNIV_INTERN
 uint8
 ha_innobase::table_cache_type()
+/*===========================*/
 {
 	return(HA_CACHE_TBL_ASKTRANSACT);
 }
@@ -3335,6 +3839,7 @@ Determines if the primary key is clustered index.
 UNIV_INTERN
 bool
 ha_innobase::primary_key_is_clustered()
+/*===================================*/
 {
 	return(true);
 }
@@ -3360,8 +3865,8 @@ normalize_table_name_low(
 	char*		norm_name,	/*!< out: normalized name as a
 					null-terminated string */
 	const char*	name,		/*!< in: table name string */
-	ibool		set_lower_case) /*!< in: TRUE if we want to set
-					name to lower case */
+	ibool		set_lower_case)	/*!< in: TRUE if we want to set name
+					to lower case */
 {
 	char*	name_ptr;
 	char*	db_ptr;
@@ -3484,7 +3989,7 @@ innobase_get_int_col_max_value(
 {
 	ulonglong	max_value = 0;
 
-	switch(field->key_type()) {
+	switch (field->key_type()) {
 	/* TINY */
 	case HA_KEYTYPE_BINARY:
 		max_value = 0xFFULL;
@@ -3616,13 +4121,13 @@ static
 ibool
 innobase_build_index_translation(
 /*=============================*/
-	const TABLE*		table,	  /*!< in: table in MySQL data
-					  dictionary */
-	dict_table_t*		ib_table, /*!< in: table in Innodb data
-					  dictionary */
-	INNOBASE_SHARE*		share)	  /*!< in/out: share structure
-					  where index translation table
-					  will be constructed in. */
+	const TABLE*		table,	/*!< in: table in MySQL data
+					dictionary */
+	dict_table_t*		ib_table,/*!< in: table in Innodb data
+					dictionary */
+	INNOBASE_SHARE*		share)	/*!< in/out: share structure
+					where index translation table
+					will be constructed in. */
 {
 	ulint		mysql_num_index;
 	ulint		ib_num_index;
@@ -3796,7 +4301,7 @@ ha_innobase::innobase_initialize_autoinc()
 	} else {
 		dict_index_t*	index;
 		const char*	col_name;
-		ulonglong	read_auto_inc;
+		ib_uint64_t	read_auto_inc;
 		ulint		err;
 
 		update_thd(ha_thd());
@@ -3819,7 +4324,7 @@ ha_innobase::innobase_initialize_autoinc()
 			nor the offset, so use a default increment of 1. */
 
 			auto_inc = innobase_next_autoinc(
-				read_auto_inc, 1, 1, 0, col_max_value);
+				read_auto_inc, 1, 1, col_max_value);
 
 			break;
 		}
@@ -3917,7 +4422,7 @@ ha_innobase::open(
 
 retry:
 	/* Get pointer to a table object in InnoDB dictionary cache */
-	ib_table = dict_table_get(norm_name, TRUE);
+	ib_table = dict_table_open_on_name(norm_name, FALSE);
 
 	if (NULL == ib_table) {
 		if (is_part && retries < 10) {
@@ -3960,9 +4465,10 @@ retry:
 					par_case_name_set = TRUE;
 				}
 
-				ib_table = dict_table_get(
+				ib_table = dict_table_open_on_name(
 					par_case_name, FALSE);
 			}
+
 			if (!ib_table) {
 				++retries;
 				os_thread_sleep(100000);
@@ -4021,6 +4527,8 @@ retry:
 
 table_opened:
 
+	MONITOR_INC(MONITOR_TABLE_OPEN);
+
 	if (ib_table->ibd_file_missing && !thd_tablespace_op(thd)) {
 		sql_print_error("MySQL is trying to open a table handle but "
 				"the .ibd file for\ntable %s does not exist.\n"
@@ -4033,7 +4541,7 @@ table_opened:
 		free_share(share);
 		my_errno = ENOENT;
 
-		dict_table_decrement_handle_count(ib_table, FALSE);
+		dict_table_close(ib_table, FALSE);
 		DBUG_RETURN(HA_ERR_NO_SUCH_TABLE);
 	}
 
@@ -4043,7 +4551,6 @@ table_opened:
 	ut_ad(prebuilt->default_rec);
 
 	/* Looks like MySQL-3.23 sometimes has primary key number != 0 */
-
 	primary_key = table->s->primary_key;
 	key_used_on_scan = primary_key;
 
@@ -4071,7 +4578,7 @@ table_opened:
 			if not attended, bring this to the user's attention
 			by printing a warning in addition to log a message
 			in the errorlog */
-			push_warning_printf(thd, MYSQL_ERROR::WARN_LEVEL_WARN,
+			push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
 					    ER_NO_SUCH_INDEX,
 					    "InnoDB: Table %s has a "
 					    "primary key in InnoDB data "
@@ -4094,9 +4601,14 @@ table_opened:
 			that user can adopt necessary measures for the
 			mismatch while still being accessible to the table
 			date. */
-			ref_length = table->key_info[0].key_length;
+			if (!table->key_info) {
+				ut_ad(!table->s->keys);
+				ref_length = 0;
+			} else {
+				ref_length = table->key_info[0].key_length;
+			}
 
-			/* Find correspoinding cluster index
+			/* Find corresponding cluster index
 			key length in MySQL's key_info[] array */
 			for (ulint i = 0; i < table->s->keys; i++) {
 				dict_index_t*	index;
@@ -4133,7 +4645,7 @@ table_opened:
 			if not attended, bring this to the user attention
 			by printing a warning in addition to log a message
 			in the errorlog */
-			push_warning_printf(thd, MYSQL_ERROR::WARN_LEVEL_WARN,
+			push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
 					    ER_NO_SUCH_INDEX,
 					    "InnoDB: Table %s has no "
 					    "primary key in InnoDB data "
@@ -4162,7 +4674,7 @@ table_opened:
 	}
 
 	/* Index block size in InnoDB: used by MySQL in query optimization */
-	stats.block_size = 16 * 1024;
+	stats.block_size = UNIV_PAGE_SIZE;
 
 	/* Init table lock structure */
 	thr_lock_data_init(&share->lock,&lock,(void*) 0);
@@ -4198,33 +4710,9 @@ table_opened:
 }
 
 UNIV_INTERN
-handler*
-ha_innobase::clone(
-/*===============*/
-	const char*	name,		/*!< in: table name */
-	MEM_ROOT*	mem_root)	/*!< in: memory context */
-{
-	ha_innobase* new_handler;
-
-	DBUG_ENTER("ha_innobase::clone");
-
-	new_handler = static_cast<ha_innobase*>(handler::clone(name,
-							       mem_root));
-	if (new_handler) {
-		DBUG_ASSERT(new_handler->prebuilt != NULL);
-		DBUG_ASSERT(new_handler->user_thd == user_thd);
-		DBUG_ASSERT(new_handler->prebuilt->trx == prebuilt->trx);
-
-		new_handler->prebuilt->select_lock_type
-			= prebuilt->select_lock_type;
-	}
-
-	DBUG_RETURN(new_handler);
-}
-
-UNIV_INTERN
 uint
 ha_innobase::max_supported_key_part_length() const
+/*==============================================*/
 {
 	/* A table format specific index column length check will be performed
 	at ha_innobase::add_index() and row_create_index_for_mysql() */
@@ -4238,8 +4726,8 @@ Closes a handle to an InnoDB table.
 @return	0 */
 UNIV_INTERN
 int
-ha_innobase::close(void)
-/*====================*/
+ha_innobase::close()
+/*================*/
 {
 	THD*	thd;
 
@@ -4261,6 +4749,8 @@ ha_innobase::close(void)
 
 	free_share(share);
 
+	MONITOR_INC(MONITOR_TABLE_CLOSE);
+
 	/* Tell InnoDB server that there might be work for
 	utility threads: */
 
@@ -4278,8 +4768,8 @@ static inline
 uint
 get_field_offset(
 /*=============*/
-	TABLE*	table,	/*!< in: MySQL table object */
-	Field*	field)	/*!< in: MySQL field object */
+	const TABLE*	table,	/*!< in: MySQL table object */
+	const Field*	field)	/*!< in: MySQL field object */
 {
 	return((uint) (field->ptr - table->record[0]));
 }
@@ -4304,7 +4794,7 @@ field_in_record_is_null(
 	}
 
 	null_offset = (uint) ((char*) field->null_ptr
-					- (char*) table->record[0]);
+		    - (char*) table->record[0]);
 
 	if (record[null_offset] & field->null_bit) {
 
@@ -4317,10 +4807,10 @@ field_in_record_is_null(
 /*************************************************************//**
 InnoDB uses this function to compare two data fields for which the data type
 is such that we must use MySQL code to compare them. NOTE that the prototype
-of this function is in rem0cmp.c in InnoDB source code! If you change this
+of this function is in rem0cmp.cc in InnoDB source code! If you change this
 function, remember to update the prototype there!
 @return	1, 0, -1, if a is greater, equal, less than b, respectively */
-extern "C" UNIV_INTERN
+UNIV_INTERN
 int
 innobase_mysql_cmp(
 /*===============*/
@@ -4378,9 +4868,9 @@ innobase_mysql_cmp(
 		changes then: 'b\0\0...' is ordered BEFORE 'b  ...'. Users
 		having indexes on such data need to rebuild their tables! */
 
-		ret = charset->coll->strnncollsp(charset,
-				  a, a_length,
-						 b, b_length, 0);
+		ret = charset->coll->strnncollsp(
+			charset, a, a_length, b, b_length, 0);
+
 		if (ret < 0) {
 			return(-1);
 		} else if (ret > 0) {
@@ -4395,12 +4885,308 @@ innobase_mysql_cmp(
 	return(0);
 }
 
+
+/*************************************************************//**
+Get the next token from the given string and store it in *token. */
+UNIV_INTERN
+CHARSET_INFO*
+innobase_get_fts_charset(
+/*=====================*/
+	int		mysql_type,	/*!< in: MySQL type */
+	uint		charset_number)	/*!< in: number of the charset */
+{
+	enum_field_types	mysql_tp;
+	CHARSET_INFO*		charset;
+
+	mysql_tp = (enum_field_types) mysql_type;
+
+	switch (mysql_tp) {
+
+	case MYSQL_TYPE_BIT:
+	case MYSQL_TYPE_STRING:
+	case MYSQL_TYPE_VAR_STRING:
+	case MYSQL_TYPE_TINY_BLOB:
+	case MYSQL_TYPE_MEDIUM_BLOB:
+	case MYSQL_TYPE_BLOB:
+	case MYSQL_TYPE_LONG_BLOB:
+	case MYSQL_TYPE_VARCHAR:
+		/* Use the charset number to pick the right charset struct for
+		the comparison. Since the MySQL function get_charset may be
+		slow before Bar removes the mutex operation there, we first
+		look at 2 common charsets directly. */
+
+		if (charset_number == default_charset_info->number) {
+			charset = default_charset_info;
+		} else if (charset_number == my_charset_latin1.number) {
+			charset = &my_charset_latin1;
+		} else {
+			charset = get_charset(charset_number, MYF(MY_WME));
+
+			if (charset == NULL) {
+			  sql_print_error("InnoDB needs charset %lu for doing "
+					  "a comparison, but MySQL cannot "
+					  "find that charset.",
+					  (ulong) charset_number);
+				ut_a(0);
+			}
+		}
+		break;
+	default:
+		ut_error;
+	}
+
+	return(charset);
+}
+
+/*************************************************************//**
+InnoDB uses this function to compare two data fields for which the data type
+is such that we must use MySQL code to compare them. NOTE that the prototype
+of this function is in rem0cmp.c in InnoDB source code! If you change this
+function, remember to update the prototype there!
+@return	1, 0, -1, if a is greater, equal, less than b, respectively */
+UNIV_INTERN
+int
+innobase_mysql_cmp_prefix(
+/*======================*/
+	int		mysql_type,	/*!< in: MySQL type */
+	uint		charset_number,	/*!< in: number of the charset */
+	const unsigned char* a,		/*!< in: data field */
+	unsigned int	a_length,	/*!< in: data field length,
+					not UNIV_SQL_NULL */
+	const unsigned char* b,		/*!< in: data field */
+	unsigned int	b_length)	/*!< in: data field length,
+					not UNIV_SQL_NULL */
+{
+	CHARSET_INFO*		charset;
+	int			result;
+
+	charset = innobase_get_fts_charset(mysql_type, charset_number);
+
+	result = ha_compare_text(charset, (uchar*) a, a_length,
+				 (uchar*) b, b_length, 1, 0);
+
+	return(result);
+}
+/******************************************************************//**
+compare two character string according to their charset. */
+UNIV_INTERN
+int
+innobase_fts_text_cmp(
+/*==================*/
+	const void*	cs,		/*!< in: Character set */
+	const void*     p1,		/*!< in: key */
+	const void*     p2)		/*!< in: node */
+{
+	const CHARSET_INFO*	charset = (const CHARSET_INFO*) cs;
+	const fts_string_t*	s1 = (const fts_string_t*) p1;
+	const fts_string_t*	s2 = (const fts_string_t*) p2;
+
+	return(ha_compare_text(charset, s1->f_str, s1->f_len,
+			       s2->f_str, s2->f_len, 0, 0));
+}
+/******************************************************************//**
+compare two character string case insensitively according to their charset. */
+UNIV_INTERN
+int
+innobase_fts_text_case_cmp(
+/*=======================*/
+	const void*	cs,		/*!< in: Character set */
+	const void*     p1,		/*!< in: key */
+	const void*     p2)		/*!< in: node */
+{
+	const CHARSET_INFO*	charset = (const CHARSET_INFO*) cs;
+	const fts_string_t*	s1 = (const fts_string_t*) p1;
+	const fts_string_t*	s2 = (const fts_string_t*) p2;
+	ulint			newlen;
+
+	my_casedn_str(charset, (char*) s2->f_str);
+
+	newlen = strlen((const char*) s2->f_str);
+
+	return(ha_compare_text(charset, s1->f_str, s1->f_len,
+			       s2->f_str, newlen, 0, 0));
+}
+/******************************************************************//**
+Get the first character's code position for FTS index partition. */
+UNIV_INTERN
+ulint
+innobase_strnxfrm(
+/*==============*/
+	const CHARSET_INFO*
+			cs,		/*!< in: Character set */
+	const uchar*	str,		/*!< in: string */
+	const ulint	len)		/*!< in: string length */
+{
+	uchar		mystr[2];
+	ulint		value;
+
+	if (!str || len == 0) {
+		return(0);
+	}
+
+	my_strnxfrm(cs, (uchar*) mystr, 2, str, len);
+
+	value = mach_read_from_2(mystr);
+
+	if (value > 255) {
+		value = value / 256;
+	}
+
+	return(value);
+}
+
+/******************************************************************//**
+compare two character string according to their charset. */
+UNIV_INTERN
+int
+innobase_fts_text_cmp_prefix(
+/*=========================*/
+	const void*	cs,		/*!< in: Character set */
+	const void*	p1,		/*!< in: prefix key */
+	const void*	p2)		/*!< in: value to compare */
+{
+	const CHARSET_INFO*	charset = (const CHARSET_INFO*) cs;
+	const fts_string_t*	s1 = (const fts_string_t*) p1;
+	const fts_string_t*	s2 = (const fts_string_t*) p2;
+	int			result;
+
+	result = ha_compare_text(charset, s2->f_str, s2->f_len,
+				 s1->f_str, s1->f_len, 1, 0);
+
+	/* We switched s1, s2 position in ha_compare_text. So we need
+	to negate the result */
+	return(-result);
+}
+/******************************************************************//**
+compare two character string according to their charset. */
+UNIV_INTERN
+int
+innobase_fts_string_cmp(
+/*====================*/
+	const void*	cs,		/*!< in: Character set */
+	const void*     p1,		/*!< in: key */
+	const void*     p2)		/*!< in: node */
+{
+	const CHARSET_INFO*	charset = (const CHARSET_INFO*) cs;
+	uchar*			s1 = (uchar*) p1;
+	uchar*			s2 = *(uchar**) p2;
+
+	return(ha_compare_text(charset, s1, strlen((const char*) s1),
+			       s2, strlen((const char*) s2), 0, 0));
+}
+/******************************************************************//**
+Makes all characters in a string lower case. */
+UNIV_INTERN
+size_t
+innobase_fts_casedn_str(
+/*====================*/
+	CHARSET_INFO*	cs,	/*!< in: Character set */
+	char*		src,	/*!< in: string to put in lower case */
+	size_t		src_len,/*!< in: input string length */
+	char*		dst,	/*!< in: buffer for result string */
+	size_t		dst_len)/*!< in: buffer size */
+{
+	if (cs->casedn_multiply == 1) {
+		memcpy(dst, src, src_len);
+		dst[src_len] = 0;
+		my_casedn_str(cs, dst);
+
+		return(strlen(dst));
+	} else {
+		return(cs->cset->casedn(cs, src, src_len, dst, dst_len));
+	}
+}
+
+#define true_word_char(c, ch) ((c) & (_MY_U | _MY_L | _MY_NMR) || (ch) == '_')
+
+#define misc_word_char(X)       0
+
+/*************************************************************//**
+Get the next token from the given string and store it in *token.
+It is mostly copied from MyISAM's doc parsing function ft_simple_get_word()
+@return length of string processed */
+UNIV_INTERN
+ulint
+innobase_mysql_fts_get_token(
+/*=========================*/
+	CHARSET_INFO*	cs,		/*!< in: Character set */
+	byte*           start,		/*!< in: start of text */
+	byte*		end,		/*!< in: one character past end of
+					text */
+	fts_string_t*	token,		/*!< out: token's text */
+	ulint*		offset)		/*!< out: offset to token,
+					measured as characters from
+					'start' */
+{
+	int		mbl;
+	uchar*		doc = start;
+
+	ut_a(cs);
+
+	token->f_n_char = token->f_len = 0;
+
+	do {
+		for (;;) {
+
+			if (doc >= end) {
+				return(doc - start);
+			}
+
+			int	ctype;
+
+			mbl = cs->cset->ctype(
+				cs, &ctype, (uchar*) doc, (uchar*) end);
+
+			if (true_word_char(ctype, *doc)) {
+				break;
+			}
+
+			doc += mbl > 0 ? mbl : (mbl < 0 ? -mbl : 1);
+		}
+
+		ulint	mwc = 0;
+		ulint	length = 0;
+
+		token->f_str = doc;
+
+		while (doc < end) {
+
+			int	ctype;
+
+			mbl = cs->cset->ctype(
+				cs, &ctype, (uchar*) doc, (uchar*) end);
+
+			if (true_word_char(ctype, *doc)) {
+				mwc = 0;
+			} else if (!misc_word_char(*doc) || mwc) {
+				break;
+			} else {
+				++mwc;
+			}
+
+			++length;
+
+			doc += mbl > 0 ? mbl : (mbl < 0 ? -mbl : 1);
+		}
+
+		token->f_len = (uint) (doc - token->f_str) - mwc;
+		token->f_n_char = length;
+
+		return(doc - start);
+
+	} while (doc < end);
+
+	token->f_str[token->f_len] = 0;
+
+	return(doc - start);
+}
+
 /**************************************************************//**
 Converts a MySQL type to an InnoDB type. Note that this function returns
 the 'mtype' of InnoDB. InnoDB differentiates between MySQL's old <= 4.1
 VARCHAR and the new true VARCHAR in >= 5.0.3 by the 'prtype'.
 @return	DATA_BINARY, DATA_VARCHAR, ... */
-extern "C" UNIV_INTERN
+UNIV_INTERN
 ulint
 get_innobase_type_from_mysql_type(
 /*==============================*/
@@ -4477,7 +5263,6 @@ get_innobase_type_from_mysql_type(
 	case HA_KEYTYPE_END:
 		ut_error;
 	}
-
 	return(0);
 }
 
@@ -4558,7 +5343,7 @@ ha_innobase::store_key_val_for_row(
 	simple memcmp to compare two key values to determine if they are
 	equal. MySQL does this to compare contents of two 'ref' values. */
 
-	bzero(buff, buff_len);
+	memset(buff, 0, buff_len);
 
 	for (; key_part != end; key_part++) {
 		is_null = FALSE;
@@ -4584,7 +5369,7 @@ ha_innobase::store_key_val_for_row(
 			const byte*	data;
 			ulint		key_len;
 			ulint		true_len;
-			CHARSET_INFO*	cs;
+			const CHARSET_INFO* cs;
 			int		error=0;
 
 			key_len = key_part->length;
@@ -4597,11 +5382,11 @@ ha_innobase::store_key_val_for_row(
 			cs = field->charset();
 
 			lenlen = (ulint)
-				(((Field_varstring*)field)->length_bytes);
+				(((Field_varstring*) field)->length_bytes);
 
 			data = row_mysql_read_true_varchar(&len,
 				(byte*) (record
-				+ (ulint)get_field_offset(table, field)),
+				+ (ulint) get_field_offset(table, field)),
 				lenlen);
 
 			true_len = len;
@@ -4611,10 +5396,9 @@ ha_innobase::store_key_val_for_row(
 
 			if (len > 0 && cs->mbmaxlen > 1) {
 				true_len = (ulint) cs->cset->well_formed_len(cs,
-						(const char *) data,
-						(const char *) data + len,
-                                                (uint) (key_len /
-                                                        cs->mbmaxlen),
+						(const char*) data,
+						(const char*) data + len,
+						(uint) (key_len / cs->mbmaxlen),
 						&error);
 			}
 
@@ -4628,7 +5412,7 @@ ha_innobase::store_key_val_for_row(
 			/* The length in a key value is always stored in 2
 			bytes */
 
-			row_mysql_store_true_var_len((byte*)buff, true_len, 2);
+			row_mysql_store_true_var_len((byte*) buff, true_len, 2);
 			buff += 2;
 
 			memcpy(buff, data, true_len);
@@ -4637,7 +5421,7 @@ ha_innobase::store_key_val_for_row(
 			length of the true VARCHAR in the key value, though
 			only len first bytes after the 2 length bytes contain
 			actual data. The rest of the space was reset to zero
-			in the bzero() call above. */
+			in the memset() call above. */
 
 			buff += key_len;
 
@@ -4649,7 +5433,7 @@ ha_innobase::store_key_val_for_row(
 			as BLOB data in innodb. */
 			|| mysql_type == MYSQL_TYPE_GEOMETRY) {
 
-			CHARSET_INFO*	cs;
+			const CHARSET_INFO* cs;
 			ulint		key_len;
 			ulint		true_len;
 			int		error=0;
@@ -4670,7 +5454,7 @@ ha_innobase::store_key_val_for_row(
 
 			blob_data = row_mysql_read_blob_ref(&blob_len,
 				(byte*) (record
-				+ (ulint)get_field_offset(table, field)),
+				+ (ulint) get_field_offset(table, field)),
 					(ulint) field->pack_length());
 
 			true_len = blob_len;
@@ -4683,11 +5467,10 @@ ha_innobase::store_key_val_for_row(
 
 			if (blob_len > 0 && cs->mbmaxlen > 1) {
 				true_len = (ulint) cs->cset->well_formed_len(cs,
-						(const char *) blob_data,
-						(const char *) blob_data
+						(const char*) blob_data,
+						(const char*) blob_data
 							+ blob_len,
-                                                (uint) (key_len /
-                                                        cs->mbmaxlen),
+						(uint) (key_len / cs->mbmaxlen),
 						&error);
 			}
 
@@ -4703,7 +5486,7 @@ ha_innobase::store_key_val_for_row(
 			storage of the number is little-endian */
 
 			innobase_write_to_2_little_endian(
-					(byte*)buff, true_len);
+					(byte*) buff, true_len);
 			buff += 2;
 
 			memcpy(buff, blob_data, true_len);
@@ -4718,7 +5501,7 @@ ha_innobase::store_key_val_for_row(
 			value we store may be also in a column prefix
 			index. */
 
-			CHARSET_INFO*		cs;
+			const CHARSET_INFO*	cs = NULL;
 			ulint			true_len;
 			ulint			key_len;
 			const uchar*		src_start;
@@ -4756,11 +5539,11 @@ ha_innobase::store_key_val_for_row(
 
 					true_len = (ulint)
 						cs->cset->well_formed_len(cs,
-							(const char *)src_start,
-							(const char *)src_start
+							(const char*) src_start,
+							(const char*) src_start
 								+ key_len,
-                                                        (uint) (key_len /
-                                                                cs->mbmaxlen),
+							(uint) (key_len
+								/ cs->mbmaxlen),
 							&error);
 				}
 			}
@@ -4772,6 +5555,7 @@ ha_innobase::store_key_val_for_row(
 
 			if (true_len < key_len) {
 				ulint	pad_len = key_len - true_len;
+				ut_a(cs != NULL);
 				ut_a(!(pad_len % cs->mbminlen));
 
 				cs->cset->fill(cs, buff, pad_len,
@@ -4787,41 +5571,189 @@ ha_innobase::store_key_val_for_row(
 }
 
 /**************************************************************//**
+Determines if a field is needed in a prebuilt struct 'template'.
+@return field to use, or NULL if the field is not needed */
+static
+const Field*
+build_template_needs_field(
+/*=======================*/
+	ibool		index_contains,	/*!< in:
+					dict_index_contains_col_or_prefix(
+					index, i) */
+	ibool		read_just_key,	/*!< in: TRUE when MySQL calls
+					ha_innobase::extra with the
+					argument HA_EXTRA_KEYREAD; it is enough
+					to read just columns defined in
+					the index (i.e., no read of the
+					clustered index record necessary) */
+	ibool		fetch_all_in_key,
+					/*!< in: true=fetch all fields in
+					the index */
+	ibool		fetch_primary_key_cols,
+					/*!< in: true=fetch the
+					primary key columns */
+	dict_index_t*	index,		/*!< in: InnoDB index to use */
+	const TABLE*	table,		/*!< in: MySQL table object */
+	ulint		i)		/*!< in: field index in InnoDB table */
+{
+	const Field*	field	= table->field[i];
+
+	ut_ad(index_contains == dict_index_contains_col_or_prefix(index, i));
+
+	if (!index_contains) {
+		if (read_just_key) {
+			/* If this is a 'key read', we do not need
+			columns that are not in the key */
+
+			return(NULL);
+		}
+	} else if (fetch_all_in_key) {
+		/* This field is needed in the query */
+
+		return(field);
+	}
+
+	if (bitmap_is_set(table->read_set, i)
+	    || bitmap_is_set(table->write_set, i)) {
+		/* This field is needed in the query */
+
+		return(field);
+	}
+
+	if (fetch_primary_key_cols
+	    && dict_table_col_in_clustered_key(index->table, i)) {
+		/* This field is needed in the query */
+
+		return(field);
+	}
+
+	/* This field is not needed in the query, skip it */
+
+	return(NULL);
+}
+
+/**************************************************************//**
+Determines if a field is needed in a prebuilt struct 'template'.
+@return whether the field is needed for index condition pushdown */
+inline
+bool
+build_template_needs_field_in_icp(
+/*==============================*/
+	const dict_index_t*	index,	/*!< in: InnoDB index */
+	const row_prebuilt_t*	prebuilt,/*!< in: row fetch template */
+	bool			contains,/*!< in: whether the index contains
+					column i */
+	ulint			i)	/*!< in: column number */
+{
+	ut_ad(contains == dict_index_contains_col_or_prefix(index, i));
+
+	return(index == prebuilt->index
+	       ? contains
+	       : dict_index_contains_col_or_prefix(prebuilt->index, i));
+}
+
+/**************************************************************//**
+Adds a field to a prebuilt struct 'template'.
+@return the field template */
+static
+mysql_row_templ_t*
+build_template_field(
+/*=================*/
+	row_prebuilt_t*	prebuilt,	/*!< in/out: template */
+	dict_index_t*	clust_index,	/*!< in: InnoDB clustered index */
+	dict_index_t*	index,		/*!< in: InnoDB index to use */
+	TABLE*		table,		/*!< in: MySQL table object */
+	const Field*	field,		/*!< in: field in MySQL table */
+	ulint		i)		/*!< in: field index in InnoDB table */
+{
+	mysql_row_templ_t*	templ;
+	const dict_col_t*	col;
+
+	ut_ad(field == table->field[i]);
+	ut_ad(clust_index->table == index->table);
+
+	col = dict_table_get_nth_col(index->table, i);
+
+	templ = prebuilt->mysql_template + prebuilt->n_template++;
+	UNIV_MEM_INVALID(templ, sizeof *templ);
+	templ->col_no = i;
+	templ->clust_rec_field_no = dict_col_get_clust_pos(col, clust_index);
+	ut_a(templ->clust_rec_field_no != ULINT_UNDEFINED);
+
+	if (dict_index_is_clust(index)) {
+		templ->rec_field_no = templ->clust_rec_field_no;
+	} else {
+		templ->rec_field_no = dict_index_get_nth_col_pos(index, i);
+	}
+
+	if (field->null_ptr) {
+		templ->mysql_null_byte_offset =
+			(ulint) ((char*) field->null_ptr
+				 - (char*) table->record[0]);
+
+		templ->mysql_null_bit_mask = (ulint) field->null_bit;
+	} else {
+		templ->mysql_null_bit_mask = 0;
+	}
+
+	templ->mysql_col_offset = (ulint) get_field_offset(table, field);
+
+	templ->mysql_col_len = (ulint) field->pack_length();
+	templ->type = col->mtype;
+	templ->mysql_type = (ulint) field->type();
+
+	if (templ->mysql_type == DATA_MYSQL_TRUE_VARCHAR) {
+		templ->mysql_length_bytes = (ulint)
+			(((Field_varstring*) field)->length_bytes);
+	}
+
+	templ->charset = dtype_get_charset_coll(col->prtype);
+	templ->mbminlen = dict_col_get_mbminlen(col);
+	templ->mbmaxlen = dict_col_get_mbmaxlen(col);
+	templ->is_unsigned = col->prtype & DATA_UNSIGNED;
+
+	if (!dict_index_is_clust(index)
+	    && templ->rec_field_no == ULINT_UNDEFINED) {
+		prebuilt->need_to_access_clustered = TRUE;
+	}
+
+	if (prebuilt->mysql_prefix_len < templ->mysql_col_offset
+	    + templ->mysql_col_len) {
+		prebuilt->mysql_prefix_len = templ->mysql_col_offset
+			+ templ->mysql_col_len;
+	}
+
+	if (templ->type == DATA_BLOB) {
+		prebuilt->templ_contains_blob = TRUE;
+	}
+
+	return(templ);
+}
+
+/**************************************************************//**
 Builds a 'template' to the prebuilt struct. The template is used in fast
 retrieval of just those column values MySQL needs in its processing. */
-static
+UNIV_INTERN
 void
-build_template(
-/*===========*/
-	row_prebuilt_t*	prebuilt,	/*!< in/out: prebuilt struct */
-	THD*		thd,		/*!< in: current user thread, used
-					only if templ_type is
-					ROW_MYSQL_REC_FIELDS */
-	TABLE*		table,		/*!< in: MySQL table */
-	uint		templ_type)	/*!< in: ROW_MYSQL_WHOLE_ROW or
-					ROW_MYSQL_REC_FIELDS */
+ha_innobase::build_template(
+/*========================*/
+	bool		whole_row)	/*!< in: true=ROW_MYSQL_WHOLE_ROW,
+					false=ROW_MYSQL_REC_FIELDS */
 {
 	dict_index_t*	index;
 	dict_index_t*	clust_index;
-	mysql_row_templ_t* templ;
-	Field*		field;
 	ulint		n_fields;
-	ulint		n_requested_fields	= 0;
 	ibool		fetch_all_in_key	= FALSE;
 	ibool		fetch_primary_key_cols	= FALSE;
 	ulint		i;
-	/* byte offset of the end of last requested column */
-	ulint		mysql_prefix_len	= 0;
 
 	if (prebuilt->select_lock_type == LOCK_X) {
 		/* We always retrieve the whole clustered index record if we
 		use exclusive row level locks, for example, if the read is
 		done in an UPDATE statement. */
 
-		templ_type = ROW_MYSQL_WHOLE_ROW;
-	}
-
-	if (templ_type == ROW_MYSQL_REC_FIELDS) {
+		whole_row = true;
+	} else if (!whole_row) {
 		if (prebuilt->hint_need_to_fetch_extra_cols
 			== ROW_RETRIEVE_ALL_COLS) {
 
@@ -4838,16 +5770,16 @@ build_template(
 
 				fetch_all_in_key = TRUE;
 			} else {
-				templ_type = ROW_MYSQL_WHOLE_ROW;
+				whole_row = true;
 			}
 		} else if (prebuilt->hint_need_to_fetch_extra_cols
 			== ROW_RETRIEVE_PRIMARY_KEY) {
 			/* We must at least fetch all primary key cols. Note
-			   that if the clustered index was internally generated
-			   by InnoDB on the row id (no primary key was
-			   defined), then row_search_for_mysql() will always
-			   retrieve the row id to a special buffer in the
-			   prebuilt struct. */
+			that if the clustered index was internally generated
+			by InnoDB on the row id (no primary key was
+			defined), then row_search_for_mysql() will always
+			retrieve the row id to a special buffer in the
+			prebuilt struct. */
 
 			fetch_primary_key_cols = TRUE;
 		}
@@ -4855,141 +5787,212 @@ build_template(
 
 	clust_index = dict_table_get_first_index(prebuilt->table);
 
-	if (templ_type == ROW_MYSQL_REC_FIELDS) {
-		index = prebuilt->index;
-	} else {
-		index = clust_index;
-	}
+	index = whole_row ? clust_index : prebuilt->index;
 
-	if (index == clust_index) {
-		prebuilt->need_to_access_clustered = TRUE;
-	} else {
-		prebuilt->need_to_access_clustered = FALSE;
-		/* Below we check column by column if we need to access
-		the clustered index */
-	}
+	prebuilt->need_to_access_clustered = (index == clust_index);
 
-	n_fields = (ulint)table->s->fields; /* number of columns */
+	/* Below we check column by column if we need to access
+	the clustered index. */
+
+	n_fields = (ulint) table->s->fields; /* number of columns */
 
 	if (!prebuilt->mysql_template) {
 		prebuilt->mysql_template = (mysql_row_templ_t*)
 			mem_alloc(n_fields * sizeof(mysql_row_templ_t));
 	}
 
-	prebuilt->template_type = templ_type;
+	prebuilt->template_type = whole_row
+		? ROW_MYSQL_WHOLE_ROW : ROW_MYSQL_REC_FIELDS;
 	prebuilt->null_bitmap_len = table->s->null_bytes;
 
+	/* Prepare to build prebuilt->mysql_template[]. */
 	prebuilt->templ_contains_blob = FALSE;
+	prebuilt->mysql_prefix_len = 0;
+	prebuilt->n_template = 0;
+	prebuilt->idx_cond_n_cols = 0;
+
+	/* Note that in InnoDB, i is the column number in the table.
+	MySQL calls columns 'fields'. */
+
+	if (active_index != MAX_KEY && active_index == pushed_idx_cond_keyno) {
+		/* Push down an index condition or an end_range check. */
+		for (i = 0; i < n_fields; i++) {
+			const ibool		index_contains
+				= dict_index_contains_col_or_prefix(index, i);
+
+			/* Test if an end_range or an index condition
+			refers to the field. Note that "index" and
+			"index_contains" may refer to the clustered index.
+			Index condition pushdown is relative to prebuilt->index
+			(the index that is being looked up first). */
+
+			/* When join_read_always_key() invokes this
+			code via handler::ha_index_init() and
+			ha_innobase::index_init(), end_range is not
+			yet initialized. Because of that, we must
+			always check for index_contains, instead of
+			the subset
+			field->part_of_key.is_set(active_index)
+			which would be acceptable if end_range==NULL. */
+			if (build_template_needs_field_in_icp(
+				    index, prebuilt, index_contains, i)) {
+				/* Needed in ICP */
+				const Field*		field;
+				mysql_row_templ_t*	templ;
+
+				if (whole_row) {
+					field = table->field[i];
+				} else {
+					field = build_template_needs_field(
+						index_contains,
+						prebuilt->read_just_key,
+						fetch_all_in_key,
+						fetch_primary_key_cols,
+						index, table, i);
+					if (!field) {
+						continue;
+					}
+				}
 
-	/* Note that in InnoDB, i is the column number. MySQL calls columns
-	'fields'. */
-	for (i = 0; i < n_fields; i++) {
-		const dict_col_t* col = &index->table->cols[i];
-		templ = prebuilt->mysql_template + n_requested_fields;
-		field = table->field[i];
-
-		if (UNIV_LIKELY(templ_type == ROW_MYSQL_REC_FIELDS)) {
-			/* Decide which columns we should fetch
-			and which we can skip. */
-			register const ibool	index_contains_field =
-				dict_index_contains_col_or_prefix(index, i);
-
-			if (!index_contains_field && prebuilt->read_just_key) {
-				/* If this is a 'key read', we do not need
-				columns that are not in the key */
-
-				goto skip_field;
-			}
-
-			if (index_contains_field && fetch_all_in_key) {
-				/* This field is needed in the query */
-
-				goto include_field;
-			}
-
-			if (bitmap_is_set(table->read_set, i) ||
-			    bitmap_is_set(table->write_set, i)) {
-				/* This field is needed in the query */
-
-				goto include_field;
-			}
+				templ = build_template_field(
+					prebuilt, clust_index, index,
+					table, field, i);
+				prebuilt->idx_cond_n_cols++;
+				ut_ad(prebuilt->idx_cond_n_cols
+				      == prebuilt->n_template);
+
+				if (index == prebuilt->index) {
+					templ->icp_rec_field_no
+						= templ->rec_field_no;
+				} else {
+					templ->icp_rec_field_no
+						= dict_index_get_nth_col_pos(
+							prebuilt->index, i);
+				}
 
-			if (fetch_primary_key_cols
-				&& dict_table_col_in_clustered_key(
-					index->table, i)) {
-				/* This field is needed in the query */
+				if (dict_index_is_clust(prebuilt->index)) {
+					ut_ad(templ->icp_rec_field_no
+					      != ULINT_UNDEFINED);
+					/* If the primary key includes
+					a column prefix, use it in
+					index condition pushdown,
+					because the condition is
+					evaluated before fetching any
+					off-page (externally stored)
+					columns. */
+					if (templ->icp_rec_field_no
+					    < prebuilt->index->n_uniq) {
+						/* This is a key column;
+						all set. */
+						continue;
+					}
+				} else if (templ->icp_rec_field_no
+					   != ULINT_UNDEFINED) {
+					continue;
+				}
 
-				goto include_field;
+				/* This is a column prefix index.
+				The column prefix can be used in
+				an end_range comparison. */
+
+				templ->icp_rec_field_no
+					= dict_index_get_nth_col_or_prefix_pos(
+						prebuilt->index, i, TRUE);
+				ut_ad(templ->icp_rec_field_no
+				      != ULINT_UNDEFINED);
+
+				/* Index condition pushdown can be used on
+				all columns of a secondary index, and on
+				the PRIMARY KEY columns. On the clustered
+				index, it must never be used on other than
+				PRIMARY KEY columns, because those columns
+				may be stored off-page, and we will not
+				fetch externally stored columns before
+				checking the index condition. */
+				/* TODO: test the above with an assertion
+				like this. Note that index conditions are
+				currently pushed down as part of the
+				"optimizer phase" while end_range is done
+				as part of the execution phase. Therefore,
+				we were unable to use an accurate condition
+				for end_range in the "if" condition above,
+				and the following assertion would fail.
+				ut_ad(!dict_index_is_clust(prebuilt->index)
+				      || templ->rec_field_no
+				      < prebuilt->index->n_uniq);
+				*/
 			}
-
-			/* This field is not needed in the query, skip it */
-
-			goto skip_field;
 		}
-include_field:
-		n_requested_fields++;
 
-		templ->col_no = i;
-		templ->clust_rec_field_no = dict_col_get_clust_pos(
-			col, clust_index);
-		ut_ad(templ->clust_rec_field_no != ULINT_UNDEFINED);
+		ut_ad(prebuilt->idx_cond_n_cols > 0);
+		ut_ad(prebuilt->idx_cond_n_cols == prebuilt->n_template);
+
+		/* Include the fields that are not needed in index condition
+		pushdown. */
+		for (i = 0; i < n_fields; i++) {
+			const ibool		index_contains
+				= dict_index_contains_col_or_prefix(index, i);
+
+			if (!build_template_needs_field_in_icp(
+				    index, prebuilt, index_contains, i)) {
+				/* Not needed in ICP */
+				const Field*	field;
+
+				if (whole_row) {
+					field = table->field[i];
+				} else {
+					field = build_template_needs_field(
+						index_contains,
+						prebuilt->read_just_key,
+						fetch_all_in_key,
+						fetch_primary_key_cols,
+						index, table, i);
+					if (!field) {
+						continue;
+					}
+				}
 
-		if (index == clust_index) {
-			templ->rec_field_no = templ->clust_rec_field_no;
-		} else {
-			templ->rec_field_no = dict_index_get_nth_col_pos(
-								index, i);
-			if (templ->rec_field_no == ULINT_UNDEFINED) {
-				prebuilt->need_to_access_clustered = TRUE;
+				build_template_field(prebuilt,
+						     clust_index, index,
+						     table, field, i);
 			}
 		}
 
-		if (field->null_ptr) {
-			templ->mysql_null_byte_offset =
-				(ulint) ((char*) field->null_ptr
-					- (char*) table->record[0]);
-
-			templ->mysql_null_bit_mask = (ulint) field->null_bit;
-		} else {
-			templ->mysql_null_bit_mask = 0;
-		}
+		prebuilt->idx_cond = this;
+	} else {
+		/* No index condition pushdown */
+		prebuilt->idx_cond = NULL;
 
-		templ->mysql_col_offset = (ulint)
-					get_field_offset(table, field);
+		for (i = 0; i < n_fields; i++) {
+			const Field*	field;
 
-		templ->mysql_col_len = (ulint) field->pack_length();
-		if (mysql_prefix_len < templ->mysql_col_offset
-				+ templ->mysql_col_len) {
-			mysql_prefix_len = templ->mysql_col_offset
-				+ templ->mysql_col_len;
-		}
-		templ->type = col->mtype;
-		templ->mysql_type = (ulint)field->type();
-
-		if (templ->mysql_type == DATA_MYSQL_TRUE_VARCHAR) {
-			templ->mysql_length_bytes = (ulint)
-				(((Field_varstring*)field)->length_bytes);
-		}
+			if (whole_row) {
+				field = table->field[i];
+			} else {
+				field = build_template_needs_field(
+					dict_index_contains_col_or_prefix(
+						index, i),
+					prebuilt->read_just_key,
+					fetch_all_in_key,
+					fetch_primary_key_cols,
+					index, table, i);
+				if (!field) {
+					continue;
+				}
+			}
 
-		templ->charset = dtype_get_charset_coll(col->prtype);
-		templ->mbminlen = dict_col_get_mbminlen(col);
-		templ->mbmaxlen = dict_col_get_mbmaxlen(col);
-		templ->is_unsigned = col->prtype & DATA_UNSIGNED;
-		if (templ->type == DATA_BLOB) {
-			prebuilt->templ_contains_blob = TRUE;
+			build_template_field(prebuilt, clust_index, index,
+					     table, field, i);
 		}
-skip_field:
-		;
 	}
 
-	prebuilt->n_template = n_requested_fields;
-	prebuilt->mysql_prefix_len = mysql_prefix_len;
-
 	if (index != clust_index && prebuilt->need_to_access_clustered) {
 		/* Change rec_field_no's to correspond to the clustered index
 		record */
-		for (i = 0; i < n_requested_fields; i++) {
-			templ = prebuilt->mysql_template + i;
+		for (i = 0; i < prebuilt->n_template; i++) {
+
+			mysql_row_templ_t*	templ
+				= &prebuilt->mysql_template[i];
 
 			templ->rec_field_no = templ->clust_rec_field_no;
 		}
@@ -5023,16 +6026,16 @@ ha_innobase::innobase_lock_autoinc(void)
 		etc. type of statement. */
 		if (thd_sql_command(user_thd) == SQLCOM_INSERT
 		    || thd_sql_command(user_thd) == SQLCOM_REPLACE) {
-			dict_table_t*	table = prebuilt->table;
+			dict_table_t*	ib_table = prebuilt->table;
 
 			/* Acquire the AUTOINC mutex. */
-			dict_table_autoinc_lock(table);
+			dict_table_autoinc_lock(ib_table);
 
 			/* We need to check that another transaction isn't
 			already holding the AUTOINC lock on the table. */
-			if (table->n_waiting_or_granted_auto_inc_locks) {
+			if (ib_table->n_waiting_or_granted_auto_inc_locks) {
 				/* Release the mutex to avoid deadlocks. */
-				dict_table_autoinc_unlock(table);
+				dict_table_autoinc_unlock(ib_table);
 			} else {
 				break;
 			}
@@ -5114,7 +6117,7 @@ ha_innobase::write_row(
 	uchar*	record)	/*!< in: a row in MySQL format */
 {
 	ulint		error = 0;
-        int             error_result= 0;
+	int		error_result= 0;
 	ibool		auto_inc_used= FALSE;
 	ulint		sql_command;
 	trx_t*		trx = thd_to_trx(user_thd);
@@ -5122,25 +6125,25 @@ ha_innobase::write_row(
 	DBUG_ENTER("ha_innobase::write_row");
 
 	if (prebuilt->trx != trx) {
-	  sql_print_error("The transaction object for the table handle is at "
-			  "%p, but for the current thread it is at %p",
-			  (const void*) prebuilt->trx, (const void*) trx);
+		sql_print_error("The transaction object for the table handle "
+				"is at %p, but for the current thread it is at "
+				"%p",
+				(const void*) prebuilt->trx, (const void*) trx);
 
 		fputs("InnoDB: Dump of 200 bytes around prebuilt: ", stderr);
-		ut_print_buf(stderr, ((const byte*)prebuilt) - 100, 200);
+		ut_print_buf(stderr, ((const byte*) prebuilt) - 100, 200);
 		fputs("\n"
 			"InnoDB: Dump of 200 bytes around ha_data: ",
 			stderr);
 		ut_print_buf(stderr, ((const byte*) trx) - 100, 200);
 		putc('\n', stderr);
 		ut_error;
+	} else if (!trx_is_started(trx)) {
+		++trx->will_lock;
 	}
 
 	ha_statistic_increment(&SSV::ha_write_count);
 
-	if (table->timestamp_field_type & TIMESTAMP_AUTO_SET_ON_INSERT)
-		table->timestamp_field->set_time();
-
 	sql_command = thd_sql_command(user_thd);
 
 	if ((sql_command == SQLCOM_ALTER_TABLE
@@ -5249,10 +6252,10 @@ no_commit:
 		/* Build the template used in converting quickly between
 		the two database formats */
 
-		build_template(prebuilt, NULL, table, ROW_MYSQL_WHOLE_ROW);
+		build_template(true);
 	}
 
-	innodb_srv_conc_enter_innodb(prebuilt->trx);
+	innobase_srv_conc_enter_innodb(prebuilt->trx);
 
 	error = row_insert_for_mysql((byte*) record, prebuilt);
 
@@ -5318,16 +6321,15 @@ set_max_autoinc:
 				if (auto_inc <= col_max_value) {
 					ut_a(prebuilt->autoinc_increment > 0);
 
+					ulonglong	need;
 					ulonglong	offset;
-					ulonglong	increment;
 
 					offset = prebuilt->autoinc_offset;
-					increment = prebuilt->autoinc_increment;
+					need = prebuilt->autoinc_increment;
 
 					auto_inc = innobase_next_autoinc(
 						auto_inc,
-						1, increment, offset,
-						col_max_value);
+						need, offset, col_max_value);
 
 					err = innobase_set_max_autoinc(
 						auto_inc);
@@ -5341,13 +6343,17 @@ set_max_autoinc:
 		}
 	}
 
-	innodb_srv_conc_exit_innodb(prebuilt->trx);
+	innobase_srv_conc_exit_innodb(prebuilt->trx);
 
 report_error:
 	error_result = convert_error_code_to_mysql((int) error,
 						   prebuilt->table->flags,
 						   user_thd);
 
+	if (error_result == HA_FTS_INVALID_DOCID) {
+		my_error(HA_FTS_INVALID_DOCID, MYF(0));
+	}
+
 func_exit:
 	innobase_active_small();
 
@@ -5389,6 +6395,11 @@ calc_row_difference(
 	dfield_t	dfield;
 	dict_index_t*	clust_index;
 	uint		i;
+	ulint		error = DB_SUCCESS;
+	ibool		changes_fts_column = FALSE;
+	ibool		changes_fts_doc_col = FALSE;
+	trx_t*          trx = thd_to_trx(thd);
+	doc_id_t	doc_id = FTS_NULL_DOC_ID;
 
 	n_fields = table->s->fields;
 	clust_index = dict_table_get_first_index(prebuilt->table);
@@ -5436,12 +6447,12 @@ calc_row_difference(
 				o_ptr = row_mysql_read_true_varchar(
 					&o_len, o_ptr,
 					(ulint)
-					(((Field_varstring*)field)->length_bytes));
+					(((Field_varstring*) field)->length_bytes));
 
 				n_ptr = row_mysql_read_true_varchar(
 					&n_len, n_ptr,
 					(ulint)
-					(((Field_varstring*)field)->length_bytes));
+					(((Field_varstring*) field)->length_bytes));
 			}
 
 			break;
@@ -5449,6 +6460,18 @@ calc_row_difference(
 			;
 		}
 
+		if (field_mysql_type == MYSQL_TYPE_LONGLONG
+		    && prebuilt->table->fts
+		    && innobase_strcasecmp(
+			field->field_name, FTS_DOC_ID_COL_NAME) == 0) {
+			doc_id = (doc_id_t) mach_read_from_n_little_endian(
+				n_ptr, 8);
+			if (doc_id == 0) {
+				return(DB_FTS_INVALID_DOCID);
+			}
+		}
+
+
 		if (field->null_ptr) {
 			if (field_in_record_is_null(table, field,
 							(char*) old_row)) {
@@ -5477,7 +6500,7 @@ calc_row_difference(
 
 				buf = row_mysql_store_col_in_innobase_format(
 					&dfield,
-					(byte*)buf,
+					(byte*) buf,
 					TRUE,
 					new_mysql_row_col,
 					col_pack_len,
@@ -5492,15 +6515,128 @@ calc_row_difference(
 			ufield->field_no = dict_col_get_clust_pos(
 				&prebuilt->table->cols[i], clust_index);
 			n_changed++;
+
+			/* If an FTS indexed column was changed by this
+			UPDATE then we need to inform the FTS sub-system.
+
+			NOTE: Currently we re-index all FTS indexed columns
+			even if only a subset of the FTS indexed columns
+			have been updated. That is the reason we are
+			checking only once here. Later we will need to
+			note which columns have been updated and do
+			selective processing. */
+			if (prebuilt->table->fts != NULL) {
+				ulint           offset;
+				dict_table_t*   innodb_table;
+
+				innodb_table = prebuilt->table;
+
+				if (!changes_fts_column) {
+					offset = row_upd_changes_fts_column(
+						innodb_table, ufield);
+
+					if (offset != ULINT_UNDEFINED) {
+						changes_fts_column = TRUE;
+					}
+				}
+
+				if (!changes_fts_doc_col) {
+					changes_fts_doc_col =
+					row_upd_changes_doc_id(
+						innodb_table, ufield);
+				}
+			}
+		}
+	}
+
+	/* If the update changes a column with an FTS index on it, we
+	then add an update column node with a new document id to the
+	other changes. We piggy back our changes on the normal UPDATE
+	to reduce processing and IO overhead. */
+	if (!prebuilt->table->fts) {
+			trx->fts_next_doc_id = 0;
+	} else if (changes_fts_column || changes_fts_doc_col) {
+		dict_table_t*   innodb_table = prebuilt->table;
+
+		ufield = uvect->fields + n_changed;
+
+		if (!DICT_TF2_FLAG_IS_SET(
+			innodb_table, DICT_TF2_FTS_HAS_DOC_ID)) {
+
+			/* If Doc ID is managed by user, and if any
+			FTS indexed column has been updated, its corresponding
+			Doc ID must also be updated. Otherwise, return
+			error */
+			if (changes_fts_column && !changes_fts_doc_col) {
+				ut_print_timestamp(stderr);
+				fprintf(stderr, " InnoDB: A new Doc ID"
+					" must be supplied while updating"
+					" FTS indexed columns.\n");
+				return(DB_FTS_INVALID_DOCID);
+			}
+
+			/* Doc ID must monotonically increase */
+			ut_ad(innodb_table->fts->cache);
+			if (doc_id < prebuilt->table->fts->cache->next_doc_id) {
+				fprintf(stderr,
+					"InnoDB: FTS Doc ID must be larger than"
+					" "IB_ID_FMT" for table",
+					innodb_table->fts->cache->next_doc_id
+					- 1);
+				ut_print_name(stderr, trx,
+					      TRUE, innodb_table->name);
+				putc('\n', stderr);
+
+				return(DB_FTS_INVALID_DOCID);
+			} else if ((doc_id
+				    - prebuilt->table->fts->cache->next_doc_id)
+				   >= FTS_DOC_ID_MAX_STEP) {
+				fprintf(stderr,
+					"InnoDB: Doc ID "UINT64PF" is too"
+					" big. Its difference with largest"
+					" Doc ID used "UINT64PF" cannot"
+					" exceed or equal to %d\n",
+					doc_id,
+					prebuilt->table->fts->cache->next_doc_id - 1,
+					FTS_DOC_ID_MAX_STEP);
+			}
+
+
+			trx->fts_next_doc_id = doc_id;
+		} else {
+			/* If the Doc ID is a hidden column, it can't be
+			changed by user */
+			ut_ad(!changes_fts_doc_col);
+
+			/* Doc ID column is hidden, a new Doc ID will be
+			generated by following fts_update_doc_id() call */
+			trx->fts_next_doc_id = 0;
+		}
+
+		fts_update_doc_id(
+			innodb_table, ufield, &trx->fts_next_doc_id);
+
+		if (error == DB_SUCCESS) {
+			++n_changed;
+		} else {
+			ut_print_timestamp(stderr);
+			fprintf(stderr, " InnoDB: Error (%lu) while updating "
+				"doc id in calc_row_difference().\n", error);
 		}
+	} else {
+		/* We have a Doc ID column, but none of FTS indexed
+		columns are touched, nor the Doc ID column, so set
+		fts_next_doc_id to UINT64_UNDEFINED, which means do not
+		update the Doc ID column */
+		trx->fts_next_doc_id = UINT64_UNDEFINED;
 	}
 
 	uvect->n_fields = n_changed;
 	uvect->info_bits = 0;
 
-	ut_a(buf <= (byte*)original_upd_buff + buff_len);
+	ut_a(buf <= (byte*) original_upd_buff + buff_len);
 
-	return(0);
+	return(error);
 }
 
 /**********************************************************************//**
@@ -5526,6 +6662,10 @@ ha_innobase::update_row(
 
 	ut_a(prebuilt->trx == trx);
 
+	if (!trx_is_started(trx)) {
+		++trx->will_lock;
+	}
+
 	if (upd_buf == NULL) {
 		ut_ad(upd_buf_size == 0);
 
@@ -5545,9 +6685,6 @@ ha_innobase::update_row(
 
 	ha_statistic_increment(&SSV::ha_update_count);
 
-	if (table->timestamp_field_type & TIMESTAMP_AUTO_SET_ON_UPDATE)
-		table->timestamp_field->set_time();
-
 	if (prebuilt->upd_node) {
 		uvect = prebuilt->upd_node->update;
 	} else {
@@ -5557,15 +6694,19 @@ ha_innobase::update_row(
 	/* Build an update vector from the modified fields in the rows
 	(uses upd_buf of the handle) */
 
-	calc_row_difference(uvect, (uchar*) old_row, new_row, table,
-			    upd_buf, upd_buf_size, prebuilt, user_thd);
+	error = calc_row_difference(uvect, (uchar*) old_row, new_row, table,
+				    upd_buf, upd_buf_size, prebuilt, user_thd);
+
+	if (error != DB_SUCCESS) {
+		goto func_exit;
+	}
 
 	/* This is not a delete */
 	prebuilt->upd_node->is_delete = FALSE;
 
 	ut_a(prebuilt->template_type == ROW_MYSQL_WHOLE_ROW);
 
-	innodb_srv_conc_enter_innodb(trx);
+	innobase_srv_conc_enter_innodb(trx);
 
 	error = row_update_for_mysql((byte*) old_row, prebuilt);
 
@@ -5595,32 +6736,35 @@ ha_innobase::update_row(
 
 		if (auto_inc <= col_max_value && auto_inc != 0) {
 
+			ulonglong	need;
 			ulonglong	offset;
-			ulonglong	increment;
 
 			offset = prebuilt->autoinc_offset;
-			increment = prebuilt->autoinc_increment;
+			need = prebuilt->autoinc_increment;
 
 			auto_inc = innobase_next_autoinc(
-				auto_inc, 1, increment, offset, col_max_value);
+				auto_inc, need, offset, col_max_value);
 
 			error = innobase_set_max_autoinc(auto_inc);
 		}
 	}
 
-	innodb_srv_conc_exit_innodb(trx);
+	innobase_srv_conc_exit_innodb(trx);
 
+func_exit:
 	error = convert_error_code_to_mysql(error,
 					    prebuilt->table->flags, user_thd);
 
-	if (error == 0 /* success */
-	    && uvect->n_fields == 0 /* no columns were updated */) {
+	/* If success and no columns were updated. */
+	if (error == 0 && uvect->n_fields == 0) {
 
 		/* This is the same as success, but instructs
 		MySQL that the row is not really updated and it
 		should not increase the count of updated rows.
 		This is fix for http://bugs.mysql.com/29157 */
 		error = HA_ERR_RECORD_IS_THE_SAME;
+	} else if (error == HA_FTS_INVALID_DOCID) {
+		my_error(HA_FTS_INVALID_DOCID, MYF(0));
 	}
 
 	/* Tell InnoDB server that there might be work for
@@ -5647,6 +6791,10 @@ ha_innobase::delete_row(
 
 	ut_a(prebuilt->trx == trx);
 
+	if (!trx_is_started(trx)) {
+		++trx->will_lock;
+	}
+
 	ha_statistic_increment(&SSV::ha_delete_count);
 
 	if (!prebuilt->upd_node) {
@@ -5657,11 +6805,11 @@ ha_innobase::delete_row(
 
 	prebuilt->upd_node->is_delete = TRUE;
 
-	innodb_srv_conc_enter_innodb(trx);
+	innobase_srv_conc_enter_innodb(trx);
 
 	error = row_update_for_mysql((byte*) record, prebuilt);
 
-	innodb_srv_conc_exit_innodb(trx);
+	innobase_srv_conc_exit_innodb(trx);
 
 	error = convert_error_code_to_mysql(
 		error, prebuilt->table->flags, user_thd);
@@ -5767,7 +6915,9 @@ ha_innobase::index_end(void)
 {
 	int	error	= 0;
 	DBUG_ENTER("index_end");
-	active_index=MAX_KEY;
+	active_index = MAX_KEY;
+	in_range_check_pushed_down = FALSE;
+	ds_mrr.dsmrr_close();
 	DBUG_RETURN(error);
 }
 
@@ -5788,7 +6938,7 @@ convert_search_mode_to_innobase(
 		return(PAGE_CUR_GE);
 	case HA_READ_KEY_OR_PREV:
 		return(PAGE_CUR_LE);
-	case HA_READ_AFTER_KEY:	
+	case HA_READ_AFTER_KEY:
 		return(PAGE_CUR_G);
 	case HA_READ_BEFORE_KEY:
 		return(PAGE_CUR_L);
@@ -5907,7 +7057,6 @@ ha_innobase::index_read(
 	DBUG_ENTER("index_read");
 
 	ut_a(prebuilt->trx == thd_to_trx(user_thd));
-	ut_ad(key_len != 0 || find_flag != HA_READ_KEY_EXACT);
 
 	ha_statistic_increment(&SSV::ha_read_key_count);
 
@@ -5923,11 +7072,15 @@ ha_innobase::index_read(
 			    : HA_ERR_TABLE_DEF_CHANGED);
 	}
 
+	if (index->type & DICT_FTS) {
+		DBUG_RETURN(HA_ERR_KEY_NOT_FOUND);
+	}
+
 	/* Note that if the index for which the search template is built is not
 	necessarily prebuilt->index, but can also be the clustered index */
 
 	if (prebuilt->sql_stat_start) {
-		build_template(prebuilt, user_thd, table, ROW_MYSQL_REC_FIELDS);
+		build_template(false);
 	}
 
 	if (key_ptr) {
@@ -5967,12 +7120,12 @@ ha_innobase::index_read(
 
 	if (mode != PAGE_CUR_UNSUPP) {
 
-		innodb_srv_conc_enter_innodb(prebuilt->trx);
+		innobase_srv_conc_enter_innodb(prebuilt->trx);
 
 		ret = row_search_for_mysql((byte*) buf, mode, prebuilt,
 					   match_mode, 0);
 
-		innodb_srv_conc_exit_innodb(prebuilt->trx);
+		innobase_srv_conc_exit_innodb(prebuilt->trx);
 	} else {
 
 		ret = DB_UNSUPPORTED;
@@ -6106,8 +7259,8 @@ ha_innobase::change_active_index(
 
 	if (UNIV_UNLIKELY(!prebuilt->index_usable)) {
 		if (dict_index_is_corrupted(prebuilt->index)) {
-			char	index_name[MAX_FULL_NAME_LEN + 1];
-			char	table_name[MAX_FULL_NAME_LEN + 1];
+			char index_name[MAX_FULL_NAME_LEN + 1];
+			char table_name[MAX_FULL_NAME_LEN + 1];
 
 			innobase_format_name(
 				index_name, sizeof index_name,
@@ -6118,7 +7271,7 @@ ha_innobase::change_active_index(
 				prebuilt->index->table->name, FALSE);
 
 			push_warning_printf(
-				user_thd, MYSQL_ERROR::WARN_LEVEL_WARN,
+				user_thd, Sql_condition::WARN_LEVEL_WARN,
 				HA_ERR_INDEX_CORRUPT,
 				"InnoDB: Index %s for table %s is"
 				" marked as corrupted",
@@ -6126,7 +7279,7 @@ ha_innobase::change_active_index(
 			DBUG_RETURN(HA_ERR_INDEX_CORRUPT);
 		} else {
 			push_warning_printf(
-				user_thd, MYSQL_ERROR::WARN_LEVEL_WARN,
+				user_thd, Sql_condition::WARN_LEVEL_WARN,
 				HA_ERR_TABLE_DEF_CHANGED,
 				"InnoDB: insufficient history for index %u",
 				keynr);
@@ -6151,7 +7304,7 @@ ha_innobase::change_active_index(
 	the flag ROW_MYSQL_WHOLE_ROW below, but that caused unnecessary
 	copying. Starting from MySQL-4.1 we use a more efficient flag here. */
 
-	build_template(prebuilt, user_thd, table, ROW_MYSQL_REC_FIELDS);
+	build_template(false);
 
 	DBUG_RETURN(0);
 }
@@ -6203,12 +7356,12 @@ ha_innobase::general_fetch(
 
 	ut_a(prebuilt->trx == thd_to_trx(user_thd));
 
-	innodb_srv_conc_enter_innodb(prebuilt->trx);
+	innobase_srv_conc_enter_innodb(prebuilt->trx);
 
 	ret = row_search_for_mysql(
-		(byte*)buf, 0, prebuilt, match_mode, direction);
+		(byte*) buf, 0, prebuilt, match_mode, direction);
 
-	innodb_srv_conc_exit_innodb(prebuilt->trx);
+	innobase_srv_conc_exit_innodb(prebuilt->trx);
 
 	switch (ret) {
 	case DB_SUCCESS:
@@ -6420,7 +7573,6 @@ ha_innobase::rnd_pos(
 			length of data in pos has to be ref_length */
 {
 	int		error;
-	uint		keynr	= active_index;
 	DBUG_ENTER("rnd_pos");
 	DBUG_DUMP("key", pos, ref_length);
 
@@ -6428,34 +7580,273 @@ ha_innobase::rnd_pos(
 
 	ut_a(prebuilt->trx == thd_to_trx(ha_thd()));
 
-	if (prebuilt->clust_index_was_generated) {
-		/* No primary key was defined for the table and we
-		generated the clustered index from the row id: the
-		row reference is the row id, not any key value
-		that MySQL knows of */
+	/* Note that we assume the length of the row reference is fixed
+	for the table, and it is == ref_length */
 
-		error = change_active_index(MAX_KEY);
-	} else {
-		error = change_active_index(primary_key);
-	}
+	error = index_read(buf, pos, ref_length, HA_READ_KEY_EXACT);
 
 	if (error) {
 		DBUG_PRINT("error", ("Got error: %d", error));
-		DBUG_RETURN(error);
 	}
 
-	/* Note that we assume the length of the row reference is fixed
-	for the table, and it is == ref_length */
+	DBUG_RETURN(error);
+}
 
-	error = index_read(buf, pos, ref_length, HA_READ_KEY_EXACT);
+/**********************************************************************//**
+Initialize FT index scan
+@return 0 or error number */
+UNIV_INTERN
+int
+ha_innobase::ft_init()
+/*==================*/
+{
+	DBUG_ENTER("ft_init");
 
-	if (error) {
-		DBUG_PRINT("error", ("Got error: %d", error));
+	fprintf(stderr, "ft_init()\n");
+
+	trx_t*	trx = check_trx_exists(ha_thd());
+
+	/* FTS queries are not treated as autocommit non-locking selects.
+	This is because the FTS implementation can acquire locks behind
+	the scenes. This has not been verified but it is safer to treat
+	them as regular read only transactions for now. */
+
+	if (!trx_is_started(trx)) {
+		++trx->will_lock;
 	}
 
-	change_active_index(keynr);
+	DBUG_RETURN(rnd_init(false));
+}
 
-	DBUG_RETURN(error);
+/**********************************************************************//**
+Initialize FT index scan
+@return FT_INFO structure if successful or NULL */
+UNIV_INTERN
+FT_INFO*
+ha_innobase::ft_init_ext(
+/*=====================*/
+	uint			flags,	/* in: */
+	uint			keynr,	/* in: */
+	String*			key)	/* in: */
+{
+	trx_t*			trx;
+	dict_table_t*		table;
+	ulint			error;
+	byte*			query = (byte*) key->ptr();
+	ulint			query_len = key->length();
+	const CHARSET_INFO*	char_set = key->charset();
+	NEW_FT_INFO*		fts_hdl = NULL;
+	dict_index_t*		index;
+	fts_result_t*		result;
+	char			buf_tmp[8192];
+	ulint			buf_tmp_used;
+	uint			num_errors;
+
+	fprintf(stderr, "ft_init_ext()\n");
+
+	fprintf(stderr, "keynr=%u, '%.*s'\n",
+		keynr, (int) key->length(), (byte*) key->ptr());
+
+	if (flags & FT_BOOL) {
+		fprintf(stderr, "BOOL search\n");
+	} else {
+		fprintf(stderr, "NL search\n");
+	}
+
+	/* FIXME: utf32 and utf16 are not compatible with some
+	string function used. So to convert them to uft8 before
+	proceed. */
+	if (strcmp(char_set->csname, "utf32") == 0
+	    || strcmp(char_set->csname, "utf16") == 0) {
+		buf_tmp_used = innobase_convert_string(
+			buf_tmp, sizeof(buf_tmp) - 1,
+			&my_charset_utf8_general_ci,
+			query, query_len, (CHARSET_INFO*) char_set,
+			&num_errors);
+
+		query = (byte*) buf_tmp;
+		query_len = buf_tmp_used;
+		query[query_len] = 0;
+	}
+
+	trx = prebuilt->trx;
+
+	/* FTS queries are not treated as autocommit non-locking selects.
+	This is because the FTS implementation can acquire locks behind
+	the scenes. This has not been verified but it is safer to treat
+	them as regular read only transactions for now. */
+
+	if (!trx_is_started(trx)) {
+		++trx->will_lock;
+	}
+
+	table = prebuilt->table;
+
+	/* Table does not have an FTS index */
+	if (!table->fts || ib_vector_is_empty(table->fts->indexes)) {
+		my_error(ER_TABLE_HAS_NO_FT, MYF(0));
+		return(NULL);
+	}
+
+	if (keynr == NO_SUCH_KEY) {
+		/* FIXME: Investigate the NO_SUCH_KEY usage */
+		index = (dict_index_t*) ib_vector_getp(table->fts->indexes, 0);
+	} else {
+		index = innobase_get_index(keynr);
+	}
+
+	if (!index || index->type != DICT_FTS) {
+		my_error(ER_TABLE_HAS_NO_FT, MYF(0));
+		return NULL;
+	}
+
+	if (!(table->fts->fts_status & ADDED_TABLE_SYNCED)) {
+		fts_init_index(table, FALSE);
+
+		table->fts->fts_status |= ADDED_TABLE_SYNCED;
+	}
+
+	error = fts_query(trx, index, flags, query, query_len, &result);
+
+	prebuilt->result = result;
+
+	// FIXME: Proper error handling and diagnostic
+	if (error != DB_SUCCESS) {
+		fprintf(stderr, "Error processing query\n");
+	} else {
+		/* Must return an instance of a result even if it's empty */
+		ut_a(prebuilt->result);
+
+		/* Allocate FTS handler, and instantiate it before return */
+		fts_hdl = (NEW_FT_INFO*) my_malloc(sizeof(NEW_FT_INFO),
+						   MYF(0));
+
+		fts_hdl->please = (struct _ft_vft*)(&ft_vft_result);
+		fts_hdl->ft_prebuilt = prebuilt;
+		fts_hdl->ft_result = result;
+	}
+
+	return ((FT_INFO*) fts_hdl);
+}
+
+/**********************************************************************//**
+Fetch next result from the FT result set
+@return error code */
+UNIV_INTERN
+int
+ha_innobase::ft_read(
+/*=================*/
+	uchar*		buf)		/*!< in/out: buf contain result row */
+{
+	fts_result_t*	result;
+	int		error;
+	row_prebuilt_t*	ft_prebuilt;
+
+	ft_prebuilt = ((NEW_FT_INFO*) ft_handler)->ft_prebuilt;
+
+	ut_a(ft_prebuilt == prebuilt);
+
+	result = ((NEW_FT_INFO*) ft_handler)->ft_result;
+
+	if (result->current == NULL) {
+		/* This is the case where the FTS query did not
+		contain and matching documents. */
+		if (result->rankings_by_id != NULL) {
+			/* Now that we have the complete result, we
+			need to sort the document ids on their rank
+			calculation. */
+
+			fts_query_sort_result_on_rank(result);
+
+			result->current = const_cast<ib_rbt_node_t*>(
+				rbt_first(result->rankings_by_rank));
+		} else {
+			ut_a(result->current == NULL);
+		}
+	} else {
+		result->current = const_cast<ib_rbt_node_t*>(
+			rbt_next(result->rankings_by_rank, result->current));
+	}
+
+next_record:
+
+	if (result->current != NULL) {
+		dict_index_t*	index;
+		dtuple_t*	tuple = prebuilt->search_tuple;
+
+		index = dict_table_get_index_on_name(
+			prebuilt->table, FTS_DOC_ID_INDEX_NAME);
+
+		/* Must find the index */
+		ut_a(index);
+
+		/* Switch to the FTS doc id index */
+		prebuilt->index = index;
+
+		fts_ranking_t*	ranking = rbt_value(
+			fts_ranking_t, result->current);
+
+		/* We pass a pointer to the doc_id because we need to
+		convert it to storage byte order. */
+		row_create_key(tuple, index, &ranking->doc_id);
+
+		innobase_srv_conc_enter_innodb(prebuilt->trx);
+
+		ulint ret = row_search_for_mysql(
+			(byte*) buf, PAGE_CUR_GE, prebuilt, ROW_SEL_EXACT, 0);
+
+		innobase_srv_conc_exit_innodb(prebuilt->trx);
+
+
+		if (ret == DB_SUCCESS) {
+			error = 0;
+			table->status = 0;
+
+		} else if (ret == DB_RECORD_NOT_FOUND) {
+
+			result->current = const_cast<ib_rbt_node_t*>(
+				rbt_next(result->rankings_by_rank,
+					 result->current));
+
+			if (!result->current) {
+				error = HA_ERR_KEY_NOT_FOUND;
+				table->status = STATUS_NOT_FOUND;
+			} else {
+				goto next_record;
+			}
+
+		} else if (ret == DB_END_OF_INDEX) {
+
+			error = HA_ERR_KEY_NOT_FOUND;
+			table->status = STATUS_NOT_FOUND;
+		} else {
+
+			error = convert_error_code_to_mysql(
+				(int) ret, 0, user_thd);
+
+			table->status = STATUS_NOT_FOUND;
+		}
+
+		return (error);
+	}
+
+	return(HA_ERR_END_OF_FILE);
+}
+
+/*************************************************************************
+*/
+
+void
+ha_innobase::ft_end()
+{
+	fprintf(stderr, "ft_end()\n");
+
+	if (prebuilt->result != NULL) {
+		fts_query_free_result(prebuilt->result);
+		prebuilt->result = NULL;
+	}
+
+	rnd_end();
 }
 
 /*********************************************************************//**
@@ -6486,7 +7877,7 @@ ha_innobase::position(
 
 		memcpy(ref, prebuilt->row_id, len);
 	} else {
-		len = store_key_val_for_row(primary_key, (char*)ref,
+		len = store_key_val_for_row(primary_key, (char*) ref,
 							 ref_length, record);
 	}
 
@@ -6494,8 +7885,8 @@ ha_innobase::position(
 	table. */
 
 	if (len != ref_length) {
-	  sql_print_error("Stored ref len is %lu, but table ref len is %lu",
-			  (ulong) len, (ulong) ref_length);
+		sql_print_error("Stored ref len is %lu, but table ref len is "
+				"%lu", (ulong) len, (ulong) ref_length);
 	}
 }
 
@@ -6506,6 +7897,70 @@ See http://bugs.mysql.com/32710 for expl. why we choose PROCESS. */
 	 && check_global_access(thd, PROCESS_ACL))
 
 /*****************************************************************//**
+Check whether there exist a column named as "FTS_DOC_ID", which is
+reserved for InnoDB FTS Doc ID
+@return TRUE if there exist a "FTS_DOC_ID" column */
+static
+ibool
+create_table_check_doc_id_col(
+/*==========================*/
+	trx_t*		trx,		/*!< in: InnoDB transaction handle */
+	TABLE*		form,		/*!< in: information on table
+					columns and indexes */
+	ulint*		doc_id_col)	/*!< out: Doc ID column number if
+					there exist a FTS_DOC_ID column,						ULINT_UNDEFINED if column is of the
+					wrong type/name/size */
+{
+	ibool		find_doc_id = FALSE;
+	ulint		i;
+
+	for (i = 0; i < form->s->fields; i++) {
+		Field*		field;
+		ulint		col_type;
+		ulint		col_len;
+		ulint		unsigned_type;
+
+		field = form->field[i];
+
+		col_type = get_innobase_type_from_mysql_type(&unsigned_type,
+							     field);
+
+		col_len = field->pack_length();
+
+		if (innobase_strcasecmp(field->field_name,
+					FTS_DOC_ID_COL_NAME) == 0) {
+
+			find_doc_id = TRUE;
+
+			/* Note the name is case sensitive due to
+			our internal query parser */
+			if (col_type == DATA_INT
+			    && !field->null_ptr
+			    && col_len == sizeof(doc_id_t)
+			    && (strcmp(field->field_name,
+				      FTS_DOC_ID_COL_NAME) == 0)) {
+				*doc_id_col = i;
+			} else {
+				push_warning_printf(
+					(THD*) trx->mysql_thd,
+					Sql_condition::WARN_LEVEL_WARN,
+					HA_WRONG_CREATE_OPTION,
+					"InnoDB: FTS_DOC_ID column must be "
+					"of BIGINT NOT NULL type, and named "
+					"in all capitalized characters");
+				my_error(ER_WRONG_COLUMN_NAME, MYF(0),
+					 field->field_name);
+				*doc_id_col = ULINT_UNDEFINED;
+			}
+
+			break;
+		}
+	}
+
+	return(find_doc_id);
+}
+
+/*****************************************************************//**
 Creates a table definition to an InnoDB database. */
 static
 int
@@ -6523,7 +7978,8 @@ create_table_def(
 					an .ibd file for it (no .ibd extension
 					in the path, though); otherwise this
 					is NULL */
-	ulint		flags)		/*!< in: table flags */
+	ulint		flags,		/*!< in: table flags */
+	ulint		flags2)		/*!< in: table flags2 */
 {
 	Field*		field;
 	dict_table_t*	table;
@@ -6537,6 +7993,8 @@ create_table_def(
 	ulint		long_true_varchar;
 	ulint		charset_no;
 	ulint		i;
+	ulint		doc_id_col = 0;
+	ibool		has_doc_id_col = FALSE;
 
 	DBUG_ENTER("create_table_def");
 	DBUG_PRINT("enter", ("table_name: %s", table_name));
@@ -6547,19 +8005,59 @@ create_table_def(
 	on the name length here */
 	if (strlen(table_name) > MAX_FULL_NAME_LEN) {
 		push_warning_printf(
-			(THD*) trx->mysql_thd, MYSQL_ERROR::WARN_LEVEL_WARN,
+			(THD*) trx->mysql_thd, Sql_condition::WARN_LEVEL_WARN,
 			ER_TABLE_NAME,
 			"InnoDB: Table Name or Database Name is too long");
 
 		DBUG_RETURN(ER_TABLE_NAME);
 	}
 
+	/* table_name must contain '/'. Later in the code we assert if it
+	does not */
+	if (strcmp(strchr(table_name, '/') + 1,
+		   "innodb_table_monitor") == 0) {
+		push_warning(
+			(THD*) trx->mysql_thd, Sql_condition::WARN_LEVEL_WARN,
+			HA_ERR_WRONG_COMMAND,
+			DEPRECATED_MSG_INNODB_TABLE_MONITOR);
+	}
+
 	n_cols = form->s->fields;
 
+	/* Check whether there already exists a FTS_DOC_ID column */
+	if (create_table_check_doc_id_col(trx, form, &doc_id_col)){
+
+		/* Raise error if the Doc ID column is of wrong type or name */
+		if (doc_id_col == ULINT_UNDEFINED) {
+			trx_commit_for_mysql(trx);
+
+			error = DB_ERROR;
+			goto error_ret;
+		} else {
+			has_doc_id_col = TRUE;
+		}
+	}
+
 	/* We pass 0 as the space id, and determine at a lower level the space
 	id where to store the table */
 
-	table = dict_mem_table_create(table_name, 0, n_cols, flags);
+	if (flags2 & DICT_TF2_FTS) {
+		/* Adjust for the FTS hidden field */
+		if (!has_doc_id_col) {
+			table = dict_mem_table_create(table_name, 0, n_cols + 1,
+						      flags, flags2);
+
+			/* Set the hidden doc_id column. */
+			table->fts->doc_col = n_cols;
+		} else {
+			table = dict_mem_table_create(table_name, 0, n_cols,
+						      flags, flags2);
+			table->fts->doc_col = doc_id_col;
+		}
+	} else {
+		table = dict_mem_table_create(table_name, 0, n_cols,
+					      flags, flags2);
+	}
 
 	if (path_of_temp_table) {
 		table->dir_path_of_temp_table =
@@ -6575,7 +8073,7 @@ create_table_def(
 		if (!col_type) {
 			push_warning_printf(
 				(THD*) trx->mysql_thd,
-				MYSQL_ERROR::WARN_LEVEL_WARN,
+				Sql_condition::WARN_LEVEL_WARN,
 				ER_CANT_CREATE_TABLE,
 				"Error creating table '%s' with "
 				"column '%s'. Please check its "
@@ -6602,14 +8100,14 @@ create_table_def(
 
 		if (dtype_is_string_type(col_type)) {
 
-			charset_no = (ulint)field->charset()->number;
+			charset_no = (ulint) field->charset()->number;
 
-			if (UNIV_UNLIKELY(charset_no >= 256)) {
+			if (UNIV_UNLIKELY(charset_no > MAX_CHAR_COLL_NUM)) {
 				/* in data0type.h we assume that the
 				number fits in one byte in prtype */
 				push_warning_printf(
 					(THD*) trx->mysql_thd,
-					MYSQL_ERROR::WARN_LEVEL_WARN,
+					Sql_condition::WARN_LEVEL_WARN,
 					ER_CANT_CREATE_TABLE,
 					"In InnoDB, charset-collation codes"
 					" must be below 256."
@@ -6619,8 +8117,9 @@ create_table_def(
 			}
 		}
 
-		ut_a(field->type() < 256); /* we assume in dtype_form_prtype()
-					   that this fits in one byte */
+		/* we assume in dtype_form_prtype() that this fits in
+		two bytes */
+		ut_a(field->type() <= MAX_CHAR_COLL_NUM);
 		col_len = field->pack_length();
 
 		/* The MySQL pack length contains 1 or 2 bytes length field
@@ -6631,9 +8130,9 @@ create_table_def(
 		long_true_varchar = 0;
 
 		if (field->type() == MYSQL_TYPE_VARCHAR) {
-			col_len -= ((Field_varstring*)field)->length_bytes;
+			col_len -= ((Field_varstring*) field)->length_bytes;
 
-			if (((Field_varstring*)field)->length_bytes == 2) {
+			if (((Field_varstring*) field)->length_bytes == 2) {
 				long_true_varchar = DATA_LONG_TRUE_VARCHAR;
 			}
 		}
@@ -6655,13 +8154,18 @@ err_col:
 			(char*) field->field_name,
 			col_type,
 			dtype_form_prtype(
-				(ulint)field->type()
+				(ulint) field->type()
 				| nulls_allowed | unsigned_type
 				| binary_type | long_true_varchar,
 				charset_no),
 			col_len);
 	}
 
+	/* Add the FTS doc_id hidden column. */
+	if (flags2 & DICT_TF2_FTS && !has_doc_id_col) {
+		fts_add_doc_id_column(table);
+	}
+
 	error = row_create_table_for_mysql(table, trx);
 
 	if (error == DB_DUPLICATE_KEY) {
@@ -6701,11 +8205,11 @@ create_index(
 	KEY_PART_INFO*	key_part;
 	ulint		ind_type;
 	ulint		col_type;
-	ulint		prefix_len;
+	ulint		prefix_len = 0;
 	ulint		is_unsigned;
 	ulint		i;
 	ulint		j;
-	ulint*		field_lengths;
+	ulint*		field_lengths = NULL;
 
 	DBUG_ENTER("create_index");
 
@@ -6718,12 +8222,16 @@ create_index(
 
 	ind_type = 0;
 
-	if (key_num == form->s->primary_key) {
-		ind_type = ind_type | DICT_CLUSTERED;
-	}
+	if (key->flags & HA_FULLTEXT) {
+		ind_type = DICT_FTS;
+	} else {
+		if (key_num == form->s->primary_key) {
+			ind_type = ind_type | DICT_CLUSTERED;
+		}
 
-	if (key->flags & HA_NOSAME ) {
-		ind_type = ind_type | DICT_UNIQUE;
+		if (key->flags & HA_NOSAME ) {
+			ind_type = ind_type | DICT_UNIQUE;
+		}
 	}
 
 	/* We pass 0 as the space id, and determine at a lower level the space
@@ -6732,51 +8240,59 @@ create_index(
 	index = dict_mem_index_create(table_name, key->name, 0,
 				      ind_type, n_fields);
 
-	field_lengths = (ulint*) my_malloc(sizeof(ulint) * n_fields,
-		MYF(MY_FAE));
+	if (ind_type != DICT_FTS) {
+		field_lengths = (ulint*) my_malloc(
+			sizeof(ulint) * n_fields, MYF(MY_FAE));
+
+		ut_ad(!(index->type & DICT_FTS));
+	}
 
 	for (i = 0; i < n_fields; i++) {
 		key_part = key->key_part + i;
 
-		/* (The flag HA_PART_KEY_SEG denotes in MySQL a column prefix
-		field in an index: we only store a specified number of first
-		bytes of the column to the index field.) The flag does not
-		seem to be properly set by MySQL. Let us fall back on testing
-		the length of the key part versus the column. */
+		if (ind_type != DICT_FTS) {
 
-		field = NULL;
-		for (j = 0; j < form->s->fields; j++) {
+			/* (The flag HA_PART_KEY_SEG denotes in MySQL a
+			column prefix field in an index: we only store a
+			specified number of first bytes of the column to
+			the index field.) The flag does not seem to be
+			properly set by MySQL. Let us fall back on testing
+			the length of the key part versus the column. */
 
-			field = form->field[j];
+			field = NULL;
 
-			if (0 == innobase_strcasecmp(
-					field->field_name,
-					key_part->field->field_name)) {
-				/* Found the corresponding column */
+			for (j = 0; j < form->s->fields; j++) {
 
-				break;
+				field = form->field[j];
+
+				if (0 == innobase_strcasecmp(
+						field->field_name,
+						key_part->field->field_name)) {
+					/* Found the corresponding column */
+
+					break;
+				}
 			}
-		}
 
-		ut_a(j < form->s->fields);
+			ut_a(j < form->s->fields);
 
-		col_type = get_innobase_type_from_mysql_type(
-					&is_unsigned, key_part->field);
+			col_type = get_innobase_type_from_mysql_type(
+						&is_unsigned, key_part->field);
 
-		if (DATA_BLOB == col_type
-			|| (key_part->length < field->pack_length()
-				&& field->type() != MYSQL_TYPE_VARCHAR)
-			|| (field->type() == MYSQL_TYPE_VARCHAR
-				&& key_part->length < field->pack_length()
-				- ((Field_varstring*)field)->length_bytes)) {
+			if (DATA_BLOB == col_type
+				|| (key_part->length < field->pack_length()
+					&& field->type() != MYSQL_TYPE_VARCHAR)
+				|| (field->type() == MYSQL_TYPE_VARCHAR
+					&& key_part->length < field->pack_length()
+					- ((Field_varstring*) field)->length_bytes)) {
 
-			prefix_len = key_part->length;
+				prefix_len = key_part->length;
 
-			if (col_type == DATA_INT
-				|| col_type == DATA_FLOAT
-				|| col_type == DATA_DOUBLE
-				|| col_type == DATA_DECIMAL) {
-				sql_print_error(
+				if (col_type == DATA_INT
+					|| col_type == DATA_FLOAT
+					|| col_type == DATA_DOUBLE
+					|| col_type == DATA_DECIMAL) {
+					sql_print_error(
 					"MySQL is trying to create a column "
 					"prefix index field, on an "
 					"inappropriate data type. Table "
@@ -6784,18 +8300,21 @@ create_index(
 					table_name,
 					key_part->field->field_name);
 
+					prefix_len = 0;
+				}
+			} else {
 				prefix_len = 0;
 			}
-		} else {
-			prefix_len = 0;
-		}
 
-		field_lengths[i] = key_part->length;
+			field_lengths[i] = key_part->length;
+		}
 
 		dict_mem_index_add_field(index,
 			(char*) key_part->field->field_name, prefix_len);
 	}
 
+	ut_ad(key->flags & HA_FULLTEXT || !(index->type & DICT_FTS));
+
 	/* Even though we've defined max_supported_key_part_length, we
 	still do our own checking using field_lengths to be absolutely
 	sure we don't create too long indexes. */
@@ -6865,10 +8384,10 @@ get_row_format_name(
 }
 
 /** If file-per-table is missing, issue warning and set ret false */
-#define CHECK_ERROR_ROW_TYPE_NEEDS_FILE_PER_TABLE		\
-	if (!srv_file_per_table) {				\
+#define CHECK_ERROR_ROW_TYPE_NEEDS_FILE_PER_TABLE(use_tablespace)\
+	if (!use_tablespace) {					\
 		push_warning_printf(				\
-			thd, MYSQL_ERROR::WARN_LEVEL_WARN,	\
+			thd, Sql_condition::WARN_LEVEL_WARN,	\
 			HA_WRONG_CREATE_OPTION,		\
 			"InnoDB: ROW_FORMAT=%s requires"	\
 			" innodb_file_per_table.",		\
@@ -6878,9 +8397,9 @@ get_row_format_name(
 
 /** If file-format is Antelope, issue warning and set ret false */
 #define CHECK_ERROR_ROW_TYPE_NEEDS_GT_ANTELOPE			\
-	if (srv_file_format < DICT_TF_FORMAT_ZIP) {		\
+	if (srv_file_format < UNIV_FORMAT_B) {		\
 		push_warning_printf(				\
-			thd, MYSQL_ERROR::WARN_LEVEL_WARN,	\
+			thd, Sql_condition::WARN_LEVEL_WARN,	\
 			HA_WRONG_CREATE_OPTION,		\
 			"InnoDB: ROW_FORMAT=%s requires"	\
 			" innodb_file_format > Antelope.",	\
@@ -6902,7 +8421,8 @@ create_options_are_valid(
 	THD*		thd,		/*!< in: connection thread. */
 	TABLE*		form,		/*!< in: information on table
 					columns and indexes */
-	HA_CREATE_INFO*	create_info)	/*!< in: create info. */
+	HA_CREATE_INFO*	create_info,	/*!< in: create info. */
+	bool		use_tablespace)	/*!< in: srv_file_per_table */
 {
 	ibool	kbs_specified	= FALSE;
 	ibool	ret		= TRUE;
@@ -6922,32 +8442,50 @@ create_options_are_valid(
 	if (create_info->key_block_size) {
 		kbs_specified = TRUE;
 		switch (create_info->key_block_size) {
+			ulint	kbs_max;
 		case 1:
 		case 2:
 		case 4:
 		case 8:
 		case 16:
 			/* Valid KEY_BLOCK_SIZE, check its dependencies. */
-			if (!srv_file_per_table) {
+			if (!use_tablespace) {
 				push_warning(
-					thd, MYSQL_ERROR::WARN_LEVEL_WARN,
+					thd, Sql_condition::WARN_LEVEL_WARN,
 					HA_WRONG_CREATE_OPTION,
 					"InnoDB: KEY_BLOCK_SIZE requires"
 					" innodb_file_per_table.");
 				ret = FALSE;
 			}
-			if (srv_file_format < DICT_TF_FORMAT_ZIP) {
+			if (srv_file_format < UNIV_FORMAT_B) {
 				push_warning(
-					thd, MYSQL_ERROR::WARN_LEVEL_WARN,
+					thd, Sql_condition::WARN_LEVEL_WARN,
 					HA_WRONG_CREATE_OPTION,
 					"InnoDB: KEY_BLOCK_SIZE requires"
 					" innodb_file_format > Antelope.");
-					ret = FALSE;
+				ret = FALSE;
+			}
+
+			/* The maximum KEY_BLOCK_SIZE (KBS) is 16. But if
+			UNIV_PAGE_SIZE is smaller than 16k, the maximum
+			KBS is also smaller. */
+			kbs_max = ut_min(
+				1 << (UNIV_PAGE_SSIZE_MAX - 1),
+				1 << (PAGE_ZIP_SSIZE_MAX - 1));
+			if (create_info->key_block_size > kbs_max) {
+				push_warning_printf(
+					thd, Sql_condition::WARN_LEVEL_WARN,
+					HA_WRONG_CREATE_OPTION,
+					"InnoDB: KEY_BLOCK_SIZE=%ld"
+					" cannot be larger than %ld.",
+					create_info->key_block_size,
+					kbs_max);
+				ret = FALSE;
 			}
 			break;
 		default:
 			push_warning_printf(
-				thd, MYSQL_ERROR::WARN_LEVEL_WARN,
+				thd, Sql_condition::WARN_LEVEL_WARN,
 				HA_WRONG_CREATE_OPTION,
 				"InnoDB: invalid KEY_BLOCK_SIZE = %lu."
 				" Valid values are [1, 2, 4, 8, 16]",
@@ -6956,23 +8494,23 @@ create_options_are_valid(
 			break;
 		}
 	}
-	
+
 	/* Check for a valid Innodb ROW_FORMAT specifier and
 	other incompatibilities. */
 	switch (row_format) {
 	case ROW_TYPE_COMPRESSED:
-		CHECK_ERROR_ROW_TYPE_NEEDS_FILE_PER_TABLE;
+		CHECK_ERROR_ROW_TYPE_NEEDS_FILE_PER_TABLE(use_tablespace);
 		CHECK_ERROR_ROW_TYPE_NEEDS_GT_ANTELOPE;
 		break;
 	case ROW_TYPE_DYNAMIC:
-		CHECK_ERROR_ROW_TYPE_NEEDS_FILE_PER_TABLE;
+		CHECK_ERROR_ROW_TYPE_NEEDS_FILE_PER_TABLE(use_tablespace);
 		CHECK_ERROR_ROW_TYPE_NEEDS_GT_ANTELOPE;
 		/* fall through since dynamic also shuns KBS */
 	case ROW_TYPE_COMPACT:
 	case ROW_TYPE_REDUNDANT:
 		if (kbs_specified) {
 			push_warning_printf(
-				thd, MYSQL_ERROR::WARN_LEVEL_WARN,
+				thd, Sql_condition::WARN_LEVEL_WARN,
 				HA_WRONG_CREATE_OPTION,
 				"InnoDB: cannot specify ROW_FORMAT = %s"
 				" with KEY_BLOCK_SIZE.",
@@ -6986,7 +8524,7 @@ create_options_are_valid(
 	case ROW_TYPE_PAGE:
 	case ROW_TYPE_NOT_USED:
 		push_warning(
-			thd, MYSQL_ERROR::WARN_LEVEL_WARN,
+			thd, Sql_condition::WARN_LEVEL_WARN,
 			HA_WRONG_CREATE_OPTION,		\
 			"InnoDB: invalid ROW_FORMAT specifier.");
 		ret = FALSE;
@@ -7002,15 +8540,31 @@ UNIV_INTERN
 void
 ha_innobase::update_create_info(
 /*============================*/
-	HA_CREATE_INFO* create_info)	/*!< in/out: create info */
+	HA_CREATE_INFO*	create_info)	/*!< in/out: create info */
 {
-  if (!(create_info->used_fields & HA_CREATE_USED_AUTO)) {
-    ha_innobase::info(HA_STATUS_AUTO);
-    create_info->auto_increment_value = stats.auto_increment_value;
-  }
+	if (!(create_info->used_fields & HA_CREATE_USED_AUTO)) {
+		ha_innobase::info(HA_STATUS_AUTO);
+		create_info->auto_increment_value = stats.auto_increment_value;
+	}
 }
 
 /*****************************************************************//**
+Initialize the table FTS stopword list
+@TRUE if succeed */
+UNIV_INTERN
+ibool
+innobase_fts_load_stopword(
+/*=======================*/
+	dict_table_t*	table,	/*!< in: Table has the FTS */
+	trx_t*		trx,	/*!< in: transaction */
+	THD*		thd)	/*!< in: current thread */
+{
+	return (fts_load_stopword(table, trx,
+				  fts_server_stopword_table,
+				  THDVAR(thd, ft_user_stopword_table),
+				  THDVAR(thd, ft_enable_stopword), FALSE));
+}
+/*****************************************************************//**
 Creates a new table to an InnoDB database.
 @return	error number */
 UNIV_INTERN
@@ -7025,7 +8579,6 @@ ha_innobase::create(
 					create statement string */
 {
 	int		error;
-	dict_table_t*	innobase_table;
 	trx_t*		parent_trx;
 	trx_t*		trx;
 	int		primary_key_no;
@@ -7034,13 +8587,30 @@ ha_innobase::create(
 	char		norm_name[FN_REFLEN];
 	THD*		thd = ha_thd();
 	ib_int64_t	auto_inc_value;
-	ulint		flags;
+	ulint		fts_indexes = 0;
+	ibool		zip_allowed = TRUE;
+	enum row_type	row_format;
+	rec_format_t	innodb_row_format = REC_FORMAT_COMPACT;
+
+	/* Cache the global variable "srv_file_per_table" to a local
+	variable before using it. Note that "srv_file_per_table"
+	is not under dict_sys mutex protection, and could be changed
+	while creating the table. So we read the current value here
+	and make all further decisions based on this. */
+	bool		use_tablespace = srv_file_per_table;
+
+	/* Zip Shift Size - log2 - 9 of compressed page size,
+	zero for uncompressed */
+	ulint		zip_ssize = 0;
+	ulint		flags = 0;
+	ulint		flags2 = 0;
+	dict_table_t*	innobase_table = NULL;
+
 	/* Cache the value of innodb_file_format, in case it is
 	modified by another thread while the table is being created. */
-	const ulint	file_format = srv_file_format;
+	const ulint	file_format_allowed = srv_file_format;
 	const char*	stmt;
 	size_t		stmt_len;
-	enum row_type	row_format;
 
 	DBUG_ENTER("ha_innobase::create");
 
@@ -7059,7 +8629,7 @@ ha_innobase::create(
 	returns error if it is in full path format, but not creating a temp.
 	table. Currently InnoDB does not support symbolic link on Windows. */
 
-	if (srv_file_per_table
+	if (use_tablespace
 	    && !mysqld_embedded
 	    && (!create_info->options & HA_LEX_CREATE_TMP_TABLE)) {
 
@@ -7078,7 +8648,49 @@ ha_innobase::create(
 		DBUG_RETURN(HA_ERR_TO_BIG_ROW);
 	}
 
-	ut_a(strlen(name) < sizeof(name2));
+	/* Check if there are any FTS indexes defined on this table. */
+	for (i = 0; i < form->s->keys; i++) {
+		KEY*    key = form->key_info + i;
+
+		if (key->flags & HA_FULLTEXT) {
+			++fts_indexes;
+
+			/* We don't support FTS indexes in temporary
+			tables. */
+			if (create_info->options & HA_LEX_CREATE_TMP_TABLE) {
+
+				my_error(ER_INNODB_NO_FT_TEMP_TABLE, MYF(0));
+				DBUG_RETURN(-1);
+			}
+		}
+
+		if (innobase_strcasecmp(key->name, FTS_DOC_ID_INDEX_NAME)) {
+			continue;
+		}
+
+		/* Do a pre-check on FTS DOC ID index */
+		if (!(key->flags & HA_NOSAME)
+		    || strcmp(key->name, FTS_DOC_ID_INDEX_NAME)
+		    || strcmp(key->key_part[0].field->field_name,
+			      FTS_DOC_ID_COL_NAME)) {
+			push_warning_printf(thd,
+					    Sql_condition::WARN_LEVEL_WARN,
+					    ER_WRONG_NAME_FOR_INDEX,
+					    " InnoDB: Index name %s is reserved"
+					    " for the unique index on"
+					    " FTS_DOC_ID column for FTS"
+					    " document ID indexing"
+					    " on table %s. Please check"
+					    " the index definition to"
+					    " make sure it is of correct"
+					    " type\n",
+					    FTS_DOC_ID_INDEX_NAME,
+					    name);
+			my_error(ER_WRONG_NAME_FOR_INDEX, MYF(0),
+				 FTS_DOC_ID_INDEX_NAME);
+			DBUG_RETURN(-1);
+		}
+	}
 
 	strcpy(name2, name);
 
@@ -7088,52 +8700,57 @@ ha_innobase::create(
 
 	flags = 0;
 
+	if (fts_indexes > 0) {
+		flags2 = DICT_TF2_FTS;
+	}
+
 	/* Validate create options if innodb_strict_mode is set. */
-	if (!create_options_are_valid(thd, form, create_info)) {
+	if (!create_options_are_valid(
+			thd, form, create_info, use_tablespace)) {
 		DBUG_RETURN(HA_WRONG_CREATE_OPTION);
 	}
 
 	if (create_info->key_block_size) {
-		/* Determine the page_zip.ssize corresponding to the
-		requested page size (key_block_size) in kilobytes. */
-
-		ulint	ssize, ksize;
-		ulint	key_block_size = create_info->key_block_size;
-
-		/*  Set 'flags' to the correct key_block_size.
-		It will be zero if key_block_size is an invalid number.*/
-		for (ssize = ksize = 1; ssize <= DICT_TF_ZSSIZE_MAX;
-		     ssize++, ksize <<= 1) {
-			if (key_block_size == ksize) {
-				flags = ssize << DICT_TF_ZSSIZE_SHIFT
-					| DICT_TF_COMPACT
-					| DICT_TF_FORMAT_ZIP
-					  << DICT_TF_FORMAT_SHIFT;
+		/* The requested compressed page size (key_block_size)
+		is given in kilobytes. If it is a valid number, store
+		that value as the number of log2 shifts from 512 in
+		zip_ssize. Zero means it is not compressed. */
+		ulint zssize;		/* Zip Shift Size */
+		ulint kbsize;		/* Key Block Size */
+		for (zssize = kbsize = 1;
+		     zssize <= ut_min(UNIV_PAGE_SSIZE_MAX,
+				      PAGE_ZIP_SSIZE_MAX);
+		     zssize++, kbsize <<= 1) {
+			if (kbsize == create_info->key_block_size) {
+				zip_ssize = zssize;
 				break;
 			}
 		}
 
-		if (!srv_file_per_table) {
+		/* Make sure compressed row format is allowed. */
+		if (!use_tablespace) {
 			push_warning(
-				thd, MYSQL_ERROR::WARN_LEVEL_WARN,
+				thd, Sql_condition::WARN_LEVEL_WARN,
 				HA_WRONG_CREATE_OPTION,
 				"InnoDB: KEY_BLOCK_SIZE requires"
 				" innodb_file_per_table.");
-			flags = 0;
+			zip_allowed = FALSE;
 		}
 
-		if (file_format < DICT_TF_FORMAT_ZIP) {
+		if (file_format_allowed < UNIV_FORMAT_B) {
 			push_warning(
-				thd, MYSQL_ERROR::WARN_LEVEL_WARN,
+				thd, Sql_condition::WARN_LEVEL_WARN,
 				HA_WRONG_CREATE_OPTION,
 				"InnoDB: KEY_BLOCK_SIZE requires"
 				" innodb_file_format > Antelope.");
-			flags = 0;
+			zip_allowed = FALSE;
 		}
 
-		if (!flags) {
+		if (!zip_allowed
+		    || zssize > ut_min(UNIV_PAGE_SSIZE_MAX,
+				       PAGE_ZIP_SSIZE_MAX)) {
 			push_warning_printf(
-				thd, MYSQL_ERROR::WARN_LEVEL_WARN,
+				thd, Sql_condition::WARN_LEVEL_WARN,
 				HA_WRONG_CREATE_OPTION,
 				"InnoDB: ignoring KEY_BLOCK_SIZE=%lu.",
 				create_info->key_block_size);
@@ -7142,7 +8759,7 @@ ha_innobase::create(
 
 	row_format = form->s->row_type;
 
-	if (flags) {
+	if (zip_ssize && zip_allowed) {
 		/* if ROW_FORMAT is set to default,
 		automatically change it to COMPRESSED.*/
 		if (row_format == ROW_TYPE_DEFAULT) {
@@ -7155,79 +8772,81 @@ ha_innobase::create(
 			such combinations can be obtained
 			with ALTER TABLE anyway. */
 			push_warning_printf(
-				thd, MYSQL_ERROR::WARN_LEVEL_WARN,
+				thd, Sql_condition::WARN_LEVEL_WARN,
 				HA_WRONG_CREATE_OPTION,
 				"InnoDB: ignoring KEY_BLOCK_SIZE=%lu"
 				" unless ROW_FORMAT=COMPRESSED.",
 				create_info->key_block_size);
-			flags = 0;
+			zip_allowed = FALSE;
 		}
 	} else {
-		/* flags == 0 means no KEY_BLOCK_SIZE.*/
-		if (row_format == ROW_TYPE_COMPRESSED) {
-			/* ROW_FORMAT=COMPRESSED without
-			KEY_BLOCK_SIZE implies half the
-			maximum KEY_BLOCK_SIZE. */
-			flags = (DICT_TF_ZSSIZE_MAX - 1)
-				<< DICT_TF_ZSSIZE_SHIFT
-				| DICT_TF_COMPACT
-				| DICT_TF_FORMAT_ZIP
-				<< DICT_TF_FORMAT_SHIFT;
-#if DICT_TF_ZSSIZE_MAX < 1
-# error "DICT_TF_ZSSIZE_MAX < 1"
-#endif
+		/* zip_ssize == 0 means no KEY_BLOCK_SIZE.*/
+		if (row_format == ROW_TYPE_COMPRESSED && zip_allowed) {
+			/* ROW_FORMAT=COMPRESSED without KEY_BLOCK_SIZE
+			implies half the maximum KEY_BLOCK_SIZE(*1k) or
+			UNIV_PAGE_SIZE, whichever is less. */
+			zip_ssize = ut_min(UNIV_PAGE_SSIZE_MAX,
+					   PAGE_ZIP_SSIZE_MAX) - 1;
 		}
 	}
 
+	/* Validate the row format.  Correct it if necessary */
 	switch (row_format) {
 	case ROW_TYPE_REDUNDANT:
+		innodb_row_format = REC_FORMAT_REDUNDANT;
 		break;
+
 	case ROW_TYPE_COMPRESSED:
 	case ROW_TYPE_DYNAMIC:
-		if (!srv_file_per_table) {
+		if (!use_tablespace) {
 			push_warning_printf(
-				thd, MYSQL_ERROR::WARN_LEVEL_WARN,
+				thd, Sql_condition::WARN_LEVEL_WARN,
 				HA_WRONG_CREATE_OPTION,
 				"InnoDB: ROW_FORMAT=%s requires"
 				" innodb_file_per_table.",
 				get_row_format_name(row_format));
-		} else if (file_format < DICT_TF_FORMAT_ZIP) {
+		} else if (file_format_allowed == UNIV_FORMAT_A) {
 			push_warning_printf(
-				thd, MYSQL_ERROR::WARN_LEVEL_WARN,
+				thd, Sql_condition::WARN_LEVEL_WARN,
 				HA_WRONG_CREATE_OPTION,
 				"InnoDB: ROW_FORMAT=%s requires"
 				" innodb_file_format > Antelope.",
 				get_row_format_name(row_format));
 		} else {
-			flags |= DICT_TF_COMPACT
-			         | (DICT_TF_FORMAT_ZIP
-			            << DICT_TF_FORMAT_SHIFT);
+			innodb_row_format = (row_format == ROW_TYPE_DYNAMIC
+					     ? REC_FORMAT_DYNAMIC
+					     : REC_FORMAT_COMPRESSED);
 			break;
 		}
-
-		/* fall through */
+		zip_allowed = FALSE;
+		/* fall through to set row_format = COMPACT */
 	case ROW_TYPE_NOT_USED:
 	case ROW_TYPE_FIXED:
 	case ROW_TYPE_PAGE:
 		push_warning(
-			thd, MYSQL_ERROR::WARN_LEVEL_WARN,
+			thd, Sql_condition::WARN_LEVEL_WARN,
 			HA_WRONG_CREATE_OPTION,
 			"InnoDB: assuming ROW_FORMAT=COMPACT.");
 	case ROW_TYPE_DEFAULT:
+		/* If we fell through, set row format to Compact. */
+		row_format = ROW_TYPE_COMPACT;
 	case ROW_TYPE_COMPACT:
-		flags = DICT_TF_COMPACT;
 		break;
 	}
 
-	/* Look for a primary key */
+	/* Set the table flags */
+	if (!zip_allowed) {
+		zip_ssize = 0;
+	}
+	dict_tf_set(&flags, innodb_row_format, zip_ssize);
 
-	primary_key_no= (form->s->primary_key != MAX_KEY ?
+	/* Look for a primary key */
+	primary_key_no = (form->s->primary_key != MAX_KEY ?
 			 (int) form->s->primary_key :
 			 -1);
 
 	/* Our function innobase_get_mysql_key_number_for_index assumes
 	the primary key is always number 0, if it exists */
-
 	ut_a(primary_key_no == -1 || primary_key_no == 0);
 
 	/* Check for name conflicts (with reserved name) for
@@ -7242,7 +8861,11 @@ ha_innobase::create(
 	}
 
 	if (create_info->options & HA_LEX_CREATE_TMP_TABLE) {
-		flags |= DICT_TF2_TEMPORARY << DICT_TF2_SHIFT;
+		flags2 |= DICT_TF2_TEMPORARY;
+	}
+
+	if (use_tablespace) {
+		flags2 |= DICT_TF2_USE_TABLESPACE;
 	}
 
 	/* Get the transaction associated with the current thd, or create one
@@ -7259,19 +8882,18 @@ ha_innobase::create(
 
 	/* Latch the InnoDB data dictionary exclusively so that no deadlocks
 	or lock waits can happen in it during a table create operation.
-	Drop table etc. do this latching in row0mysql.c. */
+	Drop table etc. do this latching in row0mysql.cc. */
 
 	row_mysql_lock_data_dictionary(trx);
 
 	error = create_table_def(trx, form, norm_name,
 		create_info->options & HA_LEX_CREATE_TMP_TABLE ? name2 : NULL,
-		flags);
+		flags, flags2);
 
 	if (error) {
 		goto cleanup;
 	}
 
-
 	/* Create the keys */
 
 	if (form->s->keys == 0 || primary_key_no == -1) {
@@ -7295,9 +8917,63 @@ ha_innobase::create(
 		}
 	}
 
+	/* Create the ancillary tables that are common to all FTS indexes on
+	this table. */
+	if (fts_indexes > 0) {
+		ulint	ret = 0;
+
+		innobase_table = dict_table_open_on_name_no_stats(
+			norm_name, TRUE, DICT_ERR_IGNORE_NONE);
+
+		ut_a(innobase_table);
+
+		/* Check whether there alreadys exist FTS_DOC_ID_INDEX */
+		ret = innobase_fts_check_doc_id_index_in_def(
+			form->s->keys, form->s->key_info);
+
+		/* Raise error if FTS_DOC_ID_INDEX is of wrong format */
+		if (ret == FTS_INCORRECT_DOC_ID_INDEX) {
+			push_warning_printf(thd,
+					    Sql_condition::WARN_LEVEL_WARN,
+					    ER_WRONG_NAME_FOR_INDEX,
+					    " InnoDB: Index name %s is reserved"
+					    " for the unique index on"
+					    " FTS_DOC_ID column for FTS"
+					    " Document ID indexing"
+					    " on table %s. Please check"
+					    " the index definition to"
+					    " make sure it is of correct"
+					    " type\n",
+					    FTS_DOC_ID_INDEX_NAME,
+					    innobase_table->name);
+
+			if (innobase_table->fts) {
+				fts_free(innobase_table);
+			}
+
+			dict_table_close(innobase_table, TRUE);
+			my_error(ER_WRONG_NAME_FOR_INDEX, MYF(0),
+				 FTS_DOC_ID_INDEX_NAME);
+			error = -1;
+			goto cleanup;
+		}
+
+		error = fts_create_common_tables(
+			trx, innobase_table, norm_name,
+			(ret == FTS_EXIST_DOC_ID_INDEX));
+
+		error = convert_error_code_to_mysql(error, 0, NULL);
+
+		dict_table_close(innobase_table, TRUE);
+
+		if (error) {
+			goto cleanup;
+		}
+	}
+
 	for (i = 0; i < form->s->keys; i++) {
 
-		if (i != (uint) primary_key_no) {
+		if (i != static_cast<uint>(primary_key_no)) {
 
 			if ((error = create_index(trx, form, flags,
 						  norm_name, i))) {
@@ -7317,7 +8993,7 @@ ha_innobase::create(
 
 		case DB_PARENT_NO_INDEX:
 			push_warning_printf(
-				thd, MYSQL_ERROR::WARN_LEVEL_WARN,
+				thd, Sql_condition::WARN_LEVEL_WARN,
 				HA_ERR_CANNOT_ADD_FOREIGN,
 				"Create table '%s' with foreign key constraint"
 				" failed. There is no index in the referenced"
@@ -7327,14 +9003,14 @@ ha_innobase::create(
 
 		case DB_CHILD_NO_INDEX:
 			push_warning_printf(
-				thd, MYSQL_ERROR::WARN_LEVEL_WARN,
+				thd, Sql_condition::WARN_LEVEL_WARN,
 				HA_ERR_CANNOT_ADD_FOREIGN,
 				"Create table '%s' with foreign key constraint"
 				" failed. There is no index in the referencing"
 				" table where referencing columns appear"
 				" as the first columns.\n", norm_name);
 			break;
-                }
+		}
 
 		error = convert_error_code_to_mysql(error, flags, NULL);
 
@@ -7342,6 +9018,15 @@ ha_innobase::create(
 			goto cleanup;
 		}
 	}
+	/* Cache all the FTS indexes on this table in the FTS specific
+	structure. They are used for FTS indexed column update handling. */
+	if (fts_indexes > 0) {
+		fts_t*          fts = innobase_table->fts;
+
+		ut_a(fts != NULL);
+
+		dict_table_get_all_fts_indexes(innobase_table, fts->indexes);
+	}
 
 	innobase_commit_low(trx);
 
@@ -7353,7 +9038,7 @@ ha_innobase::create(
 
 	log_buffer_flush_to_disk();
 
-	innobase_table = dict_table_get(norm_name, FALSE);
+	innobase_table = dict_table_open_on_name(norm_name, FALSE);
 
 	DBUG_ASSERT(innobase_table != 0);
 
@@ -7366,6 +9051,16 @@ ha_innobase::create(
 			dict_table_get_format(innobase_table));
 	}
 
+	/* Load server stopword into FTS cache */
+	if (fts_indexes > 0) {
+		if (!innobase_fts_load_stopword(innobase_table, NULL, thd)) {
+			dict_table_close(innobase_table, FALSE);
+			srv_active_wake_master_thread();
+			trx_free_for_mysql(trx);
+			DBUG_RETURN(-1);
+		}
+	}
+
 	/* Note: We can't call update_thd() as prebuilt will not be
 	setup at this stage and so we use thd. */
 
@@ -7396,6 +9091,8 @@ ha_innobase::create(
 		dict_table_autoinc_unlock(innobase_table);
 	}
 
+	dict_table_close(innobase_table, FALSE);
+
 	/* Tell the InnoDB server that there might be work for
 	utility threads: */
 
@@ -7453,8 +9150,8 @@ Deletes all rows of an InnoDB table.
 @return	error number */
 UNIV_INTERN
 int
-ha_innobase::truncate(void)
-/*==============================*/
+ha_innobase::truncate()
+/*===================*/
 {
 	int		error;
 
@@ -7465,6 +9162,9 @@ ha_innobase::truncate(void)
 
 	update_thd(ha_thd());
 
+	if (!trx_is_started(prebuilt->trx)) {
+		++prebuilt->trx->will_lock;
+	}
 	/* Truncate the table in InnoDB */
 
 	error = row_truncate_table_for_mysql(prebuilt->table, prebuilt->trx);
@@ -7494,6 +9194,7 @@ ha_innobase::delete_table(
 	trx_t*	trx;
 	THD	*thd = ha_thd();
 	char	norm_name[1000];
+	char	errstr[1024];
 
 	DBUG_ENTER("ha_innobase::delete_table");
 
@@ -7510,6 +9211,17 @@ ha_innobase::delete_table(
 		DBUG_RETURN(HA_ERR_GENERIC);
 	}
 
+	/* Remove stats for this table and all of its indexes from the
+	persistent storage if it exists and if there are stats for this
+	table in there. This function creates its own trx and commits
+	it. */
+	error = dict_stats_delete_table_stats(norm_name,
+					      errstr, sizeof(errstr));
+	if (error != DB_SUCCESS) {
+		push_warning(thd, Sql_condition::WARN_LEVEL_WARN,
+			     ER_LOCK_WAIT_TIMEOUT, errstr);
+	}
+
 	/* Get the transaction associated with the current thd, or create one
 	if not yet created */
 
@@ -7526,12 +9238,52 @@ ha_innobase::delete_table(
 
 	ut_a(name_len < 1000);
 
-	/* Drop the table in InnoDB */
+	/* Either the transaction is already flagged as a locking transaction
+	or it hasn't been started yet. */
+
+	ut_a(!trx_is_started(trx) || trx->will_lock > 0);
+
+	/* We are doing a DDL operation. */
+	++trx->will_lock;
 
+	/* Drop the table in InnoDB */
 	error = row_drop_table_for_mysql(norm_name, trx,
 					 thd_sql_command(thd)
 					 == SQLCOM_DROP_DB);
 
+
+	if (error == DB_TABLE_NOT_FOUND
+	    && innobase_get_lower_case_table_names() == 1) {
+		char*	is_part = NULL;
+#ifdef __WIN__
+		is_part = strstr(norm_name, "#p#");
+#else
+		is_part = strstr(norm_name, "#P#");
+#endif /* __WIN__ */
+
+		if (is_part) {
+			char	par_case_name[MAX_FULL_NAME_LEN + 1];
+
+#ifndef __WIN__
+			/* Check for the table using lower
+			case name, including the partition
+			separator "P" */
+			memcpy(par_case_name, norm_name, strlen(norm_name));
+			par_case_name[strlen(norm_name)] = 0;
+			innobase_casedn_str(par_case_name);
+#else
+			/* On Windows platfrom, check
+			whether there exists table name in
+			system table whose name is
+			not being normalized to lower case */
+			normalize_table_name_low(par_case_name, name, FALSE);
+#endif
+			error = row_drop_table_for_mysql(par_case_name, trx,
+							 thd_sql_command(thd)
+							 == SQLCOM_DROP_DB);
+		}
+	}
+
 	/* Flush the log to reduce probability that the .frm files and
 	the InnoDB data dictionary get out-of-sync if the user runs
 	with innodb_flush_log_at_trx_commit = 0 */
@@ -7558,11 +9310,11 @@ static
 void
 innobase_drop_database(
 /*===================*/
-	handlerton *hton, /*!< in: handlerton of Innodb */
-	char*	path)	/*!< in: database path; inside InnoDB the name
-			of the last directory in the path is used as
-			the database name: for example, in 'mysql/data/test'
-			the database name is 'test' */
+	handlerton*	hton,	/*!< in: handlerton of Innodb */
+	char*		path)	/*!< in: database path; inside InnoDB the name
+				of the last directory in the path is used as
+				the database name: for example, in
+				'mysql/data/test' the database name is 'test' */
 {
 	ulint	len		= 0;
 	trx_t*	trx;
@@ -7602,14 +9354,18 @@ innobase_drop_database(
 #ifdef	__WIN__
 	innobase_casedn_str(namebuf);
 #endif
-#if defined __WIN__ && !defined MYSQL_SERVER
-	/* In the Windows plugin, thd = current_thd is always NULL */
-	trx = trx_allocate_for_mysql();
-	trx->mysql_thd = NULL;
-#else
 	trx = innobase_trx_allocate(thd);
-#endif
+
+	/* Either the transaction is already flagged as a locking transaction
+	or it hasn't been started yet. */
+
+	ut_a(!trx_is_started(trx) || trx->will_lock > 0);
+
+	/* We are doing a DDL operation. */
+	++trx->will_lock;
+
 	row_drop_database_for_mysql(namebuf, trx);
+
 	my_free(namebuf);
 
 	/* Flush the log to reduce probability that the .frm files and
@@ -7650,8 +9406,6 @@ innobase_rename_table(
 	normalize_table_name(norm_to, to);
 	normalize_table_name(norm_from, from);
 
-	DEBUG_SYNC_C("innodb_rename_table_ready");
-
 	/* Serialize data dictionary operations with dictionary mutex:
 	no deadlocks can occur then in these operations */
 
@@ -7659,17 +9413,76 @@ innobase_rename_table(
 		row_mysql_lock_data_dictionary(trx);
 	}
 
+	/* Transaction must be flagged as a locking transaction or it hasn't
+	been started yet. */
+
+	ut_a(trx->will_lock > 0);
+
 	error = row_rename_table_for_mysql(
 		norm_from, norm_to, trx, lock_and_commit);
 
 	if (error != DB_SUCCESS) {
-		FILE* ef = dict_foreign_err_file;
+		if (error == DB_TABLE_NOT_FOUND
+		    && innobase_get_lower_case_table_names() == 1) {
+			char*	is_part = NULL;
+#ifdef __WIN__
+			is_part = strstr(norm_from, "#p#");
+#else
+			is_part = strstr(norm_from, "#P#");
+#endif /* __WIN__ */
 
-		fputs("InnoDB: Renaming table ", ef);
-		ut_print_name(ef, trx, TRUE, norm_from);
-		fputs(" to ", ef);
-		ut_print_name(ef, trx, TRUE, norm_to);
-		fputs(" failed!\n", ef);
+			if (is_part) {
+				char	par_case_name[MAX_FULL_NAME_LEN + 1];
+
+#ifndef __WIN__
+				/* Check for the table using lower
+				case name, including the partition
+				separator "P" */
+				memcpy(par_case_name, norm_from,
+				       strlen(norm_from));
+				par_case_name[strlen(norm_from)] = 0;
+				innobase_casedn_str(par_case_name);
+#else
+				/* On Windows platfrom, check
+				whether there exists table name in
+				system table whose name is
+				not being normalized to lower case */
+				normalize_table_name_low(par_case_name,
+							 from, FALSE);
+#endif
+				error = row_rename_table_for_mysql(
+					par_case_name, norm_to, trx,
+					lock_and_commit);
+
+			}
+		}
+
+		if (error != DB_SUCCESS) {
+			FILE* ef = dict_foreign_err_file;
+
+			fputs("InnoDB: Renaming table ", ef);
+			ut_print_name(ef, trx, TRUE, norm_from);
+			fputs(" to ", ef);
+			ut_print_name(ef, trx, TRUE, norm_to);
+			fputs(" failed!\n", ef);
+		} else {
+#ifndef __WIN__
+			sql_print_warning("Rename partition table %s "
+					  "succeeds after converting to lower "
+					  "case. The table may have "
+					  "been moved from a case "
+					  "in-sensitive file system.\n",
+					  norm_from);
+#else
+			sql_print_warning("Rename partition table %s "
+					  "succeeds after skipping the step to "
+					  "lower case the table name. "
+					  "The table may have been "
+					  "moved from a case sensitive "
+					  "file system.\n",
+					  norm_from);
+#endif /* __WIN__ */
+		}
 	}
 
 	if (lock_and_commit) {
@@ -7685,8 +9498,9 @@ innobase_rename_table(
 	my_free(norm_to);
 	my_free(norm_from);
 
-	return error;
+	return(error);
 }
+
 /*********************************************************************//**
 Renames an InnoDB table.
 @return	0 or error code */
@@ -7716,8 +9530,18 @@ ha_innobase::rename_table(
 
 	trx = innobase_trx_allocate(thd);
 
+	/* Either the transaction is already flagged as a locking transaction
+	or it hasn't been started yet. */
+
+	ut_a(!trx_is_started(trx) || trx->will_lock > 0);
+
+	/* We are doing a DDL operation. */
+	++trx->will_lock;
+
 	error = innobase_rename_table(trx, from, to, TRUE);
 
+	DEBUG_SYNC(thd, "after_innobase_rename_table");
+
 	/* Tell the InnoDB server that there might be work for
 	utility threads: */
 
@@ -7758,9 +9582,9 @@ ha_innobase::records_in_range(
 /*==========================*/
 	uint			keynr,		/*!< in: index number */
 	key_range		*min_key,	/*!< in: start key value of the
-						   range, may also be 0 */
+						range, may also be 0 */
 	key_range		*max_key)	/*!< in: range end key val, may
-						   also be 0 */
+						also be 0 */
 {
 	KEY*		key;
 	dict_index_t*	index;
@@ -7811,7 +9635,7 @@ ha_innobase::records_in_range(
           key_parts= key->ext_key_parts;
 
 	heap = mem_heap_create(2 * (key_parts * sizeof(dfield_t)
- 				    + sizeof(dtuple_t)));
+				    + sizeof(dtuple_t)));
 
         range_start = dtuple_create(heap, key_parts);
         dict_index_copy_types(range_start, index, key_parts);
@@ -7883,8 +9707,8 @@ filesort.cc.
 @return	upper bound of rows */
 UNIV_INTERN
 ha_rows
-ha_innobase::estimate_rows_upper_bound(void)
-/*======================================*/
+ha_innobase::estimate_rows_upper_bound()
+/*====================================*/
 {
 	dict_index_t*	index;
 	ulonglong	estimate;
@@ -7919,7 +9743,7 @@ ha_innobase::estimate_rows_upper_bound(void)
 
 	/* Calculate a minimum length for a clustered index record and from
 	that an upper bound for the number of rows. Since we only calculate
-	new statistics in row0mysql.c when a table has grown by a threshold
+	new statistics in row0mysql.cc when a table has grown by a threshold
 	factor, we must add a safety factor 2 in front of the formula below. */
 
 	estimate = 2 * local_data_file_length /
@@ -7991,7 +9815,7 @@ match. In this case, we have to take into account if we generated a
 default clustered index for the table
 @return the key number used inside MySQL */
 static
-unsigned int
+int
 innobase_get_mysql_key_number_for_index(
 /*====================================*/
 	INNOBASE_SHARE*		share,	/*!< in: share structure for index
@@ -8000,15 +9824,19 @@ innobase_get_mysql_key_number_for_index(
 					dictionary */
 	dict_table_t*		ib_table,/*!< in: table in Innodb data
 					dictionary */
-        const dict_index_t*     index)	/*!< in: index */
+	const dict_index_t*	index)	/*!< in: index */
 {
 	const dict_index_t*	ind;
 	unsigned int		i;
 
-        ut_a(index);
+ 	ut_a(index);
+	/*
+	ut_ad(strcmp(index->table->name, ib_table->name) == 0);
+	*/
 
-	/* If index does not belong to the table of share structure. Search
-	index->table instead */
+	/* If index does not belong to the table object of share structure
+	(ib_table comes from the share structure) search the index->table
+	object instead */
 	if (index->table != ib_table) {
 		i = 0;
 		ind = dict_table_get_first_index(index->table);
@@ -8028,7 +9856,7 @@ innobase_get_mysql_key_number_for_index(
 
 	/* If index translation table exists, we will first check
 	the index through index translation table for a match. */
-        if (share->idx_trans_tbl.index_mapping) {
+	if (share->idx_trans_tbl.index_mapping) {
 		for (i = 0; i < share->idx_trans_tbl.index_count; i++) {
 			if (share->idx_trans_tbl.index_mapping[i] == index) {
 				return(i);
@@ -8052,11 +9880,24 @@ innobase_get_mysql_key_number_for_index(
 		if (index == ind) {
 			return(i);
 		}
-        }
+	}
+
+	/* Loop through each index of the table and lock them */
+	for (ind = dict_table_get_first_index(ib_table);
+	     ind != NULL;
+	     ind = dict_table_get_next_index(ind)) {
+		if (index == ind) {
+			sql_print_error("Find index %s in InnoDB index list "
+					"but not its MySQL index number "
+					"It could be an InnoDB internal index.",
+					index->name);
+			return(-1);
+		}
+	}
 
 	ut_error;
 
-        return(0);
+	return(-1);
 }
 
 /*********************************************************************//**
@@ -8082,22 +9923,22 @@ innodb_rec_per_key(
 
 		rec_per_key = records;
 	} else if (srv_innodb_stats_method == SRV_STATS_NULLS_IGNORED) {
-		ib_int64_t	num_null;
-
-		/* Number of rows with NULL value in this
-		field */
-		num_null = records - index->stat_n_non_null_key_vals[i];
+		ib_uint64_t	num_null;
 
 		/* In theory, index->stat_n_non_null_key_vals[i]
 		should always be less than the number of records.
 		Since this is statistics value, the value could
 		have slight discrepancy. But we will make sure
 		the number of null values is not a negative number. */
-		num_null = (num_null < 0) ? 0 : num_null;
+		if (records < index->stat_n_non_null_key_vals[i]) {
+			num_null = 0;
+		} else {
+			num_null = records - index->stat_n_non_null_key_vals[i];
+		}
 
 		/* If the number of NULL values is the same as or
 		large than that of the distinct values, we could
-		consider that the table consists mostly of NULL value. 
+		consider that the table consists mostly of NULL value.
 		Set rec_per_key to 1. */
 		if (index->stat_n_diff_key_vals[i + 1] <= num_null) {
 			rec_per_key = 1;
@@ -8119,15 +9960,18 @@ innodb_rec_per_key(
 
 /*********************************************************************//**
 Returns statistics information of the table to the MySQL interpreter,
-in various fields of the handle object. */
+in various fields of the handle object.
+@return HA_ERR_* error code or 0 */
 UNIV_INTERN
 int
 ha_innobase::info_low(
 /*==================*/
-	uint	flag,			/*!< in: what information MySQL
+	uint			flag,	/*!< in: what information MySQL
 					requests */
-	bool	called_from_analyze)	/* in: TRUE if called from
-					::analyze() */
+	dict_stats_upd_option_t	stats_upd_option)
+					/*!< in: whether to (re) calc
+					the stats or to fetch them from
+					the persistent storage */
 {
 	dict_table_t*	ib_table;
 	dict_index_t*	index;
@@ -8158,15 +10002,22 @@ ha_innobase::info_low(
 	ib_table = prebuilt->table;
 
 	if (flag & HA_STATUS_TIME) {
-		if (called_from_analyze || innobase_stats_on_metadata) {
+		if (stats_upd_option != DICT_STATS_FETCH
+		    || innobase_stats_on_metadata) {
 			/* In sql_show we call with this flag: update
 			then statistics so that they are up-to-date */
+			enum db_err	ret;
 
 			prebuilt->trx->op_info = "updating table statistics";
 
-			dict_update_statistics(ib_table,
-					       FALSE /* update even if stats
-						     are initialized */);
+			ut_ad(!mutex_own(&dict_sys->mutex));
+			ret = dict_stats_update(ib_table, stats_upd_option,
+						FALSE);
+
+			if (ret != DB_SUCCESS) {
+				prebuilt->trx->op_info = "";
+				DBUG_RETURN(HA_ERR_GENERIC);
+			}
 
 			prebuilt->trx->op_info = "returning various info to MySQL";
 		}
@@ -8188,8 +10039,6 @@ ha_innobase::info_low(
 
 		ulint	page_size;
 
-		dict_table_stats_lock(ib_table, RW_S_LATCH);
-
 		n_rows = ib_table->stat_n_rows;
 
 		/* Because we do not protect stat_n_rows by any mutex in a
@@ -8235,7 +10084,7 @@ ha_innobase::info_low(
 			page_size = UNIV_PAGE_SIZE;
 		}
 
-		stats.records = (ha_rows)n_rows;
+		stats.records = (ha_rows) n_rows;
 		stats.deleted = 0;
 		stats.data_file_length
 			= ((ulonglong) ib_table->stat_clustered_index_size)
@@ -8244,8 +10093,6 @@ ha_innobase::info_low(
 			((ulonglong) ib_table->stat_sum_of_other_index_sizes)
 			* page_size;
 
-		dict_table_stats_unlock(ib_table, RW_S_LATCH);
-
 		/* Since fsp_get_available_space_in_free_extents() is
 		acquiring latches inside InnoDB, we do not call it if we
 		are asked by MySQL to avoid locking. Another reason to
@@ -8277,14 +10124,15 @@ ha_innobase::info_low(
 
 				push_warning_printf(
 					thd,
-					MYSQL_ERROR::WARN_LEVEL_WARN,
+					Sql_condition::WARN_LEVEL_WARN,
 					ER_CANT_GET_STAT,
 					"InnoDB: Trying to get the free "
 					"space for table %s but its "
 					"tablespace has been discarded or "
 					"the .ibd file is missing. Setting "
-					"the free space to zero.",
-					ib_table->name);
+                                        "the free space to zero. "
+                                        "(Errcode: %M)",
+					ib_table->name, errno);
 
 				stats.delete_length = 0;
 			} else {
@@ -8293,13 +10141,13 @@ ha_innobase::info_low(
 		}
 
 		stats.check_time = 0;
-                stats.mrr_length_per_rec= ref_length +  8; // 8 = max(sizeof(void *));
-
+		stats.mrr_length_per_rec = ref_length + sizeof(void*);
 
 		if (stats.records == 0) {
 			stats.mean_rec_length = 0;
 		} else {
-			stats.mean_rec_length = (ulong) (stats.data_file_length / stats.records);
+			stats.mean_rec_length = (ulong)
+				(stats.data_file_length / stats.records);
 		}
 	}
 
@@ -8311,8 +10159,11 @@ ha_innobase::info_low(
 		ulint	num_innodb_index = UT_LIST_GET_LEN(ib_table->indexes)
 					- prebuilt->clust_index_was_generated;
 
-		if (table->s->keys != num_innodb_index) {
-			sql_print_error("Table %s contains %lu "
+		if (table->s->keys != num_innodb_index
+		    && (innobase_fts_check_doc_id_index(ib_table, NULL)
+			== FTS_EXIST_DOC_ID_INDEX
+			&& table->s->keys != (num_innodb_index - 1))) {
+			sql_print_error("InnoDB: Table %s contains %lu "
 					"indexes inside InnoDB, which "
 					"is different from the number of "
 					"indexes %u defined in the MySQL ",
@@ -8320,8 +10171,6 @@ ha_innobase::info_low(
 					table->s->keys);
 		}
 
-		dict_table_stats_lock(ib_table, RW_S_LATCH);
-
 		for (i = 0; i < table->s->keys; i++) {
 			ulong	j;
                         rec_per_key = 1;
@@ -8347,16 +10196,27 @@ ha_innobase::info_low(
 
 			for (j = 0; j < table->key_info[i].key_parts; j++) {
 
+				if (table->key_info[i].flags & HA_FULLTEXT) {
+					/* The whole concept has no validity
+					for FTS indexes. */
+					table->key_info[i].rec_per_key[j] = 1;
+					continue;
+				}
+
 				if (j + 1 > index->n_uniq) {
 					sql_print_error(
-"Index %s of %s has %lu columns unique inside InnoDB, but MySQL is asking "
-"statistics for %lu columns. Have you mixed up .frm files from different "
-"installations? "
-"See " REFMAN "innodb-troubleshooting.html\n",
-							index->name,
-							ib_table->name,
-							(unsigned long)
-							index->n_uniq, j + 1);
+						"Index %s of %s has %lu columns"
+					        " unique inside InnoDB, but "
+						"MySQL is asking statistics for"
+					        " %lu columns. Have you mixed "
+						"up .frm files from different "
+					       	"installations? "
+						"See " REFMAN
+						"innodb-troubleshooting.html\n",
+						index->name,
+						ib_table->name,
+						(unsigned long)
+						index->n_uniq, j + 1);
 					break;
 				}
 
@@ -8374,7 +10234,7 @@ ha_innobase::info_low(
 					rec_per_key = 1;
 				}
 
-				table->key_info[i].rec_per_key[j]=
+				table->key_info[i].rec_per_key[j] =
 				  rec_per_key >= ~(ulong) 0 ? ~(ulong) 0 :
 				  (ulong) rec_per_key;
 			}
@@ -8409,9 +10269,9 @@ ha_innobase::info_low(
 				                }
                                                 else if (rec_per_key > 1) {
                                                         rec_per_key =
-                                                          (ha_rows) (k_rec_per_key *
-                                                                     (double)rec_per_key /
-                                                                     n_rows);
+                                                        k_rec_per_key *
+						        (double)rec_per_key /
+							n_rows;
 						}
                                                 
 				                key_info->rec_per_key[k++]=
@@ -8423,8 +10283,6 @@ ha_innobase::info_low(
 				}
 			}                                         
 		}
-
-		dict_table_stats_unlock(ib_table, RW_S_LATCH);
 	}
 
 	if (srv_force_recovery >= SRV_FORCE_NO_IBUF_MERGE) {
@@ -8444,7 +10302,11 @@ ha_innobase::info_low(
 			errkey = innobase_get_mysql_key_number_for_index(
 					share, table, ib_table, err_index);
 		} else {
-			errkey = (unsigned int) prebuilt->trx->error_key_num;
+			errkey = (unsigned int) (
+				(prebuilt->trx->error_key_num
+				 == ULINT_UNDEFINED)
+					? -1
+					: prebuilt->trx->error_key_num);
 		}
 	}
 
@@ -8460,20 +10322,21 @@ func_exit:
 
 /*********************************************************************//**
 Returns statistics information of the table to the MySQL interpreter,
-in various fields of the handle object. */
+in various fields of the handle object.
+@return HA_ERR_* error code or 0 */
 UNIV_INTERN
 int
 ha_innobase::info(
 /*==============*/
 	uint	flag)	/*!< in: what information MySQL requests */
 {
-	return(info_low(flag, false /* not called from analyze */));
+	return(info_low(flag, DICT_STATS_FETCH));
 }
 
 /**********************************************************************//**
-Updates index cardinalities of the table, based on 8 random dives into
+Updates index cardinalities of the table, based on random dives into
 each index tree. This does NOT calculate exact statistics on the table.
-@return	returns always 0 (success) */
+@return	HA_ADMIN_* error code or HA_ADMIN_OK */
 UNIV_INTERN
 int
 ha_innobase::analyze(
@@ -8481,11 +10344,25 @@ ha_innobase::analyze(
 	THD*		thd,		/*!< in: connection thread handle */
 	HA_CHECK_OPT*	check_opt)	/*!< in: currently ignored */
 {
-	/* Simply call ::info() with all the flags */
-	info_low(HA_STATUS_TIME | HA_STATUS_CONST | HA_STATUS_VARIABLE,
-		 true /* called from analyze */);
+	dict_stats_upd_option_t	upd_option;
+	int			ret;
 
-	return(0);
+	if (THDVAR(thd, analyze_is_persistent)) {
+		upd_option = DICT_STATS_RECALC_PERSISTENT;
+	} else {
+		upd_option = DICT_STATS_RECALC_TRANSIENT;
+	}
+
+	/* Simply call ::info_low() with all the flags
+	and request recalculation of the statistics */
+	ret = info_low(HA_STATUS_TIME | HA_STATUS_CONST | HA_STATUS_VARIABLE,
+		       upd_option);
+
+	if (ret != 0) {
+		return(HA_ADMIN_FAILED);
+	}
+
+	return(HA_ADMIN_OK);
 }
 
 /**********************************************************************//**
@@ -8498,7 +10375,25 @@ ha_innobase::optimize(
 	THD*		thd,		/*!< in: connection thread handle */
 	HA_CHECK_OPT*	check_opt)	/*!< in: currently ignored */
 {
-	return(HA_ADMIN_TRY_ALTER);
+	/*FTS-FIXME: Since MySQL doesn't support engine-specific commands,
+	we have to hijack some existing command in order to be able to test
+	the new admin commands added in InnoDB's FTS support. For now, we
+	use MySQL's OPTIMIZE command, normally mapped to ALTER TABLE in
+	InnoDB (so it recreates the table anew), and map it to OPTIMIZE.
+
+	This works OK otherwise, but MySQL locks the entire table during
+	calls to OPTIMIZE, which is undesirable. */
+
+	if (innodb_optimize_fulltext_only) {
+		if (prebuilt->table->fts && prebuilt->table->fts->cache) {
+			fts_sync_table(prebuilt->table);
+			fts_optimize_table(prebuilt->table);
+		}
+		return(HA_ADMIN_OK);
+	} else {
+
+		return(HA_ADMIN_TRY_ALTER);
+	}
 }
 
 /*******************************************************************//**
@@ -8531,7 +10426,7 @@ ha_innobase::check(
 		/* Build the template; we will use a dummy template
 		in index scans done in checking */
 
-		build_template(prebuilt, NULL, table, ROW_MYSQL_WHOLE_ROW);
+		build_template(true);
 	}
 
 	if (prebuilt->table->ibd_file_missing) {
@@ -8570,9 +10465,9 @@ ha_innobase::check(
 	prebuilt->table->corrupted = FALSE;
 
 	/* Enlarge the fatal lock wait timeout during CHECK TABLE. */
-	mutex_enter(&kernel_mutex);
-	srv_fatal_semaphore_wait_threshold += SRV_SEMAPHORE_WAIT_EXTENSION;
-	mutex_exit(&kernel_mutex);
+	os_increment_counter_by_amount(
+		server_mutex,
+		srv_fatal_semaphore_wait_threshold, 7200/*2 hours*/);
 
 	for (index = dict_table_get_first_index(prebuilt->table);
 	     index != NULL;
@@ -8594,7 +10489,7 @@ ha_innobase::check(
 				index_name, sizeof index_name,
 				prebuilt->index->name, TRUE);
 
-			push_warning_printf(thd, MYSQL_ERROR::WARN_LEVEL_WARN,
+			push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
 					    ER_NOT_KEYFILE,
 					    "InnoDB: The B-tree of"
 					    " index %s is corrupted.",
@@ -8617,7 +10512,8 @@ ha_innobase::check(
 
 			if (dict_index_is_corrupted(prebuilt->index)) {
 				push_warning_printf(
-					user_thd, MYSQL_ERROR::WARN_LEVEL_WARN,
+					user_thd,
+					Sql_condition::WARN_LEVEL_WARN,
 					HA_ERR_INDEX_CORRUPT,
 					"InnoDB: Index %s is marked as"
 					" corrupted",
@@ -8625,7 +10521,8 @@ ha_innobase::check(
 				is_ok = FALSE;
 			} else {
 				push_warning_printf(
-					thd, MYSQL_ERROR::WARN_LEVEL_WARN,
+					thd,
+					Sql_condition::WARN_LEVEL_WARN,
 					HA_ERR_TABLE_DEF_CHANGED,
 					"InnoDB: Insufficient history for"
 					" index %s",
@@ -8648,7 +10545,7 @@ ha_innobase::check(
 				index_name, sizeof index_name,
 				index->name, TRUE);
 
-			push_warning_printf(thd, MYSQL_ERROR::WARN_LEVEL_WARN,
+			push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
 					    ER_NOT_KEYFILE,
 					    "InnoDB: The B-tree of"
 					    " index %s is corrupted.",
@@ -8670,8 +10567,9 @@ ha_innobase::check(
 
 		if (index == dict_table_get_first_index(prebuilt->table)) {
 			n_rows_in_table = n_rows;
-		} else if (n_rows != n_rows_in_table) {
-			push_warning_printf(thd, MYSQL_ERROR::WARN_LEVEL_WARN,
+		} else if (!(index->type & DICT_FTS)
+			   && (n_rows != n_rows_in_table)) {
+			push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
 					    ER_NOT_KEYFILE,
 					    "InnoDB: Index '%-.200s'"
 					    " contains %lu entries,"
@@ -8704,16 +10602,16 @@ ha_innobase::check(
 	at every CHECK TABLE */
 
 	if (!btr_search_validate()) {
-		push_warning(thd, MYSQL_ERROR::WARN_LEVEL_WARN,
+		push_warning(thd, Sql_condition::WARN_LEVEL_WARN,
 			     ER_NOT_KEYFILE,
 			     "InnoDB: The adaptive hash index is corrupted.");
 		is_ok = FALSE;
 	}
 
 	/* Restore the fatal lock wait timeout after CHECK TABLE. */
-	mutex_enter(&kernel_mutex);
-	srv_fatal_semaphore_wait_threshold -= SRV_SEMAPHORE_WAIT_EXTENSION;
-	mutex_exit(&kernel_mutex);
+	os_decrement_counter_by_amount(
+		server_mutex,
+		srv_fatal_semaphore_wait_threshold, 7200/*2 hours*/);
 
 	prebuilt->trx->op_info = "";
 	if (thd_killed(user_thd)) {
@@ -8743,7 +10641,7 @@ ha_innobase::update_table_comment(
 	handle. */
 
 	if (length > 64000 - 3) {
-		return((char*)comment); /* string too long */
+		return((char*) comment); /* string too long */
 	}
 
 	update_thd(ha_thd());
@@ -8970,7 +10868,7 @@ get_foreign_key_info(
 
 	f_key_info.referenced_key_name = referenced_key_name;
 
-	pf_key_info = (FOREIGN_KEY_INFO *) thd_memdup(thd, &f_key_info,
+	pf_key_info = (FOREIGN_KEY_INFO*) thd_memdup(thd, &f_key_info,
 						      sizeof(FOREIGN_KEY_INFO));
 
 	return(pf_key_info);
@@ -9135,7 +11033,7 @@ ha_innobase::extra(
 			}
 			break;
 		case HA_EXTRA_RESET_STATE:
-			reset_template(prebuilt);
+			reset_template();
 			thd_to_trx(ha_thd())->duplicates = 0;
 			break;
 		case HA_EXTRA_NO_KEYREAD:
@@ -9173,15 +11071,19 @@ ha_innobase::extra(
 	return(0);
 }
 
+/******************************************************************//**
+*/
 UNIV_INTERN
 int
 ha_innobase::reset()
+/*================*/
 {
 	if (prebuilt->blob_heap) {
 		row_mysql_prebuilt_free_blob_heap(prebuilt);
 	}
 
-	reset_template(prebuilt);
+	reset_template();
+	ds_mrr.dsmrr_close();
 
 	/* TODO: This should really be reset in reset_template() but for now
 	it's safer to do it explicitly here. */
@@ -9224,14 +11126,16 @@ ha_innobase::start_stmt(
 	that may not be the case. We MUST release the search latch before an
 	INSERT, for example. */
 
-	innobase_release_stat_resources(trx);
+	trx_search_latch_release_if_reserved(trx);
+
+	innobase_srv_conc_force_exit_innodb(trx);
 
 	/* Reset the AUTOINC statement level counter for multi-row INSERTs. */
 	trx->n_autoinc_rows = 0;
 
 	prebuilt->sql_stat_start = TRUE;
 	prebuilt->hint_need_to_fetch_extra_cols = 0;
-	reset_template(prebuilt);
+	reset_template();
 
 	if (!prebuilt->mysql_has_locked) {
 		/* This handle is for a temporary table created inside
@@ -9258,6 +11162,7 @@ ha_innobase::start_stmt(
 		3) ::init_table_handle_for_HANDLER(), and
 		4) ::transactional_table_lock(). */
 
+		ut_a(prebuilt->stored_select_lock_type != LOCK_NONE_UNSET);
 		prebuilt->select_lock_type = prebuilt->stored_select_lock_type;
 	}
 
@@ -9265,6 +11170,18 @@ ha_innobase::start_stmt(
 
 	innobase_register_trx(ht, thd, trx);
 
+	if (!trx_is_started(trx)) {
+		++trx->will_lock;
+	}
+
+	if (prebuilt->result) {
+		ut_print_timestamp(stderr);
+		fprintf(stderr, " InnoDB: Warning: FTS result set not NULL\n");
+
+		fts_query_free_result(prebuilt->result);
+		prebuilt->result = NULL;
+	}
+
 	return(0);
 }
 
@@ -9277,13 +11194,16 @@ innobase_map_isolation_level(
 /*=========================*/
 	enum_tx_isolation	iso)	/*!< in: MySQL isolation level code */
 {
-	switch(iso) {
-		case ISO_REPEATABLE_READ: return(TRX_ISO_REPEATABLE_READ);
-		case ISO_READ_COMMITTED: return(TRX_ISO_READ_COMMITTED);
-		case ISO_SERIALIZABLE: return(TRX_ISO_SERIALIZABLE);
-		case ISO_READ_UNCOMMITTED: return(TRX_ISO_READ_UNCOMMITTED);
-		default: ut_a(0); return(0);
+	switch (iso) {
+	case ISO_REPEATABLE_READ:	return(TRX_ISO_REPEATABLE_READ);
+	case ISO_READ_COMMITTED:	return(TRX_ISO_READ_COMMITTED);
+	case ISO_SERIALIZABLE:		return(TRX_ISO_SERIALIZABLE);
+	case ISO_READ_UNCOMMITTED:	return(TRX_ISO_READ_UNCOMMITTED);
 	}
+
+	ut_error;
+
+	return(0);
 }
 
 /******************************************************************//**
@@ -9315,12 +11235,13 @@ ha_innobase::external_lock(
 	informative error message and return with an error.
 	Note: decide_logging_format would give the same error message,
 	except it cannot give the extra details. */
+
 	if (lock_type == F_WRLCK
 	    && !(table_flags() & HA_BINLOG_STMT_CAPABLE)
 	    && thd_binlog_format(thd) == BINLOG_FORMAT_STMT
 	    && thd_binlog_filter_ok(thd)
-            && thd_sqlcom_can_generate_row_events(thd))
-        {
+	    && thd_sqlcom_can_generate_row_events(thd))
+	{
 		int skip = 0;
 		/* used by test case */
 		DBUG_EXECUTE_IF("no_innodb_binlog_errors", skip = 1;);
@@ -9333,13 +11254,12 @@ ha_innobase::external_lock(
 		}
 	}
 
-
 	trx = prebuilt->trx;
 
 	prebuilt->sql_stat_start = TRUE;
 	prebuilt->hint_need_to_fetch_extra_cols = 0;
 
-	reset_template(prebuilt);
+	reset_template();
 
 	if (lock_type == F_WRLCK) {
 
@@ -9407,6 +11327,13 @@ ha_innobase::external_lock(
 		trx->n_mysql_tables_in_use++;
 		prebuilt->mysql_has_locked = TRUE;
 
+		if (!trx_is_started(trx)
+		    && (prebuilt->select_lock_type != LOCK_NONE
+			|| prebuilt->stored_select_lock_type != LOCK_NONE)) {
+
+			++trx->will_lock;
+		}
+
 		DBUG_RETURN(0);
 	}
 
@@ -9416,10 +11343,12 @@ ha_innobase::external_lock(
 	prebuilt->mysql_has_locked = FALSE;
 
 	/* Release a possible FIFO ticket and search latch. Since we
-	may reserve the kernel mutex, we have to release the search
+	may reserve the trx_sys->mutex, we have to release the search
 	system latch first to obey the latching order. */
 
-	innobase_release_stat_resources(trx);
+	trx_search_latch_release_if_reserved(trx);
+
+	innobase_srv_conc_force_exit_innodb(trx);
 
 	/* If the MySQL lock count drops to zero we know that the current SQL
 	statement has ended */
@@ -9446,6 +11375,13 @@ ha_innobase::external_lock(
 		}
 	}
 
+	if (!trx_is_started(trx)
+	    && (prebuilt->select_lock_type != LOCK_NONE
+		|| prebuilt->stored_select_lock_type != LOCK_NONE)) {
+
+		++trx->will_lock;
+	}
+
 	DBUG_RETURN(0);
 }
 
@@ -9492,7 +11428,7 @@ ha_innobase::transactional_table_lock(
 	prebuilt->sql_stat_start = TRUE;
 	prebuilt->hint_need_to_fetch_extra_cols = 0;
 
-	reset_template(prebuilt);
+	reset_template();
 
 	if (lock_type == F_WRLCK) {
 		prebuilt->select_lock_type = LOCK_X;
@@ -9542,8 +11478,8 @@ ha_innobase::transactional_table_lock(
 Here we export InnoDB status variables to MySQL. */
 static
 void
-innodb_export_status(void)
-/*======================*/
+innodb_export_status()
+/*==================*/
 {
 	if (innodb_inited) {
 		srv_export_innodb_status();
@@ -9551,15 +11487,16 @@ innodb_export_status(void)
 }
 
 /************************************************************************//**
-Implements the SHOW INNODB STATUS command. Sends the output of the InnoDB
-Monitor to the client. */
+Implements the SHOW ENGINE INNODB STATUS command. Sends the output of the
+InnoDB Monitor to the client.
+@return 0 on success */
 static
-bool
+int
 innodb_show_status(
 /*===============*/
 	handlerton*	hton,	/*!< in: the innodb handlerton */
-	THD*	thd,	/*!< in: the MySQL query thread of the caller */
-	stat_print_fn *stat_print)
+	THD*		thd,	/*!< in: the MySQL query thread of the caller */
+	stat_print_fn*	stat_print)
 {
 	trx_t*			trx;
 	static const char	truncated_msg[] = "... truncated...\n";
@@ -9572,22 +11509,25 @@ innodb_show_status(
 
 	trx = check_trx_exists(thd);
 
-	innobase_release_stat_resources(trx);
+	trx_search_latch_release_if_reserved(trx);
+
+	innobase_srv_conc_force_exit_innodb(trx);
 
 	/* We let the InnoDB Monitor to output at most MAX_STATUS_SIZE
 	bytes of text. */
 
-	long	flen, usable_len;
 	char*	str;
+	ssize_t	flen, usable_len;
 
 	mutex_enter(&srv_monitor_file_mutex);
 	rewind(srv_monitor_file);
+
 	srv_printf_innodb_monitor(srv_monitor_file, FALSE,
 				  &trx_list_start, &trx_list_end);
-	flen = ftell(srv_monitor_file);
+
 	os_file_set_eof(srv_monitor_file);
 
-	if (flen < 0) {
+	if ((flen = ftell(srv_monitor_file)) < 0) {
 		flen = 0;
 	}
 
@@ -9603,28 +11543,31 @@ innodb_show_status(
 
 	if (!(str = (char*) my_malloc(usable_len + 1, MYF(0)))) {
 		mutex_exit(&srv_monitor_file_mutex);
-		DBUG_RETURN(TRUE);
+		DBUG_RETURN(1);
 	}
 
 	rewind(srv_monitor_file);
+
 	if (flen < MAX_STATUS_SIZE) {
 		/* Display the entire output. */
-		flen = (long) fread(str, 1, flen, srv_monitor_file);
+		flen = fread(str, 1, flen, srv_monitor_file);
 	} else if (trx_list_end < (ulint) flen
-			&& trx_list_start < trx_list_end
-			&& trx_list_start + (flen - trx_list_end)
-			< MAX_STATUS_SIZE - sizeof truncated_msg - 1) {
+		   && trx_list_start < trx_list_end
+		   && trx_list_start + (flen - trx_list_end)
+		   < MAX_STATUS_SIZE - sizeof truncated_msg - 1) {
+
 		/* Omit the beginning of the list of active transactions. */
-		long len = (long) fread(str, 1, trx_list_start, srv_monitor_file);
+		ssize_t	len = fread(str, 1, trx_list_start, srv_monitor_file);
+
 		memcpy(str + len, truncated_msg, sizeof truncated_msg - 1);
 		len += sizeof truncated_msg - 1;
 		usable_len = (MAX_STATUS_SIZE - 1) - len;
 		fseek(srv_monitor_file, flen - usable_len, SEEK_SET);
-		len += (long) fread(str + len, 1, usable_len, srv_monitor_file);
+		len += fread(str + len, 1, usable_len, srv_monitor_file);
 		flen = len;
 	} else {
 		/* Omit the end of the output. */
-		flen = (long) fread(str, 1, MAX_STATUS_SIZE - 1, srv_monitor_file);
+		flen = fread(str, 1, MAX_STATUS_SIZE - 1, srv_monitor_file);
 	}
 
 	mutex_exit(&srv_monitor_file_mutex);
@@ -9634,14 +11577,14 @@ innodb_show_status(
 
 	my_free(str);
 
-	DBUG_RETURN(FALSE);
+	DBUG_RETURN(0);
 }
 
 /************************************************************************//**
 Implements the SHOW MUTEX STATUS command.
-@return TRUE on failure, FALSE on success. */
+@return 0 on success. */
 static
-bool
+int
 innodb_mutex_show_status(
 /*=====================*/
 	handlerton*	hton,		/*!< in: the innodb handlerton */
@@ -9650,7 +11593,8 @@ innodb_mutex_show_status(
 	stat_print_fn*	stat_print)	/*!< in: function for printing
 					statistics */
 {
-	char buf1[IO_SIZE], buf2[IO_SIZE];
+	char		buf1[IO_SIZE];
+	char		buf2[IO_SIZE];
 	mutex_t*	mutex;
 	rw_lock_t*	lock;
 	ulint		block_mutex_oswait_count = 0;
@@ -9658,14 +11602,19 @@ innodb_mutex_show_status(
 	mutex_t*	block_mutex = NULL;
 	rw_lock_t*	block_lock = NULL;
 #ifdef UNIV_DEBUG
-	ulint	  rw_lock_count= 0;
-	ulint	  rw_lock_count_spin_loop= 0;
-	ulint	  rw_lock_count_spin_rounds= 0;
-	ulint	  rw_lock_count_os_wait= 0;
-	ulint	  rw_lock_count_os_yield= 0;
-	ulonglong rw_lock_wait_time= 0;
+	ulint		rw_lock_count= 0;
+	ulint		rw_lock_count_spin_loop= 0;
+	ulint		rw_lock_count_spin_rounds= 0;
+	ulint		rw_lock_count_os_wait= 0;
+	ulint		rw_lock_count_os_yield= 0;
+	ulonglong	rw_lock_wait_time= 0;
 #endif /* UNIV_DEBUG */
-	uint	  hton_name_len= (uint) strlen(innobase_hton_name), buf1len, buf2len;
+	uint		buf1len;
+	uint		buf2len;
+	uint		hton_name_len;
+
+	hton_name_len = (uint) strlen(innobase_hton_name);
+
 	DBUG_ENTER("innodb_mutex_show_status");
 	DBUG_ASSERT(hton == innodb_hton_ptr);
 
@@ -9817,34 +11766,54 @@ innodb_mutex_show_status(
 	}
 #endif /* UNIV_DEBUG */
 
-	DBUG_RETURN(FALSE);
+	/* Success */
+	DBUG_RETURN(0);
 }
 
+/************************************************************************//**
+Return 0 on success and non-zero on failure. Note: the bool return type
+seems to be abused here, should be an int. */
 static
-bool innobase_show_status(handlerton *hton, THD* thd, 
-                          stat_print_fn* stat_print,
-                          enum ha_stat_type stat_type)
+bool
+innobase_show_status(
+/*=================*/
+	handlerton*		hton,	/*!< in: the innodb handlerton */
+	THD*			thd,	/*!< in: the MySQL query thread
+					of the caller */
+	stat_print_fn*		stat_print,
+	enum ha_stat_type	stat_type)
 {
 	DBUG_ASSERT(hton == innodb_hton_ptr);
 
 	switch (stat_type) {
 	case HA_ENGINE_STATUS:
-		return innodb_show_status(hton, thd, stat_print);
+		/* Non-zero return value means there was an error. */
+		return(innodb_show_status(hton, thd, stat_print) != 0);
+
 	case HA_ENGINE_MUTEX:
-		return innodb_mutex_show_status(hton, thd, stat_print);
-	default:
-		return(FALSE);
+		/* Non-zero return value means there was an error. */
+		return(innodb_mutex_show_status(hton, thd, stat_print) != 0);
+
+	case HA_ENGINE_LOGS:
+		/* Not handled */
+		break;
 	}
+
+	/* Success */
+	return(false);
 }
 
 /************************************************************************//**
- Handling the shared INNOBASE_SHARE structure that is needed to provide table
- locking.
-****************************************************************************/
-
-static INNOBASE_SHARE* get_share(const char* table_name)
+Handling the shared INNOBASE_SHARE structure that is needed to provide table
+locking. Register the table name if it doesn't exist in the hash table. */
+static
+INNOBASE_SHARE*
+get_share(
+/*======*/
+	const char*	table_name)
 {
-	INNOBASE_SHARE *share;
+	INNOBASE_SHARE*	share;
+
 	mysql_mutex_lock(&innobase_share_mutex);
 
 	ulint	fold = ut_fold_string(table_name);
@@ -9861,7 +11830,7 @@ static INNOBASE_SHARE* get_share(const char* table_name)
 		/* TODO: invoke HASH_MIGRATE if innobase_open_tables
 		grows too big */
 
-		share = (INNOBASE_SHARE *) my_malloc(sizeof(*share)+length+1,
+		share = (INNOBASE_SHARE*) my_malloc(sizeof(*share)+length+1,
 			MYF(MY_FAE | MY_ZEROFILL));
 
 		share->table_name = (char*) memcpy(share + 1,
@@ -9884,7 +11853,13 @@ static INNOBASE_SHARE* get_share(const char* table_name)
 	return(share);
 }
 
-static void free_share(INNOBASE_SHARE* share)
+/************************************************************************//**
+Free the shared object that was registered with get_share(). */
+static
+void
+free_share(
+/*=======*/
+	INNOBASE_SHARE*	share)	/*!< in/own: table share to free */
 {
 	mysql_mutex_lock(&innobase_share_mutex);
 
@@ -9980,7 +11955,7 @@ ha_innobase::store_lock(
 
 		/* MySQL calls this function in DROP TABLE though this table
 		handle may belong to another thd that is running a query. Let
-		us in that case skip any changes to the prebuilt struct. */ 
+		us in that case skip any changes to the prebuilt struct. */
 
 	} else if ((lock_type == TL_READ && in_lock_tables)
 		   || (lock_type == TL_READ_HIGH_PRIORITY && in_lock_tables)
@@ -10120,6 +12095,13 @@ ha_innobase::store_lock(
 
 	*to++= &lock;
 
+	if (!trx_is_started(trx)
+	    && (prebuilt->select_lock_type != LOCK_NONE
+	        || prebuilt->stored_select_lock_type != LOCK_NONE)) {
+
+		++trx->will_lock;
+	}
+
 	return(to);
 }
 
@@ -10134,8 +12116,8 @@ ha_innobase::innobase_get_autoinc(
 /*==============================*/
 	ulonglong*	value)		/*!< out: autoinc value */
 {
- 	*value = 0;
- 
+	*value = 0;
+
 	prebuilt->autoinc_error = innobase_lock_autoinc();
 
 	if (prebuilt->autoinc_error == DB_SUCCESS) {
@@ -10196,11 +12178,14 @@ UNIV_INTERN
 void
 ha_innobase::get_auto_increment(
 /*============================*/
-        ulonglong	offset,              /*!< in: table autoinc offset */
-        ulonglong	increment,           /*!< in: table autoinc increment */
-        ulonglong	nb_desired_values,   /*!< in: number of values reqd */
-        ulonglong	*first_value,        /*!< out: the autoinc value */
-        ulonglong	*nb_reserved_values) /*!< out: count of reserved values */
+	ulonglong	offset,			/*!< in: table autoinc offset */
+	ulonglong	increment,		/*!< in: table autoinc
+						increment */
+	ulonglong	nb_desired_values,	/*!< in: number of values
+						reqd */
+	ulonglong*	first_value,		/*!< out: the autoinc value */
+	ulonglong*	nb_reserved_values)	/*!< out: count of reserved
+						values */
 {
 	trx_t*		trx;
 	ulint		error;
@@ -10264,15 +12249,16 @@ ha_innobase::get_auto_increment(
 	/* With old style AUTOINC locking we only update the table's
 	AUTOINC counter after attempting to insert the row. */
 	if (innobase_autoinc_lock_mode != AUTOINC_OLD_STYLE_LOCKING) {
+		ulonglong	need;
 		ulonglong	current;
 		ulonglong	next_value;
 
 		current = *first_value > col_max_value ? autoinc : *first_value;
+		need = *nb_reserved_values * increment;
 
 		/* Compute the last value in the interval */
 		next_value = innobase_next_autoinc(
-			current, *nb_reserved_values, increment, offset,
-			col_max_value);
+			current, need, offset, col_max_value);
 
 		prebuilt->autoinc_last_value = next_value;
 
@@ -10337,10 +12323,14 @@ ha_innobase::reset_auto_increment(
 	DBUG_RETURN(0);
 }
 
-/* See comment in handler.cc */
+/*******************************************************************//**
+See comment in handler.cc */
 UNIV_INTERN
 bool
-ha_innobase::get_error_message(int error, String *buf)
+ha_innobase::get_error_message(
+/*===========================*/
+	int	error,
+	String*	buf)
 {
 	trx_t*	trx = check_trx_exists(ha_thd());
 
@@ -10351,6 +12341,64 @@ ha_innobase::get_error_message(int error, String *buf)
 }
 
 /*******************************************************************//**
+  Retrieves the names of the table and the key for which there was a
+  duplicate entry in the case of HA_ERR_FOREIGN_DUPLICATE_KEY.
+
+  If any of the names is not available, then this method will return
+  false and will not change any of child_table_name or child_key_name.
+
+  @param child_table_name[out]    Table name
+  @param child_table_name_len[in] Table name buffer size
+  @param child_key_name[out]      Key name
+  @param child_key_name_len[in]   Key name buffer size
+
+  @retval  true                  table and key names were available
+                                 and were written into the corresponding
+                                 out parameters.
+  @retval  false                 table and key names were not available,
+                                 the out parameters were not touched.
+*/
+bool
+ha_innobase::get_foreign_dup_key(
+/*=============================*/
+	char*	child_table_name,
+	uint	child_table_name_len,
+	char*	child_key_name,
+	uint	child_key_name_len)
+{
+	const dict_index_t*	err_index;
+
+	ut_a(prebuilt->trx != NULL);
+	ut_a(prebuilt->trx->magic_n == TRX_MAGIC_N);
+
+	err_index = trx_get_error_info(prebuilt->trx);
+
+	if (err_index == NULL) {
+		return(false);
+	}
+	/* else */
+
+	/* copy table name (and convert from filename-safe encoding to
+	system_charset_info, e.g. "foo_@0J@00b6" -> "foo_ö") */
+	char*	p;
+	p = strchr(err_index->table->name, '/');
+	/* strip ".../" prefix if any */
+	if (p != NULL) {
+		p++;
+	} else {
+		p = err_index->table->name;
+	}
+	uint	len;
+	len = filename_to_tablename(p, child_table_name, child_table_name_len);
+	child_table_name[len] = '\0';
+
+	/* copy index name */
+	ut_snprintf(child_key_name, child_key_name_len, "%s", err_index->name);
+
+	return(true);
+}
+
+/*******************************************************************//**
 Compares two 'refs'. A 'ref' is the (internal) primary key value of the row.
 If there is no explicitly declared non-null unique key or a primary key, then
 InnoDB internally uses the row id as the primary key.
@@ -10403,8 +12451,8 @@ ha_innobase::cmp_ref(
 
 			ref1 += 2;
 			ref2 += 2;
-			result = ((Field_blob*)field)->cmp( ref1, len1,
-                                                            ref2, len2);
+			result = ((Field_blob*) field)->cmp(
+				ref1, len1, ref2, len2);
 		} else {
 			result = field->key_cmp(ref1, ref2);
 		}
@@ -10447,16 +12495,22 @@ ha_innobase::register_query_cache_table(
 							 engine_data));
 }
 
+/*******************************************************************//**
+Get the bin log name. */
 UNIV_INTERN
-char*
+const char*
 ha_innobase::get_mysql_bin_log_name()
+/*=================================*/
 {
 	return(trx_sys_mysql_bin_log_name);
 }
 
+/*******************************************************************//**
+Get the bin log offset (or file position). */
 UNIV_INTERN
 ulonglong
 ha_innobase::get_mysql_bin_log_pos()
+/*================================*/
 {
 	/* trx... is ib_int64_t, which is a typedef for a 64-bit integer
 	(__int64 or longlong) so it's ok to cast it to ulonglong. */
@@ -10470,7 +12524,7 @@ characters for prefix indexes using a multibyte character set. The function
 finds charset information and returns length of prefix_len characters in the
 index field in bytes.
 @return	number of bytes occupied by the first n characters */
-extern "C" UNIV_INTERN
+UNIV_INTERN
 ulint
 innobase_get_at_most_n_mbchars(
 /*===========================*/
@@ -10541,13 +12595,13 @@ static
 int
 innobase_xa_prepare(
 /*================*/
-        handlerton*	hton,	/*!< in: InnoDB handlerton */
-	THD*		thd,	/*!< in: handle to the MySQL thread of
-				the user whose XA transaction should
-				be prepared */
-	bool		all)	/*!< in: TRUE - commit transaction
-				FALSE - the current SQL statement
-				ended */
+	handlerton*	hton,		/*!< in: InnoDB handlerton */
+	THD*		thd,		/*!< in: handle to the MySQL thread of
+					the user whose XA transaction should
+					be prepared */
+	bool		prepare_trx)	/*!< in: true - prepare transaction
+					false - the current SQL statement
+					ended */
 {
 	int error = 0;
 	trx_t* trx = check_trx_exists(thd);
@@ -10565,10 +12619,12 @@ innobase_xa_prepare(
 	thd_get_xid(thd, (MYSQL_XID*) &trx->xid);
 
 	/* Release a possible FIFO ticket and search latch. Since we will
-	reserve the kernel mutex, we have to release the search system latch
-	first to obey the latching order. */
+	reserve the trx_sys->mutex, we have to release the search system
+	latch first to obey the latching order. */
+
+	trx_search_latch_release_if_reserved(trx);
 
-	innobase_release_stat_resources(trx);
+	innobase_srv_conc_force_exit_innodb(trx);
 
 	if (!trx_is_registered_for_2pc(trx) && trx_is_started(trx)) {
 
@@ -10576,7 +12632,7 @@ innobase_xa_prepare(
 				"but transaction is active");
 	}
 
-	if (all
+	if (prepare_trx
 	    || (!thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN))) {
 
 		/* We were instructed to prepare the whole transaction, or
@@ -10584,7 +12640,9 @@ innobase_xa_prepare(
 
 		ut_ad(trx_is_registered_for_2pc(trx));
 
-		error = (int) trx_prepare_for_mysql(trx);
+		trx_prepare_for_mysql(trx);
+
+		error = 0;
 	} else {
 		/* We just mark the SQL statement ended and do not do a
 		transaction prepare */
@@ -10592,7 +12650,7 @@ innobase_xa_prepare(
 		/* If we had reserved the auto-inc lock for some
 		table in this SQL statement we release it now */
 
-		row_unlock_table_autoinc_for_mysql(trx);
+		lock_unlock_table_autoinc(trx);
 
 		/* Store the current undo_no of the transaction so that we
 		know where to roll back if we have to roll back the next
@@ -10693,8 +12751,8 @@ static
 void*
 innobase_create_cursor_view(
 /*========================*/
-        handlerton *hton, /*!< in: innobase hton */
-	THD* thd)	  /*!< in: user thread handle */
+	handlerton*	hton,	/*!< in: innobase hton */
+	THD*		thd)	/*!< in: user thread handle */
 {
 	DBUG_ASSERT(hton == innodb_hton_ptr);
 
@@ -10709,9 +12767,9 @@ static
 void
 innobase_close_cursor_view(
 /*=======================*/
-        handlerton *hton,
-	THD*	thd,	/*!< in: user thread handle */
-	void*	curview)/*!< in: Consistent read view to be closed */
+	handlerton*	hton,	/*!< in: innobase hton */
+	THD*		thd,	/*!< in: user thread handle */
+	void*		curview)/*!< in: Consistent read view to be closed */
 {
 	DBUG_ASSERT(hton == innodb_hton_ptr);
 
@@ -10728,9 +12786,9 @@ static
 void
 innobase_set_cursor_view(
 /*=====================*/
-        handlerton *hton,
-	THD*	thd,	/*!< in: user thread handle */
-	void*	curview)/*!< in: Consistent cursor view to be set */
+	handlerton*	hton,	/*!< in: innobase hton */
+	THD*		thd,	/*!< in: user thread handle */
+	void*		curview)/*!< in: Consistent cursor view to be set */
 {
 	DBUG_ASSERT(hton == innodb_hton_ptr);
 
@@ -10798,17 +12856,16 @@ column_is_being_renamed(
 	return(false);
 }
 
-/***********************************************************************
+/*******************************************************************//**
 Check whether a column in table "table" is being renamed and if this column
 is part of a foreign key, either part of another table, referencing this
-table or part of this table, referencing another table. */
+table or part of this table, referencing another table.
+@return true if a column that participates in a foreign key definition
+is being renamed */
 static
 bool
 foreign_key_column_is_being_renamed(
 /*================================*/
-					/* out: true if a column that
-					participates in a foreign key definition
-					is being renamed */
 	row_prebuilt_t*	prebuilt,	/* in: InnoDB prebuilt struct */
 	TABLE*		table)		/* in: MySQL table */
 {
@@ -10857,9 +12914,12 @@ foreign_key_column_is_being_renamed(
 	return(false);
 }
 
+/*******************************************************************//**
+*/
 UNIV_INTERN
 bool
 ha_innobase::check_if_incompatible_data(
+/*====================================*/
 	HA_CREATE_INFO*	info,
 	uint		table_changes)
 {
@@ -10881,14 +12941,14 @@ ha_innobase::check_if_incompatible_data(
 	currently we can just request a table rebuild/copy by returning
 	COMPATIBLE_DATA_NO */
 	if (check_column_being_renamed(table, NULL)) {
-		return COMPATIBLE_DATA_NO;
+		return(COMPATIBLE_DATA_NO);
 	}
 
 	/* Check if a column participating in a foreign key is being renamed.
 	There is no mechanism for updating InnoDB foreign key definitions. */
 	if (foreign_key_column_is_being_renamed(prebuilt, table)) {
 
-		return COMPATIBLE_DATA_NO;
+		return(COMPATIBLE_DATA_NO);
 	}
 
 	/* Check that row format didn't change */
@@ -10928,13 +12988,13 @@ innobase_file_format_name_lookup(
 	/* Check for valid parse. */
 	if (*endp == '\0' && *format_name != '\0') {
 
-		if (format_id <= DICT_TF_FORMAT_MAX) {
+		if (format_id <= UNIV_FORMAT_MAX) {
 
 			return(format_id);
 		}
 	} else {
 
-		for (format_id = 0; format_id <= DICT_TF_FORMAT_MAX;
+		for (format_id = 0; format_id <= UNIV_FORMAT_MAX;
 		     format_id++) {
 			const char*	name;
 
@@ -10947,7 +13007,7 @@ innobase_file_format_name_lookup(
 		}
 	}
 
-	return(DICT_TF_FORMAT_MAX + 1);
+	return(UNIV_FORMAT_MAX + 1);
 }
 
 /************************************************************//**
@@ -10964,7 +13024,7 @@ innobase_file_format_validate_and_set(
 
 	format_id = innobase_file_format_name_lookup(format_max);
 
-	if (format_id < DICT_TF_FORMAT_MAX + 1) {
+	if (format_id < UNIV_FORMAT_MAX + 1) {
 		srv_max_file_format_at_startup = format_id;
 
 		return((int) format_id);
@@ -11003,7 +13063,7 @@ innodb_file_format_name_validate(
 		format_id = innobase_file_format_name_lookup(
 			file_format_input);
 
-		if (format_id <= DICT_TF_FORMAT_MAX) {
+		if (format_id <= UNIV_FORMAT_MAX) {
 
 			/* Save a pointer to the name in the
 			'file_format_name_map' constant array. */
@@ -11045,7 +13105,7 @@ innodb_file_format_name_update(
 
 		format_id = innobase_file_format_name_lookup(format_name);
 
-		if (format_id <= DICT_TF_FORMAT_MAX) {
+		if (format_id <= UNIV_FORMAT_MAX) {
 			srv_file_format = format_id;
 		}
 	}
@@ -11053,6 +13113,7 @@ innodb_file_format_name_update(
 	*static_cast<const char**>(var_ptr)
 		= trx_sys_file_format_id_to_name(srv_file_format);
 }
+
 /*************************************************************//**
 Check if valid argument to innodb_file_format_max. This function
 is registered as a callback with MySQL.
@@ -11088,19 +13149,19 @@ innodb_file_format_max_validate(
 			'file_format_name_map' constant array. */
 			*static_cast<const char**>(save) =
 			    trx_sys_file_format_id_to_name(
-						(uint)format_id);
+						(uint) format_id);
 
 			return(0);
 
 		} else {
 			push_warning_printf(thd,
-			  MYSQL_ERROR::WARN_LEVEL_WARN,
+			  Sql_condition::WARN_LEVEL_WARN,
 			  ER_WRONG_ARGUMENTS,
 			  "InnoDB: invalid innodb_file_format_max "
 			  "value; can be any format up to %s "
 			  "or equivalent id of %d",
-			  trx_sys_file_format_id_to_name(DICT_TF_FORMAT_MAX),
-			  DICT_TF_FORMAT_MAX);
+			  trx_sys_file_format_id_to_name(UNIV_FORMAT_MAX),
+			  UNIV_FORMAT_MAX);
 		}
 	}
 
@@ -11115,13 +13176,13 @@ static
 void
 innodb_file_format_max_update(
 /*==========================*/
-	THD*				thd,		/*!< in: thread handle */
-	struct st_mysql_sys_var*	var,		/*!< in: pointer to
-							system variable */
-	void*				var_ptr,	/*!< out: where the
-							formal string goes */
-	const void*			save)		/*!< in: immediate result
-							from check function */
+	THD*				thd,	/*!< in: thread handle */
+	struct st_mysql_sys_var*	var,	/*!< in: pointer to
+						system variable */
+	void*				var_ptr,/*!< out: where the
+						formal string goes */
+	const void*			save)	/*!< in: immediate result
+						from check function */
 {
 	const char*	format_name_in;
 	const char**	format_name_out;
@@ -11139,9 +13200,9 @@ innodb_file_format_max_update(
 
 	format_id = innobase_file_format_name_lookup(format_name_in);
 
-	if (format_id > DICT_TF_FORMAT_MAX) {
+	if (format_id > UNIV_FORMAT_MAX) {
 		/* DEFAULT is "on", which is invalid at runtime. */
-		push_warning_printf(thd, MYSQL_ERROR::WARN_LEVEL_WARN,
+		push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
 				    ER_WRONG_ARGUMENTS,
 				    "Ignoring SET innodb_file_format=%s",
 				    format_name_in);
@@ -11159,6 +13220,206 @@ innodb_file_format_max_update(
 	}
 }
 
+/*************************************************************//**
+Check whether valid argument given to innobase_*_stopword_table.
+This function is registered as a callback with MySQL.
+@return 0 for valid stopword table */
+static
+int
+innodb_stopword_table_validate(
+/*===========================*/
+	THD*				thd,	/*!< in: thread handle */
+	struct st_mysql_sys_var*	var,	/*!< in: pointer to system
+						variable */
+	void*				save,	/*!< out: immediate result
+						for update function */
+	struct st_mysql_value*		value)	/*!< in: incoming string */
+{
+	const char*	stopword_table_name;
+	char		buff[STRING_BUFFER_USUAL_SIZE];
+	int		len = sizeof(buff);
+	trx_t*		trx;
+	int		ret = 1;
+
+	ut_a(save != NULL);
+	ut_a(value != NULL);
+
+	stopword_table_name = value->val_str(value, buff, &len);
+
+	trx = check_trx_exists(thd);
+
+	row_mysql_lock_data_dictionary(trx);
+
+	/* Validate the stopword table's (if supplied) existence and
+	of the right format */
+	if (!stopword_table_name
+	    || fts_valid_stopword_table(stopword_table_name)) {
+		*static_cast<const char**>(save) = stopword_table_name;
+		ret = 0;
+	}
+
+	row_mysql_unlock_data_dictionary(trx);
+
+	return(ret);
+}
+
+/****************************************************************//**
+Update global variable fts_server_stopword_table with the "saved"
+stopword table name value. This function is registered as a callback
+with MySQL. */
+static
+void
+innodb_stopword_table_update(
+/*=========================*/
+	THD*				thd,	/*!< in: thread handle */
+	struct st_mysql_sys_var*	var,	/*!< in: pointer to
+						system variable */
+	void*				var_ptr,/*!< out: where the
+						formal string goes */
+	const void*			save)	/*!< in: immediate result
+						from check function */
+{
+	const char*	stopword_table_name;
+	char*		old;
+
+	ut_a(save != NULL);
+	ut_a(var_ptr != NULL);
+
+	stopword_table_name = *static_cast<const char*const*>(save);
+	old = *(char**) var_ptr;
+
+	if (stopword_table_name) {
+		*(char**) var_ptr =  my_strdup(stopword_table_name,  MYF(0));
+	} else {
+		*(char**) var_ptr = NULL;
+	}
+
+	if (old) {
+		my_free(old);
+	}
+
+	fts_server_stopword_table = *(char**) var_ptr;
+}
+
+/*************************************************************//**
+Check whether valid argument given to "innodb_fts_internal_tbl_name"
+This function is registered as a callback with MySQL.
+@return 0 for valid stopword table */
+static
+int
+innodb_internal_table_validate(
+/*===========================*/
+	THD*				thd,	/*!< in: thread handle */
+	struct st_mysql_sys_var*	var,	/*!< in: pointer to system
+						variable */
+	void*				save,	/*!< out: immediate result
+						for update function */
+	struct st_mysql_value*		value)	/*!< in: incoming string */
+{
+	const char*	table_name;
+	char		buff[STRING_BUFFER_USUAL_SIZE];
+	int		len = sizeof(buff);
+	int		ret = 1;
+	dict_table_t*	user_table;
+
+	ut_a(save != NULL);
+	ut_a(value != NULL);
+
+	table_name = value->val_str(value, buff, &len);
+
+	if (!table_name) {
+		*static_cast<const char**>(save) = NULL;
+		return(0);
+	}
+
+	user_table = dict_table_open_on_name_no_stats(
+			table_name, FALSE, DICT_ERR_IGNORE_NONE);
+
+	if (user_table) {
+		if (dict_table_has_fts_index(user_table)) {
+			*static_cast<const char**>(save) = table_name;
+			ret = 0;
+		}
+
+		dict_table_close(user_table, FALSE);
+	}
+
+	return(ret);
+}
+
+/****************************************************************//**
+Update global variable "fts_internal_tbl_name" with the "saved"
+stopword table name value. This function is registered as a callback
+with MySQL. */
+static
+void
+innodb_internal_table_update(
+/*=========================*/
+	THD*				thd,	/*!< in: thread handle */
+	struct st_mysql_sys_var*	var,	/*!< in: pointer to
+						system variable */
+	void*				var_ptr,/*!< out: where the
+						formal string goes */
+	const void*			save)	/*!< in: immediate result
+						from check function */
+{
+	const char*	table_name;
+	char*		old;
+
+	ut_a(save != NULL);
+	ut_a(var_ptr != NULL);
+
+	table_name = *static_cast<const char*const*>(save);
+	old = *(char**) var_ptr;
+
+	if (table_name) {
+		*(char**) var_ptr =  my_strdup(table_name,  MYF(0));
+	} else {
+		*(char**) var_ptr = NULL;
+	}
+
+	if (old) {
+		my_free(old);
+	}
+
+	fts_internal_tbl_name = *(char**) var_ptr;
+}
+
+/****************************************************************//**
+Update the session variable innodb_session_stopword_table
+with the "saved" stopword table name value. This function
+is registered as a callback with MySQL. */
+static
+void
+innodb_session_stopword_update(
+/*===========================*/
+	THD*				thd,	/*!< in: thread handle */
+	struct st_mysql_sys_var*	var,	/*!< in: pointer to
+						system variable */
+	void*				var_ptr,/*!< out: where the
+						formal string goes */
+	const void*			save)	/*!< in: immediate result
+						from check function */
+{
+	const char*	stopword_table_name;
+	char*		old;
+
+	ut_a(save != NULL);
+	ut_a(var_ptr != NULL);
+
+	stopword_table_name = *static_cast<const char*const*>(save);
+	old = *(char**) var_ptr;
+
+	if (stopword_table_name) {
+		*(char**) var_ptr =  my_strdup(stopword_table_name,  MYF(0));
+	} else {
+		*(char**) var_ptr = NULL;
+	}
+
+	if (old) {
+		my_free(old);
+	}
+}
 /****************************************************************//**
 Update the system variable innodb_adaptive_hash_index using the "saved"
 value. This function is registered as a callback with MySQL. */
@@ -11166,13 +13427,13 @@ static
 void
 innodb_adaptive_hash_index_update(
 /*==============================*/
-	THD*				thd,		/*!< in: thread handle */
-	struct st_mysql_sys_var*	var,		/*!< in: pointer to
-							system variable */
-	void*				var_ptr,	/*!< out: where the
-							formal string goes */
-	const void*			save)		/*!< in: immediate result
-							from check function */
+	THD*				thd,	/*!< in: thread handle */
+	struct st_mysql_sys_var*	var,	/*!< in: pointer to
+						system variable */
+	void*				var_ptr,/*!< out: where the
+						formal string goes */
+	const void*			save)	/*!< in: immediate result
+						from check function */
 {
 	if (*(my_bool*) save) {
 		btr_search_enable();
@@ -11200,6 +13461,27 @@ innodb_old_blocks_pct_update(
 		*static_cast<const uint*>(save), TRUE);
 }
 
+/****************************************************************//**
+Update the system variable innodb_old_blocks_pct using the "saved"
+value. This function is registered as a callback with MySQL. */
+static
+void
+innodb_change_buffer_max_size_update(
+/*=================================*/
+	THD*				thd,	/*!< in: thread handle */
+	struct st_mysql_sys_var*	var,	/*!< in: pointer to
+						system variable */
+	void*				var_ptr,/*!< out: where the
+						formal string goes */
+	const void*			save)	/*!< in: immediate result
+						from check function */
+{
+	innobase_change_buffer_max_size =
+			(*static_cast<const uint*>(save));
+	ibuf_max_size_update(innobase_change_buffer_max_size);
+}
+
+
 /*************************************************************//**
 Find the corresponding ibuf_use_t value that indexes into
 innobase_change_buffering_values[] array for the input
@@ -11220,7 +13502,7 @@ innodb_find_change_buffering_value(
 		/* found a match */
 		if (!innobase_strcasecmp(
 			input_name, innobase_change_buffering_values[use])) {
-			return((ibuf_use_t)use);
+			return((ibuf_use_t) use);
 		}
 	}
 
@@ -11302,21 +13584,574 @@ innodb_change_buffering_update(
 		 *static_cast<const char*const*>(save);
 }
 
-static int show_innodb_vars(THD *thd, SHOW_VAR *var, char *buff)
+/*************************************************************//**
+Just emit a warning that the usage of the variable is deprecated.
+@return	0 */
+static
+void
+innodb_stats_sample_pages_update(
+/*=============================*/
+	THD*				thd,	/*!< in: thread handle */
+	struct st_mysql_sys_var*	var,	/*!< in: pointer to
+						system variable */
+	void*				var_ptr,/*!< out: where the
+						formal string goes */
+	const void*			save)	/*!< in: immediate result
+						from check function */
 {
-  innodb_export_status();
-  var->type= SHOW_ARRAY;
-  var->value= (char *) &innodb_status_variables;
-  return 0;
+#define STATS_SAMPLE_PAGES_DEPRECATED_MSG \
+	"Using innodb_stats_sample_pages is deprecated and " \
+	"the variable may be removed in future releases. " \
+	"Please use innodb_stats_transient_sample_pages " \
+	"instead."
+
+	push_warning(thd, Sql_condition::WARN_LEVEL_WARN,
+		     HA_ERR_WRONG_COMMAND, STATS_SAMPLE_PAGES_DEPRECATED_MSG);
+
+	ut_print_timestamp(stderr);
+	fprintf(stderr,
+		" InnoDB: Warning: %s\n",
+		STATS_SAMPLE_PAGES_DEPRECATED_MSG);
+
+	srv_stats_transient_sample_pages =
+		*static_cast<const unsigned long long*>(save);
 }
 
-/*********************************************************************//**
+/****************************************************************//**
+Update the monitor counter according to the "set_option",  turn
+on/off or reset specified monitor counter. */
+static
+void
+innodb_monitor_set_option(
+/*======================*/
+	const monitor_info_t* monitor_info,/*!< in: monitor info for the monitor
+					to set */
+	mon_option_t	set_option)	/*!< in: Turn on/off reset the
+					counter */
+{
+	monitor_id_t	monitor_id = monitor_info->monitor_id;
+
+	/* If module type is MONITOR_GROUP_MODULE, it cannot be
+	turned on/off individually. It should never use this
+	function to set options */
+	ut_a(!(monitor_info->monitor_type & MONITOR_GROUP_MODULE));
+
+	switch (set_option) {
+	case MONITOR_TURN_ON:
+		MONITOR_ON(monitor_id);
+		MONITOR_INIT(monitor_id);
+		MONITOR_SET_START(monitor_id);
+
+		/* If the monitor to be turned on uses
+		exisitng monitor counter (status variable),
+		make special processing to remember existing
+		counter value. */
+		if (monitor_info->monitor_type
+		    & MONITOR_EXISTING) {
+			srv_mon_process_existing_counter(
+				monitor_id, MONITOR_TURN_ON);
+		}
+		break;
+
+	case MONITOR_TURN_OFF:
+		if (monitor_info->monitor_type & MONITOR_EXISTING) {
+			srv_mon_process_existing_counter(
+				monitor_id, MONITOR_TURN_OFF);
+		}
+
+		MONITOR_OFF(monitor_id);
+		MONITOR_SET_OFF(monitor_id);
+		break;
+
+	case MONITOR_RESET_VALUE:
+		srv_mon_reset(monitor_id);
+		break;
+
+	case MONITOR_RESET_ALL_VALUE:
+		srv_mon_reset_all(monitor_id);
+		break;
+
+	default:
+		ut_error;
+	}
+}
+
+/****************************************************************//**
+Find matching InnoDB monitor counters and update their status
+according to the "set_option",  turn on/off or reset specified
+monitor counter. */
+static
+void
+innodb_monitor_update_wildcard(
+/*===========================*/
+	const char*	name,		/*!< in: monitor name to match */
+	mon_option_t	set_option)	/*!< in: the set option, whether
+					to turn on/off or reset the counter */
+{
+	ut_a(name);
+
+	for (ulint use = 0; use < NUM_MONITOR; use++) {
+		ulint		type;
+		monitor_id_t	monitor_id = static_cast<monitor_id_t>(use);
+		monitor_info_t*	monitor_info;
+
+		if (!innobase_wildcasecmp(
+			srv_mon_get_name(monitor_id), name)) {
+			monitor_info = srv_mon_get_info(monitor_id);
+
+			type = monitor_info->monitor_type;
+
+			/* If the monitor counter is of MONITOR_MODULE
+			type, skip it. Except for those also marked with
+			MONITOR_GROUP_MODULE flag, which can be turned
+			on only as a module. */
+			if (!(type & MONITOR_MODULE)
+			     && !(type & MONITOR_GROUP_MODULE)) {
+				innodb_monitor_set_option(monitor_info,
+							  set_option);
+			}
+
+			/* Need to special handle counters marked with
+			MONITOR_GROUP_MODULE, turn on the whole module if
+			any one of it comes here. Currently, only
+			"module_buf_page" is marked with MONITOR_GROUP_MODULE */
+			if (type & MONITOR_GROUP_MODULE) {
+				if ((monitor_id >= MONITOR_MODULE_BUF_PAGE)
+				     && (monitor_id < MONITOR_MODULE_OS)) {
+					if (set_option == MONITOR_TURN_ON
+					    && MONITOR_IS_ON(
+						MONITOR_MODULE_BUF_PAGE)) {
+						continue;
+					}
+
+					srv_mon_set_module_control(
+						MONITOR_MODULE_BUF_PAGE,
+						set_option);
+				} else {
+					/* If new monitor is added with
+					MONITOR_GROUP_MODULE, it needs
+					to be added here. */
+					ut_ad(0);
+				}
+			}
+		}
+	}
+}
+
+/*************************************************************//**
+Given a configuration variable name, find corresponding monitor counter
+and return its monitor ID if found.
+@return	monitor ID if found, MONITOR_NO_MATCH if there is no match */
+static
+ulint
+innodb_monitor_id_by_name_get(
+/*==========================*/
+	const char*	name)	/*!< in: monitor counter namer */
+{
+	ut_a(name);
+
+	/* Search for wild character '%' in the name, if
+	found, we treat it as a wildcard match. We do not search for
+	single character wildcard '_' since our monitor names already contain
+	such character. To avoid confusion, we request user must include
+	at least one '%' character to activate the wildcard search. */
+	if (strchr(name, '%')) {
+		return(MONITOR_WILDCARD_MATCH);
+	}
+
+	/* Not wildcard match, check for an exact match */
+	for (ulint i = 0; i < NUM_MONITOR; i++) {
+		if (!innobase_strcasecmp(
+			name, srv_mon_get_name(static_cast<monitor_id_t>(i)))) {
+			return(i);
+		}
+	}
+
+	return(MONITOR_NO_MATCH);
+}
+/*************************************************************//**
+Validate that the passed in monitor name matches at least one
+monitor counter name with wildcard compare.
+@return	TRUE if at least one monitor name matches */
+static
+ibool
+innodb_monitor_validate_wildcard_name(
+/*==================================*/
+	const char*	name)	/*!< in: monitor counter namer */
+{
+	for (ulint i = 0; i < NUM_MONITOR; i++) {
+		if (!innobase_wildcasecmp(
+			srv_mon_get_name(static_cast<monitor_id_t>(i)), name)) {
+			return(TRUE);
+		}
+	}
+
+	return(FALSE);
+}
+/*************************************************************//**
+Validate the passed in monitor name, find and save the
+corresponding monitor name in the function parameter "save".
+@return	0 if monitor name is valid */
+static
+int
+innodb_monitor_valid_byname(
+/*========================*/
+	void*			save,	/*!< out: immediate result
+					for update function */
+	const char*		name)	/*!< in: incoming monitor name */
+{
+	ulint		use;
+	monitor_info_t*	monitor_info;
+
+	if (!name) {
+		return(1);
+	}
+
+	use = innodb_monitor_id_by_name_get(name);
+
+	/* No monitor name matches, nor it is wildcard match */
+	if (use == MONITOR_NO_MATCH) {
+		return(1);
+	}
+
+	if (use < NUM_MONITOR) {
+		monitor_info = srv_mon_get_info((monitor_id_t) use);
+
+		/* If the monitor counter is marked with
+		MONITOR_GROUP_MODULE flag, then this counter
+		cannot be turned on/off individually, instead
+		it shall be turned on/off as a group using
+		its module name */
+		if ((monitor_info->monitor_type & MONITOR_GROUP_MODULE)
+		    && (!(monitor_info->monitor_type & MONITOR_MODULE))) {
+			sql_print_warning(
+				"Monitor counter '%s' cannot"
+				" be turned on/off individually."
+				" Please use its module name"
+				" to turn on/off the counters"
+				" in the module as a group.\n",
+				name);
+
+			return(1);
+		}
+
+	} else {
+		ut_a(use == MONITOR_WILDCARD_MATCH);
+
+		/* For wildcard match, if there is not a single monitor
+		counter name that matches, treat it as an invalid
+		value for the system configuration variables */
+		if (!innodb_monitor_validate_wildcard_name(name)) {
+			return(1);
+		}
+	}
+
+	/* Save the configure name for innodb_monitor_update() */
+	*static_cast<const char**>(save) = name;
+
+	return(0);
+}
+/*************************************************************//**
+Validate passed-in "value" is a valid monitor counter name.
+This function is registered as a callback with MySQL.
+@return	0 for valid name */
+static
+int
+innodb_monitor_validate(
+/*====================*/
+	THD*				thd,	/*!< in: thread handle */
+	struct st_mysql_sys_var*	var,	/*!< in: pointer to system
+						variable */
+	void*				save,	/*!< out: immediate result
+						for update function */
+	struct st_mysql_value*		value)	/*!< in: incoming string */
+{
+	const char*	name;
+	char*		monitor_name;
+	char		buff[STRING_BUFFER_USUAL_SIZE];
+	int		len = sizeof(buff);
+	int		ret;
+
+	ut_a(save != NULL);
+	ut_a(value != NULL);
+
+	name = value->val_str(value, buff, &len);
+
+	/* monitor_name could point to memory from MySQL
+	or buff[]. Always dup the name to memory allocated
+	by InnoDB, so we can access it in another callback
+	function innodb_monitor_update() and free it appropriately */
+	if (name) {
+		monitor_name = my_strdup(name, MYF(0));
+	} else {
+		return(1);
+	}
+
+	ret = innodb_monitor_valid_byname(save, monitor_name);
+
+	if (ret) {
+		/* Validation failed */
+		my_free(monitor_name);
+	} else {
+		/* monitor_name will be freed in separate callback function
+		innodb_monitor_update(). Assert "save" point to
+		the "monitor_name" variable */
+		ut_ad(*static_cast<char**>(save) == monitor_name);
+	}
+
+	return(ret);
+}
+
+/****************************************************************//**
+Update the system variable innodb_enable(disable/reset/reset_all)_monitor
+according to the "set_option" and turn on/off or reset specified monitor
+counter. */
+static
+void
+innodb_monitor_update(
+/*==================*/
+	THD*			thd,		/*!< in: thread handle */
+	void*			var_ptr,	/*!< out: where the
+						formal string goes */
+	const void*		save,		/*!< in: immediate result
+						from check function */
+	mon_option_t		set_option,	/*!< in: the set option,
+						whether to turn on/off or
+						reset the counter */
+	ibool			free_mem)	/*!< in: whether we will
+						need to free the memory */
+{
+	monitor_info_t*	monitor_info;
+	ulint		monitor_id;
+	ulint		err_monitor = 0;
+	const char*	name;
+
+	ut_a(save != NULL);
+
+	name = *static_cast<const char*const*>(save);
+
+	if (!name) {
+		monitor_id = MONITOR_DEFAULT_START;
+	} else {
+		monitor_id = innodb_monitor_id_by_name_get(name);
+
+		/* Double check we have a valid monitor ID */
+		if (monitor_id == MONITOR_NO_MATCH) {
+			return;
+		}
+	}
+
+	if (monitor_id == MONITOR_DEFAULT_START) {
+		/* If user set the variable to "default", we will
+		print a message and make this set operation a "noop".
+		The check is being made here is because "set default"
+		does not go through validation function */
+		if (thd) {
+			push_warning_printf(
+				thd, Sql_condition::WARN_LEVEL_WARN,
+				ER_NO_DEFAULT,
+				"Default value is not defined for "
+				"this set option. Please specify "
+				"correct counter or module name.");
+		} else {
+			sql_print_error(
+				"Default value is not defined for "
+				"this set option. Please specify "
+				"correct counter or module name.\n");
+		}
+
+		if (var_ptr) {
+			*(const char**) var_ptr = NULL;
+		}
+	} else if (monitor_id == MONITOR_WILDCARD_MATCH) {
+		innodb_monitor_update_wildcard(name, set_option);
+	} else {
+		monitor_info = srv_mon_get_info(
+			static_cast<monitor_id_t>(monitor_id));
+
+		ut_a(monitor_info);
+
+		/* If monitor is already truned on, someone could already
+		collect monitor data, exit and ask user to turn off the
+		monitor before turn it on again. */
+		if (set_option == MONITOR_TURN_ON
+		    && MONITOR_IS_ON(monitor_id)) {
+			err_monitor = monitor_id;
+			goto exit;
+		}
+
+		if (var_ptr) {
+			*(const char**) var_ptr = monitor_info->monitor_name;
+		}
+
+		/* Depending on the monitor name is for a module or
+		a counter, process counters in the whole module or
+		individual counter. */
+		if (monitor_info->monitor_type & MONITOR_MODULE) {
+			srv_mon_set_module_control(
+				static_cast<monitor_id_t>(monitor_id),
+				set_option);
+		} else {
+			innodb_monitor_set_option(monitor_info, set_option);
+		}
+	}
+exit:
+	/* Only if we are trying to turn on a monitor that already
+	been turned on, we will set err_monitor. Print related
+	information */
+	if (err_monitor) {
+		sql_print_warning("Monitor %s is already enabled.",
+				  srv_mon_get_name((monitor_id_t) err_monitor));
+	}
+
+	if (free_mem && name) {
+		my_free((void*) name);
+	}
+
+	return;
+}
+
+/****************************************************************//**
+Update the system variable innodb_monitor_enable and enable
+specified monitor counter.
+This function is registered as a callback with MySQL. */
+static
+void
+innodb_enable_monitor_update(
+/*=========================*/
+	THD*				thd,	/*!< in: thread handle */
+	struct st_mysql_sys_var*	var,	/*!< in: pointer to
+						system variable */
+	void*				var_ptr,/*!< out: where the
+						formal string goes */
+	const void*			save)	/*!< in: immediate result
+						from check function */
+{
+	innodb_monitor_update(thd, var_ptr, save, MONITOR_TURN_ON, TRUE);
+}
+
+/****************************************************************//**
+Update the system variable innodb_monitor_disable and turn
+off specified monitor counter. */
+static
+void
+innodb_disable_monitor_update(
+/*==========================*/
+	THD*				thd,	/*!< in: thread handle */
+	struct st_mysql_sys_var*	var,	/*!< in: pointer to
+						system variable */
+	void*				var_ptr,/*!< out: where the
+						formal string goes */
+	const void*			save)	/*!< in: immediate result
+						from check function */
+{
+	innodb_monitor_update(thd, var_ptr, save, MONITOR_TURN_OFF, TRUE);
+}
+
+/****************************************************************//**
+Update the system variable innodb_monitor_reset and reset
+specified monitor counter(s).
+This function is registered as a callback with MySQL. */
+static
+void
+innodb_reset_monitor_update(
+/*========================*/
+	THD*				thd,	/*!< in: thread handle */
+	struct st_mysql_sys_var*	var,	/*!< in: pointer to
+						system variable */
+	void*				var_ptr,/*!< out: where the
+						formal string goes */
+	const void*			save)	/*!< in: immediate result
+						from check function */
+{
+	innodb_monitor_update(thd, var_ptr, save, MONITOR_RESET_VALUE, TRUE);
+}
+
+/****************************************************************//**
+Update the system variable innodb_monitor_reset_all and reset
+all value related monitor counter.
+This function is registered as a callback with MySQL. */
+static
+void
+innodb_reset_all_monitor_update(
+/*============================*/
+	THD*				thd,	/*!< in: thread handle */
+	struct st_mysql_sys_var*	var,	/*!< in: pointer to
+						system variable */
+	void*				var_ptr,/*!< out: where the
+						formal string goes */
+	const void*			save)	/*!< in: immediate result
+						from check function */
+{
+	innodb_monitor_update(thd, var_ptr, save, MONITOR_RESET_ALL_VALUE,
+			      TRUE);
+}
+
+/****************************************************************//**
+Parse and enable InnoDB monitor counters during server startup.
+User can list the monitor counters/groups to be enable by specifying
+"loose-innodb_monitor_enable=monitor_name1;monitor_name2..."
+in server configuration file or at the command line. The string
+separate could be ";", "," or empty space. */
+static
+void
+innodb_enable_monitor_at_startup(
+/*=============================*/
+	char*	str)	/*!< in/out: monitor counter enable list */
+{
+	static const char*	sep = " ;,";
+	char*			last;
+
+	ut_a(str);
+
+	/* Walk through the string, and separate each monitor counter
+	and/or counter group name, and calling innodb_monitor_update()
+	if successfully updated. Please note that the "str" would be
+	changed by strtok_r() as it walks through it. */
+	for (char* option = strtok_r(str, sep, &last);
+	     option;
+	     option = strtok_r(NULL, sep, &last)) {
+		ulint	ret;
+		char*	option_name;
+
+		ret = innodb_monitor_valid_byname(&option_name, option);
+
+		/* The name is validated if ret == 0 */
+		if (!ret) {
+			innodb_monitor_update(NULL, NULL, &option,
+					      MONITOR_TURN_ON, FALSE);
+		} else {
+			sql_print_warning("Invalid monitor counter"
+					  " name: '%s'", option);
+		}
+	}
+}
+
+/****************************************************************//**
+Callback function for accessing the InnoDB variables from MySQL:
+SHOW VARIABLES. */
+static
+int
+show_innodb_vars(
+/*=============*/
+	THD*		thd,
+	SHOW_VAR*	var,
+	char*		buff)
+{
+	innodb_export_status();
+	var->type = SHOW_ARRAY;
+	var->value = (char*) &innodb_status_variables;
+
+	return(0);
+}
+
+/****************************************************************//**
 This function checks each index name for a table against reserved
 system default primary index name 'GEN_CLUST_INDEX'. If a name
 matches, this function pushes an warning message to the client,
 and returns true.
 @return true if the index name matches the reserved name */
-extern "C" UNIV_INTERN
+UNIV_INTERN
 bool
 innobase_index_name_is_reserved(
 /*============================*/
@@ -11335,7 +14170,7 @@ innobase_index_name_is_reserved(
 					innobase_index_reserve_name) == 0) {
 			/* Push warning to mysql */
 			push_warning_printf(thd,
-					    MYSQL_ERROR::WARN_LEVEL_WARN,
+					    Sql_condition::WARN_LEVEL_WARN,
 					    ER_WRONG_NAME_FOR_INDEX,
 					    "Cannot Create Index with name "
 					    "'%s'. The name is reserved "
@@ -11353,17 +14188,200 @@ innobase_index_name_is_reserved(
 	return(false);
 }
 
+/***********************************************************************
+Retrieve the FTS Relevance Ranking result for doc with doc_id
+of prebuilt->fts_doc_id
+@return the relevance ranking value */
+UNIV_INTERN
+float
+innobase_fts_retrieve_ranking(
+/*============================*/
+		FT_INFO * fts_hdl)	/*!< in: FTS handler */
+{
+	row_prebuilt_t*	ft_prebuilt;
+	fts_result_t*	result;
+
+	result = ((NEW_FT_INFO*) fts_hdl)->ft_result;
+
+	ft_prebuilt = ((NEW_FT_INFO*) fts_hdl)->ft_prebuilt;
+
+	/* Retrieve the ranking value for doc_id with value of
+	prebuilt->fts_doc_id */
+	return(fts_retrieve_ranking(result, ft_prebuilt->fts_doc_id));
+}
+
+/***********************************************************************
+Free the memory for the FTS handler */
+UNIV_INTERN
+void
+innobase_fts_close_ranking(
+/*=======================*/
+		FT_INFO * fts_hdl)
+{
+	fts_result_t*	result;
+	row_prebuilt_t*	ft_prebuilt;
+
+	ft_prebuilt = ((NEW_FT_INFO*) fts_hdl)->ft_prebuilt;
+
+	result = ((NEW_FT_INFO*) fts_hdl)->ft_result;
+
+	fts_query_free_result(result);
+
+	if (result == ft_prebuilt->result) {
+		ft_prebuilt->result = NULL;
+	}
+
+	my_free((uchar*) fts_hdl);
+
+	return;
+}
+
+/***********************************************************************
+Find and Retrieve the FTS Relevance Ranking result for doc with doc_id
+of prebuilt->fts_doc_id
+@return the relevance ranking value */
+UNIV_INTERN
+float
+innobase_fts_find_ranking(
+/*======================*/
+		FT_INFO*	fts_hdl,	/*!< in: FTS handler */
+		uchar*		record,		/*!< in: Unused */
+		uint		len)		/*!< in: Unused */
+{
+	row_prebuilt_t*	ft_prebuilt;
+	fts_result_t*	result;
+
+	ft_prebuilt = ((NEW_FT_INFO*) fts_hdl)->ft_prebuilt;
+	result = ((NEW_FT_INFO*) fts_hdl)->ft_result;
+
+	/* Retrieve the ranking value for doc_id with value of
+	prebuilt->fts_doc_id */
+	return fts_retrieve_ranking(result, ft_prebuilt->fts_doc_id);
+}
+
+/* These variables are never read by InnoDB or changed. They are a kind of
+dummies that are needed by the MySQL infrastructure to call
+buffer_pool_dump_now(), buffer_pool_load_now() and buffer_pool_load_abort()
+by the user by doing:
+  SET GLOBAL innodb_buffer_pool_dump_now=ON;
+  SET GLOBAL innodb_buffer_pool_load_now=ON;
+  SET GLOBAL innodb_buffer_pool_load_abort=ON;
+Their values are read by MySQL and displayed to the user when the variables
+are queried, e.g.:
+  SELECT @@innodb_buffer_pool_dump_now;
+  SELECT @@innodb_buffer_pool_load_now;
+  SELECT @@innodb_buffer_pool_load_abort; */
+static my_bool	innodb_buffer_pool_dump_now = FALSE;
+static my_bool	innodb_buffer_pool_load_now = FALSE;
+static my_bool	innodb_buffer_pool_load_abort = FALSE;
+
+/****************************************************************//**
+Trigger a dump of the buffer pool if innodb_buffer_pool_dump_now is set
+to ON. This function is registered as a callback with MySQL. */
+static
+void
+buffer_pool_dump_now(
+/*=================*/
+	THD*				thd	/*!< in: thread handle */
+					__attribute__((unused)),
+	struct st_mysql_sys_var*	var	/*!< in: pointer to system
+						variable */
+					__attribute__((unused)),
+	void*				var_ptr	/*!< out: where the formal
+						string goes */
+					__attribute__((unused)),
+	const void*			save)	/*!< in: immediate result from
+						check function */
+{
+	if (*(my_bool*) save) {
+		buf_dump_start();
+	}
+}
+
+/****************************************************************//**
+Trigger a load of the buffer pool if innodb_buffer_pool_load_now is set
+to ON. This function is registered as a callback with MySQL. */
+static
+void
+buffer_pool_load_now(
+/*=================*/
+	THD*				thd	/*!< in: thread handle */
+					__attribute__((unused)),
+	struct st_mysql_sys_var*	var	/*!< in: pointer to system
+						variable */
+					__attribute__((unused)),
+	void*				var_ptr	/*!< out: where the formal
+						string goes */
+					__attribute__((unused)),
+	const void*			save)	/*!< in: immediate result from
+						check function */
+{
+	if (*(my_bool*) save) {
+		buf_load_start();
+	}
+}
+
+/****************************************************************//**
+Abort a load of the buffer pool if innodb_buffer_pool_load_abort
+is set to ON. This function is registered as a callback with MySQL. */
+static
+void
+buffer_pool_load_abort(
+/*===================*/
+	THD*				thd	/*!< in: thread handle */
+					__attribute__((unused)),
+	struct st_mysql_sys_var*	var	/*!< in: pointer to system
+						variable */
+					__attribute__((unused)),
+	void*				var_ptr	/*!< out: where the formal
+						string goes */
+					__attribute__((unused)),
+	const void*			save)	/*!< in: immediate result from
+						check function */
+{
+	if (*(my_bool*) save) {
+		buf_load_abort();
+	}
+}
+
 static SHOW_VAR innodb_status_variables_export[]= {
-  {"Innodb",                   (char*) &show_innodb_vars, SHOW_FUNC},
-  {NullS, NullS, SHOW_LONG}
+	{"Innodb", (char*) &show_innodb_vars, SHOW_FUNC},
+	{NullS, NullS, SHOW_LONG}
 };
 
 static struct st_mysql_storage_engine innobase_storage_engine=
 { MYSQL_HANDLERTON_INTERFACE_VERSION };
 
 /* plugin options */
+
+static MYSQL_SYSVAR_ENUM(checksum_algorithm, srv_checksum_algorithm,
+  PLUGIN_VAR_RQCMDARG,
+  "The algorithm InnoDB uses for page checksumming. Possible values are "
+  "CRC32 (hardware accelerated if the CPU supports it) "
+    "write crc32, allow any of the other checksums to match when reading; "
+  "STRICT_CRC32 "
+    "write crc32, do not allow other algorithms to match when reading; "
+  "INNODB "
+    "write a software calculated checksum, allow any other checksums "
+    "to match when reading; "
+  "STRICT_INNODB "
+    "write a software calculated checksum, do not allow other algorithms "
+    "to match when reading; "
+  "NONE "
+    "write a constant magic number, do not do any checksum verification "
+    "when reading (same as innodb_checksums=OFF); "
+  "STRICT_NONE "
+    "write a constant magic number, do not allow values other than that "
+    "magic number when reading; "
+  "Files updated when this option is set to crc32 or strict_crc32 will "
+  "not be readable by MySQL versions older than 5.6.3",
+  NULL, NULL, SRV_CHECKSUM_ALGORITHM_INNODB,
+  &innodb_checksum_algorithm_typelib);
+
 static MYSQL_SYSVAR_BOOL(checksums, innobase_use_checksums,
   PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_READONLY,
+  "DEPRECATED. Use innodb_checksum_algorithm=NONE instead of setting "
+  "this to OFF. "
   "Enable InnoDB checksums validation (enabled by default). "
   "Disable with --skip-innodb-checksums.",
   NULL, NULL, TRUE);
@@ -11382,31 +14400,31 @@ static MYSQL_SYSVAR_BOOL(doublewrite, innobase_use_doublewrite,
 static MYSQL_SYSVAR_ULONG(io_capacity, srv_io_capacity,
   PLUGIN_VAR_RQCMDARG,
   "Number of IOPs the server can do. Tunes the background IO rate",
-  NULL, NULL, 200, 100, ~0L, 0);
+  NULL, NULL, 200, 100, ~0UL, 0);
 
 static MYSQL_SYSVAR_ULONG(purge_batch_size, srv_purge_batch_size,
   PLUGIN_VAR_OPCMDARG,
   "Number of UNDO log pages to purge in one batch from the history list.",
   NULL, NULL,
-  20,			/* Default setting */
+  300,			/* Default setting */
   1,			/* Minimum value */
   5000, 0);		/* Maximum value */
 
-static MYSQL_SYSVAR_ULONG(rollback_segments, srv_rollback_segments,
-  PLUGIN_VAR_OPCMDARG,
-  "Number of UNDO logs to use.",
+static MYSQL_SYSVAR_ULONG(purge_threads, srv_n_purge_threads,
+  PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_READONLY,
+  "Purge threads can be from 0 to 32. Default is 0.",
   NULL, NULL,
-  128,			/* Default setting */
+  1,			/* Default setting */
   1,			/* Minimum value */
-  TRX_SYS_N_RSEGS, 0);	/* Maximum value */
+  32, 0);		/* Maximum value */
 
-static MYSQL_SYSVAR_ULONG(purge_threads, srv_n_purge_threads,
-  PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_READONLY,
-  "Purge threads can be either 0 or 1.",
+static MYSQL_SYSVAR_ULONG(sync_array_size, srv_sync_array_size,
+  PLUGIN_VAR_OPCMDARG,
+  "Size of the mutex/lock wait array.",
   NULL, NULL,
-  0,			/* Default setting */
-  0,			/* Minimum value */
-  1, 0);		/* Maximum value */
+  1,			/* Default setting */
+  1,			/* Minimum value */
+  1024, 0);		/* Maximum value */
 
 static MYSQL_SYSVAR_ULONG(fast_shutdown, innobase_fast_shutdown,
   PLUGIN_VAR_OPCMDARG,
@@ -11438,7 +14456,7 @@ static MYSQL_SYSVAR_BOOL(file_format_check, innobase_file_format_check,
 
 /* If a new file format is introduced, the file format
 name needs to be updated accordingly. Please refer to
-file_format_name_map[] defined in trx0sys.c for the next
+file_format_name_map[] defined in trx0sys.cc for the next
 file format name. */
 static MYSQL_SYSVAR_STR(file_format_max, innobase_file_format_max,
   PLUGIN_VAR_OPCMDARG,
@@ -11446,6 +14464,13 @@ static MYSQL_SYSVAR_STR(file_format_max, innobase_file_format_max,
   innodb_file_format_max_validate,
   innodb_file_format_max_update, "Antelope");
 
+static MYSQL_SYSVAR_STR(ft_server_stopword_table, innobase_server_stopword_table,
+  PLUGIN_VAR_OPCMDARG,
+  "The user supplied stopword table name.",
+  innodb_stopword_table_validate,
+  innodb_stopword_table_update,
+  NULL);
+
 static MYSQL_SYSVAR_ULONG(flush_log_at_trx_commit, srv_flush_log_at_trx_commit,
   PLUGIN_VAR_OPCMDARG,
   "Set to 0 (write and flush once per second),"
@@ -11469,6 +14494,8 @@ static MYSQL_SYSVAR_BOOL(force_load_corrupted, srv_load_corrupted,
 
 static MYSQL_SYSVAR_BOOL(locks_unsafe_for_binlog, innobase_locks_unsafe_for_binlog,
   PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_READONLY,
+  "DEPRECATED. This option may be removed in future releases. "
+  "Please use READ COMMITTED transaction isolation level instead. "
   "Force InnoDB to not use next-key locking, to use only row-level locking.",
   NULL, NULL, FALSE);
 
@@ -11499,8 +14526,16 @@ static MYSQL_SYSVAR_BOOL(adaptive_flushing, srv_adaptive_flushing,
 static MYSQL_SYSVAR_ULONG(max_purge_lag, srv_max_purge_lag,
   PLUGIN_VAR_RQCMDARG,
   "Desired maximum length of the purge queue (0 = no limit)",
-  NULL, NULL, 0, 0, ~0L, 0);
+  NULL, NULL, 0, 0, ~0UL, 0);
 
+static MYSQL_SYSVAR_ULONG(max_purge_lag_delay, srv_max_purge_lag_delay,
+   PLUGIN_VAR_RQCMDARG,
+   "Maximum delay of user threads in micro-seconds",
+   NULL, NULL, 
+   0L,			/* Default seting */
+   0L,			/* Minimum value */
+   10000000UL, 0);	/* Maximum value */
+ 
 static MYSQL_SYSVAR_BOOL(rollback_on_timeout, innobase_rollback_on_timeout,
   PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_READONLY,
   "Roll back the complete transaction on lock wait timeout, for 4.x compatibility (disabled by default)",
@@ -11508,7 +14543,7 @@ static MYSQL_SYSVAR_BOOL(rollback_on_timeout, innobase_rollback_on_timeout,
 
 static MYSQL_SYSVAR_BOOL(status_file, innobase_create_status_file,
   PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_NOSYSVAR,
-  "Enable SHOW INNODB STATUS output in the innodb_status.<pid> file",
+  "Enable SHOW ENGINE INNODB STATUS output in the innodb_status.<pid> file",
   NULL, NULL, FALSE);
 
 static MYSQL_SYSVAR_BOOL(stats_on_metadata, innobase_stats_on_metadata,
@@ -11516,11 +14551,25 @@ static MYSQL_SYSVAR_BOOL(stats_on_metadata, innobase_stats_on_metadata,
   "Enable statistics gathering for metadata commands such as SHOW TABLE STATUS (on by default)",
   NULL, NULL, TRUE);
 
-static MYSQL_SYSVAR_ULONGLONG(stats_sample_pages, srv_stats_sample_pages,
+static MYSQL_SYSVAR_ULONGLONG(stats_sample_pages, srv_stats_transient_sample_pages,
   PLUGIN_VAR_RQCMDARG,
-  "The number of index pages to sample when calculating statistics (default 8)",
+  "Deprecated, use innodb_stats_transient_sample_pages instead",
+  NULL, innodb_stats_sample_pages_update, 8, 1, ~0ULL, 0);
+
+static MYSQL_SYSVAR_ULONGLONG(stats_transient_sample_pages,
+  srv_stats_transient_sample_pages,
+  PLUGIN_VAR_RQCMDARG,
+  "The number of leaf index pages to sample when calculating transient "
+  "statistics (if persistent statistics are not used, default 8)",
   NULL, NULL, 8, 1, ~0ULL, 0);
 
+static MYSQL_SYSVAR_ULONGLONG(stats_persistent_sample_pages,
+  srv_stats_persistent_sample_pages,
+  PLUGIN_VAR_RQCMDARG,
+  "The number of leaf index pages to sample when calculating persistent "
+  "statistics (by ANALYZE, default 20)",
+  NULL, NULL, 20, 1, ~0ULL, 0);
+
 static MYSQL_SYSVAR_BOOL(adaptive_hash_index, btr_search_enabled,
   PLUGIN_VAR_OPCMDARG,
   "Enable InnoDB adaptive hash index (enabled by default).  "
@@ -11535,6 +14584,9 @@ static MYSQL_SYSVAR_ULONG(replication_delay, srv_replication_delay,
 
 static MYSQL_SYSVAR_LONG(additional_mem_pool_size, innobase_additional_mem_pool_size,
   PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+  "DEPRECATED. This option may be removed in future releases, "
+  "together with the option innodb_use_sys_malloc and with the InnoDB's "
+  "internal memory allocator. "
   "Size of a memory pool InnoDB uses to store data dictionary information and other internal data structures.",
   NULL, NULL, 8*1024*1024L, 512*1024L, LONG_MAX, 1024);
 
@@ -11548,11 +14600,64 @@ static MYSQL_SYSVAR_LONGLONG(buffer_pool_size, innobase_buffer_pool_size,
   "The size of the memory buffer InnoDB uses to cache data and indexes of its tables.",
   NULL, NULL, 128*1024*1024L, 5*1024*1024L, LONGLONG_MAX, 1024*1024L);
 
+#if defined UNIV_DEBUG || defined UNIV_PERF_DEBUG
+static MYSQL_SYSVAR_ULONG(page_hash_locks, srv_n_page_hash_locks,
+  PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_READONLY,
+  "Number of rw_locks protecting buffer pool page_hash. Rounded up to the next power of 2",
+  NULL, NULL, 16, 1, MAX_PAGE_HASH_LOCKS, 0);
+
+static MYSQL_SYSVAR_ULONG(doublewrite_batch_size, srv_doublewrite_batch_size,
+  PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_READONLY,
+  "Number of pages reserved in doublewrite buffer for batch flushing",
+  NULL, NULL, 120, 1, 127, 0);
+#endif /* defined UNIV_DEBUG || defined UNIV_PERF_DEBUG */
+
 static MYSQL_SYSVAR_LONG(buffer_pool_instances, innobase_buffer_pool_instances,
   PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
   "Number of buffer pool instances, set to higher value on high-end machines to increase scalability",
   NULL, NULL, 1L, 1L, MAX_BUFFER_POOLS, 1L);
 
+static MYSQL_SYSVAR_STR(buffer_pool_filename, srv_buf_dump_filename,
+  PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_MEMALLOC,
+  "Filename to/from which to dump/load the InnoDB buffer pool",
+  NULL, NULL, SRV_BUF_DUMP_FILENAME_DEFAULT);
+
+static MYSQL_SYSVAR_BOOL(buffer_pool_dump_now, innodb_buffer_pool_dump_now,
+  PLUGIN_VAR_RQCMDARG,
+  "Trigger an immediate dump of the buffer pool into a file named @@innodb_buffer_pool_filename",
+  NULL, buffer_pool_dump_now, FALSE);
+
+static MYSQL_SYSVAR_BOOL(buffer_pool_dump_at_shutdown, srv_buffer_pool_dump_at_shutdown,
+  PLUGIN_VAR_RQCMDARG,
+  "Dump the buffer pool into a file named @@innodb_buffer_pool_filename",
+  NULL, NULL, FALSE);
+
+static MYSQL_SYSVAR_BOOL(buffer_pool_load_now, innodb_buffer_pool_load_now,
+  PLUGIN_VAR_RQCMDARG,
+  "Trigger an immediate load of the buffer pool from a file named @@innodb_buffer_pool_filename",
+  NULL, buffer_pool_load_now, FALSE);
+
+static MYSQL_SYSVAR_BOOL(buffer_pool_load_abort, innodb_buffer_pool_load_abort,
+  PLUGIN_VAR_RQCMDARG,
+  "Abort a currently running load of the buffer pool",
+  NULL, buffer_pool_load_abort, FALSE);
+
+/* there is no point in changing this during runtime, thus readonly */
+static MYSQL_SYSVAR_BOOL(buffer_pool_load_at_startup, srv_buffer_pool_load_at_startup,
+  PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+  "Load the buffer pool from a file named @@innodb_buffer_pool_filename",
+  NULL, NULL, FALSE);
+
+static MYSQL_SYSVAR_ULONG(lru_scan_depth, srv_LRU_scan_depth,
+  PLUGIN_VAR_RQCMDARG,
+  "How deep to scan LRU to keep it clean",
+  NULL, NULL, 1024, 100, ~0UL, 0);
+
+static MYSQL_SYSVAR_BOOL(flush_neighbors, srv_flush_neighbors,
+  PLUGIN_VAR_NOCMDARG,
+  "Flush neighbors from buffer pool when flushing a block.",
+  NULL, NULL, TRUE);
+
 static MYSQL_SYSVAR_ULONG(commit_concurrency, innobase_commit_concurrency,
   PLUGIN_VAR_RQCMDARG,
   "Helps in performance tuning in heavily concurrent environments.",
@@ -11561,13 +14666,65 @@ static MYSQL_SYSVAR_ULONG(commit_concurrency, innobase_commit_concurrency,
 static MYSQL_SYSVAR_ULONG(concurrency_tickets, srv_n_free_tickets_to_enter,
   PLUGIN_VAR_RQCMDARG,
   "Number of times a thread is allowed to enter InnoDB within the same SQL query after it has once got the ticket",
-  NULL, NULL, 500L, 1L, ~0L, 0);
+  NULL, NULL, 500L, 1L, ~0UL, 0);
 
 static MYSQL_SYSVAR_LONG(file_io_threads, innobase_file_io_threads,
   PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY | PLUGIN_VAR_NOSYSVAR,
   "Number of file I/O threads in InnoDB.",
   NULL, NULL, 4, 4, 64, 0);
 
+static MYSQL_SYSVAR_BOOL(ft_enable_diag_print, fts_enable_diag_print,
+  PLUGIN_VAR_OPCMDARG,
+  "Whether to enable additional FTS diagnostic printout ",
+  NULL, NULL, TRUE);
+
+static MYSQL_SYSVAR_BOOL(disable_sort_file_cache, srv_disable_sort_file_cache,
+  PLUGIN_VAR_OPCMDARG,
+  "Whether to disable OS system file cache for sort I/O",
+  NULL, NULL, FALSE);
+
+static MYSQL_SYSVAR_STR(ft_aux_table, fts_internal_tbl_name,
+  PLUGIN_VAR_NOCMDARG,
+  "FTS internal auxiliary table to be checked",
+  innodb_internal_table_validate,
+  innodb_internal_table_update, NULL);
+
+static MYSQL_SYSVAR_ULONG(ft_cache_size, fts_max_cache_size,
+  PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+  "InnoDB Fulltext search cache size in bytes",
+  NULL, NULL, 32000000, 1600000, 80000000, 0);
+
+static MYSQL_SYSVAR_ULONG(ft_min_token_size, fts_min_token_size,
+  PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+  "InnoDB Fulltext search minimum token size in characters",
+  NULL, NULL, 3, 0, 16, 0);
+
+static MYSQL_SYSVAR_ULONG(ft_max_token_size, fts_max_token_size,
+  PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+  "InnoDB Fulltext search maximum token size in characters",
+  NULL, NULL, HA_FT_MAXCHARLEN, 10, FTS_MAX_WORD_LEN , 0);
+
+
+static MYSQL_SYSVAR_ULONG(ft_num_word_optimize, fts_num_word_optimize,
+  PLUGIN_VAR_OPCMDARG,
+  "InnoDB Fulltext search number of words to optimize for each optimize table call ",
+  NULL, NULL, 2000, 1000, 10000, 0);
+
+static MYSQL_SYSVAR_ULONG(ft_sort_pll_degree, fts_sort_pll_degree,
+  PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+  "InnoDB Fulltext search parallel sort degree, will round up to nearest power of 2 number",
+  NULL, NULL, 2, 1, 16, 0);
+
+static MYSQL_SYSVAR_ULONG(sort_buffer_size, srv_sort_buf_size,
+  PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+  "Memory buffer size for index creation",
+  NULL, NULL, 1048576, 524288, 64<<20, 0);
+
+static MYSQL_SYSVAR_BOOL(optimize_fulltext_only, innodb_optimize_fulltext_only,
+  PLUGIN_VAR_NOCMDARG,
+  "Only optimize the Fulltext index of the table",
+  NULL, NULL, FALSE);
+
 static MYSQL_SYSVAR_ULONG(read_io_threads, innobase_read_io_threads,
   PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
   "Number of background read I/O threads in InnoDB.",
@@ -11583,6 +14740,12 @@ static MYSQL_SYSVAR_LONG(force_recovery, innobase_force_recovery,
   "Helps to save your data in case the disk image of the database becomes corrupt.",
   NULL, NULL, 0, 0, 6, 0);
 
+static MYSQL_SYSVAR_ULONG(page_size, srv_page_size,
+  PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_READONLY,
+  "Page size to use for all InnoDB tablespaces.",
+  NULL, NULL, UNIV_PAGE_SIZE_DEF,
+  UNIV_PAGE_SIZE_MIN, UNIV_PAGE_SIZE_MAX, 0);
+
 static MYSQL_SYSVAR_LONG(log_buffer_size, innobase_log_buffer_size,
   PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
   "The size of the buffer which InnoDB uses to write log to the log files on disk.",
@@ -11623,28 +14786,73 @@ static MYSQL_SYSVAR_LONG(open_files, innobase_open_files,
 static MYSQL_SYSVAR_ULONG(sync_spin_loops, srv_n_spin_wait_rounds,
   PLUGIN_VAR_RQCMDARG,
   "Count of spin-loop rounds in InnoDB mutexes (30 by default)",
-  NULL, NULL, 30L, 0L, ~0L, 0);
+  NULL, NULL, 30L, 0L, ~0UL, 0);
 
 static MYSQL_SYSVAR_ULONG(spin_wait_delay, srv_spin_wait_delay,
   PLUGIN_VAR_OPCMDARG,
   "Maximum delay between polling for a spin lock (6 by default)",
-  NULL, NULL, 6L, 0L, ~0L, 0);
+  NULL, NULL, 6L, 0L, ~0UL, 0);
 
 static MYSQL_SYSVAR_ULONG(thread_concurrency, srv_thread_concurrency,
   PLUGIN_VAR_RQCMDARG,
   "Helps in performance tuning in heavily concurrent environments. Sets the maximum number of threads allowed inside InnoDB. Value 0 will disable the thread throttling.",
   NULL, NULL, 0, 0, 1000, 0);
 
+#ifdef HAVE_ATOMIC_BUILTINS
+static MYSQL_SYSVAR_ULONG(
+  adaptive_max_sleep_delay, srv_adaptive_max_sleep_delay,
+  PLUGIN_VAR_RQCMDARG,
+  "The upper limit of the sleep delay in usec. Value of 0 disables it.",
+  NULL, NULL,
+  150000,			/* Default setting */
+  0,				/* Minimum value */
+  1000000, 0);			/* Maximum value */
+#endif /* HAVE_ATOMIC_BUILTINS */
+
 static MYSQL_SYSVAR_ULONG(thread_sleep_delay, srv_thread_sleep_delay,
   PLUGIN_VAR_RQCMDARG,
-  "Time of innodb thread sleeping before joining InnoDB queue (usec). Value 0 disable a sleep",
-  NULL, NULL, 10000L, 0L, ~0L, 0);
+  "Time of innodb thread sleeping before joining InnoDB queue (usec). "
+  "Value 0 disable a sleep",
+  NULL, NULL,
+  10000L,
+  0L,
+  ~0UL, 0);
 
 static MYSQL_SYSVAR_STR(data_file_path, innobase_data_file_path,
   PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
   "Path to individual files and their sizes.",
   NULL, NULL, NULL);
 
+static MYSQL_SYSVAR_STR(undo_directory, srv_undo_dir,
+  PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+  "Directory where undo tablespace files live, this path can be absolute.",
+  NULL, NULL, ".");
+
+static MYSQL_SYSVAR_ULONG(undo_tablespaces, srv_undo_tablespaces,
+  PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+  "Number of undo tablespaces to use. ",
+  NULL, NULL,
+  0L,			/* Default seting */
+  0L,			/* Minimum value */
+  126L, 0);		/* Maximum value */
+
+static MYSQL_SYSVAR_ULONG(undo_logs, srv_undo_logs,
+  PLUGIN_VAR_OPCMDARG,
+  "Number of undo logs to use.",
+  NULL, NULL,
+  TRX_SYS_N_RSEGS,	/* Default setting */
+  1,			/* Minimum value */
+  TRX_SYS_N_RSEGS, 0);	/* Maximum value */
+
+/* Alias for innodb_undo_logs, this config variable is deprecated. */
+static MYSQL_SYSVAR_ULONG(rollback_segments, srv_undo_logs,
+  PLUGIN_VAR_OPCMDARG,
+  "Number of undo logs to use (deprecated).",
+  NULL, NULL,
+  TRX_SYS_N_RSEGS,	/* Default setting */
+  1,			/* Minimum value */
+  TRX_SYS_N_RSEGS, 0);	/* Maximum value */
+
 static MYSQL_SYSVAR_LONG(autoinc_lock_mode, innobase_autoinc_lock_mode,
   PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
   "The AUTOINC lock modes supported by InnoDB:               "
@@ -11663,6 +14871,8 @@ static MYSQL_SYSVAR_STR(version, innodb_version_str,
 
 static MYSQL_SYSVAR_BOOL(use_sys_malloc, srv_use_sys_malloc,
   PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_READONLY,
+  "DEPRECATED. This option may be removed in future releases, "
+  "together with the InnoDB's internal memory allocator. "
   "Use OS memory allocator instead of InnoDB's internal memory allocator",
   NULL, NULL, TRUE);
 
@@ -11678,6 +14888,14 @@ static MYSQL_SYSVAR_STR(change_buffering, innobase_change_buffering,
   innodb_change_buffering_validate,
   innodb_change_buffering_update, "all");
 
+static MYSQL_SYSVAR_UINT(change_buffer_max_size,
+  innobase_change_buffer_max_size,
+  PLUGIN_VAR_RQCMDARG,
+  "Maximum on-disk size of change buffer in terms of percentage"
+  " of the buffer pool.",
+  NULL, innodb_change_buffer_max_size_update,
+  CHANGE_BUFFER_DEFAULT_SIZE, 0, 50, 0);
+
 static MYSQL_SYSVAR_ENUM(stats_method, srv_innodb_stats_method,
    PLUGIN_VAR_RQCMDARG,
   "Specifies how InnoDB index statistics collection code should "
@@ -11703,6 +14921,35 @@ static MYSQL_SYSVAR_ULONG(read_ahead_threshold, srv_read_ahead_threshold,
   "trigger a readahead.",
   NULL, NULL, 56, 0, 64, 0);
 
+static MYSQL_SYSVAR_STR(monitor_enable, innobase_enable_monitor_counter,
+  PLUGIN_VAR_RQCMDARG,
+  "Turn on a monitor counter",
+  innodb_monitor_validate,
+  innodb_enable_monitor_update, NULL);
+
+static MYSQL_SYSVAR_STR(monitor_disable, innobase_disable_monitor_counter,
+  PLUGIN_VAR_RQCMDARG,
+  "Turn off a monitor counter",
+  innodb_monitor_validate,
+  innodb_disable_monitor_update, NULL);
+
+static MYSQL_SYSVAR_STR(monitor_reset, innobase_reset_monitor_counter,
+  PLUGIN_VAR_RQCMDARG,
+  "Reset a monitor counter",
+  innodb_monitor_validate,
+  innodb_reset_monitor_update, NULL);
+
+static MYSQL_SYSVAR_STR(monitor_reset_all, innobase_reset_all_monitor_counter,
+  PLUGIN_VAR_RQCMDARG,
+  "Reset all values for a monitor counter",
+  innodb_monitor_validate,
+  innodb_reset_all_monitor_update, NULL);
+
+static MYSQL_SYSVAR_BOOL(print_all_deadlocks, srv_print_all_deadlocks,
+  PLUGIN_VAR_OPCMDARG,
+  "Print all deadlocks to MySQL error log (off by default)",
+  NULL, NULL, FALSE);
+
 #ifdef UNIV_DEBUG_never
 static MYSQL_SYSVAR_UINT(trx_rseg_n_slots_debug, trx_rseg_n_slots_debug,
   PLUGIN_VAR_RQCMDARG,
@@ -11715,6 +14962,15 @@ static struct st_mysql_sys_var* innobase_system_variables[]= {
   MYSQL_SYSVAR(autoextend_increment),
   MYSQL_SYSVAR(buffer_pool_size),
   MYSQL_SYSVAR(buffer_pool_instances),
+  MYSQL_SYSVAR(buffer_pool_filename),
+  MYSQL_SYSVAR(buffer_pool_dump_now),
+  MYSQL_SYSVAR(buffer_pool_dump_at_shutdown),
+  MYSQL_SYSVAR(buffer_pool_load_now),
+  MYSQL_SYSVAR(buffer_pool_load_abort),
+  MYSQL_SYSVAR(buffer_pool_load_at_startup),
+  MYSQL_SYSVAR(lru_scan_depth),
+  MYSQL_SYSVAR(flush_neighbors),
+  MYSQL_SYSVAR(checksum_algorithm),
   MYSQL_SYSVAR(checksums),
   MYSQL_SYSVAR(commit_concurrency),
   MYSQL_SYSVAR(concurrency_tickets),
@@ -11732,6 +14988,12 @@ static struct st_mysql_sys_var* innobase_system_variables[]= {
   MYSQL_SYSVAR(flush_log_at_trx_commit),
   MYSQL_SYSVAR(flush_method),
   MYSQL_SYSVAR(force_recovery),
+  MYSQL_SYSVAR(ft_cache_size),
+  MYSQL_SYSVAR(ft_enable_stopword),
+  MYSQL_SYSVAR(ft_max_token_size),
+  MYSQL_SYSVAR(ft_min_token_size),
+  MYSQL_SYSVAR(ft_num_word_optimize),
+  MYSQL_SYSVAR(ft_sort_pll_degree),
   MYSQL_SYSVAR(large_prefix),
   MYSQL_SYSVAR(force_load_corrupted),
   MYSQL_SYSVAR(locks_unsafe_for_binlog),
@@ -11740,6 +15002,7 @@ static struct st_mysql_sys_var* innobase_system_variables[]= {
   MYSQL_SYSVAR(log_arch_dir),
   MYSQL_SYSVAR(log_archive),
 #endif /* UNIV_LOG_ARCHIVE */
+  MYSQL_SYSVAR(page_size),
   MYSQL_SYSVAR(log_buffer_size),
   MYSQL_SYSVAR(log_file_size),
   MYSQL_SYSVAR(log_files_in_group),
@@ -11747,45 +15010,73 @@ static struct st_mysql_sys_var* innobase_system_variables[]= {
   MYSQL_SYSVAR(max_dirty_pages_pct),
   MYSQL_SYSVAR(adaptive_flushing),
   MYSQL_SYSVAR(max_purge_lag),
+  MYSQL_SYSVAR(max_purge_lag_delay),
   MYSQL_SYSVAR(mirrored_log_groups),
   MYSQL_SYSVAR(old_blocks_pct),
   MYSQL_SYSVAR(old_blocks_time),
   MYSQL_SYSVAR(open_files),
+  MYSQL_SYSVAR(optimize_fulltext_only),
   MYSQL_SYSVAR(rollback_on_timeout),
+  MYSQL_SYSVAR(ft_aux_table),
+  MYSQL_SYSVAR(ft_enable_diag_print),
+  MYSQL_SYSVAR(ft_server_stopword_table),
+  MYSQL_SYSVAR(ft_user_stopword_table),
+  MYSQL_SYSVAR(disable_sort_file_cache),
   MYSQL_SYSVAR(stats_on_metadata),
   MYSQL_SYSVAR(stats_sample_pages),
+  MYSQL_SYSVAR(stats_transient_sample_pages),
+  MYSQL_SYSVAR(stats_persistent_sample_pages),
   MYSQL_SYSVAR(adaptive_hash_index),
   MYSQL_SYSVAR(stats_method),
   MYSQL_SYSVAR(replication_delay),
   MYSQL_SYSVAR(status_file),
   MYSQL_SYSVAR(strict_mode),
   MYSQL_SYSVAR(support_xa),
+  MYSQL_SYSVAR(sort_buffer_size),
+  MYSQL_SYSVAR(analyze_is_persistent),
   MYSQL_SYSVAR(sync_spin_loops),
   MYSQL_SYSVAR(spin_wait_delay),
   MYSQL_SYSVAR(table_locks),
   MYSQL_SYSVAR(thread_concurrency),
+#ifdef HAVE_ATOMIC_BUILTINS
+  MYSQL_SYSVAR(adaptive_max_sleep_delay),
+#endif /* HAVE_ATOMIC_BUILTINS */
   MYSQL_SYSVAR(thread_sleep_delay),
   MYSQL_SYSVAR(autoinc_lock_mode),
   MYSQL_SYSVAR(version),
   MYSQL_SYSVAR(use_sys_malloc),
   MYSQL_SYSVAR(use_native_aio),
   MYSQL_SYSVAR(change_buffering),
+  MYSQL_SYSVAR(change_buffer_max_size),
 #if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG
   MYSQL_SYSVAR(change_buffering_debug),
 #endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */
   MYSQL_SYSVAR(random_read_ahead),
   MYSQL_SYSVAR(read_ahead_threshold),
   MYSQL_SYSVAR(io_capacity),
+  MYSQL_SYSVAR(monitor_enable),
+  MYSQL_SYSVAR(monitor_disable),
+  MYSQL_SYSVAR(monitor_reset),
+  MYSQL_SYSVAR(monitor_reset_all),
   MYSQL_SYSVAR(purge_threads),
   MYSQL_SYSVAR(purge_batch_size),
+#if defined UNIV_DEBUG || defined UNIV_PERF_DEBUG
+  MYSQL_SYSVAR(page_hash_locks),
+  MYSQL_SYSVAR(doublewrite_batch_size),
+#endif /* defined UNIV_DEBUG || defined UNIV_PERF_DEBUG */
+  MYSQL_SYSVAR(print_all_deadlocks),
+  MYSQL_SYSVAR(undo_logs),
   MYSQL_SYSVAR(rollback_segments),
+  MYSQL_SYSVAR(undo_directory),
+  MYSQL_SYSVAR(undo_tablespaces),
+  MYSQL_SYSVAR(sync_array_size),
 #ifdef UNIV_DEBUG_never /* disable this flag. --innodb-trx becomes ambiguous */
   MYSQL_SYSVAR(trx_rseg_n_slots_debug),
 #endif /* UNIV_DEBUG */
   NULL
 };
 
-mysql_declare_plugin(innobase)
+maria_declare_plugin(innobase)
 {
   MYSQL_STORAGE_ENGINE_PLUGIN,
   &innobase_storage_engine,
@@ -11798,8 +15089,8 @@ mysql_declare_plugin(innobase)
   INNODB_VERSION_SHORT,
   innodb_status_variables_export,/* status variables             */
   innobase_system_variables, /* system variables */
-  NULL, /* reserved */
-  0,    /* flags */
+  INNODB_VERSION_STR,         /* string version */
+  MariaDB_PLUGIN_MATURITY_STABLE /* maturity */
 },
 i_s_innodb_trx,
 i_s_innodb_locks,
@@ -11807,8 +15098,27 @@ i_s_innodb_lock_waits,
 i_s_innodb_cmp,
 i_s_innodb_cmp_reset,
 i_s_innodb_cmpmem,
-i_s_innodb_cmpmem_reset
-mysql_declare_plugin_end;
+i_s_innodb_cmpmem_reset,
+i_s_innodb_buffer_page,
+i_s_innodb_buffer_page_lru,
+i_s_innodb_buffer_stats,
+i_s_innodb_metrics,
+i_s_innodb_ft_default_stopword,
+i_s_innodb_ft_inserted,
+i_s_innodb_ft_deleted,
+i_s_innodb_ft_being_deleted,
+i_s_innodb_ft_config,
+i_s_innodb_ft_index_cache,
+i_s_innodb_ft_index_table,
+i_s_innodb_sys_tables,
+i_s_innodb_sys_tablestats,
+i_s_innodb_sys_indexes,
+i_s_innodb_sys_columns,
+i_s_innodb_sys_fields,
+i_s_innodb_sys_foreign,
+i_s_innodb_sys_foreign_cols
+
+maria_declare_plugin_end;
 
 /** @brief Initialize the default value of innodb_commit_concurrency.
 
@@ -11821,13 +15131,28 @@ to 0, even if it was initially set to nonzero at the command line
 or configuration file. */
 static
 void
-innobase_commit_concurrency_init_default(void)
-/*==========================================*/
+innobase_commit_concurrency_init_default()
+/*======================================*/
 {
 	MYSQL_SYSVAR_NAME(commit_concurrency).def_val
 		= innobase_commit_concurrency;
 }
 
+/** @brief Initialize the default and max value of innodb_undo_logs.
+
+Once InnoDB is running, the default value and the max value of
+innodb_undo_logs must be equal to the available undo logs,
+given by srv_available_undo_logs. */
+static
+void
+innobase_undo_logs_init_default_max()
+/*=================================*/
+{
+	MYSQL_SYSVAR_NAME(undo_logs).max_val
+		= MYSQL_SYSVAR_NAME(undo_logs).def_val
+		= srv_available_undo_logs;
+}
+
 #ifdef UNIV_COMPILE_TEST_FUNCS
 
 typedef struct innobase_convert_name_test_struct {
@@ -11949,3 +15274,114 @@ test_innobase_convert_name()
 }
 
 #endif /* UNIV_COMPILE_TEST_FUNCS */
+
+/****************************************************************************
+ * DS-MRR implementation
+ ***************************************************************************/
+
+/**
+ * Multi Range Read interface, DS-MRR calls
+ */
+
+int
+ha_innobase::multi_range_read_init(
+	RANGE_SEQ_IF*	seq,
+	void*		seq_init_param,
+	uint		n_ranges,
+	uint		mode,
+	HANDLER_BUFFER*	buf)
+{
+	return(ds_mrr.dsmrr_init(this, seq, seq_init_param,
+				 n_ranges, mode, buf));
+}
+
+int
+ha_innobase::multi_range_read_next(
+	range_id_t *range_info)
+{
+	return(ds_mrr.dsmrr_next(range_info));
+}
+
+ha_rows
+ha_innobase::multi_range_read_info_const(
+	uint		keyno,
+	RANGE_SEQ_IF*	seq,
+	void*		seq_init_param,
+	uint		n_ranges,
+	uint*		bufsz,
+	uint*		flags,
+	Cost_estimate*	cost)
+{
+	/* See comments in ha_myisam::multi_range_read_info_const */
+	ds_mrr.init(this, table);
+	return(ds_mrr.dsmrr_info_const(keyno, seq, seq_init_param,
+				       n_ranges, bufsz, flags, cost));
+}
+
+ha_rows
+ha_innobase::multi_range_read_info(
+	uint		keyno,
+	uint		n_ranges,
+	uint		keys,
+        uint            key_parts,
+	uint*		bufsz,
+	uint*		flags,
+	Cost_estimate*	cost)
+{
+	ds_mrr.init(this, table);
+	return(ds_mrr.dsmrr_info(keyno, n_ranges, keys, key_parts, bufsz,
+                                 flags, cost));
+}
+
+
+/**
+ * Index Condition Pushdown interface implementation
+ */
+
+/*************************************************************//**
+InnoDB index push-down condition check
+@return ICP_NO_MATCH, ICP_MATCH, or ICP_OUT_OF_RANGE */
+UNIV_INTERN
+enum icp_result
+innobase_index_cond(
+/*================*/
+	void*	file)	/*!< in/out: pointer to ha_innobase */
+{
+	DBUG_ENTER("innobase_index_cond");
+
+	ha_innobase*	h = reinterpret_cast<class ha_innobase*>(file);
+
+	DBUG_ASSERT(h->pushed_idx_cond);
+	DBUG_ASSERT(h->pushed_idx_cond_keyno != MAX_KEY);
+
+	if (h->end_range && h->compare_key2(h->end_range) > 0) {
+
+		/* caller should return HA_ERR_END_OF_FILE already */
+		DBUG_RETURN(ICP_OUT_OF_RANGE);
+	}
+
+	DBUG_RETURN(h->pushed_idx_cond->val_int() ? ICP_MATCH : ICP_NO_MATCH);
+}
+
+/** Attempt to push down an index condition.
+* @param[in] keyno	MySQL key number
+* @param[in] idx_cond	Index condition to be checked
+* @return Part of idx_cond which the handler will not evaluate
+*/
+UNIV_INTERN
+class Item*
+ha_innobase::idx_cond_push(
+	uint		keyno,
+	class Item*	idx_cond)
+{
+	DBUG_ENTER("ha_innobase::idx_cond_push");
+	DBUG_ASSERT(keyno != MAX_KEY);
+	DBUG_ASSERT(idx_cond != NULL);
+
+	pushed_idx_cond = idx_cond;
+	pushed_idx_cond_keyno = keyno;
+	in_range_check_pushed_down = TRUE;
+	/* We will evaluate the condition entirely */
+	DBUG_RETURN(NULL);
+}
+
diff --git a/storage/innobase/handler/ha_innodb.h b/storage/innobase/handler/ha_innodb.h
index 7ef3e954636..cbb2f9e7e0e 100644
--- a/storage/innobase/handler/ha_innodb.h
+++ b/storage/innobase/handler/ha_innodb.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 2000, 2010, MySQL AB & Innobase Oy. All Rights Reserved.
+Copyright (c) 2000, 2012, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -23,9 +23,7 @@ Place, Suite 330, Boston, MA 02111-1307 USA
   Innodb
 */
 
-#ifdef USE_PRAGMA_INTERFACE
-#pragma interface			/* gcc class implementation */
-#endif
+#include "dict0stats.h"
 
 /* Structure defines translation table between mysql index and innodb
 index structures */
@@ -110,7 +108,7 @@ class ha_innobase: public handler
 	ulint innobase_update_autoinc(ulonglong	auto_inc);
 	void innobase_initialize_autoinc();
 	dict_index_t* innobase_get_index(uint keynr);
-	int info_low(uint flag, bool called_from_analyze);
+	int info_low(uint flag, dict_stats_upd_option_t stats_upd_option);
 
 	/* Init values for the class: */
  public:
@@ -133,7 +131,6 @@ class ha_innobase: public handler
 	const key_map* keys_to_use_for_scanning();
 
 	int open(const char *name, int mode, uint test_if_locked);
-	handler* clone(const char *name, MEM_ROOT *mem_root);
 	int close(void);
 	double scan_time();
 	double read_time(uint index, uint ranges, ha_rows rows);
@@ -163,13 +160,18 @@ class ha_innobase: public handler
 	int rnd_next(uchar *buf);
 	int rnd_pos(uchar * buf, uchar *pos);
 
+	int ft_init();
+	void ft_end();
+	FT_INFO *ft_init_ext(uint flags, uint inx, String* key);
+	int ft_read(uchar* buf);
+
 	void position(const uchar *record);
 	int info(uint);
 	int analyze(THD* thd,HA_CHECK_OPT* check_opt);
 	int optimize(THD* thd,HA_CHECK_OPT* check_opt);
 	int discard_or_import_tablespace(my_bool discard);
 	int extra(enum ha_extra_function operation);
-        int reset();
+	int reset();
 	int external_lock(THD *thd, int lock_type);
 	int transactional_table_lock(THD *thd, int lock_type);
 	int start_stmt(THD *thd, thr_lock_type lock_type);
@@ -203,7 +205,7 @@ class ha_innobase: public handler
 	int reset_auto_increment(ulonglong value);
 
 	virtual bool get_error_message(int error, String *buf);
-
+	virtual bool get_foreign_dup_key(char*, uint, char*, uint);
 	uint8 table_cache_type();
 	/*
 	  ask handler about permission to cache table during query registration
@@ -212,7 +214,7 @@ class ha_innobase: public handler
 					   uint key_length,
 					   qc_engine_callback *call_back,
 					   ulonglong *engine_data);
-	static char *get_mysql_bin_log_name();
+	static const char *get_mysql_bin_log_name();
 	static ulonglong get_mysql_bin_log_pos();
 	bool primary_key_is_clustered();
 	int cmp_ref(const uchar *ref1, const uchar *ref2);
@@ -226,6 +228,77 @@ class ha_innobase: public handler
 	/** @} */
 	bool check_if_incompatible_data(HA_CREATE_INFO *info,
 					uint table_changes);
+private:
+	/** Builds a 'template' to the prebuilt struct.
+
+	The template is used in fast retrieval of just those column
+	values MySQL needs in its processing.
+	@param whole_row true if access is needed to a whole row,
+	false if accessing individual fields is enough */
+	void build_template(bool whole_row);
+	/** Resets a query execution 'template'.
+	@see build_template() */
+	inline void reset_template();
+
+public:
+	/** @name Multi Range Read interface @{ */
+	/** Initialize multi range read @see DsMrr_impl::dsmrr_init
+	* @param seq
+	* @param seq_init_param
+	* @param n_ranges
+	* @param mode
+	* @param buf
+	*/
+	int multi_range_read_init(RANGE_SEQ_IF* seq,
+				  void* seq_init_param,
+				  uint n_ranges, uint mode,
+				  HANDLER_BUFFER* buf);
+	/** Process next multi range read @see DsMrr_impl::dsmrr_next
+	* @param range_info
+	*/
+	int multi_range_read_next(range_id_t *range_info);
+	/** Initialize multi range read and get information.
+	* @see ha_myisam::multi_range_read_info_const
+	* @see DsMrr_impl::dsmrr_info_const
+	* @param keyno
+	* @param seq
+	* @param seq_init_param
+	* @param n_ranges
+	* @param bufsz
+	* @param flags
+	* @param cost
+	*/
+	ha_rows multi_range_read_info_const(uint keyno, RANGE_SEQ_IF* seq,
+					   void* seq_init_param,
+					   uint n_ranges, uint* bufsz,
+					   uint* flags, Cost_estimate* cost);
+	/** Initialize multi range read and get information.
+	* @see DsMrr_impl::dsmrr_info
+	* @param keyno
+	* @param seq
+	* @param seq_init_param
+	* @param n_ranges
+	* @param bufsz
+	* @param flags
+	* @param cost
+	*/
+	ha_rows multi_range_read_info(uint keyno,
+                                      uint n_ranges, uint keys,
+                                      uint key_parts,
+				      uint* bufsz, uint* mrr_mode,
+				      Cost_estimate* cost);
+
+	/** Attempt to push down an index condition.
+	* @param[in] keyno	MySQL key number
+	* @param[in] idx_cond	Index condition to be checked
+	* @return idx_cond if pushed; NULL if not pushed
+	*/
+	class Item* idx_cond_push(uint keyno, class Item* idx_cond);
+
+private:
+	/** The multi range read session object */
+	DsMrr_impl ds_mrr;
+	/* @} */
 };
 
 /* Some accessor functions which the InnoDB plugin needs, but which
@@ -236,9 +309,11 @@ the definitions are bracketed with #ifdef INNODB_COMPATIBILITY_HOOKS */
 #error InnoDB needs MySQL to be built with #define INNODB_COMPATIBILITY_HOOKS
 #endif
 
+LEX_STRING* thd_query_string(MYSQL_THD thd);
+
 extern "C" {
+
 struct charset_info_st *thd_charset(MYSQL_THD thd);
-LEX_STRING *thd_query_string(MYSQL_THD thd);
 
 /**
   Check if a user thread is a replication slave thread
@@ -284,7 +359,8 @@ bool thd_binlog_filter_ok(const MYSQL_THD thd);
   @return 1 the query may generate row changes, 0 otherwise.
 */
 bool thd_sqlcom_can_generate_row_events(const MYSQL_THD thd);
-}
+
+} /* extern "C" */
 
 /** Get the file name and position of the MySQL binlog corresponding to the
  * current commit.
@@ -292,13 +368,23 @@ bool thd_sqlcom_can_generate_row_events(const MYSQL_THD thd);
 extern void mysql_bin_log_commit_pos(THD *thd, ulonglong *out_pos, const char **out_file);
 
 typedef struct trx_struct trx_t;
+
+extern const struct _ft_vft ft_vft_result;
+
+/* Structure Returned by ha_innobase::ft_init_ext() */
+typedef struct new_ft_info
+{
+	struct _ft_vft		*please;
+	row_prebuilt_t*		ft_prebuilt;
+	fts_result_t*		ft_result;
+} NEW_FT_INFO;
+
 /********************************************************************//**
 @file handler/ha_innodb.h
 Converts an InnoDB error code to a MySQL error code and also tells to MySQL
 about a possible transaction rollback inside InnoDB caused by a lock wait
 timeout or a deadlock.
 @return	MySQL error code */
-extern "C"
 int
 convert_error_code_to_mysql(
 /*========================*/
@@ -309,20 +395,17 @@ convert_error_code_to_mysql(
 /*********************************************************************//**
 Allocates an InnoDB transaction for a MySQL handler object.
 @return	InnoDB transaction handle */
-extern "C"
 trx_t*
 innobase_trx_allocate(
 /*==================*/
 	MYSQL_THD	thd);	/*!< in: user thread handle */
 
-
 /*********************************************************************//**
 This function checks each index name for a table against reserved
 system default primary index name 'GEN_CLUST_INDEX'. If a name
 matches, this function pushes an warning message to the client,
 and returns true.
 @return true if the index name matches the reserved name */
-extern "C"
 bool
 innobase_index_name_is_reserved(
 /*============================*/
@@ -330,4 +413,79 @@ innobase_index_name_is_reserved(
 	const KEY*	key_info,	/*!< in: Indexes to be created */
 	ulint		num_of_keys);	/*!< in: Number of indexes to
 					be created. */
+/*********************************************************************//**
+Retrieve the FTS Relevance Ranking result for doc with doc_id
+of prebuilt->fts_doc_id
+@return the relevance ranking value */
+UNIV_INTERN
+float
+innobase_fts_retrieve_ranking(
+/*==========================*/
+	FT_INFO*	fts_hdl);	/*!< in: FTS handler */
 
+/*********************************************************************//**
+Find and Retrieve the FTS Relevance Ranking result for doc with doc_id
+of prebuilt->fts_doc_id
+@return the relevance ranking value */
+UNIV_INTERN
+float
+innobase_fts_find_ranking(
+/*==========================*/
+	FT_INFO*	fts_hdl,	/*!< in: FTS handler */
+	uchar*		record,		/*!< in: Unused */
+	uint		len);		/*!< in: Unused */
+/*********************************************************************//**
+Free the memory for the FTS handler */
+UNIV_INTERN
+void
+innobase_fts_close_ranking(
+/*==========================*/
+	FT_INFO*	fts_hdl);	/*!< in: FTS handler */
+/*********************************************************************//**
+Free the memory for the FTS handler */
+void
+innobase_fts_close_ranking(
+/*==========================*/
+	FT_INFO*	fts_hdl);	/*!< in: FTS handler */
+/*****************************************************************//**
+Initialize the table FTS stopword list
+@return TRUE is succeed */
+UNIV_INTERN
+ibool
+innobase_fts_load_stopword(
+/*=======================*/
+	dict_table_t*	table,		/*!< in: Table has the FTS */
+	trx_t*		trx,		/*!< in: transaction */
+	THD*		thd);		/*!< in: current thread */
+
+/** Some defines for innobase_fts_check_doc_id_index() return value */
+enum fts_doc_id_index_enum {
+	FTS_INCORRECT_DOC_ID_INDEX,
+	FTS_EXIST_DOC_ID_INDEX,
+	FTS_NOT_EXIST_DOC_ID_INDEX
+};
+
+/*******************************************************************//**
+Check whether the table has a unique index with FTS_DOC_ID_INDEX_NAME
+on the Doc ID column.
+@return FTS_EXIST_DOC_ID_INDEX if there exists the FTS_DOC_ID index,
+FTS_INCORRECT_DOC_ID_INDEX if the FTS_DOC_ID index is of wrong format */
+UNIV_INTERN
+enum fts_doc_id_index_enum
+innobase_fts_check_doc_id_index(
+/*============================*/
+	dict_table_t*	table,		/*!< in: table definition */
+	ulint*		fts_doc_col_no);/*!< out: The column number for
+					Doc ID */
+
+/*******************************************************************//**
+Check whether the table has a unique index with FTS_DOC_ID_INDEX_NAME
+on the Doc ID column in MySQL create index definition.
+@return FTS_EXIST_DOC_ID_INDEX if there exists the FTS_DOC_ID index,
+FTS_INCORRECT_DOC_ID_INDEX if the FTS_DOC_ID index is of wrong format */
+UNIV_INTERN
+enum fts_doc_id_index_enum
+innobase_fts_check_doc_id_index_in_def(
+/*===================================*/
+	ulint		n_key,		/*!< in: Number of keys */
+	KEY*		key_info);	/*!< in: Key definition */
diff --git a/storage/innobase/handler/handler0alter.cc b/storage/innobase/handler/handler0alter.cc
index 7bb370a8dc4..32501299630 100644
--- a/storage/innobase/handler/handler0alter.cc
+++ b/storage/innobase/handler/handler0alter.cc
@@ -26,7 +26,7 @@ Smart ALTER TABLE
 #include <sql_lex.h>                            // SQLCOM_CREATE_INDEX
 #include <innodb_priv.h>
 
-extern "C" {
+#include "dict0stats.h"
 #include "log0log.h"
 #include "row0merge.h"
 #include "srv0srv.h"
@@ -34,7 +34,8 @@ extern "C" {
 #include "trx0roll.h"
 #include "ha_prototypes.h"
 #include "handler0alter.h"
-}
+#include "srv0mon.h"
+#include "fts0priv.h"
 
 #include "ha_innodb.h"
 
@@ -128,7 +129,7 @@ innobase_col_to_mysql(
 
 /*************************************************************//**
 Copies an InnoDB record to table->record[0]. */
-extern "C" UNIV_INTERN
+UNIV_INTERN
 void
 innobase_rec_to_mysql(
 /*==================*/
@@ -141,7 +142,9 @@ innobase_rec_to_mysql(
 	uint	n_fields	= table->s->fields;
 	uint	i;
 
-	ut_ad(n_fields == dict_table_get_n_user_cols(index->table));
+	ut_ad(n_fields == dict_table_get_n_user_cols(index->table)
+	      || (DICT_TF2_FLAG_IS_SET(index->table, DICT_TF2_FTS_HAS_DOC_ID)
+		  && n_fields + 1 == dict_table_get_n_user_cols(index->table)));
 
 	for (i = 0; i < n_fields; i++) {
 		Field*		field	= table->field[i];
@@ -178,7 +181,7 @@ null_field:
 
 /*************************************************************//**
 Resets table->record[0]. */
-extern "C" UNIV_INTERN
+UNIV_INTERN
 void
 innobase_rec_reset(
 /*===============*/
@@ -362,7 +365,7 @@ innobase_create_index_field_def(
 		&& field->type() != MYSQL_TYPE_VARCHAR)
 	    || (field->type() == MYSQL_TYPE_VARCHAR
 		&& key_part->length < field->pack_length()
-			- ((Field_varstring*)field)->length_bytes)) {
+			- ((Field_varstring*) field)->length_bytes)) {
 
 		index_field->prefix_len = key_part->length;
 	} else {
@@ -416,6 +419,10 @@ innobase_create_index_def(
 		index->ind_type |= DICT_UNIQUE;
 	}
 
+	if (key->flags & HA_FULLTEXT) {
+		index->ind_type |= DICT_FTS;
+	}
+
 	if (key_primary) {
 		index->ind_type |= DICT_CLUSTERED;
 	}
@@ -486,6 +493,143 @@ innobase_copy_index_def(
 }
 
 /*******************************************************************//**
+Check whether the table has the FTS_DOC_ID column
+@return TRUE if there exists the FTS_DOC_ID column, if TRUE but fts_doc_col_no
+        equal to ULINT_UNDEFINED then that means the column exists but is not
+	of the right type. */
+static
+ibool
+innobase_fts_check_doc_id_col(
+/*==========================*/
+	dict_table_t*	table,		/*!< in: table with FTS index */
+	ulint*		fts_doc_col_no)	/*!< out: The column number for
+					Doc ID */
+{
+	*fts_doc_col_no = ULINT_UNDEFINED;
+
+	for (ulint i = 0; i + DATA_N_SYS_COLS < (ulint) table->n_cols; i++) {
+		const char*     name = dict_table_get_col_name(table, i);
+
+		if (strcmp(name, FTS_DOC_ID_COL_NAME) == 0) {
+			const dict_col_t*       col;
+
+			col = dict_table_get_nth_col(table, i);
+
+			if (col->mtype != DATA_INT || col->len != 8) {
+				fprintf(stderr,
+					" InnoDB: %s column in table %s"
+					" must be of the BIGINT datatype\n",
+					FTS_DOC_ID_COL_NAME, table->name);
+			} else if (!(col->prtype & DATA_NOT_NULL)) {
+				fprintf(stderr,
+					" InnoDB: %s column in table %s"
+					" must be NOT NULL\n",
+					FTS_DOC_ID_COL_NAME, table->name);
+
+			} else if (!(col->prtype & DATA_UNSIGNED)) {
+				fprintf(stderr,
+					" InnoDB: %s column in table %s"
+					" must be UNSIGNED\n",
+					FTS_DOC_ID_COL_NAME, table->name);
+			} else {
+				*fts_doc_col_no = i;
+			}
+
+			return(TRUE);
+		}
+	}
+
+	return(FALSE);
+}
+
+/*******************************************************************//**
+Check whether the table has a unique index with FTS_DOC_ID_INDEX_NAME
+on the Doc ID column.
+@return	FTS_EXIST_DOC_ID_INDEX if there exists the FTS_DOC_ID index,
+FTS_INCORRECT_DOC_ID_INDEX if the FTS_DOC_ID index is of wrong format */
+UNIV_INTERN
+enum fts_doc_id_index_enum
+innobase_fts_check_doc_id_index(
+/*============================*/
+	dict_table_t*	table,		/*!< in: table definition */
+	ulint*		fts_doc_col_no)	/*!< out: The column number for
+					Doc ID */
+{
+	dict_index_t*	index;
+	dict_field_t*	field;
+
+	for (index = dict_table_get_first_index(table);
+	     index; index = dict_table_get_next_index(index)) {
+
+		/* Check if there exists a unique index with the name of
+		FTS_DOC_ID_INDEX_NAME */
+		if (innobase_strcasecmp(index->name, FTS_DOC_ID_INDEX_NAME)) {
+			continue;
+		}
+
+		if (!dict_index_is_unique(index)
+		    || strcmp(index->name, FTS_DOC_ID_INDEX_NAME)) {
+			return(FTS_INCORRECT_DOC_ID_INDEX);
+		}
+
+		/* Check whether the index has FTS_DOC_ID as its
+		first column */
+		field = dict_index_get_nth_field(index, 0);
+
+		/* The column would be of a BIGINT data type */
+		if (strcmp(field->name, FTS_DOC_ID_COL_NAME) == 0
+		    && field->col->mtype == DATA_INT
+		    && field->col->len == 8
+		    && field->col->prtype & DATA_NOT_NULL) {
+			if (fts_doc_col_no) {
+				*fts_doc_col_no = dict_col_get_no(field->col);
+			}
+			return(FTS_EXIST_DOC_ID_INDEX);
+		} else {
+			return(FTS_INCORRECT_DOC_ID_INDEX);
+		}
+
+	}
+
+	/* Not found */
+	return(FTS_NOT_EXIST_DOC_ID_INDEX);
+}
+/*******************************************************************//**
+Check whether the table has a unique index with FTS_DOC_ID_INDEX_NAME
+on the Doc ID column in MySQL create index definition.
+@return	FTS_EXIST_DOC_ID_INDEX if there exists the FTS_DOC_ID index,
+FTS_INCORRECT_DOC_ID_INDEX if the FTS_DOC_ID index is of wrong format */
+UNIV_INTERN
+enum fts_doc_id_index_enum
+innobase_fts_check_doc_id_index_in_def(
+/*===================================*/
+	ulint		n_key,		/*!< in: Number of keys */
+	KEY *		key_info)	/*!< in: Key definition */
+{
+	/* Check whether there is a "FTS_DOC_ID_INDEX" in the to be built index
+	list */
+	for (ulint j = 0; j < n_key; j++) {
+		KEY*    key = &key_info[j];
+
+		if (innobase_strcasecmp(key->name, FTS_DOC_ID_INDEX_NAME)) {
+			continue;
+		}
+
+		/* Do a check on FTS DOC ID_INDEX, it must be unique,
+		named as "FTS_DOC_ID_INDEX" and on column "FTS_DOC_ID" */
+		if (!(key->flags & HA_NOSAME)
+		    || strcmp(key->name, FTS_DOC_ID_INDEX_NAME)
+		    || strcmp(key->key_part[0].field->field_name,
+			     FTS_DOC_ID_COL_NAME)) {
+			return(FTS_INCORRECT_DOC_ID_INDEX);
+	       }
+
+		return(FTS_EXIST_DOC_ID_INDEX);
+        }
+
+	return(FTS_NOT_EXIST_DOC_ID_INDEX);
+}
+/*******************************************************************//**
 Create an index table where indexes are ordered as follows:
 
 IF a new primary key is defined for the table THEN
@@ -507,12 +651,17 @@ merge_index_def_t*
 innobase_create_key_def(
 /*====================*/
 	trx_t*		trx,		/*!< in: trx */
-	const dict_table_t*table,		/*!< in: table definition */
+	dict_table_t*	table,		/*!< in: table definition */
 	mem_heap_t*	heap,		/*!< in: heap where space for key
 					definitions are allocated */
 	KEY*		key_info,	/*!< in: Indexes to be created */
-	ulint&		n_keys)		/*!< in/out: Number of indexes to
+	ulint&		n_keys,		/*!< in/out: Number of indexes to
 					be created */
+	ulint*		num_fts_index,	/*!< out: Number of FTS indexes */
+	ibool*		add_fts_doc_id,	/*!< out: Whether we need to add
+					new DOC ID column for FTS index */
+	ibool*		add_fts_doc_id_idx)/*!< out: Whether we need to add
+					new index on DOC ID column */
 {
 	ulint			i = 0;
 	merge_index_def_t*	indexdef;
@@ -525,6 +674,9 @@ innobase_create_key_def(
 		mem_heap_alloc(heap, sizeof *indexdef
 			       * (n_keys + UT_LIST_GET_LEN(table->indexes)));
 
+	*add_fts_doc_id = FALSE;
+	*add_fts_doc_id_idx = FALSE;
+
 	/* If there is a primary key, it is always the first index
 	defined for the table. */
 
@@ -552,12 +704,111 @@ innobase_create_key_def(
 		}
 	}
 
-	if (new_primary) {
+	/* Check whether any indexes in the create list are Full
+	Text Indexes*/
+	for (ulint j = 0; j < n_keys; j++) {
+		if (key_info[j].flags & HA_FULLTEXT) {
+			(*num_fts_index)++;
+		}
+	}
+
+	/* Check whether there is a "FTS_DOC_ID_INDEX" in the to be built index
+	list */
+	if (innobase_fts_check_doc_id_index_in_def(n_keys, key_info)
+	    == FTS_INCORRECT_DOC_ID_INDEX) {
+		push_warning_printf((THD*) trx->mysql_thd,
+				   Sql_condition::WARN_LEVEL_WARN,
+				   ER_WRONG_NAME_FOR_INDEX,
+				   " InnoDB: Index name %s is reserved"
+				   " for the unique index on"
+				   " FTS_DOC_ID column for FTS"
+				   " document ID indexing"
+				   " on table %s. Please check"
+				   " the index definition to"
+				   " make sure it is of correct"
+				   " type\n",
+				   FTS_DOC_ID_INDEX_NAME,
+				   table->name);
+	       DBUG_RETURN(NULL);
+	}
+
+	/* If we are to build an FTS index, check whether the table
+	already has a DOC ID column, if not, we will need to add a
+	Doc ID hidden column and rebuild the primary index */
+	if (*num_fts_index) {
+		enum fts_doc_id_index_enum	ret;
+		ibool				exists;
+		ulint				doc_col_no;
+		ulint				fts_doc_col_no;
+
+		exists = innobase_fts_check_doc_id_col(table, &fts_doc_col_no);
+
+		if (exists) {
+
+			if (fts_doc_col_no == ULINT_UNDEFINED) {
+
+				push_warning_printf(
+					(THD*) trx->mysql_thd,
+					Sql_condition::WARN_LEVEL_WARN,
+					ER_WRONG_COLUMN_NAME,
+					" InnoDB: There exists a column %s "
+					"in table %s, but it is the wrong "
+					"type. Create of FTS index failed.\n",
+					FTS_DOC_ID_COL_NAME, table->name);
+
+				DBUG_RETURN(NULL);
+
+			} else if (!table->fts) {
+				table->fts = fts_create(table);
+			}
+
+			table->fts->doc_col = fts_doc_col_no;
+
+		} else {
+			*add_fts_doc_id = TRUE;
+			*add_fts_doc_id_idx = TRUE;
+
+			ut_print_timestamp(stderr);
+			fprintf(stderr, " InnoDB: Rebuild table %s to add "
+					"DOC_ID column\n", table->name);
+		}
+
+		ret = innobase_fts_check_doc_id_index(table, &doc_col_no);
+
+		switch (ret) {
+		case FTS_NOT_EXIST_DOC_ID_INDEX:
+			*add_fts_doc_id_idx = TRUE;
+			break;
+		case FTS_INCORRECT_DOC_ID_INDEX:
+
+			ut_print_timestamp(stderr);
+			fprintf(stderr, " InnoDB: Index %s is used for FTS"
+					" Doc ID indexing on table %s, it is"
+					" now on the wrong column or of"
+					" wrong format. Please drop it.\n",
+					FTS_DOC_ID_INDEX_NAME, table->name);
+			DBUG_RETURN(NULL);
+
+		default:
+			ut_ad(ret == FTS_EXIST_DOC_ID_INDEX);
+
+			ut_ad(doc_col_no == fts_doc_col_no);
+		}
+	}
+
+	/* If DICT_TF2_FTS_ADD_DOC_ID is set, we will need to rebuild
+	the table to add the unique Doc ID column for FTS index. And
+	thus the primary index would required to be rebuilt. Copy all
+	the index definitions */
+	if (new_primary || *add_fts_doc_id) {
 		const dict_index_t*	index;
 
-		/* Create the PRIMARY key index definition */
-		innobase_create_index_def(&key_info[i++], TRUE, TRUE,
-					  indexdef++, heap);
+		if (new_primary) {
+			/* Create the PRIMARY key index definition */
+			innobase_create_index_def(&key_info[i++],
+						  TRUE, TRUE,
+						  indexdef++, heap);
+		}
 
 		row_mysql_lock_data_dictionary(trx);
 
@@ -568,17 +819,32 @@ innobase_create_key_def(
 		index or a PRIMARY KEY.  If the clustered index is a
 		UNIQUE INDEX, it must be converted to a secondary index. */
 
-		if (dict_index_get_nth_col(index, 0)->mtype == DATA_SYS
-		    || !my_strcasecmp(system_charset_info,
-				      index->name, "PRIMARY")) {
+		if (new_primary
+		    && (dict_index_get_nth_col(index, 0)->mtype
+			== DATA_SYS
+		        || !my_strcasecmp(system_charset_info,
+					  index->name, "PRIMARY"))) {
 			index = dict_table_get_next_index(index);
 		}
 
 		while (index) {
 			innobase_copy_index_def(index, indexdef++, heap);
+
+			if (new_primary && index->type & DICT_FTS) {
+				(*num_fts_index)++;
+			}
+
 			index = dict_table_get_next_index(index);
 		}
 
+		/* The primary index would be rebuilt if a FTS Doc ID
+		column is to be added, and the primary index definition
+		is just copied from old table and stored in indexdefs[0] */
+		if (*add_fts_doc_id) {
+			indexdefs[0].ind_type |= DICT_CLUSTERED;
+			DICT_TF2_FLAG_SET(table, DICT_TF2_FTS_ADD_DOC_ID);
+		}
+
 		row_mysql_unlock_data_dictionary(trx);
 	}
 
@@ -653,13 +919,95 @@ public:
 };
 
 /*******************************************************************//**
+This is to create FTS_DOC_ID_INDEX definition on the newly added Doc ID for
+the FTS indexes table
+@return	dict_index_t for the FTS_DOC_ID_INDEX */
+dict_index_t*
+innobase_create_fts_doc_id_idx(
+/*===========================*/
+	dict_table_t*	indexed_table,	/*!< in: Table where indexes are
+					created */
+	trx_t*		trx,		/*!< in: Transaction */
+	mem_heap_t*     heap)		/*!< Heap for index definitions */
+{
+	dict_index_t*		index;
+	merge_index_def_t	fts_index_def;
+	char*			index_name;
+
+	/* Create the temp index name for FTS_DOC_ID_INDEX */
+	fts_index_def.name = index_name = (char*) mem_heap_alloc(
+		heap, FTS_DOC_ID_INDEX_NAME_LEN + 2);
+	*index_name++ = TEMP_INDEX_PREFIX;
+	memcpy(index_name, FTS_DOC_ID_INDEX_NAME,
+	       FTS_DOC_ID_INDEX_NAME_LEN);
+	index_name[FTS_DOC_ID_INDEX_NAME_LEN] = 0;
+
+	/* Only the Doc ID will be indexed */
+	fts_index_def.n_fields = 1;
+	fts_index_def.ind_type = DICT_UNIQUE;
+	fts_index_def.fields = (merge_index_field_t*) mem_heap_alloc(
+		heap, sizeof *fts_index_def.fields);
+	fts_index_def.fields[0].prefix_len = 0;
+	fts_index_def.fields[0].field_name = mem_heap_strdup(
+		heap, FTS_DOC_ID_COL_NAME);
+
+	index = row_merge_create_index(trx, indexed_table, &fts_index_def);
+	return(index);
+}
+
+/*******************************************************************//**
+Clean up on ha_innobase::add_index error. */
+static
+void
+innobase_add_index_cleanup(
+/*=======================*/
+	row_prebuilt_t*	prebuilt,		/*!< in/out: prebuilt */
+	trx_t*		trx,			/*!< in/out: transaction */
+	dict_table_t*	table)			/*!< in/out: table on which
+						the indexes were going to be
+						created */
+{
+	trx_rollback_to_savepoint(trx, NULL);
+
+	ut_a(trx != prebuilt->trx);
+
+	trx_free_for_mysql(trx);
+
+	trx_commit_for_mysql(prebuilt->trx);
+
+	if (table != NULL) {
+
+		rw_lock_x_lock(&dict_operation_lock);
+
+		dict_mutex_enter_for_mysql();
+
+		/* Note: This check excludes the system tables. However, we
+		should be safe because users cannot add indexes to system
+		tables. */
+
+		if (UT_LIST_GET_LEN(table->foreign_list) == 0
+		    && UT_LIST_GET_LEN(table->referenced_list) == 0
+		    && !table->can_be_evicted) {
+
+			dict_table_move_from_non_lru_to_lru(table);
+		}
+
+		dict_table_close(table, TRUE);
+
+		dict_mutex_exit_for_mysql();
+
+		rw_lock_x_unlock(&dict_operation_lock);
+	}
+}
+
+/*******************************************************************//**
 Create indexes.
 @return	0 or error number */
 UNIV_INTERN
 int
 ha_innobase::add_index(
 /*===================*/
-	TABLE*			table,		/*!< in: Table where indexes
+	TABLE*			in_table,	/*!< in: Table where indexes
 						are created */
 	KEY*			key_info,	/*!< in: Indexes
 						to be created */
@@ -667,16 +1015,21 @@ ha_innobase::add_index(
 						to be created */
 	handler_add_index**	add)		/*!< out: context */
 {
-	dict_index_t**	index;		/*!< Index to be created */
+	dict_index_t**	index = NULL;	/*!< Index to be created */
+	dict_index_t*	fts_index = NULL;/*!< FTS Index to be created */
 	dict_table_t*	indexed_table;	/*!< Table where indexes are created */
 	merge_index_def_t* index_defs;	/*!< Index definitions */
-	mem_heap_t*     heap;		/*!< Heap for index definitions */
+	mem_heap_t*     heap = NULL;	/*!< Heap for index definitions */
 	trx_t*		trx;		/*!< Transaction */
 	ulint		num_of_idx;
 	ulint		num_created	= 0;
 	ibool		dict_locked	= FALSE;
-	ulint		new_primary;
+	ulint		new_primary	= 0;
 	int		error;
+	ulint		num_fts_index	= 0;
+	ulint		num_idx_create	= 0;
+	ibool		fts_add_doc_id	= FALSE;
+	ibool		fts_add_doc_idx	= FALSE;
 
 	DBUG_ENTER("ha_innobase::add_index");
 	ut_a(table);
@@ -700,7 +1053,7 @@ ha_innobase::add_index(
 		DBUG_RETURN(-1);
 	}
 
-	indexed_table = dict_table_get(prebuilt->table->name, FALSE);
+	indexed_table = dict_table_open_on_name(prebuilt->table->name, FALSE);
 
 	if (UNIV_UNLIKELY(!indexed_table)) {
 		DBUG_RETURN(HA_ERR_NO_SUCH_TABLE);
@@ -712,16 +1065,22 @@ ha_innobase::add_index(
 	error = innobase_check_index_keys(key_info, num_of_keys, prebuilt->table);
 
 	if (UNIV_UNLIKELY(error)) {
+		dict_table_close(prebuilt->table, FALSE);
 		DBUG_RETURN(error);
 	}
 
 	/* Check each index's column length to make sure they do not
 	exceed limit */
 	for (ulint i = 0; i < num_of_keys; i++) {
+		if (key_info[i].flags & HA_FULLTEXT) {
+			continue;
+		}
+
 		error = innobase_check_column_length(prebuilt->table,
 						     &key_info[i]);
 
 		if (error) {
+			dict_table_close(prebuilt->table, FALSE);
 			DBUG_RETURN(error);
 		}
 	}
@@ -734,6 +1093,20 @@ ha_innobase::add_index(
 	trx = innobase_trx_allocate(user_thd);
 	trx_start_if_not_started(trx);
 
+	/* We don't want this table to be evicted from the cache while we
+	are building an index on it. Another issue is that while we are
+	building the index this table could be referred to in a foreign
+	key relationship. In innobase_add_index_cleanup() we check for
+	that condition before moving it back to the LRU list. */
+
+	row_mysql_lock_data_dictionary(trx);
+
+	if (prebuilt->table->can_be_evicted) {
+		dict_table_move_from_lru_to_non_lru(prebuilt->table);
+	}
+
+	row_mysql_unlock_data_dictionary(trx);
+
 	/* Create table containing all indexes to be built in this
 	alter table add index so that they are in the correct order
 	in the table. */
@@ -741,14 +1114,34 @@ ha_innobase::add_index(
 	num_of_idx = num_of_keys;
 
 	index_defs = innobase_create_key_def(
-		trx, prebuilt->table, heap, key_info, num_of_idx);
+		trx, prebuilt->table, heap, key_info, num_of_idx,
+		&num_fts_index, &fts_add_doc_id, &fts_add_doc_idx);
+
+	if (!index_defs) {
+		error = DB_UNSUPPORTED;
+		goto error_handling;
+	}
+
+	/* Currently, support create one single FULLTEXT index in parallel at
+	a time */
+	if (num_fts_index > 1) {
+		ut_print_timestamp(stderr);
+		fprintf(stderr,
+			" InnoDB: Only support create ONE Fulltext index"
+			" at a time\n");
+		error = DB_UNSUPPORTED;
+		goto error_handling;
+	}
 
 	new_primary = DICT_CLUSTERED & index_defs[0].ind_type;
 
-	/* Allocate memory for dictionary index definitions */
+	/* If a new FTS Doc ID column is to be added, there will be
+	one additional index to be built on the Doc ID column itself. */
+	num_idx_create = (fts_add_doc_idx) ? num_of_idx + 1 : num_of_idx;
 
+	/* Allocate memory for dictionary index definitions */
 	index = (dict_index_t**) mem_heap_alloc(
-		heap, num_of_idx * sizeof *index);
+		heap, num_idx_create * sizeof *index);
 
 	/* Flag this transaction as a dictionary operation, so that
 	the data dictionary will be locked in crash recovery. */
@@ -776,8 +1169,9 @@ ha_innobase::add_index(
 
 	if (UNIV_UNLIKELY(new_primary)) {
 		/* This transaction should be the only one
-		operating on the table. */
-		ut_a(prebuilt->table->n_mysql_handles_opened == 1);
+		operating on the table. The table get above
+		would have incremented the ref count to 2. */
+		ut_a(prebuilt->table->n_ref_count == 2);
 
 		char*	new_table_name = innobase_create_temporary_tablename(
 			heap, '1', prebuilt->table->name);
@@ -806,11 +1200,12 @@ ha_innobase::add_index(
 
 			ut_d(dict_table_check_for_dup_indexes(prebuilt->table,
 							      FALSE));
-			mem_heap_free(heap);
-			trx_general_rollback_for_mysql(trx, NULL);
 			row_mysql_unlock_data_dictionary(trx);
-			trx_free_for_mysql(trx);
-			trx_commit_for_mysql(prebuilt->trx);
+			mem_heap_free(heap);
+
+			innobase_add_index_cleanup(
+				prebuilt, trx, prebuilt->table);
+
 			DBUG_RETURN(error);
 		}
 
@@ -828,6 +1223,40 @@ ha_innobase::add_index(
 			error = trx->error_state;
 			goto error_handling;
 		}
+
+		if (index[num_created]->type & DICT_FTS) {
+			fts_index = index[num_created];
+			fts_create_index_tables(trx, fts_index);
+
+		}
+	}
+
+	/* create FTS_DOC_ID_INDEX on the Doc ID column on the table */
+	if (fts_add_doc_idx) {
+		index[num_of_idx] = innobase_create_fts_doc_id_idx(
+					       indexed_table, trx, heap);
+		/* FTS_DOC_ID_INDEX is internal defined new index */
+		num_of_idx++;
+		num_created++;
+	}
+
+	if (num_fts_index) {
+		DICT_TF2_FLAG_SET(indexed_table, DICT_TF2_FTS);
+
+		if (!indexed_table->fts
+		    || ib_vector_size(indexed_table->fts->indexes) == 0) {
+			fts_create_common_tables(trx, indexed_table,
+						 prebuilt->table->name, TRUE);
+
+			indexed_table->fts->fts_status |= TABLE_DICT_LOCKED;
+			innobase_fts_load_stopword(
+				indexed_table, trx, ha_thd());
+			indexed_table->fts->fts_status &= ~TABLE_DICT_LOCKED;
+		}
+
+		if (new_primary && prebuilt->table->fts) {
+			indexed_table->fts->doc_col = prebuilt->table->fts->doc_col;
+		}
 	}
 
 	ut_ad(error == DB_SUCCESS);
@@ -843,8 +1272,7 @@ ha_innobase::add_index(
 	row_mysql_unlock_data_dictionary(trx);
 	dict_locked = FALSE;
 
-	ut_a(trx->n_active_thrs == 0);
-	ut_a(UT_LIST_GET_LEN(trx->signals) == 0);
+	ut_a(trx->lock.n_active_thrs == 0);
 
 	if (UNIV_UNLIKELY(new_primary)) {
 		/* A primary key is to be built.  Acquire an exclusive
@@ -867,60 +1295,66 @@ ha_innobase::add_index(
 					index, num_of_idx, table);
 
 error_handling:
+
 	/* After an error, remove all those index definitions from the
 	dictionary which were defined. */
 
+	if (!dict_locked) {
+		row_mysql_lock_data_dictionary(trx);
+		dict_locked = TRUE;
+	}
+
 	switch (error) {
 	case DB_SUCCESS:
-		ut_a(!dict_locked);
-
-		ut_d(mutex_enter(&dict_sys->mutex));
 		ut_d(dict_table_check_for_dup_indexes(prebuilt->table, TRUE));
-		ut_d(mutex_exit(&dict_sys->mutex));
-                *add = new ha_innobase_add_index(table, key_info, num_of_keys,
-                                                 indexed_table);
+
+		*add = new ha_innobase_add_index(
+			table, key_info, num_of_keys, indexed_table);
+
+		dict_table_close(prebuilt->table, dict_locked);
 		break;
 
 	case DB_TOO_BIG_RECORD:
 		my_error(HA_ERR_TO_BIG_ROW, MYF(0));
-		goto error;
+		goto error_exit;
 	case DB_PRIMARY_KEY_IS_NULL:
 		my_error(ER_PRIMARY_CANT_HAVE_NULL, MYF(0));
 		/* fall through */
 	case DB_DUPLICATE_KEY:
-error:
+		if (fts_add_doc_idx
+		    && prebuilt->trx->error_key_num == num_of_idx - 1) {
+			prebuilt->trx->error_key_num = ULINT_UNDEFINED;
+		}
+error_exit:
 		prebuilt->trx->error_info = NULL;
 		/* fall through */
 	default:
+		dict_table_close(prebuilt->table, dict_locked);
+
 		trx->error_state = DB_SUCCESS;
 
 		if (new_primary) {
 			if (indexed_table != prebuilt->table) {
+				dict_table_close(indexed_table, dict_locked);
 				row_merge_drop_table(trx, indexed_table);
 			}
 		} else {
-			if (!dict_locked) {
-				row_mysql_lock_data_dictionary(trx);
-				dict_locked = TRUE;
-			}
-
 			row_merge_drop_indexes(trx, indexed_table,
 					       index, num_created);
 		}
 	}
 
+	ut_ad(!new_primary || prebuilt->table->n_ref_count == 1);
 	trx_commit_for_mysql(trx);
+	ut_ad(dict_locked);
+	row_mysql_unlock_data_dictionary(trx);
+	trx_free_for_mysql(trx);
+	mem_heap_free(heap);
+
 	if (prebuilt->trx) {
 		trx_commit_for_mysql(prebuilt->trx);
 	}
 
-	if (dict_locked) {
-		row_mysql_unlock_data_dictionary(trx);
-	}
-
-	trx_free_for_mysql(trx);
-	mem_heap_free(heap);
-
 	/* There might be work for utility threads.*/
 	srv_active_wake_master_thread();
 
@@ -982,9 +1416,12 @@ ha_innobase::final_add_index(
 				prebuilt->table, add->indexed_table,
 				tmp_name, trx);
 
+			ut_a(prebuilt->table->n_ref_count == 1);
+
 			switch (error) {
 			case DB_TABLESPACE_ALREADY_EXISTS:
 			case DB_DUPLICATE_KEY:
+				ut_a(add->indexed_table->n_ref_count == 0);
 				innobase_convert_tablename(tmp_name);
 				my_error(HA_ERR_TABLE_EXIST, MYF(0), tmp_name);
 				err = HA_ERR_TABLE_EXIST;
@@ -1000,6 +1437,7 @@ ha_innobase::final_add_index(
 		}
 
 		if (!commit || err) {
+			dict_table_close(add->indexed_table, TRUE);
 			error = row_merge_drop_table(trx, add->indexed_table);
 			trx_commit_for_mysql(prebuilt->trx);
 		} else {
@@ -1007,7 +1445,6 @@ ha_innobase::final_add_index(
 			trx_commit_for_mysql(prebuilt->trx);
 			row_prebuilt_free(prebuilt, TRUE);
 			error = row_merge_drop_table(trx, old_table);
-			add->indexed_table->n_mysql_handles_opened++;
 			prebuilt = row_create_prebuilt(add->indexed_table,
 				0 /* XXX Do we know the mysql_row_len here?
 				Before the addition of this parameter to
@@ -1018,8 +1455,14 @@ ha_innobase::final_add_index(
 
 		err = convert_error_code_to_mysql(
 			error, prebuilt->table->flags, user_thd);
-	} else {
-		/* We created secondary indexes (!new_primary). */
+	}
+
+	if (add->indexed_table == prebuilt->table
+	    || DICT_TF2_FLAG_IS_SET(prebuilt->table, DICT_TF2_FTS_ADD_DOC_ID)) {
+		/* We created secondary indexes (!new_primary) or create full
+		text index and added a new Doc ID column, we will need to
+		rename the secondary index on the Doc ID column to its
+		official index name.. */
 
 		if (commit) {
 			err = convert_error_code_to_mysql(
@@ -1043,13 +1486,66 @@ ha_innobase::final_add_index(
 				}
 			}
 		}
+
+		DICT_TF2_FLAG_UNSET(prebuilt->table, DICT_TF2_FTS_ADD_DOC_ID);
 	}
 
 	/* If index is successfully built, we will need to rebuild index
 	translation table. Set valid index entry count in the translation
 	table to zero. */
 	if (err == 0 && commit) {
+		ibool		new_primary;
+		dict_index_t*	index;
+		dict_index_t*	next_index;
+		ibool		new_fts = FALSE;
+		dict_index_t*	primary;
+
+		new_primary = !my_strcasecmp(
+			system_charset_info, add->key_info[0].name, "PRIMARY");
+
+		primary = dict_table_get_first_index(add->indexed_table);
+
+		if (!new_primary) {
+			new_primary = !my_strcasecmp(
+				system_charset_info, add->key_info[0].name,
+				primary->name);
+		}
+
 		share->idx_trans_tbl.index_count = 0;
+
+		if (new_primary) {
+			for (index = primary; index; index = next_index) {
+
+				next_index = dict_table_get_next_index(index);
+
+				if (index->type & DICT_FTS) {
+					fts_add_index(index,
+						      add->indexed_table);
+					new_fts = TRUE;
+				}
+			}
+		} else {
+			ulint		i;
+			for (i = 0; i < add->num_of_keys; i++) {
+				if (add->key_info[i].flags & HA_FULLTEXT) {
+					dict_index_t*	fts_index;
+
+					fts_index =
+						dict_table_get_index_on_name(
+							prebuilt->table,
+							 add->key_info[i].name);
+
+					ut_ad(fts_index);
+					fts_add_index(fts_index,
+						      prebuilt->table);
+					new_fts = TRUE;
+				}
+			}
+		}
+
+		if (new_fts) {
+			fts_optimize_add_table(prebuilt->table);
+		}
 	}
 
 	trx_commit_for_mysql(trx);
@@ -1058,6 +1554,9 @@ ha_innobase::final_add_index(
 	}
 
 	ut_d(dict_table_check_for_dup_indexes(prebuilt->table, FALSE));
+
+	ut_a(fts_check_cached_index(prebuilt->table));
+
 	row_mysql_unlock_data_dictionary(trx);
 
 	trx_free_for_mysql(trx);
@@ -1068,7 +1567,6 @@ ha_innobase::final_add_index(
 	delete add;
 	DBUG_RETURN(err);
 }
-
 /*******************************************************************//**
 Prepare to drop some indexes of a table.
 @return	0 or error number */
@@ -1076,13 +1574,13 @@ UNIV_INTERN
 int
 ha_innobase::prepare_drop_index(
 /*============================*/
-	TABLE*	table,		/*!< in: Table where indexes are dropped */
+	TABLE*	in_table,	/*!< in: Table where indexes are dropped */
 	uint*	key_num,	/*!< in: Key nums to be dropped */
 	uint	num_of_keys)	/*!< in: Number of keys to be dropped */
 {
 	trx_t*		trx;
 	int		err = 0;
-	uint 		n_key;
+	uint		n_key;
 
 	DBUG_ENTER("ha_innobase::prepare_drop_index");
 	ut_ad(table);
@@ -1284,7 +1782,8 @@ UNIV_INTERN
 int
 ha_innobase::final_drop_index(
 /*==========================*/
-	TABLE*	table)		/*!< in: Table where indexes are dropped */
+	TABLE*	        iin_table)	/*!< in: Table where indexes
+					are dropped */
 {
 	dict_index_t*	index;		/*!< Index to be dropped */
 	trx_t*		trx;		/*!< Transaction */
@@ -1300,12 +1799,12 @@ ha_innobase::final_drop_index(
 	update_thd();
 
 	trx_search_latch_release_if_reserved(prebuilt->trx);
-	trx_start_if_not_started(prebuilt->trx);
+	trx_start_if_not_started_xa(prebuilt->trx);
 
 	/* Create a background transaction for the operations on
 	the data dictionary tables. */
 	trx = innobase_trx_allocate(user_thd);
-	trx_start_if_not_started(trx);
+	trx_start_if_not_started_xa(trx);
 
 	/* Flag this transaction as a dictionary operation, so that
 	the data dictionary will be locked in crash recovery. */
@@ -1317,6 +1816,36 @@ ha_innobase::final_drop_index(
 		row_merge_lock_table(prebuilt->trx, prebuilt->table, LOCK_X),
 		prebuilt->table->flags, user_thd);
 
+	/* Delete corresponding rows from the stats table.
+	Marko advises not to edit both user tables and SYS_* tables in one
+	trx, thus we use prebuilt->trx instead of trx. Because of this the
+	drop from SYS_* and from the stats table cannot happen in one
+	transaction and eventually if a crash occurs below, between
+	trx_commit_for_mysql(trx); which drops the indexes from SYS_* and
+	trx_commit_for_mysql(prebuilt->trx);
+	then an orphaned rows will be left in the stats table. */
+	for (index = dict_table_get_first_index(prebuilt->table);
+	     index != NULL;
+	     index = dict_table_get_next_index(index)) {
+
+		if (index->to_be_dropped) {
+
+			enum db_err	ret;
+			char		errstr[1024];
+
+			ret = dict_stats_delete_index_stats(
+				index, prebuilt->trx,
+				errstr, sizeof(errstr));
+
+			if (ret != DB_SUCCESS) {
+				push_warning(user_thd,
+					     Sql_condition::WARN_LEVEL_WARN,
+					     ER_LOCK_WAIT_TIMEOUT,
+					     errstr);
+			}
+		}
+	}
+
 	row_mysql_lock_data_dictionary(trx);
 	ut_d(dict_table_check_for_dup_indexes(prebuilt->table, FALSE));
 
@@ -1344,7 +1873,6 @@ ha_innobase::final_drop_index(
 		next_index = dict_table_get_next_index(index);
 
 		if (index->to_be_dropped) {
-
 			row_merge_drop_index(index, prebuilt->table, trx);
 		}
 
@@ -1363,6 +1891,9 @@ ha_innobase::final_drop_index(
 
 func_exit:
 	ut_d(dict_table_check_for_dup_indexes(prebuilt->table, FALSE));
+
+	ut_a(fts_check_cached_index(prebuilt->table));
+
 	trx_commit_for_mysql(trx);
 	trx_commit_for_mysql(prebuilt->trx);
 	row_mysql_unlock_data_dictionary(trx);
diff --git a/storage/innobase/handler/i_s.cc b/storage/innobase/handler/i_s.cc
index abbb8b10ce7..7d89b4b175b 100644
--- a/storage/innobase/handler/i_s.cc
+++ b/storage/innobase/handler/i_s.cc
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 2007, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 2007, 2011, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -24,7 +24,7 @@ Created July 18, 2007 Vasil Dimov
 *******************************************************/
 
 #include <mysqld_error.h>
-#include <sql_acl.h>                            // PROCESS_ACL
+#include <sql_acl.h>				// PROCESS_ACL
 
 #include <m_ctype.h>
 #include <hash.h>
@@ -35,17 +35,106 @@ Created July 18, 2007 Vasil Dimov
 #include <sql_plugin.h>
 #include <innodb_priv.h>
 
-extern "C" {
+#include "btr0pcur.h"	/* for file sys_tables related info. */
 #include "btr0types.h"
-#include "buf0buddy.h" /* for i_s_cmpmem */
-#include "buf0buf.h" /* for buf_pool and PAGE_ZIP_MIN_SIZE */
+#include "buf0buddy.h"	/* for i_s_cmpmem */
+#include "buf0buf.h"	/* for buf_pool */
+#include "dict0load.h"	/* for file sys_tables related info. */
 #include "dict0mem.h"
 #include "dict0types.h"
 #include "ha_prototypes.h" /* for innobase_convert_name() */
-#include "srv0start.h" /* for srv_was_started */
+#include "srv0start.h"	/* for srv_was_started */
 #include "trx0i_s.h"
-#include "trx0trx.h" /* for TRX_QUE_STATE_STR_MAX_LEN */
-}
+#include "trx0trx.h"	/* for TRX_QUE_STATE_STR_MAX_LEN */
+#include "srv0mon.h"
+#include "fut0fut.h"
+#include "pars0pars.h"
+#include "fts0types.h"
+#include "fts0opt.h"
+#include "fts0priv.h"
+#include "btr0btr.h"
+#include "page0zip.h"
+
+/** structure associates a name string with a file page type and/or buffer
+page state. */
+struct buffer_page_desc_str_struct{
+	const char*	type_str;	/*!< String explain the page
+					type/state */
+	ulint		type_value;	/*!< Page type or page state */
+};
+
+typedef struct buffer_page_desc_str_struct	buf_page_desc_str_t;
+
+/** Any states greater than FIL_PAGE_TYPE_LAST would be treated as unknown. */
+#define	I_S_PAGE_TYPE_UNKNOWN		(FIL_PAGE_TYPE_LAST + 1)
+
+/** We also define I_S_PAGE_TYPE_INDEX as the Index Page's position
+in i_s_page_type[] array */
+#define I_S_PAGE_TYPE_INDEX		1
+
+/** Name string for File Page Types */
+static buf_page_desc_str_t	i_s_page_type[] = {
+	{"ALLOCATED", FIL_PAGE_TYPE_ALLOCATED},
+	{"INDEX", FIL_PAGE_INDEX},
+	{"UNDO_LOG", FIL_PAGE_UNDO_LOG},
+	{"INODE", FIL_PAGE_INODE},
+	{"IBUF_FREE_LIST", FIL_PAGE_IBUF_FREE_LIST},
+	{"IBUF_BITMAP", FIL_PAGE_IBUF_BITMAP},
+	{"SYSTEM", FIL_PAGE_TYPE_SYS},
+	{"TRX_SYSTEM", FIL_PAGE_TYPE_TRX_SYS},
+	{"FILE_SPACE_HEADER", FIL_PAGE_TYPE_FSP_HDR},
+	{"EXTENT_DESCRIPTOR", FIL_PAGE_TYPE_XDES},
+	{"BLOB", FIL_PAGE_TYPE_BLOB},
+	{"COMPRESSED_BLOB", FIL_PAGE_TYPE_ZBLOB},
+	{"COMPRESSED_BLOB2", FIL_PAGE_TYPE_ZBLOB2},
+	{"UNKNOWN", I_S_PAGE_TYPE_UNKNOWN}
+};
+
+/* Check if we can hold all page type in a 4 bit value */
+#if I_S_PAGE_TYPE_UNKNOWN > 1<<4
+# error "i_s_page_type[] is too large"
+#endif
+
+/** This structure defines information we will fetch from pages
+currently cached in the buffer pool. It will be used to populate
+table INFORMATION_SCHEMA.INNODB_BUFFER_PAGE */
+struct buffer_page_info_struct{
+	ulint		block_id;	/*!< Buffer Pool block ID */
+	unsigned	space_id:32;	/*!< Tablespace ID */
+	unsigned	page_num:32;	/*!< Page number/offset */
+	unsigned	access_time:32;	/*!< Time of first access */
+	unsigned	pool_id:MAX_BUFFER_POOLS_BITS;
+					/*!< Buffer Pool ID. Must be less than
+					MAX_BUFFER_POOLS */
+	unsigned	flush_type:2;	/*!< Flush type */
+	unsigned	io_fix:2;	/*!< type of pending I/O operation */
+	unsigned	fix_count:19;	/*!< Count of how manyfold this block
+					is bufferfixed */
+	unsigned	hashed:1;	/*!< Whether hash index has been
+					built on this page */
+	unsigned	is_old:1;	/*!< TRUE if the block is in the old
+					blocks in buf_pool->LRU_old */
+	unsigned	freed_page_clock:31; /*!< the value of
+					buf_pool->freed_page_clock */
+	unsigned	zip_ssize:PAGE_ZIP_SSIZE_BITS;
+					/*!< Compressed page size */
+	unsigned	page_state:BUF_PAGE_STATE_BITS; /*!< Page state */
+	unsigned	page_type:4;	/*!< Page type */
+	unsigned	num_recs:UNIV_PAGE_SIZE_SHIFT_MAX-2;
+					/*!< Number of records on Page */
+	unsigned	data_size:UNIV_PAGE_SIZE_SHIFT_MAX;
+					/*!< Sum of the sizes of the records */
+	lsn_t		newest_mod;	/*!< Log sequence number of
+					the youngest modification */
+	lsn_t		oldest_mod;	/*!< Log sequence number of
+					the oldest modification */
+	index_id_t	index_id;	/*!< Index ID if a index page */
+};
+
+typedef struct buffer_page_info_struct	buf_page_info_t;
+
+/** maximum number of buffer page info we would cache. */
+#define MAX_BUF_INFO_CACHED		10000
 
 #define OK(expr)		\
 	if ((expr) != 0) {	\
@@ -55,7 +144,7 @@ extern "C" {
 #define RETURN_IF_INNODB_NOT_STARTED(plugin_name)			\
 do {									\
 	if (!srv_was_started) {						\
-		push_warning_printf(thd, MYSQL_ERROR::WARN_LEVEL_WARN,	\
+		push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,	\
 				    ER_CANT_FIND_SYSTEM_REC,		\
 				    "InnoDB: SELECTing from "		\
 				    "INFORMATION_SCHEMA.%s but "	\
@@ -65,7 +154,8 @@ do {									\
 	}								\
 } while (0)
 
-#if !defined __STRICT_ANSI__ && defined __GNUC__ && (__GNUC__) > 2 && !defined __INTEL_COMPILER
+#if !defined __STRICT_ANSI__ && defined __GNUC__ && (__GNUC__) > 2 &&	\
+	!defined __INTEL_COMPILER && !defined __clang__
 #define STRUCT_FLD(name, value)	name: value
 #else
 #define STRUCT_FLD(name, value)	value
@@ -122,7 +212,7 @@ trx_i_s_common_fill_table(
 /*======================*/
 	THD*		thd,	/*!< in: thread */
 	TABLE_LIST*	tables,	/*!< in/out: tables to fill */
-	COND*		cond);	/*!< in: condition (not used) */
+	Item*		);	/*!< in: condition (not used) */
 
 /*******************************************************************//**
 Unbind a dynamic INFORMATION_SCHEMA table.
@@ -146,16 +236,20 @@ field_store_time_t(
 	MYSQL_TIME	my_time;
 	struct tm	tm_time;
 
+	if (time) {
 #if 0
-	/* use this if you are sure that `variables' and `time_zone'
-	are always initialized */
-	thd->variables.time_zone->gmt_sec_to_TIME(
-		&my_time, (my_time_t) time);
+		/* use this if you are sure that `variables' and `time_zone'
+		are always initialized */
+		thd->variables.time_zone->gmt_sec_to_TIME(
+			&my_time, (my_time_t) time);
 #else
-	localtime_r(&time, &tm_time);
-	localtime_to_TIME(&my_time, &tm_time);
-	my_time.time_type = MYSQL_TIMESTAMP_DATETIME;
+		localtime_r(&time, &tm_time);
+		localtime_to_TIME(&my_time, &tm_time);
+		my_time.time_type = MYSQL_TIMESTAMP_DATETIME;
 #endif
+	} else {
+		memset(&my_time, 0, sizeof(my_time));
+	}
 
 	return(field->store_time(&my_time));
 }
@@ -414,6 +508,24 @@ static ST_FIELD_INFO	innodb_trx_fields_info[] =
 	 STRUCT_FLD(old_name,		""),
 	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
 
+#define IDX_TRX_READ_ONLY		22
+	{STRUCT_FLD(field_name,		"trx_is_read_only"),
+	 STRUCT_FLD(field_length,	1),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_TRX_AUTOCOMMIT_NON_LOCKING	23
+	{STRUCT_FLD(field_name,		"trx_autocommit_non_locking"),
+	 STRUCT_FLD(field_length,	1),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
 	END_OF_ST_FIELD_INFO
 };
 
@@ -560,6 +672,15 @@ fill_innodb_trx_from_cache(
 		OK(fields[IDX_TRX_ADAPTIVE_HASH_TIMEOUT]->store(
 			   (longlong) row->trx_search_latch_timeout, true));
 
+		/* trx_is_read_only*/
+		OK(fields[IDX_TRX_READ_ONLY]->store(
+				(long) row->trx_is_read_only, true));
+
+		/* trx_is_autocommit_non_locking */
+		OK(fields[IDX_TRX_AUTOCOMMIT_NON_LOCKING]->store(
+				(long) row->trx_is_autocommit_non_locking,
+				true));
+
 		OK(schema_table_store_record(thd, table));
 	}
 
@@ -592,7 +713,7 @@ static struct st_mysql_information_schema	i_s_info =
 	MYSQL_INFORMATION_SCHEMA_INTERFACE_VERSION
 };
 
-UNIV_INTERN struct st_mysql_plugin	i_s_innodb_trx =
+UNIV_INTERN struct st_maria_plugin	i_s_innodb_trx =
 {
 	/* the plugin type (a MYSQL_XXX_PLUGIN value) */
 	/* int */
@@ -636,13 +757,9 @@ UNIV_INTERN struct st_mysql_plugin	i_s_innodb_trx =
 	/* struct st_mysql_sys_var** */
 	STRUCT_FLD(system_vars, NULL),
 
-	/* reserved for dependency checking */
-	/* void* */
-	STRUCT_FLD(__reserved1, NULL),
-
-	/* Plugin flags */
-	/* unsigned long */
-	STRUCT_FLD(flags, 0UL),
+        /* Maria extension */
+	STRUCT_FLD(version_info, INNODB_VERSION_STR),
+        STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_STABLE),
 };
 
 /* Fields of the dynamic table INFORMATION_SCHEMA.innodb_locks */
@@ -862,7 +979,7 @@ innodb_locks_init(
 	DBUG_RETURN(0);
 }
 
-UNIV_INTERN struct st_mysql_plugin	i_s_innodb_locks =
+UNIV_INTERN struct st_maria_plugin	i_s_innodb_locks =
 {
 	/* the plugin type (a MYSQL_XXX_PLUGIN value) */
 	/* int */
@@ -906,13 +1023,9 @@ UNIV_INTERN struct st_mysql_plugin	i_s_innodb_locks =
 	/* struct st_mysql_sys_var** */
 	STRUCT_FLD(system_vars, NULL),
 
-	/* reserved for dependency checking */
-	/* void* */
-	STRUCT_FLD(__reserved1, NULL),
-
-	/* Plugin flags */
-	/* unsigned long */
-	STRUCT_FLD(flags, 0UL),
+        /* Maria extension */
+	STRUCT_FLD(version_info, INNODB_VERSION_STR),
+        STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_STABLE),
 };
 
 /* Fields of the dynamic table INFORMATION_SCHEMA.innodb_lock_waits */
@@ -1049,7 +1162,7 @@ innodb_lock_waits_init(
 	DBUG_RETURN(0);
 }
 
-UNIV_INTERN struct st_mysql_plugin	i_s_innodb_lock_waits =
+UNIV_INTERN struct st_maria_plugin	i_s_innodb_lock_waits =
 {
 	/* the plugin type (a MYSQL_XXX_PLUGIN value) */
 	/* int */
@@ -1093,13 +1206,9 @@ UNIV_INTERN struct st_mysql_plugin	i_s_innodb_lock_waits =
 	/* struct st_mysql_sys_var** */
 	STRUCT_FLD(system_vars, NULL),
 
-	/* reserved for dependency checking */
-	/* void* */
-	STRUCT_FLD(__reserved1, NULL),
-
-	/* Plugin flags */
-	/* unsigned long */
-	STRUCT_FLD(flags, 0UL),
+        /* Maria extension */
+	STRUCT_FLD(version_info, INNODB_VERSION_STR),
+        STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_STABLE),
 };
 
 /*******************************************************************//**
@@ -1114,7 +1223,7 @@ trx_i_s_common_fill_table(
 /*======================*/
 	THD*		thd,	/*!< in: thread */
 	TABLE_LIST*	tables,	/*!< in/out: tables to fill */
-	COND*		cond)	/*!< in: condition (not used) */
+	Item*		)	/*!< in: condition (not used) */
 {
 	const char*		table_name;
 	int			ret;
@@ -1274,10 +1383,10 @@ i_s_cmp_fill_low(
 /*=============*/
 	THD*		thd,	/*!< in: thread */
 	TABLE_LIST*	tables,	/*!< in/out: tables to fill */
-	COND*		cond,	/*!< in: condition (ignored) */
+	Item*		,	/*!< in: condition (ignored) */
 	ibool		reset)	/*!< in: TRUE=reset cumulated counts */
 {
-	TABLE*	table	= (TABLE *) tables->table;
+	TABLE*	table	= (TABLE*) tables->table;
 	int	status	= 0;
 
 	DBUG_ENTER("i_s_cmp_fill_low");
@@ -1290,17 +1399,17 @@ i_s_cmp_fill_low(
 
 	RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name);
 
-	for (uint i = 0; i < PAGE_ZIP_NUM_SSIZE - 1; i++) {
+	for (uint i = 0; i < PAGE_ZIP_SSIZE_MAX; i++) {
 		page_zip_stat_t*	zip_stat = &page_zip_stat[i];
 
-		table->field[0]->store(PAGE_ZIP_MIN_SIZE << i);
+		table->field[0]->store(UNIV_ZIP_SIZE_MIN << i);
 
 		/* The cumulated counts are not protected by any
-		mutex.  Thus, some operation in page0zip.c could
+		mutex.  Thus, some operation in page0zip.cc could
 		increment a counter between the time we read it and
 		clear it.  We could introduce mutex protection, but it
 		could cause a measureable performance hit in
-		page0zip.c. */
+		page0zip.cc. */
 		table->field[1]->store(zip_stat->compressed);
 		table->field[2]->store(zip_stat->compressed_ok);
 		table->field[3]->store(
@@ -1331,7 +1440,7 @@ i_s_cmp_fill(
 /*=========*/
 	THD*		thd,	/*!< in: thread */
 	TABLE_LIST*	tables,	/*!< in/out: tables to fill */
-	COND*		cond)	/*!< in: condition (ignored) */
+	Item*		cond)	/*!< in: condition (ignored) */
 {
 	return(i_s_cmp_fill_low(thd, tables, cond, FALSE));
 }
@@ -1345,7 +1454,7 @@ i_s_cmp_reset_fill(
 /*===============*/
 	THD*		thd,	/*!< in: thread */
 	TABLE_LIST*	tables,	/*!< in/out: tables to fill */
-	COND*		cond)	/*!< in: condition (ignored) */
+	Item*		cond)	/*!< in: condition (ignored) */
 {
 	return(i_s_cmp_fill_low(thd, tables, cond, TRUE));
 }
@@ -1386,7 +1495,7 @@ i_s_cmp_reset_init(
 	DBUG_RETURN(0);
 }
 
-UNIV_INTERN struct st_mysql_plugin	i_s_innodb_cmp =
+UNIV_INTERN struct st_maria_plugin	i_s_innodb_cmp =
 {
 	/* the plugin type (a MYSQL_XXX_PLUGIN value) */
 	/* int */
@@ -1430,16 +1539,12 @@ UNIV_INTERN struct st_mysql_plugin	i_s_innodb_cmp =
 	/* struct st_mysql_sys_var** */
 	STRUCT_FLD(system_vars, NULL),
 
-	/* reserved for dependency checking */
-	/* void* */
-	STRUCT_FLD(__reserved1, NULL),
-
-	/* Plugin flags */
-	/* unsigned long */
-	STRUCT_FLD(flags, 0UL),
+        /* Maria extension */
+	STRUCT_FLD(version_info, INNODB_VERSION_STR),
+        STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_STABLE),
 };
 
-UNIV_INTERN struct st_mysql_plugin	i_s_innodb_cmp_reset =
+UNIV_INTERN struct st_maria_plugin	i_s_innodb_cmp_reset =
 {
 	/* the plugin type (a MYSQL_XXX_PLUGIN value) */
 	/* int */
@@ -1484,13 +1589,9 @@ UNIV_INTERN struct st_mysql_plugin	i_s_innodb_cmp_reset =
 	/* struct st_mysql_sys_var** */
 	STRUCT_FLD(system_vars, NULL),
 
-	/* reserved for dependency checking */
-	/* void* */
-	STRUCT_FLD(__reserved1, NULL),
-
-	/* Plugin flags */
-	/* unsigned long */
-	STRUCT_FLD(flags, 0UL),
+        /* Maria extension */
+	STRUCT_FLD(version_info, INNODB_VERSION_STR),
+        STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_STABLE),
 };
 
 /* Fields of the dynamic table information_schema.innodb_cmpmem. */
@@ -1542,7 +1643,7 @@ static ST_FIELD_INFO	i_s_cmpmem_fields_info[] =
 	 STRUCT_FLD(value,		0),
 	 STRUCT_FLD(field_flags,	0),
 	 STRUCT_FLD(old_name,		"Total Duration of Relocations,"
-		    			" in Seconds"),
+					" in Seconds"),
 	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
 
 	END_OF_ST_FIELD_INFO
@@ -1558,11 +1659,11 @@ i_s_cmpmem_fill_low(
 /*================*/
 	THD*		thd,	/*!< in: thread */
 	TABLE_LIST*	tables,	/*!< in/out: tables to fill */
-	COND*		cond,	/*!< in: condition (ignored) */
+	Item*		,	/*!< in: condition (ignored) */
 	ibool		reset)	/*!< in: TRUE=reset cumulated counts */
 {
 	int		status = 0;
-	TABLE*	table	= (TABLE *) tables->table;
+	TABLE*	table	= (TABLE*) tables->table;
 
 	DBUG_ENTER("i_s_cmpmem_fill_low");
 
@@ -1630,7 +1731,7 @@ i_s_cmpmem_fill(
 /*============*/
 	THD*		thd,	/*!< in: thread */
 	TABLE_LIST*	tables,	/*!< in/out: tables to fill */
-	COND*		cond)	/*!< in: condition (ignored) */
+	Item*		cond)	/*!< in: condition (ignored) */
 {
 	return(i_s_cmpmem_fill_low(thd, tables, cond, FALSE));
 }
@@ -1644,7 +1745,7 @@ i_s_cmpmem_reset_fill(
 /*==================*/
 	THD*		thd,	/*!< in: thread */
 	TABLE_LIST*	tables,	/*!< in/out: tables to fill */
-	COND*		cond)	/*!< in: condition (ignored) */
+	Item*		cond)	/*!< in: condition (ignored) */
 {
 	return(i_s_cmpmem_fill_low(thd, tables, cond, TRUE));
 }
@@ -1685,7 +1786,7 @@ i_s_cmpmem_reset_init(
 	DBUG_RETURN(0);
 }
 
-UNIV_INTERN struct st_mysql_plugin	i_s_innodb_cmpmem =
+UNIV_INTERN struct st_maria_plugin	i_s_innodb_cmpmem =
 {
 	/* the plugin type (a MYSQL_XXX_PLUGIN value) */
 	/* int */
@@ -1729,16 +1830,12 @@ UNIV_INTERN struct st_mysql_plugin	i_s_innodb_cmpmem =
 	/* struct st_mysql_sys_var** */
 	STRUCT_FLD(system_vars, NULL),
 
-	/* reserved for dependency checking */
-	/* void* */
-	STRUCT_FLD(__reserved1, NULL),
-
-	/* Plugin flags */
-	/* unsigned long */
-	STRUCT_FLD(flags, 0UL),
+        /* Maria extension */
+	STRUCT_FLD(version_info, INNODB_VERSION_STR),
+        STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_STABLE),
 };
 
-UNIV_INTERN struct st_mysql_plugin	i_s_innodb_cmpmem_reset =
+UNIV_INTERN struct st_maria_plugin	i_s_innodb_cmpmem_reset =
 {
 	/* the plugin type (a MYSQL_XXX_PLUGIN value) */
 	/* int */
@@ -1783,13 +1880,3545 @@ UNIV_INTERN struct st_mysql_plugin	i_s_innodb_cmpmem_reset =
 	/* struct st_mysql_sys_var** */
 	STRUCT_FLD(system_vars, NULL),
 
-	/* reserved for dependency checking */
+        /* Maria extension */
+	STRUCT_FLD(version_info, INNODB_VERSION_STR),
+        STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_STABLE),
+};
+
+/* Fields of the dynamic table INFORMATION_SCHEMA.innodb_metrics */
+static ST_FIELD_INFO	innodb_metrics_fields_info[] =
+{
+#define	METRIC_NAME		0
+	{STRUCT_FLD(field_name,		"NAME"),
+	 STRUCT_FLD(field_length,	NAME_LEN + 1),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_STRING),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define	METRIC_SUBSYS		1
+	{STRUCT_FLD(field_name,		"SUBSYSTEM"),
+	 STRUCT_FLD(field_length,	NAME_LEN + 1),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_STRING),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define	METRIC_VALUE_START	2
+	{STRUCT_FLD(field_name,		"COUNT"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define	METRIC_MAX_VALUE_START	3
+	{STRUCT_FLD(field_name,		"MAX_COUNT"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_MAYBE_NULL),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define	METRIC_MIN_VALUE_START	4
+	{STRUCT_FLD(field_name,		"MIN_COUNT"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_MAYBE_NULL),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define	METRIC_AVG_VALUE_START	5
+	{STRUCT_FLD(field_name,		"AVG_COUNT"),
+	 STRUCT_FLD(field_length,	0),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_FLOAT),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_MAYBE_NULL),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define	METRIC_VALUE_RESET	6
+	{STRUCT_FLD(field_name,		"COUNT_RESET"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define	METRIC_MAX_VALUE_RESET	7
+	{STRUCT_FLD(field_name,		"MAX_COUNT_RESET"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_MAYBE_NULL),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define	METRIC_MIN_VALUE_RESET	8
+	{STRUCT_FLD(field_name,		"MIN_COUNT_RESET"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_MAYBE_NULL),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define	METRIC_AVG_VALUE_RESET	9
+	{STRUCT_FLD(field_name,		"AVG_COUNT_RESET"),
+	 STRUCT_FLD(field_length,	0),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_FLOAT),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_MAYBE_NULL),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define	METRIC_START_TIME	10
+	{STRUCT_FLD(field_name,		"TIME_ENABLED"),
+	 STRUCT_FLD(field_length,	0),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_DATETIME),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_MAYBE_NULL),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define	METRIC_STOP_TIME	11
+	{STRUCT_FLD(field_name,		"TIME_DISABLED"),
+	 STRUCT_FLD(field_length,	0),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_DATETIME),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_MAYBE_NULL),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define	METRIC_TIME_ELAPSED	12
+	{STRUCT_FLD(field_name,		"TIME_ELAPSED"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_MAYBE_NULL),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define	METRIC_RESET_TIME	13
+	{STRUCT_FLD(field_name,		"TIME_RESET"),
+	 STRUCT_FLD(field_length,	0),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_DATETIME),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_MAYBE_NULL),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define	METRIC_STATUS		14
+	{STRUCT_FLD(field_name,		"STATUS"),
+	 STRUCT_FLD(field_length,	NAME_LEN + 1),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_STRING),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define	METRIC_TYPE		15
+	{STRUCT_FLD(field_name,		"TYPE"),
+	 STRUCT_FLD(field_length,	NAME_LEN + 1),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_STRING),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define	METRIC_DESC		16
+	{STRUCT_FLD(field_name,		"COMMENT"),
+	 STRUCT_FLD(field_length,	NAME_LEN + 1),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_STRING),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+	END_OF_ST_FIELD_INFO
+};
+
+/**********************************************************************//**
+Fill the information schema metrics table.
+@return	0 on success */
+static
+int
+i_s_metrics_fill(
+/*=============*/
+	THD*		thd,		/*!< in: thread */
+	TABLE*		table_to_fill)	/*!< in/out: fill this table */
+{
+	int		count;
+	Field**		fields;
+	double		time_diff = 0;
+	monitor_info_t*	monitor_info;
+	mon_type_t	min_val;
+	mon_type_t	max_val;
+
+	DBUG_ENTER("i_s_metrics_fill");
+	fields = table_to_fill->field;
+
+	for (count = 0; count < NUM_MONITOR; count++) {
+		monitor_info = srv_mon_get_info((monitor_id_t) count);
+
+		/* A good place to sanity check the Monitor ID */
+		ut_a(count == monitor_info->monitor_id);
+
+		/* If the item refers to a Module, nothing to fill,
+		continue. */
+		if ((monitor_info->monitor_type & MONITOR_MODULE)
+		    || (monitor_info->monitor_type & MONITOR_HIDDEN)) {
+			continue;
+		}
+
+		/* If this is an existing "status variable", and
+		its corresponding counter is still on, we need
+		to calculate the result from its corresponding
+		counter. */
+		if (monitor_info->monitor_type & MONITOR_EXISTING
+		    && MONITOR_IS_ON(count)) {
+			srv_mon_process_existing_counter((monitor_id_t) count,
+							 MONITOR_GET_VALUE);
+		}
+
+		/* Fill in counter's basic information */
+		OK(field_store_string(fields[METRIC_NAME],
+				      monitor_info->monitor_name));
+
+		OK(field_store_string(fields[METRIC_SUBSYS],
+				      monitor_info->monitor_module));
+
+		OK(field_store_string(fields[METRIC_DESC],
+				      monitor_info->monitor_desc));
+
+		/* Fill in counter values */
+		OK(fields[METRIC_VALUE_RESET]->store(
+			MONITOR_VALUE(count), FALSE));
+
+		OK(fields[METRIC_VALUE_START]->store(
+			MONITOR_VALUE_SINCE_START(count), FALSE));
+
+		/* If the max value is MAX_RESERVED, counter max
+		value has not been updated. Set the column value
+		to NULL. */
+		if (MONITOR_MAX_VALUE(count) == MAX_RESERVED
+		    || MONITOR_MAX_MIN_NOT_INIT(count)) {
+			fields[METRIC_MAX_VALUE_RESET]->set_null();
+		} else {
+			OK(fields[METRIC_MAX_VALUE_RESET]->store(
+				MONITOR_MAX_VALUE(count), FALSE));
+			fields[METRIC_MAX_VALUE_RESET]->set_notnull();
+		}
+
+		/* If the min value is MAX_RESERVED, counter min
+		value has not been updated. Set the column value
+		to NULL. */
+		if (MONITOR_MIN_VALUE(count) == MIN_RESERVED
+		    || MONITOR_MAX_MIN_NOT_INIT(count)) {
+			fields[METRIC_MIN_VALUE_RESET]->set_null();
+		} else {
+			OK(fields[METRIC_MIN_VALUE_RESET]->store(
+				MONITOR_MIN_VALUE(count), FALSE));
+			fields[METRIC_MIN_VALUE_RESET]->set_notnull();
+		}
+
+		/* Calculate the max value since counter started */
+		max_val = srv_mon_calc_max_since_start((monitor_id_t) count);
+
+		if (max_val == MAX_RESERVED
+		    || MONITOR_MAX_MIN_NOT_INIT(count)) {
+			fields[METRIC_MAX_VALUE_START]->set_null();
+		} else {
+			OK(fields[METRIC_MAX_VALUE_START]->store(
+				max_val, FALSE));
+			fields[METRIC_MAX_VALUE_START]->set_notnull();
+		}
+
+		/* Calculate the min value since counter started */
+		min_val = srv_mon_calc_min_since_start((monitor_id_t) count);
+
+		if (min_val == MIN_RESERVED
+		    || MONITOR_MAX_MIN_NOT_INIT(count)) {
+			fields[METRIC_MIN_VALUE_START]->set_null();
+		} else {
+			OK(fields[METRIC_MIN_VALUE_START]->store(
+				min_val, FALSE));
+
+			fields[METRIC_MIN_VALUE_START]->set_notnull();
+		}
+
+		/* If monitor has been enabled (no matter it is disabled
+		or not now), fill METRIC_START_TIME and METRIC_TIME_ELAPSED
+		field */
+		if (MONITOR_FIELD(count, mon_start_time)) {
+			OK(field_store_time_t(fields[METRIC_START_TIME],
+				(time_t)MONITOR_FIELD(count, mon_start_time)));
+			fields[METRIC_START_TIME]->set_notnull();
+
+			/* If monitor is enabled, the TIME_ELAPSED is the
+			time difference between current and time when monitor
+			is enabled. Otherwise, it is the time difference
+			between time when monitor is enabled and time
+			when it is disabled */
+			if (MONITOR_IS_ON(count)) {
+				time_diff = difftime(time(NULL),
+					MONITOR_FIELD(count, mon_start_time));
+			} else {
+				time_diff =  difftime(
+					MONITOR_FIELD(count, mon_stop_time),
+					MONITOR_FIELD(count, mon_start_time));
+			}
+
+			OK(fields[METRIC_TIME_ELAPSED]->store(
+				time_diff));
+			fields[METRIC_TIME_ELAPSED]->set_notnull();
+		} else {
+			fields[METRIC_START_TIME]->set_null();
+			fields[METRIC_TIME_ELAPSED]->set_null();
+			time_diff = 0;
+		}
+
+		/* Unless MONITOR__NO_AVERAGE is marked, we will need
+		to calculate the average value. If this is a monitor set
+		owner marked by MONITOR_SET_OWNER, divide
+		the value by another counter (number of calls) designated
+		by monitor_info->monitor_related_id.
+		Otherwise average the counter value by the time between the
+		time that the counter is enabled and time it is disabled
+		or time it is sampled. */
+		if (!(monitor_info->monitor_type & MONITOR_NO_AVERAGE)
+		    && (monitor_info->monitor_type & MONITOR_SET_OWNER)
+		    && monitor_info->monitor_related_id) {
+			mon_type_t	value_start
+				 = MONITOR_VALUE_SINCE_START(
+					monitor_info->monitor_related_id);
+
+			if (value_start) {
+				OK(fields[METRIC_AVG_VALUE_START]->store(
+					MONITOR_VALUE_SINCE_START(count)
+					/ value_start, FALSE));
+
+				fields[METRIC_AVG_VALUE_START]->set_notnull();
+			} else {
+				fields[METRIC_AVG_VALUE_START]->set_null();
+			}
+
+			if (MONITOR_VALUE(monitor_info->monitor_related_id)) {
+				OK(fields[METRIC_AVG_VALUE_RESET]->store(
+					MONITOR_VALUE(count)
+					/ MONITOR_VALUE(
+					monitor_info->monitor_related_id),
+					FALSE));
+			} else {
+				fields[METRIC_AVG_VALUE_RESET]->set_null();
+			}
+		} else if (!(monitor_info->monitor_type & MONITOR_NO_AVERAGE)
+			   && !(monitor_info->monitor_type
+				& MONITOR_DISPLAY_CURRENT)) {
+			if (time_diff) {
+				OK(fields[METRIC_AVG_VALUE_START]->store(
+					(double) MONITOR_VALUE_SINCE_START(
+						count) / time_diff));
+				fields[METRIC_AVG_VALUE_START]->set_notnull();
+			} else {
+				fields[METRIC_AVG_VALUE_START]->set_null();
+			}
+
+			if (MONITOR_FIELD(count, mon_reset_time)) {
+				/* calculate the time difference since last
+				reset */
+				if (MONITOR_IS_ON(count)) {
+					time_diff = difftime(
+						time(NULL), MONITOR_FIELD(
+							count, mon_reset_time));
+				} else {
+					time_diff =  difftime(
+					MONITOR_FIELD(count, mon_stop_time),
+					MONITOR_FIELD(count, mon_reset_time));
+				}
+			} else {
+				time_diff = 0;
+			}
+
+			if (time_diff) {
+				OK(fields[METRIC_AVG_VALUE_RESET]->store(
+					(double )MONITOR_VALUE(count)
+					/ time_diff));
+				fields[METRIC_AVG_VALUE_RESET]->set_notnull();
+			} else {
+				fields[METRIC_AVG_VALUE_RESET]->set_null();
+			}
+		} else {
+			fields[METRIC_AVG_VALUE_START]->set_null();
+			fields[METRIC_AVG_VALUE_RESET]->set_null();
+		}
+
+
+		if (MONITOR_IS_ON(count)) {
+			/* If monitor is on, the stop time will set to NULL */
+			fields[METRIC_STOP_TIME]->set_null();
+
+			/* Display latest Monitor Reset Time only if Monitor
+			counter is on. */
+			if (MONITOR_FIELD(count, mon_reset_time)) {
+				OK(field_store_time_t(
+					fields[METRIC_RESET_TIME],
+					(time_t)MONITOR_FIELD(
+						count, mon_reset_time)));
+				fields[METRIC_RESET_TIME]->set_notnull();
+			} else {
+				fields[METRIC_RESET_TIME]->set_null();
+			}
+
+			/* Display the monitor status as "enabled" */
+			OK(field_store_string(fields[METRIC_STATUS],
+					      "enabled"));
+		} else {
+			if (MONITOR_FIELD(count, mon_stop_time)) {
+				OK(field_store_time_t(fields[METRIC_STOP_TIME],
+				(time_t)MONITOR_FIELD(count, mon_stop_time)));
+				fields[METRIC_STOP_TIME]->set_notnull();
+			} else {
+				fields[METRIC_STOP_TIME]->set_null();
+			}
+
+			fields[METRIC_RESET_TIME]->set_null();
+
+			OK(field_store_string(fields[METRIC_STATUS],
+					      "disabled"));
+		}
+
+		if (monitor_info->monitor_type & MONITOR_DISPLAY_CURRENT) {
+			OK(field_store_string(fields[METRIC_TYPE],
+					      "value"));
+		} else if (monitor_info->monitor_type & MONITOR_EXISTING) {
+			OK(field_store_string(fields[METRIC_TYPE],
+					      "status_counter"));
+		} else if (monitor_info->monitor_type & MONITOR_SET_OWNER) {
+			OK(field_store_string(fields[METRIC_TYPE],
+					      "set_owner"));
+		} else if ( monitor_info->monitor_type & MONITOR_SET_MEMBER) {
+			OK(field_store_string(fields[METRIC_TYPE],
+					      "set_member"));
+		} else {
+			OK(field_store_string(fields[METRIC_TYPE],
+					      "counter"));
+		}
+
+		OK(schema_table_store_record(thd, table_to_fill));
+	}
+
+	DBUG_RETURN(0);
+}
+
+/*******************************************************************//**
+Function to fill information schema metrics tables.
+@return	0 on success */
+static
+int
+i_s_metrics_fill_table(
+/*===================*/
+	THD*		thd,	/*!< in: thread */
+	TABLE_LIST*	tables,	/*!< in/out: tables to fill */
+	Item*		)	/*!< in: condition (not used) */
+{
+	DBUG_ENTER("i_s_metrics_fill_table");
+
+	/* deny access to non-superusers */
+	if (check_global_access(thd, PROCESS_ACL)) {
+		DBUG_RETURN(0);
+	}
+
+	i_s_metrics_fill(thd, tables->table);
+
+	DBUG_RETURN(0);
+}
+/*******************************************************************//**
+Bind the dynamic table INFORMATION_SCHEMA.innodb_metrics
+@return	0 on success */
+static
+int
+innodb_metrics_init(
+/*================*/
+	void*	p)	/*!< in/out: table schema object */
+{
+	ST_SCHEMA_TABLE*	schema;
+
+	DBUG_ENTER("innodb_metrics_init");
+
+	schema = (ST_SCHEMA_TABLE*) p;
+
+	schema->fields_info = innodb_metrics_fields_info;
+	schema->fill_table = i_s_metrics_fill_table;
+
+	DBUG_RETURN(0);
+}
+
+UNIV_INTERN struct st_maria_plugin	i_s_innodb_metrics =
+{
+	/* the plugin type (a MYSQL_XXX_PLUGIN value) */
+	/* int */
+	STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+
+	/* pointer to type-specific plugin descriptor */
+	/* void* */
+	STRUCT_FLD(info, &i_s_info),
+
+	/* plugin name */
+	/* const char* */
+	STRUCT_FLD(name, "INNODB_METRICS"),
+
+	/* plugin author (for SHOW PLUGINS) */
+	/* const char* */
+	STRUCT_FLD(author, plugin_author),
+
+	/* general descriptive text (for SHOW PLUGINS) */
+	/* const char* */
+	STRUCT_FLD(descr, "InnoDB Metrics Info"),
+
+	/* the plugin license (PLUGIN_LICENSE_XXX) */
+	/* int */
+	STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
+
+	/* the function to invoke when plugin is loaded */
+	/* int (*)(void*); */
+	STRUCT_FLD(init, innodb_metrics_init),
+
+	/* the function to invoke when plugin is unloaded */
+	/* int (*)(void*); */
+	STRUCT_FLD(deinit, i_s_common_deinit),
+
+	/* plugin version (for SHOW PLUGINS) */
+	/* unsigned int */
+	STRUCT_FLD(version, INNODB_VERSION_SHORT),
+
+	/* struct st_mysql_show_var* */
+	STRUCT_FLD(status_vars, NULL),
+
+	/* struct st_mysql_sys_var** */
+	STRUCT_FLD(system_vars, NULL),
+
+        /* Maria extension */
+	STRUCT_FLD(version_info, INNODB_VERSION_STR),
+        STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_STABLE),
+};
+/* Fields of the dynamic table INFORMATION_SCHEMA.innodb_ft_default_stopword */
+static ST_FIELD_INFO	i_s_stopword_fields_info[] =
+{
+#define STOPWORD_VALUE	0
+	{STRUCT_FLD(field_name,		"value"),
+	 STRUCT_FLD(field_length,	TRX_ID_MAX_LEN + 1),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_STRING),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+	END_OF_ST_FIELD_INFO
+};
+
+/*******************************************************************//**
+Fill the dynamic table information_schema.innodb_ft_default_stopword.
+@return	0 on success, 1 on failure */
+static
+int
+i_s_stopword_fill(
+/*==============*/
+	THD*		thd,	/*!< in: thread */
+	TABLE_LIST*	tables,	/*!< in/out: tables to fill */
+	Item*		)	/*!< in: condition (not used) */
+{
+	Field**	fields;
+	ulint	i = 0;
+	TABLE*	table = (TABLE*) tables->table;
+
+	DBUG_ENTER("i_s_stopword_fill");
+
+	fields = table->field;
+
+	/* Fill with server default stopword list in array
+	fts_default_stopword */
+	while (fts_default_stopword[i]) {
+		OK(field_store_string(fields[STOPWORD_VALUE],
+				      fts_default_stopword[i]));
+
+		OK(schema_table_store_record(thd, table));
+		i++;
+	}
+
+	DBUG_RETURN(0);
+}
+
+/*******************************************************************//**
+Bind the dynamic table information_schema.innodb_ft_default_stopword.
+@return	0 on success */
+static
+int
+i_s_stopword_init(
+/*==============*/
+	void*	p)	/*!< in/out: table schema object */
+{
+	DBUG_ENTER("i_s_stopword_init");
+	ST_SCHEMA_TABLE* schema = (ST_SCHEMA_TABLE*) p;
+
+	schema->fields_info = i_s_stopword_fields_info;
+	schema->fill_table = i_s_stopword_fill;
+
+	DBUG_RETURN(0);
+}
+
+UNIV_INTERN struct st_maria_plugin	i_s_innodb_ft_default_stopword =
+{
+	/* the plugin type (a MYSQL_XXX_PLUGIN value) */
+	/* int */
+	STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+
+	/* pointer to type-specific plugin descriptor */
+	/* void* */
+	STRUCT_FLD(info, &i_s_stopword_fields_info),
+
+	/* plugin name */
+	/* const char* */
+	STRUCT_FLD(name, "INNODB_FT_DEFAULT_STOPWORD"),
+
+	/* plugin author (for SHOW PLUGINS) */
+	/* const char* */
+	STRUCT_FLD(author, plugin_author),
+
+	/* general descriptive text (for SHOW PLUGINS) */
+	/* const char* */
+	STRUCT_FLD(descr, "Default stopword list for InnDB Full Text Search"),
+
+	/* the plugin license (PLUGIN_LICENSE_XXX) */
+	/* int */
+	STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
+
+	/* the function to invoke when plugin is loaded */
+	/* int (*)(void*); */
+	STRUCT_FLD(init, i_s_stopword_init),
+
+	/* the function to invoke when plugin is unloaded */
+	/* int (*)(void*); */
+	STRUCT_FLD(deinit, i_s_common_deinit),
+
+	/* plugin version (for SHOW PLUGINS) */
+	/* unsigned int */
+	STRUCT_FLD(version, INNODB_VERSION_SHORT),
+
+	/* struct st_mysql_show_var* */
+	STRUCT_FLD(status_vars, NULL),
+
+	/* struct st_mysql_sys_var** */
+	STRUCT_FLD(system_vars, NULL),
+
+        /* Maria extension */
+	STRUCT_FLD(version_info, INNODB_VERSION_STR),
+        STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_STABLE),
+};
+
+/* Fields of the dynamic table INFORMATION_SCHEMA.INNODB_FT_DELETED
+INFORMATION_SCHEMA.INNODB_FT_BEING_DELETED and
+INFORMATION_SCHEMA.INNODB_FT_INSERTED */
+static ST_FIELD_INFO	i_s_fts_doc_fields_info[] =
+{
+#define	I_S_FTS_DOC_ID			0
+	{STRUCT_FLD(field_name,		"DOC_ID"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+	END_OF_ST_FIELD_INFO
+};
+
+/*******************************************************************//**
+Fill the dynamic table INFORMATION_SCHEMA.INNODB_FT_DELETED or
+INFORMATION_SCHEMA.INNODB_FT_BEING_DELETED
+@return	0 on success, 1 on failure */
+static
+int
+i_s_fts_deleted_generic_fill(
+/*=========================*/
+	THD*		thd,		/*!< in: thread */
+	TABLE_LIST*	tables,		/*!< in/out: tables to fill */
+	ibool		being_deleted)	/*!< in: BEING_DELTED table */
+{
+	Field**			fields;
+	TABLE*			table = (TABLE*) tables->table;
+	trx_t*			trx;
+	fts_table_t		fts_table;
+	fts_doc_ids_t*		deleted;
+	dict_table_t*		user_table;
+
+	DBUG_ENTER("i_s_fts_deleted_generic_fill");
+
+	/* deny access to non-superusers */
+	if (check_global_access(thd, PROCESS_ACL)) {
+		DBUG_RETURN(0);
+	}
+
+	if (!fts_internal_tbl_name) {
+		DBUG_RETURN(0);
+	}
+
+	deleted = fts_doc_ids_create();
+
+	user_table = dict_table_open_on_name_no_stats(
+			fts_internal_tbl_name, FALSE, DICT_ERR_IGNORE_NONE);
+
+	if (!user_table) {
+		DBUG_RETURN(0);
+	}
+
+	trx = trx_allocate_for_background();
+	trx->op_info = "Select for FTS DELETE TABLE";
+
+	FTS_INIT_FTS_TABLE(&fts_table,
+			   (being_deleted) ? "BEING_DELETED" : "DELETED",
+			   FTS_COMMON_TABLE, user_table);
+
+	fts_table_fetch_doc_ids(trx, &fts_table, deleted);
+
+	fields = table->field;
+
+	for (ulint j = 0; j < ib_vector_size(deleted->doc_ids); ++j) {
+		doc_id_t	doc_id;
+
+		doc_id = *(doc_id_t*) ib_vector_get_const(deleted->doc_ids, j);
+
+		OK(fields[I_S_FTS_DOC_ID]->store((longlong) doc_id, true));
+
+		OK(schema_table_store_record(thd, table));
+	}
+
+	trx_free_for_background(trx);
+
+	fts_doc_ids_free(deleted);
+
+	dict_table_close(user_table, FALSE);
+
+	DBUG_RETURN(0);
+}
+
+/*******************************************************************//**
+Fill the dynamic table INFORMATION_SCHEMA.INNODB_FT_DELETED
+@return	0 on success, 1 on failure */
+static
+int
+i_s_fts_deleted_fill(
+/*=================*/
+	THD*		thd,	/*!< in: thread */
+	TABLE_LIST*	tables,	/*!< in/out: tables to fill */
+	Item*		)	/*!< in: condition (ignored) */
+{
+	DBUG_ENTER("i_s_fts_deleted_fill");
+
+	DBUG_RETURN(i_s_fts_deleted_generic_fill(thd, tables, FALSE));
+}
+
+/*******************************************************************//**
+Bind the dynamic table INFORMATION_SCHEMA.INNODB_FT_DELETED
+@return	0 on success */
+static
+int
+i_s_fts_deleted_init(
+/*=================*/
+	void*	p)	/*!< in/out: table schema object */
+{
+	DBUG_ENTER("i_s_fts_deleted_init");
+	ST_SCHEMA_TABLE* schema = (ST_SCHEMA_TABLE*) p;
+
+	schema->fields_info = i_s_fts_doc_fields_info;
+	schema->fill_table = i_s_fts_deleted_fill;
+
+	DBUG_RETURN(0);
+}
+
+UNIV_INTERN struct st_maria_plugin	i_s_innodb_ft_deleted =
+{
+	/* the plugin type (a MYSQL_XXX_PLUGIN value) */
+	/* int */
+	STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+
+	/* pointer to type-specific plugin descriptor */
+	/* void* */
+	STRUCT_FLD(info, &i_s_fts_doc_fields_info),
+
+	/* plugin name */
+	/* const char* */
+	STRUCT_FLD(name, "INNODB_FT_DELETED"),
+
+	/* plugin author (for SHOW PLUGINS) */
+	/* const char* */
+	STRUCT_FLD(author, plugin_author),
+
+	/* general descriptive text (for SHOW PLUGINS) */
+	/* const char* */
+	STRUCT_FLD(descr, "INNODB AUXILIARY FTS DELETED TABLE"),
+
+	/* the plugin license (PLUGIN_LICENSE_XXX) */
+	/* int */
+	STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
+
+	/* the function to invoke when plugin is loaded */
+	/* int (*)(void*); */
+	STRUCT_FLD(init, i_s_fts_deleted_init),
+
+	/* the function to invoke when plugin is unloaded */
+	/* int (*)(void*); */
+	STRUCT_FLD(deinit, i_s_common_deinit),
+
+	/* plugin version (for SHOW PLUGINS) */
+	/* unsigned int */
+	STRUCT_FLD(version, INNODB_VERSION_SHORT),
+
+	/* struct st_mysql_show_var* */
+	STRUCT_FLD(status_vars, NULL),
+
+	/* struct st_mysql_sys_var** */
+	STRUCT_FLD(system_vars, NULL),
+
+        /* Maria extension */
+	STRUCT_FLD(version_info, INNODB_VERSION_STR),
+        STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_STABLE),
+};
+
+/*******************************************************************//**
+Fill the dynamic table INFORMATION_SCHEMA.INNODB_FT_BEING_DELETED
+@return	0 on success, 1 on failure */
+static
+int
+i_s_fts_being_deleted_fill(
+/*=======================*/
+	THD*		thd,	/*!< in: thread */
+	TABLE_LIST*	tables,	/*!< in/out: tables to fill */
+	Item*		)	/*!< in: condition (ignored) */
+{
+	DBUG_ENTER("i_s_fts_being_deleted_fill");
+
+	DBUG_RETURN(i_s_fts_deleted_generic_fill(thd, tables, TRUE));
+}
+
+/*******************************************************************//**
+Bind the dynamic table INFORMATION_SCHEMA.INNODB_FT_BEING_DELETED
+@return	0 on success */
+static
+int
+i_s_fts_being_deleted_init(
+/*=======================*/
+	void*	p)	/*!< in/out: table schema object */
+{
+	DBUG_ENTER("i_s_fts_deleted_init");
+	ST_SCHEMA_TABLE* schema = (ST_SCHEMA_TABLE*) p;
+
+	schema->fields_info = i_s_fts_doc_fields_info;
+	schema->fill_table = i_s_fts_being_deleted_fill;
+
+	DBUG_RETURN(0);
+}
+
+UNIV_INTERN struct st_maria_plugin	i_s_innodb_ft_being_deleted =
+{
+	/* the plugin type (a MYSQL_XXX_PLUGIN value) */
+	/* int */
+	STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+
+	/* pointer to type-specific plugin descriptor */
+	/* void* */
+	STRUCT_FLD(info, &i_s_fts_doc_fields_info),
+
+	/* plugin name */
+	/* const char* */
+	STRUCT_FLD(name, "INNODB_FT_BEING_DELETED"),
+
+	/* plugin author (for SHOW PLUGINS) */
+	/* const char* */
+	STRUCT_FLD(author, plugin_author),
+
+	/* general descriptive text (for SHOW PLUGINS) */
+	/* const char* */
+	STRUCT_FLD(descr, "INNODB AUXILIARY FTS BEING DELETED TABLE"),
+
+	/* the plugin license (PLUGIN_LICENSE_XXX) */
+	/* int */
+	STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
+
+	/* the function to invoke when plugin is loaded */
+	/* int (*)(void*); */
+	STRUCT_FLD(init, i_s_fts_being_deleted_init),
+
+	/* the function to invoke when plugin is unloaded */
+	/* int (*)(void*); */
+	STRUCT_FLD(deinit, i_s_common_deinit),
+
+	/* plugin version (for SHOW PLUGINS) */
+	/* unsigned int */
+	STRUCT_FLD(version, INNODB_VERSION_SHORT),
+
+	/* struct st_mysql_show_var* */
+	STRUCT_FLD(status_vars, NULL),
+
+	/* struct st_mysql_sys_var** */
+	STRUCT_FLD(system_vars, NULL),
+
+        /* Maria extension */
+	STRUCT_FLD(version_info, INNODB_VERSION_STR),
+        STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_STABLE),
+};
+
+/*******************************************************************//**
+Fill the dynamic table INFORMATION_SCHEMA.INNODB_FT_INSERTED.
+@return	0 on success, 1 on failure */
+static
+int
+i_s_fts_inserted_fill(
+/*==================*/
+	THD*		thd,	/*!< in: thread */
+	TABLE_LIST*	tables,	/*!< in/out: tables to fill */
+	Item*		)	/*!< in: condition (ignored) */
+{
+	Field**			fields;
+	TABLE*			table = (TABLE*) tables->table;
+	trx_t*			trx;
+	fts_table_t		fts_table;
+	fts_doc_ids_t*		inserted;
+	dict_table_t*		user_table;
+
+	DBUG_ENTER("i_s_fts_inserted_fill");
+
+	/* deny access to non-superusers */
+	if (check_global_access(thd, PROCESS_ACL)) {
+		DBUG_RETURN(0);
+	}
+
+	if (!fts_internal_tbl_name) {
+		DBUG_RETURN(0);
+	}
+
+	user_table = dict_table_open_on_name_no_stats(
+			fts_internal_tbl_name, FALSE, DICT_ERR_IGNORE_NONE);
+
+	if (!user_table) {
+		DBUG_RETURN(0);
+	}
+
+	inserted = fts_doc_ids_create();
+
+	trx = trx_allocate_for_background();
+	trx->op_info = "Select for FTS ADDED Table";
+
+	FTS_INIT_FTS_TABLE(&fts_table, "ADDED", FTS_COMMON_TABLE, user_table);
+
+	fts_table_fetch_doc_ids(trx, &fts_table, inserted);
+
+	fields = table->field;
+
+	for (ulint j = 0; j < ib_vector_size(inserted->doc_ids); ++j) {
+		doc_id_t	doc_id;
+
+		doc_id = *(doc_id_t*) ib_vector_get_const(inserted->doc_ids, j);
+
+		OK(fields[I_S_FTS_DOC_ID]->store((longlong) doc_id, true));
+
+		OK(schema_table_store_record(thd, table));
+	}
+
+	trx_free_for_background(trx);
+
+	fts_doc_ids_free(inserted);
+
+	dict_table_close(user_table, FALSE);
+
+	DBUG_RETURN(0);
+}
+
+/*******************************************************************//**
+Bind the dynamic table INFORMATION_SCHEMA.INNODB_FT_INSERTED
+@return	0 on success */
+static
+int
+i_s_fts_inserted_init(
+/*==================*/
+	void*	p)	/*!< in/out: table schema object */
+{
+	DBUG_ENTER("i_s_fts_inserted_init");
+	ST_SCHEMA_TABLE* schema = (ST_SCHEMA_TABLE*) p;
+
+	schema->fields_info = i_s_fts_doc_fields_info;
+	schema->fill_table = i_s_fts_inserted_fill;
+
+	DBUG_RETURN(0);
+}
+
+UNIV_INTERN struct st_maria_plugin	i_s_innodb_ft_inserted =
+{
+	/* the plugin type (a MYSQL_XXX_PLUGIN value) */
+	/* int */
+	STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+
+	/* pointer to type-specific plugin descriptor */
+	/* void* */
+	STRUCT_FLD(info, &i_s_fts_doc_fields_info),
+
+	/* plugin name */
+	/* const char* */
+	STRUCT_FLD(name, "INNODB_FT_INSERTED"),
+
+	/* plugin author (for SHOW PLUGINS) */
+	/* const char* */
+	STRUCT_FLD(author, plugin_author),
+
+	/* general descriptive text (for SHOW PLUGINS) */
+	/* const char* */
+	STRUCT_FLD(descr, "INNODB AUXILIARY FTS INSERTED TABLE"),
+
+	/* the plugin license (PLUGIN_LICENSE_XXX) */
+	/* int */
+	STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
+
+	/* the function to invoke when plugin is loaded */
+	/* int (*)(void*); */
+	STRUCT_FLD(init, i_s_fts_inserted_init),
+
+	/* the function to invoke when plugin is unloaded */
+	/* int (*)(void*); */
+	STRUCT_FLD(deinit, i_s_common_deinit),
+
+	/* plugin version (for SHOW PLUGINS) */
+	/* unsigned int */
+	STRUCT_FLD(version, INNODB_VERSION_SHORT),
+
+	/* struct st_mysql_show_var* */
+	STRUCT_FLD(status_vars, NULL),
+
+	/* struct st_mysql_sys_var** */
+	STRUCT_FLD(system_vars, NULL),
+
+        /* Maria extension */
+	STRUCT_FLD(version_info, INNODB_VERSION_STR),
+        STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_STABLE),
+};
+
+/* Fields of the dynamic table INFORMATION_SCHEMA.INNODB_FT_INDEX_CACHED and
+INFORMATION_SCHEMA.INNODB_FT_INDEX_TABLE */
+static ST_FIELD_INFO	i_s_fts_index_fields_info[] =
+{
+#define	I_S_FTS_WORD			0
+	{STRUCT_FLD(field_name,		"WORD"),
+	 STRUCT_FLD(field_length,	FTS_MAX_WORD_LEN + 1),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_STRING),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define	I_S_FTS_FIRST_DOC_ID		1
+	{STRUCT_FLD(field_name,		"FIRST_DOC_ID"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define	I_S_FTS_LAST_DOC_ID		2
+	{STRUCT_FLD(field_name,		"LAST_DOC_ID"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define	I_S_FTS_DOC_COUNT		3
+	{STRUCT_FLD(field_name,		"DOC_COUNT"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define	I_S_FTS_ILIST_DOC_ID		4
+	{STRUCT_FLD(field_name,		"DOC_ID"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define	I_S_FTS_ILIST_DOC_POS		5
+	{STRUCT_FLD(field_name,		"POSITION"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+	END_OF_ST_FIELD_INFO
+};
+
+/*******************************************************************//**
+Go through the Doc Node and its ilist, fill the dynamic table
+INFORMATION_SCHEMA.INNODB_FT_INDEX_CACHED for one FTS index on the table.
+@return	0 on success, 1 on failure */
+static
+int
+i_s_fts_index_cache_fill_one_index(
+/*===============================*/
+	fts_index_cache_t*	index_cache,	/*!< in: FTS index cache */
+	THD*			thd,		/*!< in: thread */
+	TABLE_LIST*		tables)		/*!< in/out: tables to fill */
+{
+	TABLE*			table = (TABLE*) tables->table;
+	Field**			fields;
+	const ib_rbt_node_t*	rbt_node;
+
+	DBUG_ENTER("i_s_fts_index_cache_fill_one_index");
+
+	fields = table->field;
+
+	/* Go through each word in the index cache */
+	for (rbt_node = rbt_first(index_cache->words);
+	     rbt_node;
+	     rbt_node = rbt_next(index_cache->words, rbt_node)) {
+		doc_id_t	doc_id = 0;
+
+		fts_tokenizer_word_t* word;
+
+		word = rbt_value(fts_tokenizer_word_t, rbt_node);
+
+		/* Decrypt the ilist, and display Dod ID and word position */
+		for (ulint i = 0; i < ib_vector_size(word->nodes); i++) {
+			fts_node_t*	node;
+			byte*		ptr;
+			ulint		decoded = 0;
+
+			node = static_cast<fts_node_t*> (ib_vector_get(
+				word->nodes, i));
+
+			ptr = node->ilist;
+
+			while (decoded < node->ilist_size) {
+				ulint	pos = fts_decode_vlc(&ptr);
+
+				doc_id += pos;
+
+				/* Get position info */
+				while (*ptr) {
+					pos = fts_decode_vlc(&ptr);
+
+					OK(field_store_string(
+						fields[I_S_FTS_WORD],
+						reinterpret_cast<const char*>
+						(word->text.f_str)));
+
+					OK(fields[I_S_FTS_FIRST_DOC_ID]->store(
+						(longlong) node->first_doc_id,
+						true));
+
+					OK(fields[I_S_FTS_LAST_DOC_ID]->store(
+						(longlong) node->last_doc_id,
+						true));
+
+					OK(fields[I_S_FTS_DOC_COUNT]->store(
+						node->doc_count));
+
+					OK(fields[I_S_FTS_ILIST_DOC_ID]->store(
+						(longlong) doc_id, true));
+
+					OK(fields[I_S_FTS_ILIST_DOC_POS]->store(
+						pos));
+
+					OK(schema_table_store_record(
+						thd, table));
+				}
+
+				++ptr;
+
+				decoded = ptr - (byte*) node->ilist;
+			}
+		}
+	}
+
+	DBUG_RETURN(0);
+}
+/*******************************************************************//**
+Fill the dynamic table INFORMATION_SCHEMA.INNODB_FT_INDEX_CACHED
+@return	0 on success, 1 on failure */
+static
+int
+i_s_fts_index_cache_fill(
+/*=====================*/
+	THD*		thd,	/*!< in: thread */
+	TABLE_LIST*	tables,	/*!< in/out: tables to fill */
+	Item*		)	/*!< in: condition (ignored) */
+{
+	dict_table_t*		user_table;
+	fts_cache_t*		cache;
+
+	DBUG_ENTER("i_s_fts_index_cache_fill");
+
+	/* deny access to non-superusers */
+	if (check_global_access(thd, PROCESS_ACL)) {
+		DBUG_RETURN(0);
+	}
+
+	if (!fts_internal_tbl_name) {
+		DBUG_RETURN(0);
+	}
+
+	user_table = dict_table_open_on_name_no_stats(
+			fts_internal_tbl_name, FALSE, DICT_ERR_IGNORE_NONE);
+
+	if (!user_table) {
+		DBUG_RETURN(0);
+	}
+
+	cache = user_table->fts->cache;
+
+	ut_a(cache);
+
+	for (ulint i = 0; i < ib_vector_size(cache->indexes); i++) {
+		fts_index_cache_t*      index_cache;
+
+		index_cache = static_cast<fts_index_cache_t*> (
+			ib_vector_get(cache->indexes, i));
+
+		i_s_fts_index_cache_fill_one_index(index_cache, thd, tables);
+	}
+
+	dict_table_close(user_table, FALSE);
+
+	DBUG_RETURN(0);
+}
+
+/*******************************************************************//**
+Bind the dynamic table INFORMATION_SCHEMA.INNODB_FT_INDEX_CACHE
+@return	0 on success */
+static
+int
+i_s_fts_index_cache_init(
+/*=====================*/
+	void*	p)	/*!< in/out: table schema object */
+{
+	DBUG_ENTER("i_s_fts_index_cache_init");
+	ST_SCHEMA_TABLE* schema = (ST_SCHEMA_TABLE*) p;
+
+	schema->fields_info = i_s_fts_index_fields_info;
+	schema->fill_table = i_s_fts_index_cache_fill;
+
+	DBUG_RETURN(0);
+}
+
+UNIV_INTERN struct st_maria_plugin	i_s_innodb_ft_index_cache =
+{
+	/* the plugin type (a MYSQL_XXX_PLUGIN value) */
+	/* int */
+	STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+
+	/* pointer to type-specific plugin descriptor */
+	/* void* */
+	STRUCT_FLD(info, &i_s_fts_index_fields_info),
+
+	/* plugin name */
+	/* const char* */
+	STRUCT_FLD(name, "INNODB_FT_INDEX_CACHE"),
+
+	/* plugin author (for SHOW PLUGINS) */
+	/* const char* */
+	STRUCT_FLD(author, plugin_author),
+
+	/* general descriptive text (for SHOW PLUGINS) */
+	/* const char* */
+	STRUCT_FLD(descr, "INNODB AUXILIARY FTS INDEX CACHED"),
+
+	/* the plugin license (PLUGIN_LICENSE_XXX) */
+	/* int */
+	STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
+
+	/* the function to invoke when plugin is loaded */
+	/* int (*)(void*); */
+	STRUCT_FLD(init, i_s_fts_index_cache_init),
+
+	/* the function to invoke when plugin is unloaded */
+	/* int (*)(void*); */
+	STRUCT_FLD(deinit, i_s_common_deinit),
+
+	/* plugin version (for SHOW PLUGINS) */
+	/* unsigned int */
+	STRUCT_FLD(version, INNODB_VERSION_SHORT),
+
+	/* struct st_mysql_show_var* */
+	STRUCT_FLD(status_vars, NULL),
+
+	/* struct st_mysql_sys_var** */
+	STRUCT_FLD(system_vars, NULL),
+
+        /* Maria extension */
+	STRUCT_FLD(version_info, INNODB_VERSION_STR),
+        STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_STABLE),
+};
+
+/*******************************************************************//**
+Go through a FTS index auxiliary table, fetch its rows and fill
+FTS word cache structure.
+@return	DB_SUCCESS on success, otherwise error code */
+static
+ulint
+i_s_fts_index_table_fill_selected(
+/*==============================*/
+	dict_index_t*		index,		/*!< in: FTS index */
+	ib_vector_t*		words,		/*!< in/out: vector to hold
+						fetched words */
+	ulint			selected)	/*!< in: selected FTS index */
+{
+	pars_info_t*		info;
+	fts_table_t		fts_table;
+	trx_t*			trx;
+	que_t*			graph;
+	ulint			error;
+	fts_fetch_t		fetch;
+
+	info = pars_info_create();
+
+	fetch.read_arg = words;
+	fetch.read_record = fts_optimize_index_fetch_node;
+
+	trx = trx_allocate_for_background();
+
+	trx->op_info = "fetching FTS index nodes";
+
+	pars_info_bind_function(info, "my_func", fetch.read_record, &fetch);
+
+	FTS_INIT_INDEX_TABLE(&fts_table, fts_get_suffix(selected),
+			     FTS_INDEX_TABLE, index);
+
+	graph = fts_parse_sql(
+		&fts_table, info,
+		"DECLARE FUNCTION my_func;\n"
+		"DECLARE CURSOR c IS"
+		" SELECT word, doc_count, first_doc_id, last_doc_id, "
+		"ilist\n"
+		" FROM %s;\n"
+		"BEGIN\n"
+		"\n"
+		"OPEN c;\n"
+		"WHILE 1 = 1 LOOP\n"
+		"  FETCH c INTO my_func();\n"
+		"  IF c % NOTFOUND THEN\n"
+		"    EXIT;\n"
+		"  END IF;\n"
+		"END LOOP;\n"
+		"CLOSE c;");
+
+	for(;;) {
+		error = fts_eval_sql(trx, graph);
+
+		if (error == DB_SUCCESS) {
+			fts_sql_commit(trx);
+
+			break;
+		} else {
+			fts_sql_rollback(trx);
+
+			ut_print_timestamp(stderr);
+
+			if (error == DB_LOCK_WAIT_TIMEOUT) {
+				fprintf(stderr, "  InnoDB: Warning: "
+					"lock wait timeout reading "
+					"FTS index.  Retrying!\n");
+
+				trx->error_state = DB_SUCCESS;
+			} else {
+				fprintf(stderr, "  InnoDB: Error: %lu "
+				"while reading FTS index.\n", error);
+				break;
+			}
+		}
+	}
+
+	mutex_enter(&dict_sys->mutex);
+	que_graph_free(graph);
+	mutex_exit(&dict_sys->mutex);
+
+	trx_free_for_background(trx);
+
+	return(error);
+}
+
+/*******************************************************************//**
+Go through a FTS index and its auxiliary tables, fetch rows in each table
+and fill INFORMATION_SCHEMA.INNODB_FT_INDEX_TABLE.
+@return	0 on success, 1 on failure */
+static
+int
+i_s_fts_index_table_fill_one_index(
+/*===============================*/
+	dict_index_t*		index,		/*!< in: FTS index */
+	THD*			thd,		/*!< in: thread */
+	TABLE_LIST*		tables)		/*!< in/out: tables to fill */
+{
+	TABLE*			table = (TABLE*) tables->table;
+	Field**			fields;
+	ib_vector_t*		words;
+	mem_heap_t*		heap;
+	ulint			num_row_fill;
+
+	DBUG_ENTER("i_s_fts_index_cache_fill_one_index");
+
+	heap = mem_heap_create(1024);
+
+	words = ib_vector_create(ib_heap_allocator_create(heap),
+				 sizeof(fts_word_t), 256);
+
+	fields = table->field;
+
+	/* Iterate through each auxiliary table as described in
+	fts_index_selector */
+	for (ulint selected = 0; fts_index_selector[selected].value;
+	     selected++) {
+		i_s_fts_index_table_fill_selected(index, words, selected);
+	}
+
+	num_row_fill = ut_min(ib_vector_size(words), 500000);
+
+	/* Go through each word in the index cache */
+	for (ulint i = 0; i < num_row_fill; i++) {
+		fts_word_t*	word;
+
+		word = (fts_word_t*) ib_vector_get(words, i);
+
+		word->text.f_str[word->text.f_len] = 0;
+
+		/* Decrypt the ilist, and display Dod ID and word position */
+		for (ulint i = 0; i < ib_vector_size(word->nodes); i++) {
+			fts_node_t*	node;
+			byte*		ptr;
+			ulint		decoded = 0;
+			doc_id_t	doc_id = 0;
+
+			node = static_cast<fts_node_t*> (ib_vector_get(
+				word->nodes, i));
+
+			ptr = node->ilist;
+
+			while (decoded < node->ilist_size) {
+				ulint	pos = fts_decode_vlc(&ptr);
+
+				doc_id += pos;
+
+				/* Get position info */
+				while (*ptr) {
+					pos = fts_decode_vlc(&ptr);
+
+					OK(field_store_string(
+						fields[I_S_FTS_WORD],
+						reinterpret_cast<const char*>
+						(word->text.f_str)));
+
+					OK(fields[I_S_FTS_FIRST_DOC_ID]->store(
+						(longlong) node->first_doc_id,
+						true));
+
+					OK(fields[I_S_FTS_LAST_DOC_ID]->store(
+						(longlong) node->last_doc_id,
+						true));
+
+					OK(fields[I_S_FTS_DOC_COUNT]->store(
+						node->doc_count));
+
+					OK(fields[I_S_FTS_ILIST_DOC_ID]->store(
+						(longlong) doc_id, true));
+
+					OK(fields[I_S_FTS_ILIST_DOC_POS]->store(
+						pos));
+
+					OK(schema_table_store_record(
+						thd, table));
+				}
+
+				++ptr;
+
+				decoded = ptr - (byte*) node->ilist;
+			}
+		}
+	}
+
+	mem_heap_free(heap);
+
+	DBUG_RETURN(0);
+}
+/*******************************************************************//**
+Fill the dynamic table INFORMATION_SCHEMA.INNODB_FT_INDEX_TABLE
+@return	0 on success, 1 on failure */
+static
+int
+i_s_fts_index_table_fill(
+/*=====================*/
+	THD*		thd,	/*!< in: thread */
+	TABLE_LIST*	tables,	/*!< in/out: tables to fill */
+	Item*		)	/*!< in: condition (ignored) */
+{
+	dict_table_t*		user_table;
+	dict_index_t*		index;
+
+	DBUG_ENTER("i_s_fts_index_table_fill");
+
+	/* deny access to non-superusers */
+	if (check_global_access(thd, PROCESS_ACL)) {
+		DBUG_RETURN(0);
+	}
+
+	if (!fts_internal_tbl_name) {
+		DBUG_RETURN(0);
+	}
+
+	user_table = dict_table_open_on_name_no_stats(
+			fts_internal_tbl_name, FALSE, DICT_ERR_IGNORE_NONE);
+
+	if (!user_table) {
+		DBUG_RETURN(0);
+	}
+
+	for (index = dict_table_get_first_index(user_table);
+	     index; index = dict_table_get_next_index(index)) {
+		if (index->type & DICT_FTS) {
+			i_s_fts_index_table_fill_one_index(index, thd, tables);
+		}
+	}
+
+	dict_table_close(user_table, FALSE);
+
+	DBUG_RETURN(0);
+}
+
+/*******************************************************************//**
+Bind the dynamic table INFORMATION_SCHEMA.INNODB_FT_INDEX_TABLE
+@return	0 on success */
+static
+int
+i_s_fts_index_table_init(
+/*=====================*/
+	void*	p)	/*!< in/out: table schema object */
+{
+	DBUG_ENTER("i_s_fts_index_table_init");
+	ST_SCHEMA_TABLE* schema = (ST_SCHEMA_TABLE*) p;
+
+	schema->fields_info = i_s_fts_index_fields_info;
+	schema->fill_table = i_s_fts_index_table_fill;
+
+	DBUG_RETURN(0);
+}
+
+UNIV_INTERN struct st_maria_plugin	i_s_innodb_ft_index_table =
+{
+	/* the plugin type (a MYSQL_XXX_PLUGIN value) */
+	/* int */
+	STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+
+	/* pointer to type-specific plugin descriptor */
+	/* void* */
+	STRUCT_FLD(info, &i_s_fts_index_fields_info),
+
+	/* plugin name */
+	/* const char* */
+	STRUCT_FLD(name, "INNODB_FT_INDEX_TABLE"),
+
+	/* plugin author (for SHOW PLUGINS) */
+	/* const char* */
+	STRUCT_FLD(author, plugin_author),
+
+	/* general descriptive text (for SHOW PLUGINS) */
+	/* const char* */
+	STRUCT_FLD(descr, "INNODB AUXILIARY FTS INDEX TABLE"),
+
+	/* the plugin license (PLUGIN_LICENSE_XXX) */
+	/* int */
+	STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
+
+	/* the function to invoke when plugin is loaded */
+	/* int (*)(void*); */
+	STRUCT_FLD(init, i_s_fts_index_table_init),
+
+	/* the function to invoke when plugin is unloaded */
+	/* int (*)(void*); */
+	STRUCT_FLD(deinit, i_s_common_deinit),
+
+	/* plugin version (for SHOW PLUGINS) */
+	/* unsigned int */
+	STRUCT_FLD(version, INNODB_VERSION_SHORT),
+
+	/* struct st_mysql_show_var* */
+	STRUCT_FLD(status_vars, NULL),
+
+	/* struct st_mysql_sys_var** */
+	STRUCT_FLD(system_vars, NULL),
+
+        /* Maria extension */
+	STRUCT_FLD(version_info, INNODB_VERSION_STR),
+        STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_STABLE),
+};
+
+/* Fields of the dynamic table INFORMATION_SCHEMA.INNODB_FT_CONFIG */
+static ST_FIELD_INFO	i_s_fts_config_fields_info[] =
+{
+#define	FTS_CONFIG_KEY			0
+	{STRUCT_FLD(field_name,		"KEY"),
+	 STRUCT_FLD(field_length,	NAME_LEN + 1),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_STRING),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define	FTS_CONFIG_VALUE		1
+	{STRUCT_FLD(field_name,		"VALUE"),
+	 STRUCT_FLD(field_length,	NAME_LEN + 1),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_STRING),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+	END_OF_ST_FIELD_INFO
+};
+
+static const char* fts_config_key[] = {
+	FTS_OPTIMIZE_LIMIT_IN_SECS,
+	FTS_SYNCED_DOC_ID,
+	FTS_LAST_OPTIMIZED_WORD,
+	FTS_TOTAL_DELETED_COUNT,
+	FTS_TOTAL_WORD_COUNT,
+	FTS_OPTIMIZE_START_TIME,
+	FTS_OPTIMIZE_END_TIME,
+	FTS_STOPWORD_TABLE_NAME,
+	FTS_USE_STOPWORD,
+	FTS_TABLE_STATE,
+        NULL
+};
+
+/*******************************************************************//**
+Fill the dynamic table INFORMATION_SCHEMA.INNODB_FT_CONFIG
+@return	0 on success, 1 on failure */
+static
+int
+i_s_fts_config_fill(
+/*================*/
+	THD*		thd,		/*!< in: thread */
+	TABLE_LIST*	tables,		/*!< in/out: tables to fill */
+	Item*		)	/*!< in: condition (ignored) */
+{
+	Field**			fields;
+	TABLE*			table = (TABLE*) tables->table;
+	trx_t*			trx;
+	fts_table_t		fts_table;
+	dict_table_t*		user_table;
+	ulint			i = 0;
+	dict_index_t*		index = NULL;
+	unsigned char		str[FTS_MAX_CONFIG_VALUE_LEN + 1];
+
+	DBUG_ENTER("i_s_fts_config_fill");
+
+	/* deny access to non-superusers */
+	if (check_global_access(thd, PROCESS_ACL)) {
+		DBUG_RETURN(0);
+	}
+
+	if (!fts_internal_tbl_name) {
+		DBUG_RETURN(0);
+	}
+
+	fields = table->field;
+
+	user_table = dict_table_open_on_name_no_stats(
+			fts_internal_tbl_name, FALSE, DICT_ERR_IGNORE_NONE);
+
+	if (!user_table) {
+		DBUG_RETURN(0);
+	}
+
+	trx = trx_allocate_for_background();
+	trx->op_info = "Select for FTS DELETE TABLE";
+
+	FTS_INIT_FTS_TABLE(&fts_table, "CONFIG", FTS_COMMON_TABLE, user_table);
+
+	if (!ib_vector_is_empty(user_table->fts->indexes)) {
+		index = (dict_index_t*) ib_vector_getp_const(
+				user_table->fts->indexes, 0);
+	}
+
+	while (fts_config_key[i]) {
+		fts_string_t	value;
+		char*		key_name;
+		ulint		allocated = FALSE;
+
+		value.f_len = FTS_MAX_CONFIG_VALUE_LEN;
+
+		value.f_str = str;
+
+		if (strcmp(fts_config_key[i], FTS_TOTAL_WORD_COUNT) == 0
+		    && index) {
+			key_name = fts_config_create_index_param_name(
+					fts_config_key[i], index);
+			allocated = TRUE;
+		} else {
+			key_name = (char*) fts_config_key[i];
+		}
+
+		fts_config_get_value(trx, &fts_table, key_name, &value);
+
+		if (allocated) {
+			ut_free(key_name);
+		}
+
+		OK(field_store_string(
+                        fields[FTS_CONFIG_KEY], fts_config_key[i]));
+
+		OK(field_store_string(
+                        fields[FTS_CONFIG_VALUE], (const char*) value.f_str));
+
+		OK(schema_table_store_record(thd, table));
+
+		i++;
+	}
+
+	fts_sql_commit(trx);
+
+	trx_free_for_background(trx);
+
+	dict_table_close(user_table, FALSE);
+
+	DBUG_RETURN(0);
+}
+
+/*******************************************************************//**
+Bind the dynamic table INFORMATION_SCHEMA.INNODB_FT_CONFIG
+@return	0 on success */
+static
+int
+i_s_fts_config_init(
+/*=================*/
+	void*	p)	/*!< in/out: table schema object */
+{
+	DBUG_ENTER("i_s_fts_config_init");
+	ST_SCHEMA_TABLE* schema = (ST_SCHEMA_TABLE*) p;
+
+	schema->fields_info = i_s_fts_config_fields_info;
+	schema->fill_table = i_s_fts_config_fill;
+
+	DBUG_RETURN(0);
+}
+
+UNIV_INTERN struct st_maria_plugin	i_s_innodb_ft_config =
+{
+	/* the plugin type (a MYSQL_XXX_PLUGIN value) */
+	/* int */
+	STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+
+	/* pointer to type-specific plugin descriptor */
+	/* void* */
+	STRUCT_FLD(info, &i_s_fts_config_fields_info),
+
+	/* plugin name */
+	/* const char* */
+	STRUCT_FLD(name, "INNODB_FT_CONFIG"),
+
+	/* plugin author (for SHOW PLUGINS) */
+	/* const char* */
+	STRUCT_FLD(author, plugin_author),
+
+	/* general descriptive text (for SHOW PLUGINS) */
+	/* const char* */
+	STRUCT_FLD(descr, "INNODB AUXILIARY FTS CONFIG TABLE"),
+
+	/* the plugin license (PLUGIN_LICENSE_XXX) */
+	/* int */
+	STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
+
+	/* the function to invoke when plugin is loaded */
+	/* int (*)(void*); */
+	STRUCT_FLD(init, i_s_fts_config_init),
+
+	/* the function to invoke when plugin is unloaded */
+	/* int (*)(void*); */
+	STRUCT_FLD(deinit, i_s_common_deinit),
+
+	/* plugin version (for SHOW PLUGINS) */
+	/* unsigned int */
+	STRUCT_FLD(version, INNODB_VERSION_SHORT),
+
+	/* struct st_mysql_show_var* */
+	STRUCT_FLD(status_vars, NULL),
+
+	/* struct st_mysql_sys_var** */
+	STRUCT_FLD(system_vars, NULL),
+
+        /* Maria extension */
+	STRUCT_FLD(version_info, INNODB_VERSION_STR),
+        STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_STABLE),
+};
+
+/* Fields of the dynamic table INNODB_BUFFER_POOL_STATS. */
+static ST_FIELD_INFO	i_s_innodb_buffer_stats_fields_info[] =
+{
+#define IDX_BUF_STATS_POOL_ID		0
+	{STRUCT_FLD(field_name,		"POOL_ID"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_BUF_STATS_POOL_SIZE		1
+	{STRUCT_FLD(field_name,		"POOL_SIZE"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_BUF_STATS_FREE_BUFFERS	2
+	{STRUCT_FLD(field_name,		"FREE_BUFFERS"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_BUF_STATS_LRU_LEN		3
+	{STRUCT_FLD(field_name,		"DATABASE_PAGES"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_BUF_STATS_OLD_LRU_LEN	4
+	{STRUCT_FLD(field_name,		"OLD_DATABASE_PAGES"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_BUF_STATS_FLUSH_LIST_LEN	5
+	{STRUCT_FLD(field_name,		"MODIFIED_DATABASE_PAGES"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_BUF_STATS_PENDING_ZIP	6
+	{STRUCT_FLD(field_name,		"PENDING_DECOMPRESS"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_BUF_STATS_PENDING_READ	7
+	{STRUCT_FLD(field_name,		"PENDING_READS"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_BUF_STATS_FLUSH_LRU		8
+	{STRUCT_FLD(field_name,		"PENDING_FLUSH_LRU"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_BUF_STATS_FLUSH_LIST	9
+	{STRUCT_FLD(field_name,		"PENDING_FLUSH_LIST"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_BUF_STATS_PAGE_YOUNG	10
+	{STRUCT_FLD(field_name,		"PAGES_MADE_YOUNG"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_BUF_STATS_PAGE_NOT_YOUNG	11
+	{STRUCT_FLD(field_name,		"PAGES_NOT_MADE_YOUNG"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define	IDX_BUF_STATS_PAGE_YOUNG_RATE	12
+	{STRUCT_FLD(field_name,		"PAGES_MADE_YOUNG_RATE"),
+	 STRUCT_FLD(field_length,	0),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_FLOAT),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define	IDX_BUF_STATS_PAGE_NOT_YOUNG_RATE 13
+	{STRUCT_FLD(field_name,		"PAGES_MADE_NOT_YOUNG_RATE"),
+	 STRUCT_FLD(field_length,	0),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_FLOAT),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_BUF_STATS_PAGE_READ		14
+	{STRUCT_FLD(field_name,		"NUMBER_PAGES_READ"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_BUF_STATS_PAGE_CREATED	15
+	{STRUCT_FLD(field_name,		"NUMBER_PAGES_CREATED"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_BUF_STATS_PAGE_WRITTEN	16
+	{STRUCT_FLD(field_name,		"NUMBER_PAGES_WRITTEN"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define	IDX_BUF_STATS_PAGE_READ_RATE	17
+	{STRUCT_FLD(field_name,		"PAGES_READ_RATE"),
+	 STRUCT_FLD(field_length,	0),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_FLOAT),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define	IDX_BUF_STATS_PAGE_CREATE_RATE	18
+	{STRUCT_FLD(field_name,		"PAGES_CREATE_RATE"),
+	 STRUCT_FLD(field_length,	0),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_FLOAT),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define	IDX_BUF_STATS_PAGE_WRITTEN_RATE	19
+	{STRUCT_FLD(field_name,		"PAGES_WRITTEN_RATE"),
+	 STRUCT_FLD(field_length,	0),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_FLOAT),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_BUF_STATS_GET		20
+	{STRUCT_FLD(field_name,		"NUMBER_PAGES_GET"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_BUF_STATS_HIT_RATE		21
+	{STRUCT_FLD(field_name,		"HIT_RATE"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_BUF_STATS_MADE_YOUNG_PCT	22
+	{STRUCT_FLD(field_name,		"YOUNG_MAKE_PER_THOUSAND_GETS"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_BUF_STATS_NOT_MADE_YOUNG_PCT 23
+	{STRUCT_FLD(field_name,		"NOT_YOUNG_MAKE_PER_THOUSAND_GETS"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_BUF_STATS_READ_AHREAD	24
+	{STRUCT_FLD(field_name,		"NUMBER_PAGES_READ_AHEAD"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_BUF_STATS_READ_AHEAD_EVICTED 25
+	{STRUCT_FLD(field_name,		"NUMBER_READ_AHEAD_EVICTED"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define	IDX_BUF_STATS_READ_AHEAD_RATE	26
+	{STRUCT_FLD(field_name,		"READ_AHEAD_RATE"),
+	 STRUCT_FLD(field_length,	0),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_FLOAT),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define	IDX_BUF_STATS_READ_AHEAD_EVICT_RATE 27
+	{STRUCT_FLD(field_name,		"READ_AHEAD_EVICTED_RATE"),
+	 STRUCT_FLD(field_length,	0),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_FLOAT),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_BUF_STATS_LRU_IO_SUM	28
+	{STRUCT_FLD(field_name,		"LRU_IO_TOTAL"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_BUF_STATS_LRU_IO_CUR	29
+	{STRUCT_FLD(field_name,		"LRU_IO_CURRENT"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_BUF_STATS_UNZIP_SUM		30
+	{STRUCT_FLD(field_name,		"UNCOMPRESS_TOTAL"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_BUF_STATS_UNZIP_CUR		31
+	{STRUCT_FLD(field_name,		"UNCOMPRESS_CURRENT"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+	END_OF_ST_FIELD_INFO
+};
+
+/*******************************************************************//**
+Fill Information Schema table INNODB_BUFFER_POOL_STATS for a particular
+buffer pool
+@return	0 on success, 1 on failure */
+static
+int
+i_s_innodb_stats_fill(
+/*==================*/
+	THD*			thd,		/*!< in: thread */
+	TABLE_LIST*		tables,		/*!< in/out: tables to fill */
+	const buf_pool_info_t*	info)		/*!< in: buffer pool
+						information */
+{
+	TABLE*			table;
+	Field**			fields;
+
+	DBUG_ENTER("i_s_innodb_stats_fill");
+
+	table = tables->table;
+
+	fields = table->field;
+
+	OK(fields[IDX_BUF_STATS_POOL_ID]->store(info->pool_unique_id));
+
+	OK(fields[IDX_BUF_STATS_POOL_SIZE]->store(info->pool_size));
+
+	OK(fields[IDX_BUF_STATS_LRU_LEN]->store(info->lru_len));
+
+	OK(fields[IDX_BUF_STATS_OLD_LRU_LEN]->store(info->old_lru_len));
+
+	OK(fields[IDX_BUF_STATS_FREE_BUFFERS]->store(info->free_list_len));
+
+	OK(fields[IDX_BUF_STATS_FLUSH_LIST_LEN]->store(
+		info->flush_list_len));
+
+	OK(fields[IDX_BUF_STATS_PENDING_ZIP]->store(info->n_pend_unzip));
+
+	OK(fields[IDX_BUF_STATS_PENDING_READ]->store(info->n_pend_reads));
+
+	OK(fields[IDX_BUF_STATS_FLUSH_LRU]->store(info->n_pending_flush_lru));
+
+	OK(fields[IDX_BUF_STATS_FLUSH_LIST]->store(info->n_pending_flush_list));
+
+	OK(fields[IDX_BUF_STATS_PAGE_YOUNG]->store(info->n_pages_made_young));
+
+	OK(fields[IDX_BUF_STATS_PAGE_NOT_YOUNG]->store(
+		info->n_pages_not_made_young));
+
+	OK(fields[IDX_BUF_STATS_PAGE_YOUNG_RATE]->store(
+		info->page_made_young_rate));
+
+	OK(fields[IDX_BUF_STATS_PAGE_NOT_YOUNG_RATE]->store(
+		info->page_not_made_young_rate));
+
+	OK(fields[IDX_BUF_STATS_PAGE_READ]->store(info->n_pages_read));
+
+	OK(fields[IDX_BUF_STATS_PAGE_CREATED]->store(info->n_pages_created));
+
+	OK(fields[IDX_BUF_STATS_PAGE_WRITTEN]->store(info->n_pages_written));
+
+	OK(fields[IDX_BUF_STATS_PAGE_READ_RATE]->store(info->pages_read_rate));
+
+	OK(fields[IDX_BUF_STATS_PAGE_CREATED]->store(info->pages_created_rate));
+
+	OK(fields[IDX_BUF_STATS_PAGE_WRITTEN]->store(info->pages_written_rate));
+
+	if (info->n_page_get_delta) {
+		OK(fields[IDX_BUF_STATS_HIT_RATE]->store(
+			1000 - (1000 * info->page_read_delta
+				/ info->n_page_get_delta)));
+
+		OK(fields[IDX_BUF_STATS_MADE_YOUNG_PCT]->store(
+			1000 * info->young_making_delta
+			/ info->n_page_get_delta));
+
+		OK(fields[IDX_BUF_STATS_NOT_MADE_YOUNG_PCT]->store(
+			1000 * info->not_young_making_delta
+			/ info->n_page_get_delta));
+	} else {
+		OK(fields[IDX_BUF_STATS_HIT_RATE]->store(0));
+		OK(fields[IDX_BUF_STATS_MADE_YOUNG_PCT]->store(0));
+		OK(fields[IDX_BUF_STATS_NOT_MADE_YOUNG_PCT]->store(0));
+	}
+
+	OK(fields[IDX_BUF_STATS_READ_AHREAD]->store(info->n_ra_pages_read));
+
+	OK(fields[IDX_BUF_STATS_READ_AHEAD_EVICTED]->store(
+		info->n_ra_pages_evicted));
+
+	OK(fields[IDX_BUF_STATS_READ_AHEAD_RATE]->store(
+		info->pages_readahead_rate));
+
+	OK(fields[IDX_BUF_STATS_READ_AHEAD_EVICT_RATE]->store(
+		info->pages_evicted_rate));
+
+	OK(fields[IDX_BUF_STATS_LRU_IO_SUM]->store(info->io_sum));
+
+	OK(fields[IDX_BUF_STATS_LRU_IO_CUR]->store(info->io_cur));
+
+	OK(fields[IDX_BUF_STATS_UNZIP_SUM]->store(info->unzip_sum));
+
+	OK(fields[IDX_BUF_STATS_UNZIP_CUR]->store( info->unzip_cur));
+
+	DBUG_RETURN(schema_table_store_record(thd, table));
+}
+
+/*******************************************************************//**
+This is the function that loops through each buffer pool and fetch buffer
+pool stats to information schema  table: I_S_INNODB_BUFFER_POOL_STATS
+@return	0 on success, 1 on failure */
+static
+int
+i_s_innodb_buffer_stats_fill_table(
+/*===============================*/
+	THD*		thd,		/*!< in: thread */
+	TABLE_LIST*	tables,		/*!< in/out: tables to fill */
+	Item*		)		/*!< in: condition (ignored) */
+{
+	int			status	= 0;
+	buf_pool_info_t*	pool_info;
+
+	DBUG_ENTER("i_s_innodb_buffer_fill_general");
+
+	/* Only allow the PROCESS privilege holder to access the stats */
+	if (check_global_access(thd, PROCESS_ACL)) {
+		DBUG_RETURN(0);
+	}
+
+	pool_info = (buf_pool_info_t*) mem_zalloc(
+		srv_buf_pool_instances *  sizeof *pool_info);
+
+	/* Walk through each buffer pool */
+	for (ulint i = 0; i < srv_buf_pool_instances; i++) {
+		buf_pool_t*		buf_pool;
+
+		buf_pool = buf_pool_from_array(i);
+
+		/* Fetch individual buffer pool info */
+		buf_stats_get_pool_info(buf_pool, i, pool_info);
+
+		status = i_s_innodb_stats_fill(thd, tables, &pool_info[i]);
+
+		/* If something goes wrong, break and return */
+		if (status) {
+			break;
+		}
+	}
+
+	mem_free(pool_info);
+
+	DBUG_RETURN(status);
+}
+
+/*******************************************************************//**
+Bind the dynamic table INFORMATION_SCHEMA.INNODB_BUFFER_POOL_STATS.
+@return	0 on success, 1 on failure */
+static
+int
+i_s_innodb_buffer_pool_stats_init(
+/*==============================*/
+	void*	p)	/*!< in/out: table schema object */
+{
+	ST_SCHEMA_TABLE*	schema;
+
+	DBUG_ENTER("i_s_innodb_buffer_pool_stats_init");
+
+	schema = reinterpret_cast<ST_SCHEMA_TABLE*>(p);
+
+	schema->fields_info = i_s_innodb_buffer_stats_fields_info;
+	schema->fill_table = i_s_innodb_buffer_stats_fill_table;
+
+	DBUG_RETURN(0);
+}
+
+UNIV_INTERN struct st_maria_plugin	i_s_innodb_buffer_stats =
+{
+	/* the plugin type (a MYSQL_XXX_PLUGIN value) */
+	/* int */
+	STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+
+	/* pointer to type-specific plugin descriptor */
+	/* void* */
+	STRUCT_FLD(info, &i_s_info),
+
+	/* plugin name */
+	/* const char* */
+	STRUCT_FLD(name, "INNODB_BUFFER_POOL_STATS"),
+
+	/* plugin author (for SHOW PLUGINS) */
+	/* const char* */
+	STRUCT_FLD(author, plugin_author),
+
+	/* general descriptive text (for SHOW PLUGINS) */
+	/* const char* */
+	STRUCT_FLD(descr, "InnoDB Buffer Pool Statistics Information "),
+
+	/* the plugin license (PLUGIN_LICENSE_XXX) */
+	/* int */
+	STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
+
+	/* the function to invoke when plugin is loaded */
+	/* int (*)(void*); */
+	STRUCT_FLD(init, i_s_innodb_buffer_pool_stats_init),
+
+	/* the function to invoke when plugin is unloaded */
+	/* int (*)(void*); */
+	STRUCT_FLD(deinit, i_s_common_deinit),
+
+	/* plugin version (for SHOW PLUGINS) */
+	/* unsigned int */
+	STRUCT_FLD(version, INNODB_VERSION_SHORT),
+
+	/* struct st_mysql_show_var* */
+	STRUCT_FLD(status_vars, NULL),
+
+	/* struct st_mysql_sys_var** */
+	STRUCT_FLD(system_vars, NULL),
+
+        /* Maria extension */
+	STRUCT_FLD(version_info, INNODB_VERSION_STR),
+        STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_STABLE),
+};
+
+/* Fields of the dynamic table INNODB_BUFFER_POOL_PAGE. */
+static ST_FIELD_INFO	i_s_innodb_buffer_page_fields_info[] =
+{
+#define IDX_BUFFER_POOL_ID		0
+	{STRUCT_FLD(field_name,		"POOL_ID"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_BUFFER_BLOCK_ID		1
+	{STRUCT_FLD(field_name,		"BLOCK_ID"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_BUFFER_PAGE_SPACE		2
+	{STRUCT_FLD(field_name,		"SPACE"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_BUFFER_PAGE_NUM		3
+	{STRUCT_FLD(field_name,		"PAGE_NUMBER"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_BUFFER_PAGE_TYPE		4
+	{STRUCT_FLD(field_name,		"PAGE_TYPE"),
+	 STRUCT_FLD(field_length,	64),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_STRING),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_MAYBE_NULL),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_BUFFER_PAGE_FLUSH_TYPE	5
+	{STRUCT_FLD(field_name,		"FLUSH_TYPE"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_BUFFER_PAGE_FIX_COUNT	6
+	{STRUCT_FLD(field_name,		"FIX_COUNT"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_BUFFER_PAGE_HASHED		7
+	{STRUCT_FLD(field_name,		"IS_HASHED"),
+	 STRUCT_FLD(field_length,	3),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_STRING),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_MAYBE_NULL),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_BUFFER_PAGE_NEWEST_MOD	8
+	{STRUCT_FLD(field_name,		"NEWEST_MODIFICATION"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_BUFFER_PAGE_OLDEST_MOD	9
+	{STRUCT_FLD(field_name,		"OLDEST_MODIFICATION"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_BUFFER_PAGE_ACCESS_TIME	10
+	{STRUCT_FLD(field_name,		"ACCESS_TIME"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_BUFFER_PAGE_TABLE_NAME	11
+	{STRUCT_FLD(field_name,		"TABLE_NAME"),
+	 STRUCT_FLD(field_length,	1024),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_STRING),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_MAYBE_NULL),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_BUFFER_PAGE_INDEX_NAME	12
+	{STRUCT_FLD(field_name,		"INDEX_NAME"),
+	 STRUCT_FLD(field_length,	1024),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_STRING),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_MAYBE_NULL),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_BUFFER_PAGE_NUM_RECS	13
+	{STRUCT_FLD(field_name,		"NUMBER_RECORDS"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_BUFFER_PAGE_DATA_SIZE	14
+	{STRUCT_FLD(field_name,		"DATA_SIZE"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_BUFFER_PAGE_ZIP_SIZE	15
+	{STRUCT_FLD(field_name,		"COMPRESSED_SIZE"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_BUFFER_PAGE_STATE		16
+	{STRUCT_FLD(field_name,		"PAGE_STATE"),
+	 STRUCT_FLD(field_length,	64),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_STRING),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_MAYBE_NULL),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_BUFFER_PAGE_IO_FIX		17
+	{STRUCT_FLD(field_name,		"IO_FIX"),
+	 STRUCT_FLD(field_length,	64),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_STRING),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_MAYBE_NULL),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_BUFFER_PAGE_IS_OLD		18
+	{STRUCT_FLD(field_name,		"IS_OLD"),
+	 STRUCT_FLD(field_length,	3),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_STRING),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_MAYBE_NULL),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_BUFFER_PAGE_FREE_CLOCK	19
+	{STRUCT_FLD(field_name,		"FREE_PAGE_CLOCK"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+	END_OF_ST_FIELD_INFO
+};
+
+/*******************************************************************//**
+Fill Information Schema table INNODB_BUFFER_PAGE with information
+cached in the buf_page_info_t array
+@return	0 on success, 1 on failure */
+static
+int
+i_s_innodb_buffer_page_fill(
+/*========================*/
+	THD*			thd,		/*!< in: thread */
+	TABLE_LIST*		tables,		/*!< in/out: tables to fill */
+	const buf_page_info_t*	info_array,	/*!< in: array cached page
+						info */
+	ulint			num_page,	/*!< in: number of page info
+						 cached */
+	mem_heap_t*		heap)		/*!< in: temp heap memory */
+{
+	TABLE*			table;
+	Field**			fields;
+
+	DBUG_ENTER("i_s_innodb_buffer_page_fill");
+
+	table = tables->table;
+
+	fields = table->field;
+
+	/* Iterate through the cached array and fill the I_S table rows */
+	for (ulint i = 0; i < num_page; i++) {
+		const buf_page_info_t*	page_info;
+		const char*		table_name;
+		const char*		index_name;
+		const char*		state_str;
+		enum buf_page_state	state;
+
+		page_info = info_array + i;
+
+		table_name = NULL;
+		index_name = NULL;
+		state_str = NULL;
+
+		OK(fields[IDX_BUFFER_POOL_ID]->store(page_info->pool_id));
+
+		OK(fields[IDX_BUFFER_BLOCK_ID]->store(page_info->block_id));
+
+		OK(fields[IDX_BUFFER_PAGE_SPACE]->store(page_info->space_id));
+
+		OK(fields[IDX_BUFFER_PAGE_NUM]->store(page_info->page_num));
+
+		OK(field_store_string(
+			fields[IDX_BUFFER_PAGE_TYPE],
+			i_s_page_type[page_info->page_type].type_str));
+
+		OK(fields[IDX_BUFFER_PAGE_FLUSH_TYPE]->store(
+			page_info->flush_type));
+
+		OK(fields[IDX_BUFFER_PAGE_FIX_COUNT]->store(
+			page_info->fix_count));
+
+		if (page_info->hashed) {
+			OK(field_store_string(
+				fields[IDX_BUFFER_PAGE_HASHED], "YES"));
+		} else {
+			OK(field_store_string(
+				fields[IDX_BUFFER_PAGE_HASHED], "NO"));
+		}
+
+		OK(fields[IDX_BUFFER_PAGE_NEWEST_MOD]->store(
+			(longlong) page_info->newest_mod, true));
+
+		OK(fields[IDX_BUFFER_PAGE_OLDEST_MOD]->store(
+			(longlong) page_info->oldest_mod, true));
+
+		OK(fields[IDX_BUFFER_PAGE_ACCESS_TIME]->store(
+			page_info->access_time));
+
+		/* If this is an index page, fetch the index name
+		and table name */
+		if (page_info->page_type == I_S_PAGE_TYPE_INDEX) {
+			const dict_index_t*	index;
+
+			mutex_enter(&dict_sys->mutex);
+			index = dict_index_get_if_in_cache_low(
+				page_info->index_id);
+
+			/* Copy the index/table name under mutex. We
+			do not want to hold the InnoDB mutex while
+			filling the IS table */
+			if (index) {
+				const char*	name_ptr = index->name;
+
+				if (name_ptr[0] == TEMP_INDEX_PREFIX) {
+					name_ptr++;
+				}
+
+				index_name = mem_heap_strdup(heap, name_ptr);
+
+				table_name = mem_heap_strdup(heap,
+							     index->table_name);
+
+			}
+
+			mutex_exit(&dict_sys->mutex);
+		}
+
+		OK(field_store_string(
+			fields[IDX_BUFFER_PAGE_TABLE_NAME], table_name));
+
+		OK(field_store_string(
+			fields[IDX_BUFFER_PAGE_INDEX_NAME], index_name));
+
+		OK(fields[IDX_BUFFER_PAGE_NUM_RECS]->store(
+			page_info->num_recs));
+
+		OK(fields[IDX_BUFFER_PAGE_DATA_SIZE]->store(
+			page_info->data_size));
+
+		OK(fields[IDX_BUFFER_PAGE_ZIP_SIZE]->store(
+			page_info->zip_ssize
+			? (UNIV_ZIP_SIZE_MIN >> 1) << page_info->zip_ssize
+			: 0));
+
+#if BUF_PAGE_STATE_BITS > 3
+# error "BUF_PAGE_STATE_BITS > 3, please ensure that all 1<<BUF_PAGE_STATE_BITS values are checked for"
+#endif
+		state = static_cast<enum buf_page_state>(page_info->page_state);
+
+		switch (state) {
+		/* First three states are for compression pages and
+		are not states we would get as we scan pages through
+		buffer blocks */
+		case BUF_BLOCK_ZIP_FREE:
+		case BUF_BLOCK_ZIP_PAGE:
+		case BUF_BLOCK_ZIP_DIRTY:
+			state_str = NULL;
+			break;
+		case BUF_BLOCK_NOT_USED:
+			state_str = "NOT_USED";
+			break;
+		case BUF_BLOCK_READY_FOR_USE:
+			state_str = "READY_FOR_USE";
+			break;
+		case BUF_BLOCK_FILE_PAGE:
+			state_str = "FILE_PAGE";
+			break;
+		case BUF_BLOCK_MEMORY:
+			state_str = "MEMORY";
+			break;
+		case BUF_BLOCK_REMOVE_HASH:
+			state_str = "REMOVE_HASH";
+			break;
+		};
+
+		OK(field_store_string(fields[IDX_BUFFER_PAGE_STATE],
+				      state_str));
+
+		switch (page_info->io_fix) {
+		case BUF_IO_NONE:
+			OK(field_store_string(fields[IDX_BUFFER_PAGE_IO_FIX],
+					      "IO_NONE"));
+			break;
+		case BUF_IO_READ:
+			OK(field_store_string(fields[IDX_BUFFER_PAGE_IO_FIX],
+					      "IO_READ"));
+			break;
+		case BUF_IO_WRITE:
+			OK(field_store_string(fields[IDX_BUFFER_PAGE_IO_FIX],
+					      "IO_WRITE"));
+			break;
+		case BUF_IO_PIN:
+			OK(field_store_string(fields[IDX_BUFFER_PAGE_IO_FIX],
+					      "IO_PIN"));
+			break;
+		}
+
+		OK(field_store_string(fields[IDX_BUFFER_PAGE_IS_OLD],
+				      (page_info->is_old) ? "YES" : "NO"));
+
+		OK(fields[IDX_BUFFER_PAGE_FREE_CLOCK]->store(
+			page_info->freed_page_clock));
+
+		if (schema_table_store_record(thd, table)) {
+			DBUG_RETURN(1);
+		}
+	}
+
+	DBUG_RETURN(0);
+}
+
+/*******************************************************************//**
+Set appropriate page type to a buf_page_info_t structure */
+static
+void
+i_s_innodb_set_page_type(
+/*=====================*/
+	buf_page_info_t*page_info,	/*!< in/out: structure to fill with
+					scanned info */
+	ulint		page_type,	/*!< in: page type */
+	const byte*	frame)		/*!< in: buffer frame */
+{
+	if (page_type == FIL_PAGE_INDEX) {
+		const page_t*	page = (const page_t*) frame;
+
+		/* FIL_PAGE_INDEX is a bit special, its value
+		is defined as 17855, so we cannot use FIL_PAGE_INDEX
+		to index into i_s_page_type[] array, its array index
+		in the i_s_page_type[] array is I_S_PAGE_TYPE_INDEX
+		(1) */
+		page_info->page_type = I_S_PAGE_TYPE_INDEX;
+
+		page_info->index_id = btr_page_get_index_id(page);
+
+		page_info->data_size = (ulint)(page_header_get_field(
+			page, PAGE_HEAP_TOP) - (page_is_comp(page)
+						? PAGE_NEW_SUPREMUM_END
+						: PAGE_OLD_SUPREMUM_END)
+			- page_header_get_field(page, PAGE_GARBAGE));
+
+		page_info->num_recs = page_get_n_recs(page);
+	} else if (page_type >= I_S_PAGE_TYPE_UNKNOWN) {
+		/* Encountered an unknown page type */
+		page_info->page_type = I_S_PAGE_TYPE_UNKNOWN;
+	} else {
+		/* Make sure we get the righ index into the
+		i_s_page_type[] array */
+		ut_a(page_type == i_s_page_type[page_type].type_value);
+
+		page_info->page_type = page_type;
+	}
+
+	if (page_info->page_type == FIL_PAGE_TYPE_ZBLOB
+	    || page_info->page_type == FIL_PAGE_TYPE_ZBLOB2) {
+		page_info->page_num = mach_read_from_4(
+			frame + FIL_PAGE_OFFSET);
+		page_info->space_id = mach_read_from_4(
+			frame + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
+	}
+}
+/*******************************************************************//**
+Scans pages in the buffer cache, and collect their general information
+into the buf_page_info_t array which is zero-filled. So any fields
+that are not initialized in the function will default to 0 */
+static
+void
+i_s_innodb_buffer_page_get_info(
+/*============================*/
+	const buf_page_t*bpage,		/*!< in: buffer pool page to scan */
+	ulint		pool_id,	/*!< in: buffer pool id */
+	ulint		pos,		/*!< in: buffer block position in
+					buffer pool or in the LRU list */
+	buf_page_info_t*page_info)	/*!< in: zero filled info structure;
+					out: structure filled with scanned
+					info */
+{
+	ut_ad(pool_id < MAX_BUFFER_POOLS);
+
+	page_info->pool_id = pool_id;
+
+	page_info->block_id = pos;
+
+	page_info->page_state = buf_page_get_state(bpage);
+
+	/* Only fetch information for buffers that map to a tablespace,
+	that is, buffer page with state BUF_BLOCK_ZIP_PAGE,
+	BUF_BLOCK_ZIP_DIRTY or BUF_BLOCK_FILE_PAGE */
+	if (buf_page_in_file(bpage)) {
+		const byte*	frame;
+		ulint		page_type;
+
+		page_info->space_id = buf_page_get_space(bpage);
+
+		page_info->page_num = buf_page_get_page_no(bpage);
+
+		page_info->flush_type = bpage->flush_type;
+
+		page_info->fix_count = bpage->buf_fix_count;
+
+		page_info->newest_mod = bpage->newest_modification;
+
+		page_info->oldest_mod = bpage->oldest_modification;
+
+		page_info->access_time = bpage->access_time;
+
+		page_info->zip_ssize = bpage->zip.ssize;
+
+		page_info->io_fix = bpage->io_fix;
+
+		page_info->is_old = bpage->old;
+
+		page_info->freed_page_clock = bpage->freed_page_clock;
+
+		if (page_info->page_state == BUF_BLOCK_FILE_PAGE) {
+			const buf_block_t*block;
+
+			block = reinterpret_cast<const buf_block_t*>(bpage);
+			frame = block->frame;
+			page_info->hashed = (block->index != NULL);
+		} else {
+			ut_ad(page_info->zip_ssize);
+			frame = bpage->zip.data;
+		}
+
+		page_type = fil_page_get_type(frame);
+
+		i_s_innodb_set_page_type(page_info, page_type, frame);
+	} else {
+		page_info->page_type = I_S_PAGE_TYPE_UNKNOWN;
+	}
+}
+
+/*******************************************************************//**
+This is the function that goes through each block of the buffer pool
+and fetch information to information schema tables: INNODB_BUFFER_PAGE.
+@return	0 on success, 1 on failure */
+static
+int
+i_s_innodb_fill_buffer_pool(
+/*========================*/
+	THD*			thd,		/*!< in: thread */
+	TABLE_LIST*		tables,		/*!< in/out: tables to fill */
+	buf_pool_t*		buf_pool,	/*!< in: buffer pool to scan */
+	const ulint		pool_id)	/*!< in: buffer pool id */
+{
+	int			status	= 0;
+	mem_heap_t*		heap;
+
+	DBUG_ENTER("i_s_innodb_fill_buffer_pool");
+
+	heap = mem_heap_create(10000);
+
+	/* Go through each chunk of buffer pool. Currently, we only
+	have one single chunk for each buffer pool */
+	for (ulint n = 0; n < buf_pool->n_chunks; n++) {
+		const buf_block_t*	block;
+		ulint			n_blocks;
+		buf_page_info_t*	info_buffer;
+		ulint			num_page;
+		ulint			mem_size;
+		ulint			chunk_size;
+		ulint			num_to_process = 0;
+		ulint			block_id = 0;
+
+		/* Get buffer block of the nth chunk */
+		block = buf_get_nth_chunk_block(buf_pool, n, &chunk_size);
+		num_page = 0;
+
+		while (chunk_size > 0) {
+			/* we cache maximum MAX_BUF_INFO_CACHED number of
+			buffer page info */
+			num_to_process = ut_min(chunk_size,
+						MAX_BUF_INFO_CACHED);
+
+			mem_size = num_to_process * sizeof(buf_page_info_t);
+
+			/* For each chunk, we'll pre-allocate information
+			structures to cache the page information read from
+			the buffer pool. Doing so before obtain any mutex */
+			info_buffer = (buf_page_info_t*) mem_heap_zalloc(
+				heap, mem_size);
+
+			/* Obtain appropriate mutexes. Since this is diagnostic
+			buffer pool info printout, we are not required to
+			preserve the overall consistency, so we can
+			release mutex periodically */
+			buf_pool_mutex_enter(buf_pool);
+
+			/* GO through each block in the chunk */
+			for (n_blocks = num_to_process; n_blocks--; block++) {
+				i_s_innodb_buffer_page_get_info(
+					&block->page, pool_id, block_id,
+					info_buffer + num_page);
+				block_id++;
+				num_page++;
+			}
+
+			buf_pool_mutex_exit(buf_pool);
+
+			/* Fill in information schema table with information
+			just collected from the buffer chunk scan */
+			status = i_s_innodb_buffer_page_fill(
+				thd, tables, info_buffer,
+				num_page, heap);
+
+			/* If something goes wrong, break and return */
+			if (status) {
+				break;
+			}
+
+			mem_heap_empty(heap);
+			chunk_size -= num_to_process;
+			num_page = 0;
+		}
+	}
+
+	mem_heap_free(heap);
+
+	DBUG_RETURN(status);
+}
+
+/*******************************************************************//**
+Fill page information for pages in InnoDB buffer pool to the
+dynamic table INFORMATION_SCHEMA.INNODB_BUFFER_PAGE
+@return	0 on success, 1 on failure */
+static
+int
+i_s_innodb_buffer_page_fill_table(
+/*==============================*/
+	THD*		thd,		/*!< in: thread */
+	TABLE_LIST*	tables,		/*!< in/out: tables to fill */
+	Item*		)		/*!< in: condition (ignored) */
+{
+	int	status	= 0;
+
+	DBUG_ENTER("i_s_innodb_buffer_page_fill_table");
+
+	/* deny access to user without PROCESS privilege */
+	if (check_global_access(thd, PROCESS_ACL)) {
+		DBUG_RETURN(0);
+	}
+
+	/* Walk through each buffer pool */
+	for (ulint i = 0; i < srv_buf_pool_instances; i++) {
+		buf_pool_t*	buf_pool;
+
+		buf_pool = buf_pool_from_array(i);
+
+		/* Fetch information from pages in this buffer pool,
+		and fill the corresponding I_S table */
+		status = i_s_innodb_fill_buffer_pool(thd, tables, buf_pool, i);
+
+		/* If something wrong, break and return */
+		if (status) {
+			break;
+		}
+	}
+
+	DBUG_RETURN(status);
+}
+
+/*******************************************************************//**
+Bind the dynamic table INFORMATION_SCHEMA.INNODB_BUFFER_PAGE.
+@return	0 on success, 1 on failure */
+static
+int
+i_s_innodb_buffer_page_init(
+/*========================*/
+	void*	p)	/*!< in/out: table schema object */
+{
+	ST_SCHEMA_TABLE*	schema;
+
+	DBUG_ENTER("i_s_innodb_buffer_page_init");
+
+	schema = reinterpret_cast<ST_SCHEMA_TABLE*>(p);
+
+	schema->fields_info = i_s_innodb_buffer_page_fields_info;
+	schema->fill_table = i_s_innodb_buffer_page_fill_table;
+
+	DBUG_RETURN(0);
+}
+
+UNIV_INTERN struct st_maria_plugin	i_s_innodb_buffer_page =
+{
+	/* the plugin type (a MYSQL_XXX_PLUGIN value) */
+	/* int */
+	STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+
+	/* pointer to type-specific plugin descriptor */
+	/* void* */
+	STRUCT_FLD(info, &i_s_info),
+
+	/* plugin name */
+	/* const char* */
+	STRUCT_FLD(name, "INNODB_BUFFER_PAGE"),
+
+	/* plugin author (for SHOW PLUGINS) */
+	/* const char* */
+	STRUCT_FLD(author, plugin_author),
+
+	/* general descriptive text (for SHOW PLUGINS) */
+	/* const char* */
+	STRUCT_FLD(descr, "InnoDB Buffer Page Information"),
+
+	/* the plugin license (PLUGIN_LICENSE_XXX) */
+	/* int */
+	STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
+
+	/* the function to invoke when plugin is loaded */
+	/* int (*)(void*); */
+	STRUCT_FLD(init, i_s_innodb_buffer_page_init),
+
+	/* the function to invoke when plugin is unloaded */
+	/* int (*)(void*); */
+	STRUCT_FLD(deinit, i_s_common_deinit),
+
+	/* plugin version (for SHOW PLUGINS) */
+	/* unsigned int */
+	STRUCT_FLD(version, INNODB_VERSION_SHORT),
+
+	/* struct st_mysql_show_var* */
+	STRUCT_FLD(status_vars, NULL),
+
+	/* struct st_mysql_sys_var** */
+	STRUCT_FLD(system_vars, NULL),
+
+        /* Maria extension */
+	STRUCT_FLD(version_info, INNODB_VERSION_STR),
+        STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_STABLE),
+};
+
+static ST_FIELD_INFO	i_s_innodb_buf_page_lru_fields_info[] =
+{
+#define IDX_BUF_LRU_POOL_ID		0
+	{STRUCT_FLD(field_name,		"POOL_ID"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_BUF_LRU_POS			1
+	{STRUCT_FLD(field_name,		"LRU_POSITION"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_BUF_LRU_PAGE_SPACE		2
+	{STRUCT_FLD(field_name,		"SPACE"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_BUF_LRU_PAGE_NUM		3
+	{STRUCT_FLD(field_name,		"PAGE_NUMBER"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_BUF_LRU_PAGE_TYPE		4
+	{STRUCT_FLD(field_name,		"PAGE_TYPE"),
+	 STRUCT_FLD(field_length,	64),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_STRING),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_MAYBE_NULL),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_BUF_LRU_PAGE_FLUSH_TYPE	5
+	{STRUCT_FLD(field_name,		"FLUSH_TYPE"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_BUF_LRU_PAGE_FIX_COUNT	6
+	{STRUCT_FLD(field_name,		"FIX_COUNT"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_BUF_LRU_PAGE_HASHED		7
+	{STRUCT_FLD(field_name,		"IS_HASHED"),
+	 STRUCT_FLD(field_length,	3),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_STRING),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_MAYBE_NULL),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_BUF_LRU_PAGE_NEWEST_MOD	8
+	{STRUCT_FLD(field_name,		"NEWEST_MODIFICATION"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_BUF_LRU_PAGE_OLDEST_MOD	9
+	{STRUCT_FLD(field_name,		"OLDEST_MODIFICATION"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_BUF_LRU_PAGE_ACCESS_TIME	10
+	{STRUCT_FLD(field_name,		"ACCESS_TIME"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_BUF_LRU_PAGE_TABLE_NAME	11
+	{STRUCT_FLD(field_name,		"TABLE_NAME"),
+	 STRUCT_FLD(field_length,	1024),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_STRING),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_MAYBE_NULL),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_BUF_LRU_PAGE_INDEX_NAME	12
+	{STRUCT_FLD(field_name,		"INDEX_NAME"),
+	 STRUCT_FLD(field_length,	1024),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_STRING),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_MAYBE_NULL),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_BUF_LRU_PAGE_NUM_RECS	13
+	{STRUCT_FLD(field_name,		"NUMBER_RECORDS"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_BUF_LRU_PAGE_DATA_SIZE	14
+	{STRUCT_FLD(field_name,		"DATA_SIZE"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_BUF_LRU_PAGE_ZIP_SIZE	15
+	{STRUCT_FLD(field_name,		"COMPRESSED_SIZE"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_BUF_LRU_PAGE_STATE		16
+	{STRUCT_FLD(field_name,		"COMPRESSED"),
+	 STRUCT_FLD(field_length,	3),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_STRING),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_MAYBE_NULL),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_BUF_LRU_PAGE_IO_FIX		17
+	{STRUCT_FLD(field_name,		"IO_FIX"),
+	 STRUCT_FLD(field_length,	64),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_STRING),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_MAYBE_NULL),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_BUF_LRU_PAGE_IS_OLD		18
+	{STRUCT_FLD(field_name,		"IS_OLD"),
+	 STRUCT_FLD(field_length,	3),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_STRING),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_MAYBE_NULL),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_BUF_LRU_PAGE_FREE_CLOCK	19
+	{STRUCT_FLD(field_name,		"FREE_PAGE_CLOCK"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+	END_OF_ST_FIELD_INFO
+};
+
+/*******************************************************************//**
+Fill Information Schema table INNODB_BUFFER_PAGE_LRU with information
+cached in the buf_page_info_t array
+@return	0 on success, 1 on failure */
+static
+int
+i_s_innodb_buf_page_lru_fill(
+/*=========================*/
+	THD*			thd,		/*!< in: thread */
+	TABLE_LIST*		tables,		/*!< in/out: tables to fill */
+	const buf_page_info_t*	info_array,	/*!< in: array cached page
+						info */
+	ulint			num_page)	/*!< in: number of page info
+						 cached */
+{
+	TABLE*			table;
+	Field**			fields;
+	mem_heap_t*		heap;
+
+	DBUG_ENTER("i_s_innodb_buf_page_lru_fill");
+
+	table = tables->table;
+
+	fields = table->field;
+
+	heap = mem_heap_create(1000);
+
+	/* Iterate through the cached array and fill the I_S table rows */
+	for (ulint i = 0; i < num_page; i++) {
+		const buf_page_info_t*	page_info;
+		const char*		table_name;
+		const char*		index_name;
+		const char*		state_str;
+		enum buf_page_state	state;
+
+		table_name = NULL;
+		index_name = NULL;
+		state_str = NULL;
+
+		page_info = info_array + i;
+
+		OK(fields[IDX_BUF_LRU_POOL_ID]->store(page_info->pool_id));
+
+		OK(fields[IDX_BUF_LRU_POS]->store(page_info->block_id));
+
+		OK(fields[IDX_BUF_LRU_PAGE_SPACE]->store(page_info->space_id));
+
+		OK(fields[IDX_BUF_LRU_PAGE_NUM]->store(page_info->page_num));
+
+		OK(field_store_string(
+			fields[IDX_BUF_LRU_PAGE_TYPE],
+			i_s_page_type[page_info->page_type].type_str));
+
+		OK(fields[IDX_BUF_LRU_PAGE_FLUSH_TYPE]->store(
+			page_info->flush_type));
+
+		OK(fields[IDX_BUF_LRU_PAGE_FIX_COUNT]->store(
+			page_info->fix_count));
+
+		if (page_info->hashed) {
+			OK(field_store_string(
+				fields[IDX_BUF_LRU_PAGE_HASHED], "YES"));
+		} else {
+			OK(field_store_string(
+				fields[IDX_BUF_LRU_PAGE_HASHED], "NO"));
+		}
+
+		OK(fields[IDX_BUF_LRU_PAGE_NEWEST_MOD]->store(
+			page_info->newest_mod, true));
+
+		OK(fields[IDX_BUF_LRU_PAGE_OLDEST_MOD]->store(
+			page_info->oldest_mod, true));
+
+		OK(fields[IDX_BUF_LRU_PAGE_ACCESS_TIME]->store(
+			page_info->access_time));
+
+		/* If this is an index page, fetch the index name
+		and table name */
+		if (page_info->page_type == I_S_PAGE_TYPE_INDEX) {
+			const dict_index_t*	index;
+
+			mutex_enter(&dict_sys->mutex);
+			index = dict_index_get_if_in_cache_low(
+				page_info->index_id);
+
+			/* Copy the index/table name under mutex. We
+			do not want to hold the InnoDB mutex while
+			filling the IS table */
+			if (index) {
+				const char*	name_ptr = index->name;
+
+				if (name_ptr[0] == TEMP_INDEX_PREFIX) {
+					name_ptr++;
+				}
+
+				index_name = mem_heap_strdup(heap, name_ptr);
+
+				table_name = mem_heap_strdup(heap,
+							     index->table_name);
+			}
+
+			mutex_exit(&dict_sys->mutex);
+		}
+
+		OK(field_store_string(
+			fields[IDX_BUF_LRU_PAGE_TABLE_NAME], table_name));
+
+		OK(field_store_string(
+			fields[IDX_BUF_LRU_PAGE_INDEX_NAME], index_name));
+		OK(fields[IDX_BUF_LRU_PAGE_NUM_RECS]->store(
+			page_info->num_recs));
+
+		OK(fields[IDX_BUF_LRU_PAGE_DATA_SIZE]->store(
+			page_info->data_size));
+
+		OK(fields[IDX_BUF_LRU_PAGE_ZIP_SIZE]->store(
+			page_info->zip_ssize ?
+				 512 << page_info->zip_ssize : 0));
+
+		state = static_cast<enum buf_page_state>(page_info->page_state);
+
+		switch (state) {
+		/* Compressed page */
+		case BUF_BLOCK_ZIP_PAGE:
+		case BUF_BLOCK_ZIP_DIRTY:
+			state_str = "YES";
+			break;
+		/* Uncompressed page */
+		case BUF_BLOCK_FILE_PAGE:
+			state_str = "NO";
+			break;
+		/* We should not see following states */
+		case BUF_BLOCK_ZIP_FREE:
+		case BUF_BLOCK_READY_FOR_USE:
+		case BUF_BLOCK_NOT_USED:
+		case BUF_BLOCK_MEMORY:
+		case BUF_BLOCK_REMOVE_HASH:
+			state_str = NULL;
+			break;
+		};
+
+		OK(field_store_string(fields[IDX_BUF_LRU_PAGE_STATE],
+				      state_str));
+
+		switch (page_info->io_fix) {
+		case BUF_IO_NONE:
+			OK(field_store_string(fields[IDX_BUF_LRU_PAGE_IO_FIX],
+					      "IO_NONE"));
+			break;
+		case BUF_IO_READ:
+			OK(field_store_string(fields[IDX_BUF_LRU_PAGE_IO_FIX],
+					      "IO_READ"));
+			break;
+		case BUF_IO_WRITE:
+			OK(field_store_string(fields[IDX_BUF_LRU_PAGE_IO_FIX],
+					      "IO_WRITE"));
+			break;
+		}
+
+		OK(field_store_string(fields[IDX_BUF_LRU_PAGE_IS_OLD],
+				      (page_info->is_old) ? "YES" : "NO"));
+
+		OK(fields[IDX_BUF_LRU_PAGE_FREE_CLOCK]->store(
+			page_info->freed_page_clock));
+
+		if (schema_table_store_record(thd, table)) {
+			mem_heap_free(heap);
+			DBUG_RETURN(1);
+		}
+
+		mem_heap_empty(heap);
+	}
+
+	mem_heap_free(heap);
+
+	DBUG_RETURN(0);
+}
+
+/*******************************************************************//**
+This is the function that goes through buffer pool's LRU list
+and fetch information to INFORMATION_SCHEMA.INNODB_BUFFER_PAGE_LRU.
+@return	0 on success, 1 on failure */
+static
+int
+i_s_innodb_fill_buffer_lru(
+/*=======================*/
+	THD*			thd,		/*!< in: thread */
+	TABLE_LIST*		tables,		/*!< in/out: tables to fill */
+	buf_pool_t*		buf_pool,	/*!< in: buffer pool to scan */
+	const ulint		pool_id)	/*!< in: buffer pool id */
+{
+	int			status = 0;
+	buf_page_info_t*	info_buffer;
+	ulint			lru_pos = 0;
+	const buf_page_t*	bpage;
+	ulint			lru_len;
+
+	DBUG_ENTER("i_s_innodb_fill_buffer_lru");
+
+	/* Obtain buf_pool mutex before allocate info_buffer, since
+	UT_LIST_GET_LEN(buf_pool->LRU) could change */
+	buf_pool_mutex_enter(buf_pool);
+
+	lru_len = UT_LIST_GET_LEN(buf_pool->LRU);
+
+	/* Print error message if malloc fail */
+	info_buffer = (buf_page_info_t*) my_malloc(
+		lru_len * sizeof *info_buffer, MYF(MY_WME));
+
+	if (!info_buffer) {
+		status = 1;
+		goto exit;
+	}
+
+	memset(info_buffer, 0, lru_len * sizeof *info_buffer);
+
+	/* Walk through Pool's LRU list and print the buffer page
+	information */
+	bpage = UT_LIST_GET_LAST(buf_pool->LRU);
+
+	while (bpage != NULL) {
+		/* Use the same function that collect buffer info for
+		INNODB_BUFFER_PAGE to get buffer page info */
+		i_s_innodb_buffer_page_get_info(bpage, pool_id, lru_pos,
+						(info_buffer + lru_pos));
+
+		bpage = UT_LIST_GET_PREV(LRU, bpage);
+
+		lru_pos++;
+	}
+
+	ut_ad(lru_pos == lru_len);
+	ut_ad(lru_pos == UT_LIST_GET_LEN(buf_pool->LRU));
+
+exit:
+	buf_pool_mutex_exit(buf_pool);
+
+	if (info_buffer) {
+		status = i_s_innodb_buf_page_lru_fill(
+			thd, tables, info_buffer, lru_len);
+
+		my_free(info_buffer);
+	}
+
+	DBUG_RETURN(status);
+}
+
+/*******************************************************************//**
+Fill page information for pages in InnoDB buffer pool to the
+dynamic table INFORMATION_SCHEMA.INNODB_BUFFER_PAGE_LRU
+@return	0 on success, 1 on failure */
+static
+int
+i_s_innodb_buf_page_lru_fill_table(
+/*===============================*/
+	THD*		thd,		/*!< in: thread */
+	TABLE_LIST*	tables,		/*!< in/out: tables to fill */
+	Item*		)		/*!< in: condition (ignored) */
+{
+	int	status	= 0;
+
+	DBUG_ENTER("i_s_innodb_buf_page_lru_fill_table");
+
+	/* deny access to any users that do not hold PROCESS_ACL */
+	if (check_global_access(thd, PROCESS_ACL)) {
+		DBUG_RETURN(0);
+	}
+
+	/* Walk through each buffer pool */
+	for (ulint i = 0; i < srv_buf_pool_instances; i++) {
+		buf_pool_t*	buf_pool;
+
+		buf_pool = buf_pool_from_array(i);
+
+		/* Fetch information from pages in this buffer pool's LRU list,
+		and fill the corresponding I_S table */
+		status = i_s_innodb_fill_buffer_lru(thd, tables, buf_pool, i);
+
+		/* If something wrong, break and return */
+		if (status) {
+			break;
+		}
+	}
+
+	DBUG_RETURN(status);
+}
+
+/*******************************************************************//**
+Bind the dynamic table INFORMATION_SCHEMA.INNODB_BUFFER_PAGE_LRU.
+@return	0 on success, 1 on failure */
+static
+int
+i_s_innodb_buffer_page_lru_init(
+/*============================*/
+	void*	p)	/*!< in/out: table schema object */
+{
+	ST_SCHEMA_TABLE*	schema;
+
+	DBUG_ENTER("i_s_innodb_buffer_page_lru_init");
+
+	schema = reinterpret_cast<ST_SCHEMA_TABLE*>(p);
+
+	schema->fields_info = i_s_innodb_buf_page_lru_fields_info;
+	schema->fill_table = i_s_innodb_buf_page_lru_fill_table;
+
+	DBUG_RETURN(0);
+}
+
+UNIV_INTERN struct st_maria_plugin	i_s_innodb_buffer_page_lru =
+{
+	/* the plugin type (a MYSQL_XXX_PLUGIN value) */
+	/* int */
+	STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+
+	/* pointer to type-specific plugin descriptor */
 	/* void* */
-	STRUCT_FLD(__reserved1, NULL),
+	STRUCT_FLD(info, &i_s_info),
+
+	/* plugin name */
+	/* const char* */
+	STRUCT_FLD(name, "INNODB_BUFFER_PAGE_LRU"),
+
+	/* plugin author (for SHOW PLUGINS) */
+	/* const char* */
+	STRUCT_FLD(author, plugin_author),
+
+	/* general descriptive text (for SHOW PLUGINS) */
+	/* const char* */
+	STRUCT_FLD(descr, "InnoDB Buffer Page in LRU"),
+
+	/* the plugin license (PLUGIN_LICENSE_XXX) */
+	/* int */
+	STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
 
-	/* Plugin flags */
-	/* unsigned long */
-	STRUCT_FLD(flags, 0UL),
+	/* the function to invoke when plugin is loaded */
+	/* int (*)(void*); */
+	STRUCT_FLD(init, i_s_innodb_buffer_page_lru_init),
+
+	/* the function to invoke when plugin is unloaded */
+	/* int (*)(void*); */
+	STRUCT_FLD(deinit, i_s_common_deinit),
+
+	/* plugin version (for SHOW PLUGINS) */
+	/* unsigned int */
+	STRUCT_FLD(version, INNODB_VERSION_SHORT),
+
+	/* struct st_mysql_show_var* */
+	STRUCT_FLD(status_vars, NULL),
+
+	/* struct st_mysql_sys_var** */
+	STRUCT_FLD(system_vars, NULL),
+
+        /* Maria extension */
+	STRUCT_FLD(version_info, INNODB_VERSION_STR),
+        STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_STABLE),
 };
 
 /*******************************************************************//**
@@ -1807,3 +5436,1637 @@ i_s_common_deinit(
 
 	DBUG_RETURN(0);
 }
+
+/* Fields of the dynamic table INFORMATION_SCHEMA.SYS_TABLES */
+static ST_FIELD_INFO	innodb_sys_tables_fields_info[] =
+{
+#define SYS_TABLE_ID		0
+	{STRUCT_FLD(field_name,		"TABLE_ID"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define SYS_TABLE_NAME		1
+	{STRUCT_FLD(field_name,		"NAME"),
+	 STRUCT_FLD(field_length,	MAX_FULL_NAME_LEN + 1),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_STRING),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define SYS_TABLE_FLAG		2
+	{STRUCT_FLD(field_name,		"FLAG"),
+	 STRUCT_FLD(field_length,	MY_INT32_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define SYS_TABLE_NUM_COLUMN	3
+	{STRUCT_FLD(field_name,		"N_COLS"),
+	 STRUCT_FLD(field_length,	MY_INT32_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define SYS_TABLE_SPACE		4
+	{STRUCT_FLD(field_name,		"SPACE"),
+	 STRUCT_FLD(field_length,	MY_INT32_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+	END_OF_ST_FIELD_INFO
+};
+
+/**********************************************************************//**
+Populate information_schema.innodb_sys_tables table with information
+from SYS_TABLES.
+@return	0 on success */
+static
+int
+i_s_dict_fill_sys_tables(
+/*=====================*/
+	THD*		thd,		/*!< in: thread */
+	dict_table_t*	table,		/*!< in: table */
+	TABLE*		table_to_fill)	/*!< in/out: fill this table */
+{
+	Field**		fields;
+
+	DBUG_ENTER("i_s_dict_fill_sys_tables");
+
+	fields = table_to_fill->field;
+
+	OK(fields[SYS_TABLE_ID]->store(longlong(table->id), TRUE));
+
+	OK(field_store_string(fields[SYS_TABLE_NAME], table->name));
+
+	OK(fields[SYS_TABLE_FLAG]->store(table->flags));
+
+	OK(fields[SYS_TABLE_NUM_COLUMN]->store(table->n_cols));
+
+	OK(fields[SYS_TABLE_SPACE]->store(table->space));
+
+	OK(schema_table_store_record(thd, table_to_fill));
+
+	DBUG_RETURN(0);
+}
+/*******************************************************************//**
+Function to go through each record in SYS_TABLES table, and fill the
+information_schema.innodb_sys_tables table with related table information
+@return 0 on success */
+static
+int
+i_s_sys_tables_fill_table(
+/*======================*/
+	THD*		thd,	/*!< in: thread */
+	TABLE_LIST*	tables,	/*!< in/out: tables to fill */
+	Item*		)	/*!< in: condition (not used) */
+{
+	btr_pcur_t	pcur;
+	const rec_t*	rec;
+	mem_heap_t*	heap;
+	mtr_t		mtr;
+
+	DBUG_ENTER("i_s_sys_tables_fill_table");
+
+	/* deny access to user without PROCESS_ACL privilege */
+	if (check_global_access(thd, PROCESS_ACL)) {
+		DBUG_RETURN(0);
+	}
+
+	heap = mem_heap_create(1000);
+	mutex_enter(&(dict_sys->mutex));
+	mtr_start(&mtr);
+
+	rec = dict_startscan_system(&pcur, &mtr, SYS_TABLES);
+
+	while (rec) {
+		const char*	err_msg;
+		dict_table_t*	table_rec;
+
+		/* Create and populate a dict_table_t structure with
+		information from SYS_TABLES row */
+		err_msg = dict_process_sys_tables_rec_and_mtr_commit(
+			heap, rec, &table_rec,
+			DICT_TABLE_LOAD_FROM_RECORD, &mtr);
+
+		mutex_exit(&dict_sys->mutex);
+
+		if (!err_msg) {
+			i_s_dict_fill_sys_tables(thd, table_rec, tables->table);
+		} else {
+			push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
+					    ER_CANT_FIND_SYSTEM_REC, "%s",
+					    err_msg);
+		}
+
+		/* Since dict_process_sys_tables_rec_and_mtr_commit()
+		is called with DICT_TABLE_LOAD_FROM_RECORD, the table_rec
+		is created in dict_process_sys_tables_rec(), we will
+		need to free it */
+		if (table_rec) {
+			dict_mem_table_free(table_rec);
+		}
+
+		mem_heap_empty(heap);
+
+		/* Get the next record */
+		mutex_enter(&dict_sys->mutex);
+		mtr_start(&mtr);
+		rec = dict_getnext_system(&pcur, &mtr);
+	}
+
+	mtr_commit(&mtr);
+	mutex_exit(&dict_sys->mutex);
+	mem_heap_free(heap);
+
+	DBUG_RETURN(0);
+}
+
+/*******************************************************************//**
+Bind the dynamic table INFORMATION_SCHEMA.innodb_sys_tables
+@return 0 on success */
+static
+int
+innodb_sys_tables_init(
+/*===================*/
+	void*	p)	/*!< in/out: table schema object */
+{
+	ST_SCHEMA_TABLE*	schema;
+
+	DBUG_ENTER("innodb_sys_tables_init");
+
+	schema = (ST_SCHEMA_TABLE*) p;
+
+	schema->fields_info = innodb_sys_tables_fields_info;
+	schema->fill_table = i_s_sys_tables_fill_table;
+
+	DBUG_RETURN(0);
+}
+
+UNIV_INTERN struct st_maria_plugin	i_s_innodb_sys_tables =
+{
+	/* the plugin type (a MYSQL_XXX_PLUGIN value) */
+	/* int */
+	STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+
+	/* pointer to type-specific plugin descriptor */
+	/* void* */
+	STRUCT_FLD(info, &i_s_info),
+
+	/* plugin name */
+	/* const char* */
+	STRUCT_FLD(name, "INNODB_SYS_TABLES"),
+
+	/* plugin author (for SHOW PLUGINS) */
+	/* const char* */
+	STRUCT_FLD(author, plugin_author),
+
+	/* general descriptive text (for SHOW PLUGINS) */
+	/* const char* */
+	STRUCT_FLD(descr, "InnoDB SYS_TABLES"),
+
+	/* the plugin license (PLUGIN_LICENSE_XXX) */
+	/* int */
+	STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
+
+	/* the function to invoke when plugin is loaded */
+	/* int (*)(void*); */
+	STRUCT_FLD(init, innodb_sys_tables_init),
+
+	/* the function to invoke when plugin is unloaded */
+	/* int (*)(void*); */
+	STRUCT_FLD(deinit, i_s_common_deinit),
+
+	/* plugin version (for SHOW PLUGINS) */
+	/* unsigned int */
+	STRUCT_FLD(version, INNODB_VERSION_SHORT),
+
+	/* struct st_mysql_show_var* */
+	STRUCT_FLD(status_vars, NULL),
+
+	/* struct st_mysql_sys_var** */
+	STRUCT_FLD(system_vars, NULL),
+
+        /* Maria extension */
+	STRUCT_FLD(version_info, INNODB_VERSION_STR),
+        STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_STABLE),
+};
+
+/* Fields of the dynamic table INFORMATION_SCHEMA.SYS_TABLESTATS */
+static ST_FIELD_INFO	innodb_sys_tablestats_fields_info[] =
+{
+#define SYS_TABLESTATS_ID		0
+	{STRUCT_FLD(field_name,		"TABLE_ID"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define SYS_TABLESTATS_NAME		1
+	{STRUCT_FLD(field_name,		"NAME"),
+	 STRUCT_FLD(field_length,	NAME_LEN + 1),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_STRING),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define SYS_TABLESTATS_INIT		2
+	{STRUCT_FLD(field_name,		"STATS_INITIALIZED"),
+	 STRUCT_FLD(field_length,	NAME_LEN + 1),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_STRING),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define SYS_TABLESTATS_NROW		3
+	{STRUCT_FLD(field_name,		"NUM_ROWS"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define SYS_TABLESTATS_CLUST_SIZE	4
+	{STRUCT_FLD(field_name,		"CLUST_INDEX_SIZE"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define SYS_TABLESTATS_INDEX_SIZE	5
+	{STRUCT_FLD(field_name,		"OTHER_INDEX_SIZE"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define SYS_TABLESTATS_MODIFIED		6
+	{STRUCT_FLD(field_name,		"MODIFIED_COUNTER"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define SYS_TABLESTATS_AUTONINC		7
+	{STRUCT_FLD(field_name,		"AUTOINC"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define SYS_TABLESTATS_TABLE_REF_COUNT	8
+	{STRUCT_FLD(field_name,		"REF_COUNT"),
+	 STRUCT_FLD(field_length,	MY_INT32_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+	END_OF_ST_FIELD_INFO
+};
+
+/**********************************************************************//**
+Populate information_schema.innodb_sys_tablestats table with information
+from SYS_TABLES.
+@return	0 on success */
+static
+int
+i_s_dict_fill_sys_tablestats(
+/*=========================*/
+	THD*		thd,		/*!< in: thread */
+	dict_table_t*	table,		/*!< in: table */
+	TABLE*		table_to_fill)	/*!< in/out: fill this table */
+{
+	Field**		fields;
+
+	DBUG_ENTER("i_s_dict_fill_sys_tablestats");
+
+	fields = table_to_fill->field;
+
+	OK(fields[SYS_TABLESTATS_ID]->store(longlong(table->id), TRUE));
+
+	OK(field_store_string(fields[SYS_TABLESTATS_NAME], table->name));
+
+	if (table->stat_initialized) {
+		OK(field_store_string(fields[SYS_TABLESTATS_INIT],
+				      "Initialized"));
+	} else {
+		OK(field_store_string(fields[SYS_TABLESTATS_INIT],
+				      "Uninitialized"));
+	}
+
+	OK(fields[SYS_TABLESTATS_NROW]->store(table->stat_n_rows, TRUE));
+
+	OK(fields[SYS_TABLESTATS_CLUST_SIZE]->store(
+		table->stat_clustered_index_size));
+
+	OK(fields[SYS_TABLESTATS_INDEX_SIZE]->store(
+		table->stat_sum_of_other_index_sizes));
+
+	OK(fields[SYS_TABLESTATS_MODIFIED]->store(
+		table->stat_modified_counter));
+
+	OK(fields[SYS_TABLESTATS_AUTONINC]->store(table->autoinc, TRUE));
+
+	OK(fields[SYS_TABLESTATS_TABLE_REF_COUNT]->store(
+		table->n_ref_count));
+
+	OK(schema_table_store_record(thd, table_to_fill));
+
+	DBUG_RETURN(0);
+}
+
+/*******************************************************************//**
+Function to go through each record in SYS_TABLES table, and fill the
+information_schema.innodb_sys_tablestats table with table statistics
+related information
+@return 0 on success */
+static
+int
+i_s_sys_tables_fill_table_stats(
+/*============================*/
+	THD*		thd,	/*!< in: thread */
+	TABLE_LIST*	tables,	/*!< in/out: tables to fill */
+	Item*		)	/*!< in: condition (not used) */
+{
+	btr_pcur_t	pcur;
+	const rec_t*	rec;
+	mem_heap_t*	heap;
+	mtr_t		mtr;
+
+	DBUG_ENTER("i_s_sys_tables_fill_table_stats");
+
+	/* deny access to user without PROCESS_ACL privilege */
+	if (check_global_access(thd, PROCESS_ACL)) {
+		DBUG_RETURN(0);
+	}
+
+	heap = mem_heap_create(1000);
+	mutex_enter(&dict_sys->mutex);
+	mtr_start(&mtr);
+
+	rec = dict_startscan_system(&pcur, &mtr, SYS_TABLES);
+
+	while (rec) {
+		const char*	err_msg;
+		dict_table_t*	table_rec;
+
+		/* Fetch the dict_table_t structure corresponding to
+		this SYS_TABLES record */
+		err_msg = dict_process_sys_tables_rec_and_mtr_commit(
+			heap, rec, &table_rec,
+			DICT_TABLE_LOAD_FROM_CACHE, &mtr);
+
+		mutex_exit(&dict_sys->mutex);
+
+		if (!err_msg) {
+			i_s_dict_fill_sys_tablestats(thd, table_rec,
+						     tables->table);
+		} else {
+			push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
+					    ER_CANT_FIND_SYSTEM_REC, "%s",
+					    err_msg);
+		}
+
+		mem_heap_empty(heap);
+
+		/* Get the next record */
+		mutex_enter(&dict_sys->mutex);
+		mtr_start(&mtr);
+		rec = dict_getnext_system(&pcur, &mtr);
+	}
+
+	mtr_commit(&mtr);
+	mutex_exit(&dict_sys->mutex);
+	mem_heap_free(heap);
+
+	DBUG_RETURN(0);
+}
+
+/*******************************************************************//**
+Bind the dynamic table INFORMATION_SCHEMA.innodb_sys_tablestats
+@return 0 on success */
+static
+int
+innodb_sys_tablestats_init(
+/*=======================*/
+	void*	p)	/*!< in/out: table schema object */
+{
+	ST_SCHEMA_TABLE*	schema;
+
+	DBUG_ENTER("innodb_sys_tablestats_init");
+
+	schema = (ST_SCHEMA_TABLE*) p;
+
+	schema->fields_info = innodb_sys_tablestats_fields_info;
+	schema->fill_table = i_s_sys_tables_fill_table_stats;
+
+	DBUG_RETURN(0);
+}
+
+UNIV_INTERN struct st_maria_plugin	i_s_innodb_sys_tablestats =
+{
+	/* the plugin type (a MYSQL_XXX_PLUGIN value) */
+	/* int */
+	STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+
+	/* pointer to type-specific plugin descriptor */
+	/* void* */
+	STRUCT_FLD(info, &i_s_info),
+
+	/* plugin name */
+	/* const char* */
+	STRUCT_FLD(name, "INNODB_SYS_TABLESTATS"),
+
+	/* plugin author (for SHOW PLUGINS) */
+	/* const char* */
+	STRUCT_FLD(author, plugin_author),
+
+	/* general descriptive text (for SHOW PLUGINS) */
+	/* const char* */
+	STRUCT_FLD(descr, "InnoDB SYS_TABLESTATS"),
+
+	/* the plugin license (PLUGIN_LICENSE_XXX) */
+	/* int */
+	STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
+
+	/* the function to invoke when plugin is loaded */
+	/* int (*)(void*); */
+	STRUCT_FLD(init, innodb_sys_tablestats_init),
+
+	/* the function to invoke when plugin is unloaded */
+	/* int (*)(void*); */
+	STRUCT_FLD(deinit, i_s_common_deinit),
+
+	/* plugin version (for SHOW PLUGINS) */
+	/* unsigned int */
+	STRUCT_FLD(version, INNODB_VERSION_SHORT),
+
+	/* struct st_mysql_show_var* */
+	STRUCT_FLD(status_vars, NULL),
+
+	/* struct st_mysql_sys_var** */
+	STRUCT_FLD(system_vars, NULL),
+
+        /* Maria extension */
+	STRUCT_FLD(version_info, INNODB_VERSION_STR),
+        STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_STABLE),
+};
+
+/* Fields of the dynamic table INFORMATION_SCHEMA.SYS_INDEXES */
+static ST_FIELD_INFO	innodb_sysindex_fields_info[] =
+{
+#define SYS_INDEX_ID		0
+	{STRUCT_FLD(field_name,		"INDEX_ID"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define SYS_INDEX_NAME		1
+	{STRUCT_FLD(field_name,		"NAME"),
+	 STRUCT_FLD(field_length,	NAME_LEN + 1),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_STRING),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define SYS_INDEX_TABLE_ID	2
+	{STRUCT_FLD(field_name,		"TABLE_ID"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define SYS_INDEX_TYPE		3
+	{STRUCT_FLD(field_name,		"TYPE"),
+	 STRUCT_FLD(field_length,	MY_INT32_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define SYS_INDEX_NUM_FIELDS	4
+	{STRUCT_FLD(field_name,		"N_FIELDS"),
+	 STRUCT_FLD(field_length,	MY_INT32_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define SYS_INDEX_PAGE_NO	5
+	{STRUCT_FLD(field_name,		"PAGE_NO"),
+	 STRUCT_FLD(field_length,	MY_INT32_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define SYS_INDEX_SPACE		6
+	{STRUCT_FLD(field_name,		"SPACE"),
+	 STRUCT_FLD(field_length,	MY_INT32_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+	END_OF_ST_FIELD_INFO
+};
+
+/**********************************************************************//**
+Function to populate the information_schema.innodb_sys_indexes table with
+collected index information
+@return 0 on success */
+static
+int
+i_s_dict_fill_sys_indexes(
+/*======================*/
+	THD*		thd,		/*!< in: thread */
+	table_id_t	table_id,	/*!< in: table id */
+	dict_index_t*	index,		/*!< in: populated dict_index_t
+					struct with index info */
+	TABLE*		table_to_fill)	/*!< in/out: fill this table */
+{
+	Field**		fields;
+	const char*	name_ptr = index->name;
+
+	DBUG_ENTER("i_s_dict_fill_sys_indexes");
+
+	fields = table_to_fill->field;
+
+	if (name_ptr[0] == TEMP_INDEX_PREFIX) {
+		name_ptr++;
+	}
+
+	OK(field_store_string(fields[SYS_INDEX_NAME], name_ptr));
+
+	OK(fields[SYS_INDEX_ID]->store(longlong(index->id), TRUE));
+
+	OK(fields[SYS_INDEX_TABLE_ID]->store(longlong(table_id), TRUE));
+
+	OK(fields[SYS_INDEX_TYPE]->store(index->type));
+
+	OK(fields[SYS_INDEX_NUM_FIELDS]->store(index->n_fields));
+
+	/* FIL_NULL is ULINT32_UNDEFINED */
+	if (index->page == FIL_NULL) {
+		OK(fields[SYS_INDEX_PAGE_NO]->store(-1));
+	} else {
+		OK(fields[SYS_INDEX_PAGE_NO]->store(index->page));
+	}
+
+	OK(fields[SYS_INDEX_SPACE]->store(index->space));
+
+	OK(schema_table_store_record(thd, table_to_fill));
+
+	DBUG_RETURN(0);
+}
+/*******************************************************************//**
+Function to go through each record in SYS_INDEXES table, and fill the
+information_schema.innodb_sys_indexes table with related index information
+@return 0 on success */
+static
+int
+i_s_sys_indexes_fill_table(
+/*=======================*/
+	THD*		thd,	/*!< in: thread */
+	TABLE_LIST*	tables,	/*!< in/out: tables to fill */
+	Item*		)	/*!< in: condition (not used) */
+{
+	btr_pcur_t		pcur;
+	const rec_t*		rec;
+	mem_heap_t*		heap;
+	mtr_t			mtr;
+
+	DBUG_ENTER("i_s_sys_indexes_fill_table");
+
+	/* deny access to user without PROCESS_ACL privilege */
+	if (check_global_access(thd, PROCESS_ACL)) {
+		DBUG_RETURN(0);
+	}
+
+	heap = mem_heap_create(1000);
+	mutex_enter(&dict_sys->mutex);
+	mtr_start(&mtr);
+
+	/* Start scan the SYS_INDEXES table */
+	rec = dict_startscan_system(&pcur, &mtr, SYS_INDEXES);
+
+	/* Process each record in the table */
+	while (rec) {
+		const char*	err_msg;
+		table_id_t	table_id;
+		dict_index_t	index_rec;
+
+		/* Populate a dict_index_t structure with information from
+		a SYS_INDEXES row */
+		err_msg = dict_process_sys_indexes_rec(heap, rec, &index_rec,
+						       &table_id);
+
+		mtr_commit(&mtr);
+		mutex_exit(&dict_sys->mutex);
+
+		if (!err_msg) {
+			i_s_dict_fill_sys_indexes(thd, table_id, &index_rec,
+						 tables->table);
+		} else {
+			push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
+					    ER_CANT_FIND_SYSTEM_REC, "%s",
+					    err_msg);
+		}
+
+		mem_heap_empty(heap);
+
+		/* Get the next record */
+		mutex_enter(&dict_sys->mutex);
+		mtr_start(&mtr);
+		rec = dict_getnext_system(&pcur, &mtr);
+	}
+
+	mtr_commit(&mtr);
+	mutex_exit(&dict_sys->mutex);
+	mem_heap_free(heap);
+
+	DBUG_RETURN(0);
+}
+/*******************************************************************//**
+Bind the dynamic table INFORMATION_SCHEMA.innodb_sys_indexes
+@return 0 on success */
+static
+int
+innodb_sys_indexes_init(
+/*====================*/
+	void*	p)	/*!< in/out: table schema object */
+{
+	ST_SCHEMA_TABLE*	schema;
+
+	DBUG_ENTER("innodb_sys_indexes_init");
+
+	schema = (ST_SCHEMA_TABLE*) p;
+
+	schema->fields_info = innodb_sysindex_fields_info;
+	schema->fill_table = i_s_sys_indexes_fill_table;
+
+	DBUG_RETURN(0);
+}
+
+UNIV_INTERN struct st_maria_plugin	i_s_innodb_sys_indexes =
+{
+	/* the plugin type (a MYSQL_XXX_PLUGIN value) */
+	/* int */
+	STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+
+	/* pointer to type-specific plugin descriptor */
+	/* void* */
+	STRUCT_FLD(info, &i_s_info),
+
+	/* plugin name */
+	/* const char* */
+	STRUCT_FLD(name, "INNODB_SYS_INDEXES"),
+
+	/* plugin author (for SHOW PLUGINS) */
+	/* const char* */
+	STRUCT_FLD(author, plugin_author),
+
+	/* general descriptive text (for SHOW PLUGINS) */
+	/* const char* */
+	STRUCT_FLD(descr, "InnoDB SYS_INDEXES"),
+
+	/* the plugin license (PLUGIN_LICENSE_XXX) */
+	/* int */
+	STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
+
+	/* the function to invoke when plugin is loaded */
+	/* int (*)(void*); */
+	STRUCT_FLD(init, innodb_sys_indexes_init),
+
+	/* the function to invoke when plugin is unloaded */
+	/* int (*)(void*); */
+	STRUCT_FLD(deinit, i_s_common_deinit),
+
+	/* plugin version (for SHOW PLUGINS) */
+	/* unsigned int */
+	STRUCT_FLD(version, INNODB_VERSION_SHORT),
+
+	/* struct st_mysql_show_var* */
+	STRUCT_FLD(status_vars, NULL),
+
+	/* struct st_mysql_sys_var** */
+	STRUCT_FLD(system_vars, NULL),
+
+        /* Maria extension */
+	STRUCT_FLD(version_info, INNODB_VERSION_STR),
+        STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_STABLE),
+};
+
+/* Fields of the dynamic table INFORMATION_SCHEMA.SYS_COLUMNS */
+static ST_FIELD_INFO	innodb_sys_columns_fields_info[] =
+{
+#define SYS_COLUMN_TABLE_ID		0
+	{STRUCT_FLD(field_name,		"TABLE_ID"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define SYS_COLUMN_NAME		1
+	{STRUCT_FLD(field_name,		"NAME"),
+	 STRUCT_FLD(field_length,	NAME_LEN + 1),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_STRING),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define SYS_COLUMN_POSITION	2
+	{STRUCT_FLD(field_name,		"POS"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define SYS_COLUMN_MTYPE		3
+	{STRUCT_FLD(field_name,		"MTYPE"),
+	 STRUCT_FLD(field_length,	MY_INT32_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define SYS_COLUMN__PRTYPE	4
+	{STRUCT_FLD(field_name,		"PRTYPE"),
+	 STRUCT_FLD(field_length,	MY_INT32_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define SYS_COLUMN_COLUMN_LEN	5
+	{STRUCT_FLD(field_name,		"LEN"),
+	 STRUCT_FLD(field_length,	MY_INT32_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+	END_OF_ST_FIELD_INFO
+};
+
+/**********************************************************************//**
+Function to populate the information_schema.innodb_sys_columns with
+related column information
+@return 0 on success */
+static
+int
+i_s_dict_fill_sys_columns(
+/*======================*/
+	THD*		thd,		/*!< in: thread */
+	table_id_t	table_id,	/*!< in: table ID */
+	const char*	col_name,	/*!< in: column name */
+	dict_col_t*	column,		/*!< in: dict_col_t struct holding
+					more column information */
+	TABLE*		table_to_fill)	/*!< in/out: fill this table */
+{
+	Field**		fields;
+
+	DBUG_ENTER("i_s_dict_fill_sys_columns");
+
+	fields = table_to_fill->field;
+
+	OK(fields[SYS_COLUMN_TABLE_ID]->store(longlong(table_id), TRUE));
+
+	OK(field_store_string(fields[SYS_COLUMN_NAME], col_name));
+
+	OK(fields[SYS_COLUMN_POSITION]->store(column->ind));
+
+	OK(fields[SYS_COLUMN_MTYPE]->store(column->mtype));
+
+	OK(fields[SYS_COLUMN__PRTYPE]->store(column->prtype));
+
+	OK(fields[SYS_COLUMN_COLUMN_LEN]->store(column->len));
+
+	OK(schema_table_store_record(thd, table_to_fill));
+
+	DBUG_RETURN(0);
+}
+/*******************************************************************//**
+Function to fill information_schema.innodb_sys_columns with information
+collected by scanning SYS_COLUMNS table.
+@return 0 on success */
+static
+int
+i_s_sys_columns_fill_table(
+/*=======================*/
+	THD*		thd,	/*!< in: thread */
+	TABLE_LIST*	tables,	/*!< in/out: tables to fill */
+	Item*		)	/*!< in: condition (not used) */
+{
+	btr_pcur_t	pcur;
+	const rec_t*	rec;
+	const char*	col_name;
+	mem_heap_t*	heap;
+	mtr_t		mtr;
+
+	DBUG_ENTER("i_s_sys_columns_fill_table");
+
+	/* deny access to user without PROCESS_ACL privilege */
+	if (check_global_access(thd, PROCESS_ACL)) {
+		DBUG_RETURN(0);
+	}
+
+	heap = mem_heap_create(1000);
+	mutex_enter(&dict_sys->mutex);
+	mtr_start(&mtr);
+
+	rec = dict_startscan_system(&pcur, &mtr, SYS_COLUMNS);
+
+	while (rec) {
+		const char*	err_msg;
+		dict_col_t	column_rec;
+		table_id_t	table_id;
+
+		/* populate a dict_col_t structure with information from
+		a SYS_COLUMNS row */
+		err_msg = dict_process_sys_columns_rec(heap, rec, &column_rec,
+						       &table_id, &col_name);
+
+		mtr_commit(&mtr);
+		mutex_exit(&dict_sys->mutex);
+
+		if (!err_msg) {
+			i_s_dict_fill_sys_columns(thd, table_id, col_name,
+						 &column_rec,
+						 tables->table);
+		} else {
+			push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
+					    ER_CANT_FIND_SYSTEM_REC, "%s",
+					    err_msg);
+		}
+
+		mem_heap_empty(heap);
+
+		/* Get the next record */
+		mutex_enter(&dict_sys->mutex);
+		mtr_start(&mtr);
+		rec = dict_getnext_system(&pcur, &mtr);
+	}
+
+	mtr_commit(&mtr);
+	mutex_exit(&dict_sys->mutex);
+	mem_heap_free(heap);
+
+	DBUG_RETURN(0);
+}
+/*******************************************************************//**
+Bind the dynamic table INFORMATION_SCHEMA.innodb_sys_columns
+@return 0 on success */
+static
+int
+innodb_sys_columns_init(
+/*====================*/
+	void*	p)	/*!< in/out: table schema object */
+{
+	ST_SCHEMA_TABLE*	schema;
+
+	DBUG_ENTER("innodb_sys_columns_init");
+
+	schema = (ST_SCHEMA_TABLE*) p;
+
+	schema->fields_info = innodb_sys_columns_fields_info;
+	schema->fill_table = i_s_sys_columns_fill_table;
+
+	DBUG_RETURN(0);
+}
+
+UNIV_INTERN struct st_maria_plugin	i_s_innodb_sys_columns =
+{
+	/* the plugin type (a MYSQL_XXX_PLUGIN value) */
+	/* int */
+	STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+
+	/* pointer to type-specific plugin descriptor */
+	/* void* */
+	STRUCT_FLD(info, &i_s_info),
+
+	/* plugin name */
+	/* const char* */
+	STRUCT_FLD(name, "INNODB_SYS_COLUMNS"),
+
+	/* plugin author (for SHOW PLUGINS) */
+	/* const char* */
+	STRUCT_FLD(author, plugin_author),
+
+	/* general descriptive text (for SHOW PLUGINS) */
+	/* const char* */
+	STRUCT_FLD(descr, "InnoDB SYS_COLUMNS"),
+
+	/* the plugin license (PLUGIN_LICENSE_XXX) */
+	/* int */
+	STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
+
+	/* the function to invoke when plugin is loaded */
+	/* int (*)(void*); */
+	STRUCT_FLD(init, innodb_sys_columns_init),
+
+	/* the function to invoke when plugin is unloaded */
+	/* int (*)(void*); */
+	STRUCT_FLD(deinit, i_s_common_deinit),
+
+	/* plugin version (for SHOW PLUGINS) */
+	/* unsigned int */
+	STRUCT_FLD(version, INNODB_VERSION_SHORT),
+
+	/* struct st_mysql_show_var* */
+	STRUCT_FLD(status_vars, NULL),
+
+	/* struct st_mysql_sys_var** */
+	STRUCT_FLD(system_vars, NULL),
+
+        /* Maria extension */
+	STRUCT_FLD(version_info, INNODB_VERSION_STR),
+        STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_STABLE),
+};
+/* Fields of the dynamic table INFORMATION_SCHEMA.innodb_sys_fields */
+static ST_FIELD_INFO	innodb_sys_fields_fields_info[] =
+{
+#define SYS_FIELD_INDEX_ID	0
+	{STRUCT_FLD(field_name,		"INDEX_ID"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define SYS_FIELD_NAME		1
+	{STRUCT_FLD(field_name,		"NAME"),
+	 STRUCT_FLD(field_length,	NAME_LEN + 1),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_STRING),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define SYS_FIELD_POS		2
+	{STRUCT_FLD(field_name,		"POS"),
+	 STRUCT_FLD(field_length,	MY_INT32_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+	END_OF_ST_FIELD_INFO
+};
+
+/**********************************************************************//**
+Function to fill information_schema.innodb_sys_fields with information
+collected by scanning SYS_FIELDS table.
+@return 0 on success */
+static
+int
+i_s_dict_fill_sys_fields(
+/*=====================*/
+	THD*		thd,		/*!< in: thread */
+	index_id_t	index_id,	/*!< in: index id for the field */
+	dict_field_t*	field,		/*!< in: table */
+	ulint		pos,		/*!< in: Field position */
+	TABLE*		table_to_fill)	/*!< in/out: fill this table */
+{
+	Field**		fields;
+
+	DBUG_ENTER("i_s_dict_fill_sys_fields");
+
+	fields = table_to_fill->field;
+
+	OK(fields[SYS_FIELD_INDEX_ID]->store(longlong(index_id), TRUE));
+
+	OK(field_store_string(fields[SYS_FIELD_NAME], field->name));
+
+	OK(fields[SYS_FIELD_POS]->store(pos));
+
+	OK(schema_table_store_record(thd, table_to_fill));
+
+	DBUG_RETURN(0);
+}
+/*******************************************************************//**
+Function to go through each record in SYS_FIELDS table, and fill the
+information_schema.innodb_sys_fields table with related index field
+information
+@return 0 on success */
+static
+int
+i_s_sys_fields_fill_table(
+/*======================*/
+	THD*		thd,	/*!< in: thread */
+	TABLE_LIST*	tables,	/*!< in/out: tables to fill */
+	Item*		)	/*!< in: condition (not used) */
+{
+	btr_pcur_t	pcur;
+	const rec_t*	rec;
+	mem_heap_t*	heap;
+	index_id_t	last_id;
+	mtr_t		mtr;
+
+	DBUG_ENTER("i_s_sys_fields_fill_table");
+
+	/* deny access to user without PROCESS_ACL privilege */
+	if (check_global_access(thd, PROCESS_ACL)) {
+
+		DBUG_RETURN(0);
+	}
+
+	heap = mem_heap_create(1000);
+	mutex_enter(&dict_sys->mutex);
+	mtr_start(&mtr);
+
+	/* will save last index id so that we know whether we move to
+	the next index. This is used to calculate prefix length */
+	last_id = 0;
+
+	rec = dict_startscan_system(&pcur, &mtr, SYS_FIELDS);
+
+	while (rec) {
+		ulint		pos;
+		const char*	err_msg;
+		index_id_t	index_id;
+		dict_field_t	field_rec;
+
+		/* Populate a dict_field_t structure with information from
+		a SYS_FIELDS row */
+		err_msg = dict_process_sys_fields_rec(heap, rec, &field_rec,
+						      &pos, &index_id, last_id);
+
+		mtr_commit(&mtr);
+		mutex_exit(&dict_sys->mutex);
+
+		if (!err_msg) {
+			i_s_dict_fill_sys_fields(thd, index_id, &field_rec,
+						 pos, tables->table);
+			last_id = index_id;
+		} else {
+			push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
+					    ER_CANT_FIND_SYSTEM_REC, "%s",
+					    err_msg);
+		}
+
+		mem_heap_empty(heap);
+
+		/* Get the next record */
+		mutex_enter(&dict_sys->mutex);
+		mtr_start(&mtr);
+		rec = dict_getnext_system(&pcur, &mtr);
+	}
+
+	mtr_commit(&mtr);
+	mutex_exit(&dict_sys->mutex);
+	mem_heap_free(heap);
+
+	DBUG_RETURN(0);
+}
+/*******************************************************************//**
+Bind the dynamic table INFORMATION_SCHEMA.innodb_sys_fields
+@return 0 on success */
+static
+int
+innodb_sys_fields_init(
+/*===================*/
+	void*   p)	/*!< in/out: table schema object */
+{
+	ST_SCHEMA_TABLE*	schema;
+
+	DBUG_ENTER("innodb_sys_field_init");
+
+	schema = (ST_SCHEMA_TABLE*) p;
+
+	schema->fields_info = innodb_sys_fields_fields_info;
+	schema->fill_table = i_s_sys_fields_fill_table;
+
+	DBUG_RETURN(0);
+}
+
+UNIV_INTERN struct st_maria_plugin	i_s_innodb_sys_fields =
+{
+	/* the plugin type (a MYSQL_XXX_PLUGIN value) */
+	/* int */
+	STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+
+	/* pointer to type-specific plugin descriptor */
+	/* void* */
+	STRUCT_FLD(info, &i_s_info),
+
+	/* plugin name */
+	/* const char* */
+	STRUCT_FLD(name, "INNODB_SYS_FIELDS"),
+
+	/* plugin author (for SHOW PLUGINS) */
+	/* const char* */
+	STRUCT_FLD(author, plugin_author),
+
+	/* general descriptive text (for SHOW PLUGINS) */
+	/* const char* */
+	STRUCT_FLD(descr, "InnoDB SYS_FIELDS"),
+
+	/* the plugin license (PLUGIN_LICENSE_XXX) */
+	/* int */
+	STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
+
+	/* the function to invoke when plugin is loaded */
+	/* int (*)(void*); */
+	STRUCT_FLD(init, innodb_sys_fields_init),
+
+	/* the function to invoke when plugin is unloaded */
+	/* int (*)(void*); */
+	STRUCT_FLD(deinit, i_s_common_deinit),
+
+	/* plugin version (for SHOW PLUGINS) */
+	/* unsigned int */
+	STRUCT_FLD(version, INNODB_VERSION_SHORT),
+
+	/* struct st_mysql_show_var* */
+	STRUCT_FLD(status_vars, NULL),
+
+	/* struct st_mysql_sys_var** */
+	STRUCT_FLD(system_vars, NULL),
+
+        /* Maria extension */
+	STRUCT_FLD(version_info, INNODB_VERSION_STR),
+        STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_STABLE),
+};
+
+/* Fields of the dynamic table INFORMATION_SCHEMA.innodb_sys_foreign */
+static ST_FIELD_INFO	innodb_sys_foreign_fields_info[] =
+{
+#define SYS_FOREIGN_ID		0
+	{STRUCT_FLD(field_name,		"ID"),
+	 STRUCT_FLD(field_length,	NAME_LEN + 1),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_STRING),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define SYS_FOREIGN_FOR_NAME	1
+	{STRUCT_FLD(field_name,		"FOR_NAME"),
+	 STRUCT_FLD(field_length,	NAME_LEN + 1),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_STRING),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define SYS_FOREIGN_REF_NAME	2
+	{STRUCT_FLD(field_name,		"REF_NAME"),
+	 STRUCT_FLD(field_length,	NAME_LEN + 1),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_STRING),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define SYS_FOREIGN_NUM_COL	3
+	{STRUCT_FLD(field_name,		"N_COLS"),
+	 STRUCT_FLD(field_length,	MY_INT32_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define SYS_FOREIGN_TYPE	4
+	{STRUCT_FLD(field_name,		"TYPE"),
+	 STRUCT_FLD(field_length,	MY_INT32_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+	END_OF_ST_FIELD_INFO
+};
+
+/**********************************************************************//**
+Function to fill information_schema.innodb_sys_foreign with information
+collected by scanning SYS_FOREIGN table.
+@return 0 on success */
+static
+int
+i_s_dict_fill_sys_foreign(
+/*======================*/
+	THD*		thd,		/*!< in: thread */
+	dict_foreign_t*	foreign,	/*!< in: table */
+	TABLE*		table_to_fill)	/*!< in/out: fill this table */
+{
+	Field**		fields;
+
+	DBUG_ENTER("i_s_dict_fill_sys_foreign");
+
+	fields = table_to_fill->field;
+
+	OK(field_store_string(fields[SYS_FOREIGN_ID], foreign->id));
+
+	OK(field_store_string(fields[SYS_FOREIGN_FOR_NAME],
+			      foreign->foreign_table_name));
+
+	OK(field_store_string(fields[SYS_FOREIGN_REF_NAME],
+			      foreign->referenced_table_name));
+
+	OK(fields[SYS_FOREIGN_NUM_COL]->store(foreign->n_fields));
+
+	OK(fields[SYS_FOREIGN_TYPE]->store(foreign->type));
+
+	OK(schema_table_store_record(thd, table_to_fill));
+
+	DBUG_RETURN(0);
+}
+/*******************************************************************//**
+Function to populate INFORMATION_SCHEMA.innodb_sys_foreign table. Loop
+through each record in SYS_FOREIGN, and extract the foreign key
+information.
+@return 0 on success */
+static
+int
+i_s_sys_foreign_fill_table(
+/*=======================*/
+	THD*		thd,	/*!< in: thread */
+	TABLE_LIST*	tables,	/*!< in/out: tables to fill */
+	Item*		)	/*!< in: condition (not used) */
+{
+	btr_pcur_t	pcur;
+	const rec_t*	rec;
+	mem_heap_t*	heap;
+	mtr_t		mtr;
+
+	DBUG_ENTER("i_s_sys_foreign_fill_table");
+
+	/* deny access to user without PROCESS_ACL privilege */
+	if (check_global_access(thd, PROCESS_ACL)) {
+
+		DBUG_RETURN(0);
+	}
+
+	heap = mem_heap_create(1000);
+	mutex_enter(&dict_sys->mutex);
+	mtr_start(&mtr);
+
+	rec = dict_startscan_system(&pcur, &mtr, SYS_FOREIGN);
+
+	while (rec) {
+		const char*	err_msg;
+		dict_foreign_t	foreign_rec;
+
+		/* Populate a dict_foreign_t structure with information from
+		a SYS_FOREIGN row */
+		err_msg = dict_process_sys_foreign_rec(heap, rec, &foreign_rec);
+
+		mtr_commit(&mtr);
+		mutex_exit(&dict_sys->mutex);
+
+		if (!err_msg) {
+			i_s_dict_fill_sys_foreign(thd, &foreign_rec,
+						 tables->table);
+		} else {
+			push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
+					    ER_CANT_FIND_SYSTEM_REC, "%s",
+					    err_msg);
+		}
+
+		mem_heap_empty(heap);
+
+		/* Get the next record */
+		mtr_start(&mtr);
+		mutex_enter(&dict_sys->mutex);
+		rec = dict_getnext_system(&pcur, &mtr);
+	}
+
+	mtr_commit(&mtr);
+	mutex_exit(&dict_sys->mutex);
+	mem_heap_free(heap);
+
+	DBUG_RETURN(0);
+}
+/*******************************************************************//**
+Bind the dynamic table INFORMATION_SCHEMA.innodb_sys_foreign
+@return 0 on success */
+static
+int
+innodb_sys_foreign_init(
+/*====================*/
+	void*   p)	/*!< in/out: table schema object */
+{
+	ST_SCHEMA_TABLE*	schema;
+
+	DBUG_ENTER("innodb_sys_foreign_init");
+
+	schema = (ST_SCHEMA_TABLE*) p;
+
+	schema->fields_info = innodb_sys_foreign_fields_info;
+	schema->fill_table = i_s_sys_foreign_fill_table;
+
+	DBUG_RETURN(0);
+}
+
+UNIV_INTERN struct st_maria_plugin	i_s_innodb_sys_foreign =
+{
+	/* the plugin type (a MYSQL_XXX_PLUGIN value) */
+	/* int */
+	STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+
+	/* pointer to type-specific plugin descriptor */
+	/* void* */
+	STRUCT_FLD(info, &i_s_info),
+
+	/* plugin name */
+	/* const char* */
+	STRUCT_FLD(name, "INNODB_SYS_FOREIGN"),
+
+	/* plugin author (for SHOW PLUGINS) */
+	/* const char* */
+	STRUCT_FLD(author, plugin_author),
+
+	/* general descriptive text (for SHOW PLUGINS) */
+	/* const char* */
+	STRUCT_FLD(descr, "InnoDB SYS_FOREIGN"),
+
+	/* the plugin license (PLUGIN_LICENSE_XXX) */
+	/* int */
+	STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
+
+	/* the function to invoke when plugin is loaded */
+	/* int (*)(void*); */
+	STRUCT_FLD(init, innodb_sys_foreign_init),
+
+	/* the function to invoke when plugin is unloaded */
+	/* int (*)(void*); */
+	STRUCT_FLD(deinit, i_s_common_deinit),
+
+	/* plugin version (for SHOW PLUGINS) */
+	/* unsigned int */
+	STRUCT_FLD(version, INNODB_VERSION_SHORT),
+
+	/* struct st_mysql_show_var* */
+	STRUCT_FLD(status_vars, NULL),
+
+	/* struct st_mysql_sys_var** */
+	STRUCT_FLD(system_vars, NULL),
+
+        /* Maria extension */
+	STRUCT_FLD(version_info, INNODB_VERSION_STR),
+        STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_STABLE),
+};
+/* Fields of the dynamic table INFORMATION_SCHEMA.innodb_sys_foreign_cols */
+static ST_FIELD_INFO	innodb_sys_foreign_cols_fields_info[] =
+{
+#define SYS_FOREIGN_COL_ID		0
+	{STRUCT_FLD(field_name,		"ID"),
+	 STRUCT_FLD(field_length,	NAME_LEN + 1),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_STRING),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define SYS_FOREIGN_COL_FOR_NAME	1
+	{STRUCT_FLD(field_name,		"FOR_COL_NAME"),
+	 STRUCT_FLD(field_length,	NAME_LEN + 1),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_STRING),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define SYS_FOREIGN_COL_REF_NAME	2
+	{STRUCT_FLD(field_name,		"REF_COL_NAME"),
+	 STRUCT_FLD(field_length,	NAME_LEN + 1),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_STRING),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define SYS_FOREIGN_COL_POS		3
+	{STRUCT_FLD(field_name,		"POS"),
+	 STRUCT_FLD(field_length,	MY_INT32_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+	END_OF_ST_FIELD_INFO
+};
+
+/**********************************************************************//**
+Function to fill information_schema.innodb_sys_foreign_cols with information
+collected by scanning SYS_FOREIGN_COLS table.
+@return 0 on success */
+static
+int
+i_s_dict_fill_sys_foreign_cols(
+/*==========================*/
+	THD*		thd,		/*!< in: thread */
+	const char*	name,		/*!< in: foreign key constraint name */
+	const char*	for_col_name,	/*!< in: referencing column name*/
+	const char*	ref_col_name,	/*!< in: referenced column
+					name */
+	ulint		pos,		/*!< in: column position */
+	TABLE*		table_to_fill)	/*!< in/out: fill this table */
+{
+	Field**		fields;
+
+	DBUG_ENTER("i_s_dict_fill_sys_foreign_cols");
+
+	fields = table_to_fill->field;
+
+	OK(field_store_string(fields[SYS_FOREIGN_COL_ID], name));
+
+	OK(field_store_string(fields[SYS_FOREIGN_COL_FOR_NAME], for_col_name));
+
+	OK(field_store_string(fields[SYS_FOREIGN_COL_REF_NAME], ref_col_name));
+
+	OK(fields[SYS_FOREIGN_COL_POS]->store(pos));
+
+	OK(schema_table_store_record(thd, table_to_fill));
+
+	DBUG_RETURN(0);
+}
+/*******************************************************************//**
+Function to populate INFORMATION_SCHEMA.innodb_sys_foreign_cols table. Loop
+through each record in SYS_FOREIGN_COLS, and extract the foreign key column
+information and fill the INFORMATION_SCHEMA.innodb_sys_foreign_cols table.
+@return 0 on success */
+static
+int
+i_s_sys_foreign_cols_fill_table(
+/*============================*/
+	THD*		thd,	/*!< in: thread */
+	TABLE_LIST*	tables,	/*!< in/out: tables to fill */
+	Item*		)	/*!< in: condition (not used) */
+{
+	btr_pcur_t	pcur;
+	const rec_t*	rec;
+	mem_heap_t*	heap;
+	mtr_t		mtr;
+
+	DBUG_ENTER("i_s_sys_foreign_cols_fill_table");
+
+	/* deny access to user without PROCESS_ACL privilege */
+	if (check_global_access(thd, PROCESS_ACL)) {
+		DBUG_RETURN(0);
+	}
+
+	heap = mem_heap_create(1000);
+	mutex_enter(&dict_sys->mutex);
+	mtr_start(&mtr);
+
+	rec = dict_startscan_system(&pcur, &mtr, SYS_FOREIGN_COLS);
+
+	while (rec) {
+		const char*	err_msg;
+		const char*	name;
+		const char*	for_col_name;
+		const char*	ref_col_name;
+		ulint		pos;
+
+		/* Extract necessary information from a SYS_FOREIGN_COLS row */
+		err_msg = dict_process_sys_foreign_col_rec(
+			heap, rec, &name, &for_col_name, &ref_col_name, &pos);
+
+		mtr_commit(&mtr);
+		mutex_exit(&dict_sys->mutex);
+
+		if (!err_msg) {
+			i_s_dict_fill_sys_foreign_cols(
+				thd, name, for_col_name, ref_col_name, pos,
+				tables->table);
+		} else {
+			push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
+					    ER_CANT_FIND_SYSTEM_REC, "%s",
+					    err_msg);
+		}
+
+		mem_heap_empty(heap);
+
+		/* Get the next record */
+		mutex_enter(&dict_sys->mutex);
+		mtr_start(&mtr);
+		rec = dict_getnext_system(&pcur, &mtr);
+	}
+
+	mtr_commit(&mtr);
+	mutex_exit(&dict_sys->mutex);
+	mem_heap_free(heap);
+
+	DBUG_RETURN(0);
+}
+/*******************************************************************//**
+Bind the dynamic table INFORMATION_SCHEMA.innodb_sys_foreign_cols
+@return 0 on success */
+static
+int
+innodb_sys_foreign_cols_init(
+/*========================*/
+	void*	p)	/*!< in/out: table schema object */
+{
+	ST_SCHEMA_TABLE*	schema;
+
+	DBUG_ENTER("innodb_sys_foreign_cols_init");
+
+	schema = (ST_SCHEMA_TABLE*) p;
+
+	schema->fields_info = innodb_sys_foreign_cols_fields_info;
+	schema->fill_table = i_s_sys_foreign_cols_fill_table;
+
+	DBUG_RETURN(0);
+}
+
+UNIV_INTERN struct st_maria_plugin	i_s_innodb_sys_foreign_cols =
+{
+	/* the plugin type (a MYSQL_XXX_PLUGIN value) */
+	/* int */
+	STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+
+	/* pointer to type-specific plugin descriptor */
+	/* void* */
+	STRUCT_FLD(info, &i_s_info),
+
+	/* plugin name */
+	/* const char* */
+	STRUCT_FLD(name, "INNODB_SYS_FOREIGN_COLS"),
+
+	/* plugin author (for SHOW PLUGINS) */
+	/* const char* */
+	STRUCT_FLD(author, plugin_author),
+
+	/* general descriptive text (for SHOW PLUGINS) */
+	/* const char* */
+	STRUCT_FLD(descr, "InnoDB SYS_FOREIGN_COLS"),
+
+	/* the plugin license (PLUGIN_LICENSE_XXX) */
+	/* int */
+	STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
+
+	/* the function to invoke when plugin is loaded */
+	/* int (*)(void*); */
+	STRUCT_FLD(init, innodb_sys_foreign_cols_init),
+
+	/* the function to invoke when plugin is unloaded */
+	/* int (*)(void*); */
+	STRUCT_FLD(deinit, i_s_common_deinit),
+
+	/* plugin version (for SHOW PLUGINS) */
+	/* unsigned int */
+	STRUCT_FLD(version, INNODB_VERSION_SHORT),
+
+	/* struct st_mysql_show_var* */
+	STRUCT_FLD(status_vars, NULL),
+
+	/* struct st_mysql_sys_var** */
+	STRUCT_FLD(system_vars, NULL),
+
+        /* Maria extension */
+	STRUCT_FLD(version_info, INNODB_VERSION_STR),
+        STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_STABLE),
+};
+
diff --git a/storage/innobase/handler/i_s.h b/storage/innobase/handler/i_s.h
index dc0deef119b..7fc7b091795 100644
--- a/storage/innobase/handler/i_s.h
+++ b/storage/innobase/handler/i_s.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 2007, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 2007, 2011, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -28,12 +28,30 @@ Created July 18, 2007 Vasil Dimov
 
 const char plugin_author[] = "Oracle Corporation";
 
-extern struct st_mysql_plugin	i_s_innodb_trx;
-extern struct st_mysql_plugin	i_s_innodb_locks;
-extern struct st_mysql_plugin	i_s_innodb_lock_waits;
-extern struct st_mysql_plugin	i_s_innodb_cmp;
-extern struct st_mysql_plugin	i_s_innodb_cmp_reset;
-extern struct st_mysql_plugin	i_s_innodb_cmpmem;
-extern struct st_mysql_plugin	i_s_innodb_cmpmem_reset;
+extern struct st_maria_plugin	i_s_innodb_trx;
+extern struct st_maria_plugin	i_s_innodb_locks;
+extern struct st_maria_plugin	i_s_innodb_lock_waits;
+extern struct st_maria_plugin	i_s_innodb_cmp;
+extern struct st_maria_plugin	i_s_innodb_cmp_reset;
+extern struct st_maria_plugin	i_s_innodb_cmpmem;
+extern struct st_maria_plugin	i_s_innodb_cmpmem_reset;
+extern struct st_maria_plugin   i_s_innodb_metrics;
+extern struct st_maria_plugin	i_s_innodb_ft_default_stopword;
+extern struct st_maria_plugin	i_s_innodb_ft_inserted;
+extern struct st_maria_plugin	i_s_innodb_ft_deleted;
+extern struct st_maria_plugin	i_s_innodb_ft_being_deleted;
+extern struct st_maria_plugin	i_s_innodb_ft_index_cache;
+extern struct st_maria_plugin	i_s_innodb_ft_index_table;
+extern struct st_maria_plugin	i_s_innodb_ft_config;
+extern struct st_maria_plugin	i_s_innodb_buffer_page;
+extern struct st_maria_plugin	i_s_innodb_buffer_page_lru;
+extern struct st_maria_plugin	i_s_innodb_buffer_stats;
+extern struct st_maria_plugin	i_s_innodb_sys_tables;
+extern struct st_maria_plugin	i_s_innodb_sys_tablestats;
+extern struct st_maria_plugin	i_s_innodb_sys_indexes;
+extern struct st_maria_plugin	i_s_innodb_sys_columns;
+extern struct st_maria_plugin	i_s_innodb_sys_fields;
+extern struct st_maria_plugin	i_s_innodb_sys_foreign;
+extern struct st_maria_plugin	i_s_innodb_sys_foreign_cols;
 
 #endif /* i_s_h */
diff --git a/storage/innobase/ibuf/ibuf0ibuf.c b/storage/innobase/ibuf/ibuf0ibuf.cc
index e534e165c0c..4f615da5809 100644
--- a/storage/innobase/ibuf/ibuf0ibuf.c
+++ b/storage/innobase/ibuf/ibuf0ibuf.cc
@@ -17,7 +17,7 @@ this program; if not, write to the Free Software Foundation, Inc.,
 *****************************************************************************/
 
 /**************************************************//**
-@file ibuf/ibuf0ibuf.c
+@file ibuf/ibuf0ibuf.cc
 Insert buffer
 
 Created 7/19/1997 Heikki Tuuri
@@ -183,9 +183,6 @@ level 2 i/o. However, if an OS thread does the i/o handling for itself, i.e.,
 it uses synchronous aio, it can access any pages, as long as it obeys the
 access order rules. */
 
-/** Buffer pool size per the maximum insert buffer size */
-#define IBUF_POOL_SIZE_PER_MAX_SIZE	2
-
 /** Table name for the insert buffer. */
 #define IBUF_TABLE_NAME		"SYS_IBUF_TABLE"
 
@@ -200,9 +197,6 @@ UNIV_INTERN uint	ibuf_debug;
 /** The insert buffer control structure */
 UNIV_INTERN ibuf_t*	ibuf			= NULL;
 
-/** Counter for ibuf_should_try() */
-UNIV_INTERN ulint	ibuf_flush_count	= 0;
-
 #ifdef UNIV_PFS_MUTEX
 UNIV_INTERN mysql_pfs_key_t	ibuf_pessimistic_insert_mutex_key;
 UNIV_INTERN mysql_pfs_key_t	ibuf_mutex_key;
@@ -515,16 +509,15 @@ ibuf_init_at_db_start(void)
 	page_t*		header_page;
 	ulint		error;
 
-	ibuf = mem_alloc(sizeof(ibuf_t));
-
-	memset(ibuf, 0, sizeof(*ibuf));
+	ibuf = static_cast<ibuf_t*>(mem_zalloc(sizeof(ibuf_t)));
 
-	/* Note that also a pessimistic delete can sometimes make a B-tree
-	grow in size, as the references on the upper levels of the tree can
-	change */
-
-	ibuf->max_size = buf_pool_get_curr_size() / UNIV_PAGE_SIZE
-		/ IBUF_POOL_SIZE_PER_MAX_SIZE;
+	/* At startup we intialize ibuf to have a maximum of
+	CHANGE_BUFFER_DEFAULT_SIZE in terms of percentage of the
+	buffer pool size. Once ibuf struct is initialized this
+	value is updated with the user supplied size by calling
+	ibuf_max_size_update(). */
+	ibuf->max_size = ((buf_pool_get_curr_size() / UNIV_PAGE_SIZE)
+			  * CHANGE_BUFFER_DEFAULT_SIZE) / 100;
 
 	mutex_create(ibuf_pessimistic_insert_mutex_key,
 		     &ibuf_pessimistic_insert_mutex,
@@ -572,13 +565,13 @@ ibuf_init_at_db_start(void)
 	heap = mem_heap_create(450);
 
 	/* Use old-style record format for the insert buffer. */
-	table = dict_mem_table_create(IBUF_TABLE_NAME, IBUF_SPACE_ID, 1, 0);
+	table = dict_mem_table_create(IBUF_TABLE_NAME, IBUF_SPACE_ID, 1, 0, 0);
 
 	dict_mem_table_add_col(table, heap, "DUMMY_COLUMN", DATA_BINARY, 0, 0);
 
 	table->id = DICT_IBUF_ID_MIN + IBUF_SPACE_ID;
 
-	dict_table_add_to_cache(table, heap);
+	dict_table_add_to_cache(table, FALSE, heap);
 	mem_heap_free(heap);
 
 	index = dict_mem_index_create(
@@ -595,6 +588,24 @@ ibuf_init_at_db_start(void)
 
 	ibuf->index = dict_table_get_first_index(table);
 }
+
+/*********************************************************************//**
+Updates the max_size value for ibuf. */
+UNIV_INTERN
+void
+ibuf_max_size_update(
+/*=================*/
+	ulint	new_val)	/*!< in: new value in terms of
+				percentage of the buffer pool size */
+{
+	ulint	new_size = ((buf_pool_get_curr_size() / UNIV_PAGE_SIZE)
+			    * new_val) / 100;
+	mutex_enter(&ibuf_mutex);
+	ibuf->max_size = new_size;
+	mutex_exit(&ibuf_mutex);
+}
+
+
 #endif /* !UNIV_HOTBACKUP */
 /*********************************************************************//**
 Initializes an ibuf bitmap page. */
@@ -1244,17 +1255,9 @@ ibuf_rec_get_page_no_func(
 
 	field = rec_get_nth_field_old(rec, IBUF_REC_FIELD_MARKER, &len);
 
-	if (len == 1) {
-		/* This is of the >= 4.1.x record format */
-		ut_a(trx_sys_multiple_tablespace_format);
-
-		field = rec_get_nth_field_old(rec, IBUF_REC_FIELD_PAGE, &len);
-	} else {
-		ut_a(trx_doublewrite_must_reset_space_ids);
-		ut_a(!trx_sys_multiple_tablespace_format);
+	ut_a(len == 1);
 
-		field = rec_get_nth_field_old(rec, 0, &len);
-	}
+	field = rec_get_nth_field_old(rec, IBUF_REC_FIELD_PAGE, &len);
 
 	ut_a(len == 4);
 
@@ -1290,20 +1293,13 @@ ibuf_rec_get_space_func(
 
 	field = rec_get_nth_field_old(rec, IBUF_REC_FIELD_MARKER, &len);
 
-	if (len == 1) {
-		/* This is of the >= 4.1.x record format */
-
-		ut_a(trx_sys_multiple_tablespace_format);
-		field = rec_get_nth_field_old(rec, IBUF_REC_FIELD_SPACE, &len);
-		ut_a(len == 4);
+	ut_a(len == 1);
 
-		return(mach_read_from_4(field));
-	}
+	field = rec_get_nth_field_old(rec, IBUF_REC_FIELD_SPACE, &len);
 
-	ut_a(trx_doublewrite_must_reset_space_ids);
-	ut_a(!trx_sys_multiple_tablespace_format);
+	ut_a(len == 4);
 
-	return(0);
+	return(mach_read_from_4(field));
 }
 
 #ifdef UNIV_DEBUG
@@ -1360,7 +1356,7 @@ ibuf_rec_get_info_func(
 		break;
 
 	case IBUF_REC_INFO_SIZE:
-		op_local = (ibuf_op_t)types[IBUF_REC_OFFSET_TYPE];
+		op_local = (ibuf_op_t) types[IBUF_REC_OFFSET_TYPE];
 		comp_local = types[IBUF_REC_OFFSET_FLAGS] & IBUF_REC_COMPACT;
 		counter_local = mach_read_from_2(
 			types + IBUF_REC_OFFSET_COUNTER);
@@ -1529,7 +1525,7 @@ ibuf_dummy_index_create(
 
 	table = dict_mem_table_create("IBUF_DUMMY",
 				      DICT_HDR_SPACE, n,
-				      comp ? DICT_TF_COMPACT : 0);
+				      comp ? DICT_TF_COMPACT : 0, 0);
 
 	index = dict_mem_index_create("IBUF_DUMMY", "IBUF_DUMMY",
 				      DICT_HDR_SPACE, 0, n);
@@ -1573,58 +1569,6 @@ ibuf_dummy_index_free(
 	dict_mem_table_free(table);
 }
 
-/*********************************************************************//**
-Builds the entry to insert into a non-clustered index when we have the
-corresponding record in an ibuf index.
-
-NOTE that as we copy pointers to fields in ibuf_rec, the caller must
-hold a latch to the ibuf_rec page as long as the entry is used!
-
-@return own: entry to insert to a non-clustered index */
-UNIV_INLINE
-dtuple_t*
-ibuf_build_entry_pre_4_1_x(
-/*=======================*/
-	const rec_t*	ibuf_rec,	/*!< in: record in an insert buffer */
-	mem_heap_t*	heap,		/*!< in: heap where built */
-	dict_index_t**	pindex)		/*!< out, own: dummy index that
-					describes the entry */
-{
-	ulint		i;
-	ulint		len;
-	const byte*	types;
-	dtuple_t*	tuple;
-	ulint		n_fields;
-
-	ut_a(trx_doublewrite_must_reset_space_ids);
-	ut_a(!trx_sys_multiple_tablespace_format);
-
-	n_fields = rec_get_n_fields_old(ibuf_rec) - 2;
-	tuple = dtuple_create(heap, n_fields);
-	types = rec_get_nth_field_old(ibuf_rec, 1, &len);
-
-	ut_a(len == n_fields * DATA_ORDER_NULL_TYPE_BUF_SIZE);
-
-	for (i = 0; i < n_fields; i++) {
-		const byte*	data;
-		dfield_t*	field;
-
-		field = dtuple_get_nth_field(tuple, i);
-
-		data = rec_get_nth_field_old(ibuf_rec, i + 2, &len);
-
-		dfield_set_data(field, data, len);
-
-		dtype_read_for_order_and_null_size(
-			dfield_get_type(field),
-			types + i * DATA_ORDER_NULL_TYPE_BUF_SIZE);
-	}
-
-	*pindex = ibuf_dummy_index_create(n_fields, FALSE);
-
-	return(tuple);
-}
-
 #ifdef UNIV_DEBUG
 # define ibuf_build_entry_from_ibuf_rec(mtr,ibuf_rec,heap,pindex)	\
 	ibuf_build_entry_from_ibuf_rec_func(mtr,ibuf_rec,heap,pindex)
@@ -1678,15 +1622,7 @@ ibuf_build_entry_from_ibuf_rec_func(
 
 	data = rec_get_nth_field_old(ibuf_rec, IBUF_REC_FIELD_MARKER, &len);
 
-	if (len > 1) {
-		/* This a < 4.1.x format record */
-
-		return(ibuf_build_entry_pre_4_1_x(ibuf_rec, heap, pindex));
-	}
-
-	/* This a >= 4.1.x format record */
-
-	ut_a(trx_sys_multiple_tablespace_format);
+	ut_a(len == 1);
 	ut_a(*data == 0);
 	ut_a(rec_get_n_fields_old(ibuf_rec) > IBUF_REC_FIELD_USER);
 
@@ -1742,8 +1678,6 @@ ibuf_rec_get_size(
 	const rec_t*	rec,			/*!< in: ibuf record */
 	const byte*	types,			/*!< in: fields */
 	ulint		n_fields,		/*!< in: number of fields */
-	ibool		pre_4_1,		/*!< in: TRUE=pre-4.1 format,
-						FALSE=newer */
 	ulint		comp)			/*!< in: 0=ROW_FORMAT=REDUNDANT,
 						nonzero=ROW_FORMAT=COMPACT */
 {
@@ -1752,13 +1686,8 @@ ibuf_rec_get_size(
 	ulint	types_offset;
 	ulint	size = 0;
 
-	if (pre_4_1) {
-		field_offset = 2;
-		types_offset = DATA_ORDER_NULL_TYPE_BUF_SIZE;
-	} else {
-		field_offset = IBUF_REC_FIELD_USER;
-		types_offset = DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE;
-	}
+	field_offset = IBUF_REC_FIELD_USER;
+	types_offset = DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE;
 
 	for (i = 0; i < n_fields; i++) {
 		ulint		len;
@@ -1768,10 +1697,6 @@ ibuf_rec_get_size(
 
 		if (len != UNIV_SQL_NULL) {
 			size += len;
-		} else if (pre_4_1) {
-			dtype_read_for_order_and_null_size(&dtype, types);
-
-			size += dtype_get_sql_null_size(&dtype, comp);
 		} else {
 			dtype_new_read_for_order_and_null_size(&dtype, types);
 
@@ -1809,8 +1734,9 @@ ibuf_rec_get_volume_func(
 	const byte*	types;
 	ulint		n_fields;
 	ulint		data_size;
-	ibool		pre_4_1;
 	ulint		comp;
+	ibuf_op_t	op;
+	ulint		info_len;
 
 	ut_ad(mtr_memo_contains_page(mtr, ibuf_rec, MTR_MEMO_PAGE_X_FIX)
 	      || mtr_memo_contains_page(mtr, ibuf_rec, MTR_MEMO_PAGE_S_FIX));
@@ -1818,64 +1744,44 @@ ibuf_rec_get_volume_func(
 	ut_ad(rec_get_n_fields_old(ibuf_rec) > 2);
 
 	data = rec_get_nth_field_old(ibuf_rec, IBUF_REC_FIELD_MARKER, &len);
-	pre_4_1 = (len > 1);
-
-	if (pre_4_1) {
-		/* < 4.1.x format record */
-
-		ut_a(trx_doublewrite_must_reset_space_ids);
-		ut_a(!trx_sys_multiple_tablespace_format);
-
-		n_fields = rec_get_n_fields_old(ibuf_rec) - 2;
-
-		types = rec_get_nth_field_old(ibuf_rec, 1, &len);
-
-		ut_ad(len == n_fields * DATA_ORDER_NULL_TYPE_BUF_SIZE);
-		comp = 0;
-	} else {
-		/* >= 4.1.x format record */
-		ibuf_op_t	op;
-		ulint		info_len;
-
-		ut_a(trx_sys_multiple_tablespace_format);
-		ut_a(*data == 0);
-
-		types = rec_get_nth_field_old(
-			ibuf_rec, IBUF_REC_FIELD_METADATA, &len);
+	ut_a(len == 1);
+	ut_a(*data == 0);
 
-		ibuf_rec_get_info(mtr, ibuf_rec, &op, &comp, &info_len, NULL);
+	types = rec_get_nth_field_old(
+		ibuf_rec, IBUF_REC_FIELD_METADATA, &len);
 
-		if (op == IBUF_OP_DELETE_MARK || op == IBUF_OP_DELETE) {
-			/* Delete-marking a record doesn't take any
-			additional space, and while deleting a record
-			actually frees up space, we have to play it safe and
-			pretend it takes no additional space (the record
-			might not exist, etc.).  */
+	ibuf_rec_get_info(mtr, ibuf_rec, &op, &comp, &info_len, NULL);
 
-			return(0);
-		} else if (comp) {
-			dtuple_t*	entry;
-			ulint		volume;
-			dict_index_t*	dummy_index;
-			mem_heap_t*	heap = mem_heap_create(500);
+	if (op == IBUF_OP_DELETE_MARK || op == IBUF_OP_DELETE) {
+		/* Delete-marking a record doesn't take any
+		additional space, and while deleting a record
+		actually frees up space, we have to play it safe and
+		pretend it takes no additional space (the record
+		might not exist, etc.).  */
 
-			entry = ibuf_build_entry_from_ibuf_rec(
-				mtr, ibuf_rec, heap, &dummy_index);
+		return(0);
+	} else if (comp) {
+		dtuple_t*	entry;
+		ulint		volume;
+		dict_index_t*	dummy_index;
+		mem_heap_t*	heap = mem_heap_create(500);
 
-			volume = rec_get_converted_size(dummy_index, entry, 0);
+		entry = ibuf_build_entry_from_ibuf_rec(mtr, ibuf_rec,
+			heap, &dummy_index);
 
-			ibuf_dummy_index_free(dummy_index);
-			mem_heap_free(heap);
+		volume = rec_get_converted_size(dummy_index, entry, 0);
 
-			return(volume + page_dir_calc_reserved_space(1));
-		}
+		ibuf_dummy_index_free(dummy_index);
+		mem_heap_free(heap);
 
-		types += info_len;
-		n_fields = rec_get_n_fields_old(ibuf_rec)
-			- IBUF_REC_FIELD_USER;
+		return(volume + page_dir_calc_reserved_space(1));
 	}
 
-	data_size = ibuf_rec_get_size(ibuf_rec, types, n_fields, pre_4_1, comp);
+	types += info_len;
+	n_fields = rec_get_n_fields_old(ibuf_rec)
+		- IBUF_REC_FIELD_USER;
+
+	data_size = ibuf_rec_get_size(ibuf_rec, types, n_fields, comp);
 
 	return(data_size + rec_get_converted_extra_size(data_size, n_fields, 0)
 	       + page_dir_calc_reserved_space(1));
@@ -1933,7 +1839,7 @@ ibuf_entry_build(
 
 	field = dtuple_get_nth_field(tuple, IBUF_REC_FIELD_SPACE);
 
-	buf = mem_heap_alloc(heap, 4);
+	buf = static_cast<byte*>(mem_heap_alloc(heap, 4));
 
 	mach_write_to_4(buf, space);
 
@@ -1943,7 +1849,7 @@ ibuf_entry_build(
 
 	field = dtuple_get_nth_field(tuple, IBUF_REC_FIELD_MARKER);
 
-	buf = mem_heap_alloc(heap, 1);
+	buf = static_cast<byte*>(mem_heap_alloc(heap, 1));
 
 	/* We set the marker byte zero */
 
@@ -1955,7 +1861,7 @@ ibuf_entry_build(
 
 	field = dtuple_get_nth_field(tuple, IBUF_REC_FIELD_PAGE);
 
-	buf = mem_heap_alloc(heap, 4);
+	buf = static_cast<byte*>(mem_heap_alloc(heap, 4));
 
 	mach_write_to_4(buf, page_no);
 
@@ -1970,8 +1876,10 @@ ibuf_entry_build(
 		i = IBUF_REC_INFO_SIZE;
 	}
 
-	ti = type_info = mem_heap_alloc(heap, i + n_fields
-					* DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE);
+	ti = type_info = static_cast<byte*>(
+		mem_heap_alloc(
+			heap,
+			i + n_fields * DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE));
 
 	switch (i) {
 	default:
@@ -2047,7 +1955,7 @@ ibuf_entry_build(
 
 /*********************************************************************//**
 Builds a search tuple used to search buffered inserts for an index page.
-This is for < 4.1.x format records
+This is for >= 4.1.x format records.
 @return	own: search tuple */
 static
 dtuple_t*
@@ -2061,52 +1969,13 @@ ibuf_search_tuple_build(
 	dfield_t*	field;
 	byte*		buf;
 
-	ut_a(space == 0);
-	ut_a(trx_doublewrite_must_reset_space_ids);
-	ut_a(!trx_sys_multiple_tablespace_format);
-
-	tuple = dtuple_create(heap, 1);
-
-	/* Store the page number in tuple */
-
-	field = dtuple_get_nth_field(tuple, 0);
-
-	buf = mem_heap_alloc(heap, 4);
-
-	mach_write_to_4(buf, page_no);
-
-	dfield_set_data(field, buf, 4);
-
-	dtuple_set_types_binary(tuple, 1);
-
-	return(tuple);
-}
-
-/*********************************************************************//**
-Builds a search tuple used to search buffered inserts for an index page.
-This is for >= 4.1.x format records.
-@return	own: search tuple */
-static
-dtuple_t*
-ibuf_new_search_tuple_build(
-/*========================*/
-	ulint		space,	/*!< in: space id */
-	ulint		page_no,/*!< in: index page number */
-	mem_heap_t*	heap)	/*!< in: heap into which to build */
-{
-	dtuple_t*	tuple;
-	dfield_t*	field;
-	byte*		buf;
-
-	ut_a(trx_sys_multiple_tablespace_format);
-
 	tuple = dtuple_create(heap, IBUF_REC_FIELD_METADATA);
 
 	/* Store the space id in tuple */
 
 	field = dtuple_get_nth_field(tuple, IBUF_REC_FIELD_SPACE);
 
-	buf = mem_heap_alloc(heap, 4);
+	buf = static_cast<byte*>(mem_heap_alloc(heap, 4));
 
 	mach_write_to_4(buf, space);
 
@@ -2116,7 +1985,7 @@ ibuf_new_search_tuple_build(
 
 	field = dtuple_get_nth_field(tuple, IBUF_REC_FIELD_MARKER);
 
-	buf = mem_heap_alloc(heap, 1);
+	buf = static_cast<byte*>(mem_heap_alloc(heap, 1));
 
 	mach_write_to_1(buf, 0);
 
@@ -2126,7 +1995,7 @@ ibuf_new_search_tuple_build(
 
 	field = dtuple_get_nth_field(tuple, IBUF_REC_FIELD_PAGE);
 
-	buf = mem_heap_alloc(heap, 4);
+	buf = static_cast<byte*>(mem_heap_alloc(heap, 4));
 
 	mach_write_to_4(buf, page_no);
 
@@ -2194,7 +2063,7 @@ ibuf_add_free_page(void)
 	/* Acquire the fsp latch before the ibuf header, obeying the latching
 	order */
 	mtr_x_lock(fil_space_get_latch(IBUF_SPACE_ID, &flags), &mtr);
-	zip_size = dict_table_flags_to_zip_size(flags);
+	zip_size = fsp_flags_get_zip_size(flags);
 
 	header_page = ibuf_header_page_get(&mtr);
 
@@ -2276,7 +2145,7 @@ ibuf_remove_free_page(void)
 	/* Acquire the fsp latch before the ibuf header, obeying the latching
 	order */
 	mtr_x_lock(fil_space_get_latch(IBUF_SPACE_ID, &flags), &mtr);
-	zip_size = dict_table_flags_to_zip_size(flags);
+	zip_size = fsp_flags_get_zip_size(flags);
 
 	header_page = ibuf_header_page_get(&mtr);
 
@@ -2550,7 +2419,8 @@ ibuf_get_merge_page_nos_func(
 			smallest possible secondary index leaf page
 			(and that only after DROP INDEX). */
 			ut_ad(rec_page_no
-			      > IBUF_TREE_ROOT_PAGE_NO - (rec_space_id != 0));
+			      > (ulint) IBUF_TREE_ROOT_PAGE_NO
+			      - (rec_space_id != 0));
 		}
 
 #ifdef UNIV_IBUF_DEBUG
@@ -2715,22 +2585,42 @@ will be merged from ibuf trees to the pages read, 0 if ibuf is
 empty */
 UNIV_INTERN
 ulint
-ibuf_contract_for_n_pages(
-/*======================*/
-	ibool	sync,	/*!< in: TRUE if the caller wants to wait for the
-			issued read with the highest tablespace address
-			to complete */
-	ulint	n_pages)/*!< in: try to read at least this many pages to
-			the buffer pool and merge the ibuf contents to
-			them */
+ibuf_contract_in_background(
+/*========================*/
+	ibool	full)	/*!< in: TRUE if the caller wants to do a full
+			contract based on PCT_IO(100). If FALSE then
+			the size of contract batch is determined based
+			on the current size of the ibuf tree. */
 {
 	ulint	sum_bytes	= 0;
 	ulint	sum_pages	= 0;
 	ulint	n_bytes;
 	ulint	n_pag2;
+	ulint	n_pages;
+
+	if (full) {
+		/* Caller has requested a full batch */
+		n_pages = PCT_IO(100);
+	} else {
+		/* By default we do a batch of 5% of the io_capacity */
+		n_pages = PCT_IO(5);
+
+		mutex_enter(&ibuf_mutex);
+
+		/* If the ibuf->size is more than half the max_size
+		then we make more agreesive contraction.
+		+1 is to avoid division by zero. */
+		if (ibuf->size > ibuf->max_size / 2) {
+			ulint diff = ibuf->size - ibuf->max_size / 2;
+			n_pages += PCT_IO((diff * 100)
+					   / (ibuf->max_size + 1));
+		}
+
+		mutex_exit(&ibuf_mutex);
+	}
 
 	while (sum_pages < n_pages) {
-		n_bytes = ibuf_contract_ext(&n_pag2, sync);
+		n_bytes = ibuf_contract_ext(&n_pag2, FALSE);
 
 		if (n_bytes == 0) {
 			return(sum_bytes);
@@ -2805,8 +2695,7 @@ ibuf_get_volume_buffered_hash(
 
 	len = ibuf_rec_get_size(
 		rec, types,
-		rec_get_n_fields_old(rec) - IBUF_REC_FIELD_USER,
-		FALSE, comp);
+		rec_get_n_fields_old(rec) - IBUF_REC_FIELD_USER, comp);
 	fold = ut_fold_binary(data, len);
 
 	hash += (fold / (CHAR_BIT * sizeof *hash)) % size;
@@ -2866,7 +2755,6 @@ ibuf_get_volume_buffered_count_func(
 	operations.  All pre-4.1 records should have been merged
 	when the database was started up. */
 	ut_a(len == 1);
-	ut_ad(trx_sys_multiple_tablespace_format);
 
 	types = rec_get_nth_field_old(rec, IBUF_REC_FIELD_METADATA, &len);
 
@@ -2880,7 +2768,7 @@ ibuf_get_volume_buffered_count_func(
 		because deletes cannot be buffered if there are
 		old-style inserts buffered for the page. */
 
-		len = ibuf_rec_get_size(rec, types, n_fields, FALSE, 0);
+		len = ibuf_rec_get_size(rec, types, n_fields, 0);
 
 		return(len
 		       + rec_get_converted_extra_size(len, n_fields, 0)
@@ -2985,8 +2873,6 @@ ibuf_get_volume_buffered(
 	/* bitmap of buffered recs */
 	ulint		hash_bitmap[128 / sizeof(ulint)];
 
-	ut_a(trx_sys_multiple_tablespace_format);
-
 	ut_ad((pcur->latch_mode == BTR_MODIFY_PREV)
 	      || (pcur->latch_mode == BTR_MODIFY_TREE));
 
@@ -3230,18 +3116,11 @@ ibuf_get_entry_counter_low_func(
 
 	field = rec_get_nth_field_old(rec, IBUF_REC_FIELD_MARKER, &len);
 
-	if (UNIV_UNLIKELY(len != 1)) {
-		/* pre-4.1 format */
-		ut_a(trx_doublewrite_must_reset_space_ids);
-		ut_a(!trx_sys_multiple_tablespace_format);
-
-		return(ULINT_UNDEFINED);
-	}
-
-	ut_a(trx_sys_multiple_tablespace_format);
+	ut_a(len == 1);
 
 	/* Check the tablespace identifier. */
 	field = rec_get_nth_field_old(rec, IBUF_REC_FIELD_SPACE, &len);
+
 	ut_a(len == 4);
 
 	if (mach_read_from_4(field) != space) {
@@ -3382,17 +3261,17 @@ ibuf_insert_low(
 	ut_ad(!no_counter || op == IBUF_OP_INSERT);
 	ut_a(op < IBUF_OP_COUNT);
 
-	ut_a(trx_sys_multiple_tablespace_format);
-
 	do_merge = FALSE;
 
 	/* Perform dirty reads of ibuf->size and ibuf->max_size, to
-	reduce ibuf_mutex contention. ibuf->max_size remains constant
-	after ibuf_init_at_db_start(), but ibuf->size should be
-	protected by ibuf_mutex. Given that ibuf->size fits in a
-	machine word, this should be OK; at worst we are doing some
-	excessive ibuf_contract() or occasionally skipping a
-	ibuf_contract(). */
+	reduce ibuf_mutex contention. Given that ibuf->max_size and
+	ibuf->size fit in a machine word, this should be OK; at worst
+	we are doing some excessive ibuf_contract() or occasionally
+	skipping an ibuf_contract(). */
+	if (ibuf->max_size == 0) {
+		return(DB_STRONG_FAIL);
+	}
+
 	if (ibuf->size >= ibuf->max_size + IBUF_CONTRACT_DO_NOT_INSERT) {
 		/* Insert buffer is now too big, contract it but do not try
 		to insert */
@@ -3511,7 +3390,8 @@ fail_exit:
 	if (buf_page_peek(space, page_no)
 	    || lock_rec_expl_exist_on_page(space, page_no)) {
 
-		goto bitmap_fail;
+		ibuf_mtr_commit(&bitmap_mtr);
+		goto fail_exit;
 	}
 
 	if (op == IBUF_OP_INSERT) {
@@ -3547,7 +3427,6 @@ fail_exit:
 		dfield_t*	field;
 
 		if (counter == ULINT_UNDEFINED) {
-bitmap_fail:
 			ibuf_mtr_commit(&bitmap_mtr);
 			goto fail_exit;
 		}
@@ -3681,7 +3560,6 @@ ibuf_insert(
 	this function, so that we will have a consistent view of it. */
 	ibuf_use_t	use		= ibuf_use;
 
-	ut_a(trx_sys_multiple_tablespace_format);
 	ut_ad(dtuple_check_typed(entry));
 	ut_ad(ut_is_2pow(zip_size));
 
@@ -3756,12 +3634,8 @@ check_watch:
 
 	{
 		buf_page_t*	bpage;
-		ulint		fold = buf_page_address_fold(space, page_no);
 		buf_pool_t*	buf_pool = buf_pool_get(space, page_no);
-
-		buf_pool_mutex_enter(buf_pool);
-		bpage = buf_page_hash_get_low(buf_pool, space, page_no, fold);
-		buf_pool_mutex_exit(buf_pool);
+		bpage = buf_page_hash_get(buf_pool, space, page_no);
 
 		if (UNIV_LIKELY_NULL(bpage)) {
 			/* A buffer pool watch has been set or the
@@ -3801,7 +3675,7 @@ skip_watch:
 		return(TRUE);
 
 	} else {
-		ut_a(err == DB_STRONG_FAIL);
+		ut_a(err == DB_STRONG_FAIL || err == DB_TOO_BIG_RECORD);
 
 		return(FALSE);
 	}
@@ -4128,7 +4002,7 @@ ibuf_delete(
 		ulint		offsets_[REC_OFFS_NORMAL_SIZE];
 		ulint*		offsets	= offsets_;
 		mem_heap_t*	heap = NULL;
-		ulint		max_ins_size;
+		ulint		max_ins_size = 0;
 
 		rec_offs_init(offsets_);
 
@@ -4428,7 +4302,7 @@ ibuf_merge_or_delete_for_page(
 		function. When the counter is > 0, that prevents tablespace
 		from being dropped. */
 
-		tablespace_being_deleted = fil_inc_pending_ops(space);
+		tablespace_being_deleted = fil_inc_pending_ibuf_merges(space);
 
 		if (UNIV_UNLIKELY(tablespace_being_deleted)) {
 			/* Do not try to read the bitmap page from space;
@@ -4454,7 +4328,7 @@ ibuf_merge_or_delete_for_page(
 				/* No inserts buffered for this page */
 
 				if (!tablespace_being_deleted) {
-					fil_decr_pending_ops(space);
+					fil_decr_pending_ibuf_merges(space);
 				}
 
 				return;
@@ -4469,13 +4343,7 @@ ibuf_merge_or_delete_for_page(
 
 	heap = mem_heap_create(512);
 
-	if (UNIV_UNLIKELY(!trx_sys_multiple_tablespace_format)) {
-		ut_a(trx_doublewrite_must_reset_space_ids);
-		search_tuple = ibuf_search_tuple_build(space, page_no, heap);
-	} else {
-		search_tuple = ibuf_new_search_tuple_build(space, page_no,
-							   heap);
-	}
+	search_tuple = ibuf_search_tuple_build(space, page_no, heap);
 
 	if (block) {
 		/* Move the ownership of the x-latch on the page to this OS
@@ -4753,7 +4621,7 @@ reset_bit:
 
 	if (update_ibuf_bitmap && !tablespace_being_deleted) {
 
-		fil_decr_pending_ops(space);
+		fil_decr_pending_ibuf_merges(space);
 	}
 
 #ifdef UNIV_IBUF_COUNT_DEBUG
@@ -4787,7 +4655,7 @@ ibuf_delete_for_discarded_space(
 	/* Use page number 0 to build the search tuple so that we get the
 	cursor positioned at the first entry for this space id */
 
-	search_tuple = ibuf_new_search_tuple_build(space, 0, heap);
+	search_tuple = ibuf_search_tuple_build(space, 0, heap);
 
 	memset(dops, 0, sizeof(dops));
 loop:
diff --git a/storage/innobase/include/btr0btr.ic b/storage/innobase/include/btr0btr.ic
index 55bdb289b21..6f7a66b12ac 100644
--- a/storage/innobase/include/btr0btr.ic
+++ b/storage/innobase/include/btr0btr.ic
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -81,7 +81,7 @@ btr_page_set_index_id(
 	index_id_t	id,	/*!< in: index id */
 	mtr_t*		mtr)	/*!< in: mtr */
 {
-	if (UNIV_LIKELY_NULL(page_zip)) {
+	if (page_zip) {
 		mach_write_to_8(page + (PAGE_HEADER + PAGE_INDEX_ID), id);
 		page_zip_write_header(page_zip,
 				      page + (PAGE_HEADER + PAGE_INDEX_ID),
@@ -156,7 +156,7 @@ btr_page_set_level(
 	ut_ad(page && mtr);
 	ut_ad(level <= BTR_MAX_NODE_LEVEL);
 
-	if (UNIV_LIKELY_NULL(page_zip)) {
+	if (page_zip) {
 		mach_write_to_2(page + (PAGE_HEADER + PAGE_LEVEL), level);
 		page_zip_write_header(page_zip,
 				      page + (PAGE_HEADER + PAGE_LEVEL),
@@ -199,7 +199,7 @@ btr_page_set_next(
 {
 	ut_ad(page && mtr);
 
-	if (UNIV_LIKELY_NULL(page_zip)) {
+	if (page_zip) {
 		mach_write_to_4(page + FIL_PAGE_NEXT, next);
 		page_zip_write_header(page_zip, page + FIL_PAGE_NEXT, 4, mtr);
 	} else {
@@ -236,7 +236,7 @@ btr_page_set_prev(
 {
 	ut_ad(page && mtr);
 
-	if (UNIV_LIKELY_NULL(page_zip)) {
+	if (page_zip) {
 		mach_write_to_4(page + FIL_PAGE_PREV, prev);
 		page_zip_write_header(page_zip, page + FIL_PAGE_PREV, 4, mtr);
 	} else {
@@ -272,7 +272,7 @@ btr_node_ptr_get_child_page_no(
 
 	page_no = mach_read_from_4(field);
 
-	if (UNIV_UNLIKELY(page_no == 0)) {
+	if (page_no == 0) {
 		fprintf(stderr,
 			"InnoDB: a nonsensical page number 0"
 			" in a node ptr record at offset %lu\n",
diff --git a/storage/innobase/include/btr0cur.h b/storage/innobase/include/btr0cur.h
index cbc6103c2ee..354b6982a13 100644
--- a/storage/innobase/include/btr0cur.h
+++ b/storage/innobase/include/btr0cur.h
@@ -57,9 +57,6 @@ page_cur_t*
 btr_cur_get_page_cur(
 /*=================*/
 	const btr_cur_t*	cursor);/*!< in: tree cursor */
-#else /* UNIV_DEBUG */
-# define btr_cur_get_page_cur(cursor) (&(cursor)->page_cur)
-#endif /* UNIV_DEBUG */
 /*********************************************************//**
 Returns the buffer block on which the tree cursor is positioned.
 @return	pointer to buffer block */
@@ -67,7 +64,7 @@ UNIV_INLINE
 buf_block_t*
 btr_cur_get_block(
 /*==============*/
-	btr_cur_t*	cursor);/*!< in: tree cursor */
+	const btr_cur_t*	cursor);/*!< in: tree cursor */
 /*********************************************************//**
 Returns the record pointer of a tree cursor.
 @return	pointer to record */
@@ -75,7 +72,12 @@ UNIV_INLINE
 rec_t*
 btr_cur_get_rec(
 /*============*/
-	btr_cur_t*	cursor);/*!< in: tree cursor */
+	const btr_cur_t*	cursor);/*!< in: tree cursor */
+#else /* UNIV_DEBUG */
+# define btr_cur_get_page_cur(cursor)	(&(cursor)->page_cur)
+# define btr_cur_get_block(cursor)	((cursor)->page_cur.block)
+# define btr_cur_get_rec(cursor)	((cursor)->page_cur.rec)
+#endif /* UNIV_DEBUG */
 /*********************************************************//**
 Returns the compressed page on which the tree cursor is positioned.
 @return	pointer to compressed page, or NULL if the page is not compressed */
@@ -101,12 +103,9 @@ btr_cur_get_page(
 	btr_cur_t*	cursor);/*!< in: tree cursor */
 /*********************************************************//**
 Returns the index of a cursor.
+@param cursor	b-tree cursor
 @return	index */
-UNIV_INLINE
-dict_index_t*
-btr_cur_get_index(
-/*==============*/
-	btr_cur_t*	cursor);/*!< in: B-tree cursor */
+#define btr_cur_get_index(cursor) ((cursor)->index)
 /*********************************************************//**
 Positions a tree cursor at a given record. */
 UNIV_INLINE
@@ -474,7 +473,8 @@ btr_estimate_n_rows_in_range(
 /*******************************************************************//**
 Estimates the number of different key values in a given index, for
 each n-column prefix of the index where n <= dict_index_get_n_unique(index).
-The estimates are stored in the array index->stat_n_diff_key_vals.
+The estimates are stored in the array index->stat_n_diff_key_vals[] and
+the number of pages that were sampled is saved in index->stat_n_sample_sizes[].
 If innodb_stats_method is nulls_ignored, we also record the number of
 non-null values for each prefix and stored the estimates in
 array index->stat_n_non_null_key_vals. */
@@ -595,6 +595,23 @@ btr_copy_externally_stored_field_prefix(
 				a lock or a page latch */
 	ulint		local_len);/*!< in: length of data, in bytes */
 /*******************************************************************//**
+Copies an externally stored field of a record to mem heap.  The
+clustered index record must be protected by a lock or a page latch.
+@return the whole field copied to heap */
+UNIV_INTERN
+byte*
+btr_copy_externally_stored_field(
+/*=============================*/
+	ulint*		len,	/*!< out: length of the whole field */
+	const byte*	data,	/*!< in: 'internally' stored part of the
+				field containing also the reference to
+				the external part; must be protected by
+				a lock or a page latch */
+	ulint		zip_size,/*!< in: nonzero=compressed BLOB page size,
+				zero for uncompressed BLOBs */
+	ulint		local_len,/*!< in: length of data */
+	mem_heap_t*	heap);	/*!< in: mem heap */
+/*******************************************************************//**
 Copies an externally stored field of a record to mem heap.
 @return	the field copied to heap, or NULL if the field is incomplete */
 UNIV_INTERN
diff --git a/storage/innobase/include/btr0cur.ic b/storage/innobase/include/btr0cur.ic
index e31f77c77eb..540417e3062 100644
--- a/storage/innobase/include/btr0cur.ic
+++ b/storage/innobase/include/btr0cur.ic
@@ -38,7 +38,7 @@ btr_cur_get_page_cur(
 {
 	return(&((btr_cur_t*) cursor)->page_cur);
 }
-#endif /* UNIV_DEBUG */
+
 /*********************************************************//**
 Returns the buffer block on which the tree cursor is positioned.
 @return	pointer to buffer block */
@@ -46,7 +46,7 @@ UNIV_INLINE
 buf_block_t*
 btr_cur_get_block(
 /*==============*/
-	btr_cur_t*	cursor)	/*!< in: tree cursor */
+	const btr_cur_t*	cursor)	/*!< in: tree cursor */
 {
 	return(page_cur_get_block(btr_cur_get_page_cur(cursor)));
 }
@@ -58,10 +58,11 @@ UNIV_INLINE
 rec_t*
 btr_cur_get_rec(
 /*============*/
-	btr_cur_t*	cursor)	/*!< in: tree cursor */
+	const btr_cur_t*	cursor)	/*!< in: tree cursor */
 {
-	return(page_cur_get_rec(&(cursor->page_cur)));
+	return(page_cur_get_rec(btr_cur_get_page_cur(cursor)));
 }
+#endif /* UNIV_DEBUG */
 
 /*********************************************************//**
 Returns the compressed page on which the tree cursor is positioned.
@@ -99,18 +100,6 @@ btr_cur_get_page(
 }
 
 /*********************************************************//**
-Returns the index of a cursor.
-@return	index */
-UNIV_INLINE
-dict_index_t*
-btr_cur_get_index(
-/*==============*/
-	btr_cur_t*	cursor)	/*!< in: B-tree cursor */
-{
-	return(cursor->index);
-}
-
-/*********************************************************//**
 Positions a tree cursor at a given record. */
 UNIV_INLINE
 void
diff --git a/storage/innobase/include/btr0pcur.h b/storage/innobase/include/btr0pcur.h
index 2ebd70a6f23..a8eaac4690b 100644
--- a/storage/innobase/include/btr0pcur.h
+++ b/storage/innobase/include/btr0pcur.h
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -92,9 +92,10 @@ Initializes and opens a persistent cursor to an index tree. It should be
 closed with btr_pcur_close. */
 UNIV_INLINE
 void
-btr_pcur_open_func(
-/*===============*/
+btr_pcur_open_low(
+/*==============*/
 	dict_index_t*	index,	/*!< in: index */
+	ulint		level,	/*!< in: level in the btree */
 	const dtuple_t*	tuple,	/*!< in: tuple on which search done */
 	ulint		mode,	/*!< in: PAGE_CUR_L, ...;
 				NOTE that if the search is made using a unique
@@ -108,7 +109,7 @@ btr_pcur_open_func(
 	ulint		line,	/*!< in: line where called */
 	mtr_t*		mtr);	/*!< in: mtr */
 #define btr_pcur_open(i,t,md,l,c,m)				\
-	btr_pcur_open_func(i,t,md,l,c,__FILE__,__LINE__,m)
+	btr_pcur_open_low(i,0,t,md,l,c,__FILE__,__LINE__,m)
 /**************************************************************//**
 Opens an persistent cursor to an index tree without initializing the
 cursor. */
diff --git a/storage/innobase/include/btr0pcur.ic b/storage/innobase/include/btr0pcur.ic
index 054ce753c7d..a27033c4a7c 100644
--- a/storage/innobase/include/btr0pcur.ic
+++ b/storage/innobase/include/btr0pcur.ic
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -406,9 +406,10 @@ Initializes and opens a persistent cursor to an index tree. It should be
 closed with btr_pcur_close. */
 UNIV_INLINE
 void
-btr_pcur_open_func(
-/*===============*/
+btr_pcur_open_low(
+/*==============*/
 	dict_index_t*	index,	/*!< in: index */
+	ulint		level,	/*!< in: level in the btree */
 	const dtuple_t*	tuple,	/*!< in: tuple on which search done */
 	ulint		mode,	/*!< in: PAGE_CUR_L, ...;
 				NOTE that if the search is made using a unique
@@ -435,7 +436,7 @@ btr_pcur_open_func(
 
 	btr_cursor = btr_pcur_get_btr_cur(cursor);
 
-	btr_cur_search_to_nth_level(index, 0, tuple, mode, latch_mode,
+	btr_cur_search_to_nth_level(index, level, tuple, mode, latch_mode,
 				    btr_cursor, 0, file, line, mtr);
 	cursor->pos_state = BTR_PCUR_IS_POSITIONED;
 
diff --git a/storage/innobase/include/btr0sea.h b/storage/innobase/include/btr0sea.h
index 1f920471f7d..5316c3efd39 100644
--- a/storage/innobase/include/btr0sea.h
+++ b/storage/innobase/include/btr0sea.h
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
diff --git a/storage/innobase/include/btr0sea.ic b/storage/innobase/include/btr0sea.ic
index beadeeb8d02..49ba0fd3f0b 100644
--- a/storage/innobase/include/btr0sea.ic
+++ b/storage/innobase/include/btr0sea.ic
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2011, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
diff --git a/storage/innobase/include/btr0types.h b/storage/innobase/include/btr0types.h
index 5adc858b931..62b7868b419 100644
--- a/storage/innobase/include/btr0types.h
+++ b/storage/innobase/include/btr0types.h
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
diff --git a/storage/innobase/include/buf0buddy.h b/storage/innobase/include/buf0buddy.h
index 2d7d6146092..fab9a4b828b 100644
--- a/storage/innobase/include/buf0buddy.h
+++ b/storage/innobase/include/buf0buddy.h
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -47,7 +47,7 @@ buf_buddy_alloc(
 	buf_pool_t*	buf_pool,	/*!< in/out: buffer pool in which
 					the page resides */
 	ulint		size,		/*!< in: compressed page size
-					(between PAGE_ZIP_MIN_SIZE and
+					(between UNIV_ZIP_SIZE_MIN and
 					UNIV_PAGE_SIZE) */
 	ibool*		lru)		/*!< in: pointer to a variable
 					that will be assigned TRUE if
diff --git a/storage/innobase/include/buf0buddy.ic b/storage/innobase/include/buf0buddy.ic
index b8281f7341a..be2f950162d 100644
--- a/storage/innobase/include/buf0buddy.ic
+++ b/storage/innobase/include/buf0buddy.ic
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -77,7 +77,7 @@ buf_buddy_get_slot(
 	ulint	i;
 	ulint	s;
 
-	ut_ad(size >= PAGE_ZIP_MIN_SIZE);
+	ut_ad(size >= UNIV_ZIP_SIZE_MIN);
 
 	for (i = 0, s = BUF_BUDDY_LOW; s < size; i++, s <<= 1) {
 	}
@@ -99,7 +99,7 @@ buf_buddy_alloc(
 	buf_pool_t*	buf_pool,	/*!< in/out: buffer pool in which
 					the page resides */
 	ulint		size,		/*!< in: compressed page size
-					(between PAGE_ZIP_MIN_SIZE and
+					(between UNIV_ZIP_SIZE_MIN and
 					UNIV_PAGE_SIZE) */
 	ibool*		lru)		/*!< in: pointer to a variable
 					that will be assigned TRUE if
@@ -109,7 +109,7 @@ buf_buddy_alloc(
 {
 	ut_ad(buf_pool_mutex_own(buf_pool));
 	ut_ad(ut_is_2pow(size));
-	ut_ad(size >= PAGE_ZIP_MIN_SIZE);
+	ut_ad(size >= UNIV_ZIP_SIZE_MIN);
 	ut_ad(size <= UNIV_PAGE_SIZE);
 
 	return((byte*) buf_buddy_alloc_low(buf_pool, buf_buddy_get_slot(size),
@@ -131,7 +131,7 @@ buf_buddy_free(
 {
 	ut_ad(buf_pool_mutex_own(buf_pool));
 	ut_ad(ut_is_2pow(size));
-	ut_ad(size >= PAGE_ZIP_MIN_SIZE);
+	ut_ad(size >= UNIV_ZIP_SIZE_MIN);
 	ut_ad(size <= UNIV_PAGE_SIZE);
 
 	buf_buddy_free_low(buf_pool, buf, buf_buddy_get_slot(size));
diff --git a/storage/innobase/include/buf0buf.h b/storage/innobase/include/buf0buf.h
index d9e6801eb86..08e61c08004 100644
--- a/storage/innobase/include/buf0buf.h
+++ b/storage/innobase/include/buf0buf.h
@@ -36,6 +36,7 @@ Created 11/5/1995 Heikki Tuuri
 #ifndef UNIV_HOTBACKUP
 #include "ut0rbt.h"
 #include "os0proc.h"
+#include "log0log.h"
 
 /** @name Modes for buf_page_get_gen */
 /* @{ */
@@ -68,11 +69,18 @@ Created 11/5/1995 Heikki Tuuri
 					position of the block. */
 /* @} */
 
-#define MAX_BUFFER_POOLS 64		/*!< The maximum number of buffer
+#define MAX_BUFFER_POOLS_BITS	6	/*!< Number of bits to representing
+					a buffer pool ID */
+
+#define MAX_BUFFER_POOLS 	(1 << MAX_BUFFER_POOLS_BITS)
+					/*!< The maximum number of buffer
 					pools that can be defined */
 
-#define BUF_POOL_WATCH_SIZE 1		/*!< Maximum number of concurrent
+#define BUF_POOL_WATCH_SIZE		(srv_n_purge_threads + 1)
+					/*!< Maximum number of concurrent
 					buffer pool watches */
+#define MAX_PAGE_HASH_LOCKS	1024	/*!< The maximum number of
+					page_hash locks */
 
 extern	buf_pool_t*	buf_pool_ptr;	/*!< The buffer pools
 					of the database */
@@ -137,10 +145,12 @@ struct buf_pool_info_struct{
 	ulint	n_pend_reads;		/*!< buf_pool->n_pend_reads, pages
 					pending read */
 	ulint	n_pending_flush_lru;	/*!< Pages pending flush in LRU */
+	ulint	n_pending_flush_single_page;/*!< Pages pending to be
+					flushed as part of single page
+					flushes issued by various user
+					threads */
 	ulint	n_pending_flush_list;	/*!< Pages pending flush in FLUSH
 					LIST */
-	ulint	n_pending_flush_single_page;/*!< Pages pending flush in
-					BUF_FLUSH_SINGLE_PAGE list */
 	ulint	n_pages_made_young;	/*!< number of pages made young */
 	ulint	n_pages_not_made_young;	/*!< number of pages not made young */
 	ulint	n_pages_read;		/*!< buf_pool->n_pages_read */
@@ -267,9 +277,10 @@ Gets the smallest oldest_modification lsn for any page in the pool. Returns
 zero if all modified pages have been flushed to disk.
 @return	oldest modification in pool, zero if none */
 UNIV_INTERN
-ib_uint64_t
+lsn_t
 buf_pool_get_oldest_modification(void);
 /*==================================*/
+
 /********************************************************************//**
 Allocates a buf_page_t descriptor. This function must succeed. In case
 of failure we assert in this function. */
@@ -341,8 +352,7 @@ buf_page_optimistic_get(
 /*====================*/
 	ulint		rw_latch,/*!< in: RW_S_LATCH, RW_X_LATCH */
 	buf_block_t*	block,	/*!< in: guessed block */
-	ib_uint64_t	modify_clock,/*!< in: modify clock value if mode is
-				..._GUESS_ON_CLOCK */
+	ib_uint64_t	modify_clock,/*!< in: modify clock value */
 	const char*	file,	/*!< in: file name */
 	ulint		line,	/*!< in: line where called */
 	mtr_t*		mtr);	/*!< in: mini-transaction */
@@ -364,7 +374,7 @@ buf_page_get_known_nowait(
 /*******************************************************************//**
 Given a tablespace id and page number tries to get that page. If the
 page is not in the buffer pool it is not loaded and NULL is returned.
-Suitable for using when holding the kernel mutex. */
+Suitable for using when holding the lock_sys_t::mutex. */
 UNIV_INTERN
 const buf_block_t*
 buf_page_try_get_func(
@@ -376,7 +386,7 @@ buf_page_try_get_func(
 	mtr_t*		mtr);	/*!< in: mini-transaction */
 
 /** Tries to get a page. If the page is not in the buffer pool it is
-not loaded.  Suitable for using when holding the kernel mutex.
+not loaded.  Suitable for using when holding the lock_sys_t::mutex.
 @param space_id	in: tablespace id
 @param page_no	in: page number
 @param mtr	in: mini-transaction
@@ -489,15 +499,6 @@ buf_page_peek(
 /*==========*/
 	ulint	space,	/*!< in: space id */
 	ulint	offset);/*!< in: page number */
-/********************************************************************//**
-Resets the check_index_page_at_flush field of a page if found in the buffer
-pool. */
-UNIV_INTERN
-void
-buf_reset_check_index_page_at_flush(
-/*================================*/
-	ulint	space,	/*!< in: space id */
-	ulint	offset);/*!< in: page number */
 #if defined UNIV_DEBUG_FILE_ACCESSES || defined UNIV_DEBUG
 /********************************************************************//**
 Sets file_page_was_freed TRUE if the page is found in the buffer pool.
@@ -570,7 +571,7 @@ Gets the youngest modification log sequence number for a frame.
 Returns zero if not file page or no modification occurred yet.
 @return	newest modification to page */
 UNIV_INLINE
-ib_uint64_t
+lsn_t
 buf_page_get_newest_modification(
 /*=============================*/
 	const buf_page_t*	bpage);	/*!< in: block containing the
@@ -622,29 +623,6 @@ buf_block_buf_fix_inc_func(
 # define buf_block_buf_fix_inc(b,f,l) buf_block_buf_fix_inc_func(b)
 #endif /* UNIV_SYNC_DEBUG */
 /********************************************************************//**
-Calculates a page checksum which is stored to the page when it is written
-to a file. Note that we must be careful to calculate the same value
-on 32-bit and 64-bit architectures.
-@return	checksum */
-UNIV_INTERN
-ulint
-buf_calc_page_new_checksum(
-/*=======================*/
-	const byte*	page);	/*!< in: buffer page */
-/********************************************************************//**
-In versions < 4.0.14 and < 4.1.1 there was a bug that the checksum only
-looked at the first few bytes of the page. This calculates that old
-checksum.
-NOTE: we must first store the new formula checksum to
-FIL_PAGE_SPACE_OR_CHKSUM before calculating and storing this old checksum
-because this takes that field as an input!
-@return	checksum */
-UNIV_INTERN
-ulint
-buf_calc_page_old_checksum(
-/*=======================*/
-	const byte*	 page);	/*!< in: buffer page */
-/********************************************************************//**
 Checks if a page is corrupt.
 @return	TRUE if corrupted */
 UNIV_INTERN
@@ -687,6 +665,17 @@ buf_pool_contains_zip(
 	buf_pool_t*	buf_pool,	/*!< in: buffer pool instance */
 	const void*	data);		/*!< in: pointer to compressed page */
 #endif /* UNIV_DEBUG */
+
+/***********************************************************************
+FIXME_FTS: Gets the frame the pointer is pointing to. */
+UNIV_INLINE
+buf_frame_t*
+buf_frame_align(
+/*============*/
+                        /* out: pointer to frame */
+        byte*   ptr);   /* in: pointer to a frame */
+
+
 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
 /*********************************************************************//**
 Validates the buffer pool data structure.
@@ -724,7 +713,7 @@ buf_page_print(
 	ulint		flags)		/*!< in: 0 or
 					BUF_PAGE_PRINT_NO_CRASH or
 					BUF_PAGE_PRINT_NO_FULL */
-	__attribute__((nonnull));
+	UNIV_COLD __attribute__((nonnull));
 /********************************************************************//**
 Decompress a block.
 @return	TRUE if successful */
@@ -745,12 +734,12 @@ buf_get_latched_pages_number(void);
 /*==============================*/
 #endif /* UNIV_DEBUG */
 /*********************************************************************//**
-Returns the number of pending buf pool ios.
-@return	number of pending I/O operations */
+Returns the number of pending buf pool read ios.
+@return	number of pending read I/O operations */
 UNIV_INTERN
 ulint
-buf_get_n_pending_ios(void);
-/*=======================*/
+buf_get_n_pending_read_ios(void);
+/*============================*/
 /*********************************************************************//**
 Prints info of the buffer i/o. */
 UNIV_INTERN
@@ -758,6 +747,18 @@ void
 buf_print_io(
 /*=========*/
 	FILE*	file);	/*!< in: file where to print */
+/*******************************************************************//**
+Collect buffer pool stats information for a buffer pool. Also
+record aggregated stats if there are more than one buffer pool
+in the server */
+UNIV_INTERN
+void
+buf_stats_get_pool_info(
+/*====================*/
+	buf_pool_t*		buf_pool,	/*!< in: buffer pool */
+	ulint			pool_id,	/*!< in: buffer pool ID */
+	buf_pool_info_t*	all_pool_info);	/*!< in/out: buffer pool info
+						to fill */
 /*********************************************************************//**
 Returns the ratio in percents of modified pages in the buffer pool /
 database pages in the buffer pool.
@@ -792,8 +793,8 @@ pool.
 @return	number of pending i/o operations */
 UNIV_INTERN
 ulint
-buf_pool_check_num_pending_io(void);
-/*===============================*/
+buf_pool_check_no_pending_io(void);
+/*==============================*/
 /*********************************************************************//**
 Invalidates the file pages in the buffer pool when an archive recovery is
 completed. All the file pages buffered must be in a replaceable state when
@@ -1095,7 +1096,7 @@ buf_block_get_zip_size(
 Gets the compressed page descriptor corresponding to an uncompressed page
 if applicable. */
 #define buf_block_get_page_zip(block) \
-	(UNIV_LIKELY_NULL((block)->page.zip.data) ? &(block)->page.zip : NULL)
+	((block)->page.zip.data ? &(block)->page.zip : NULL)
 #ifndef UNIV_HOTBACKUP
 /*******************************************************************//**
 Gets the block to whose frame the pointer is pointing to.
@@ -1229,35 +1230,83 @@ UNIV_INLINE
 buf_page_t*
 buf_page_hash_get_low(
 /*==================*/
-	buf_pool_t*	buf_pool,	/*!< buffer pool instance */
-	ulint		space,		/*!< in: space id */
-	ulint		offset,		/*!< in: offset of the page
-					within space */
-	ulint		fold);		/*!< in: buf_page_address_fold(
-					space, offset) */
+	buf_pool_t*	buf_pool,/*!< buffer pool instance */
+	ulint		space,	/*!< in: space id */
+	ulint		offset,	/*!< in: offset of the page within space */
+	ulint		fold);	/*!< in: buf_page_address_fold(space, offset) */
 /******************************************************************//**
 Returns the control block of a file page, NULL if not found.
-@return	block, NULL if not found or not a real control block */
+If the block is found and lock is not NULL then the appropriate
+page_hash lock is acquired in the specified lock mode. Otherwise,
+mode value is ignored. It is up to the caller to release the
+lock. If the block is found and the lock is NULL then the page_hash
+lock is released by this function.
+@return	block, NULL if not found */
 UNIV_INLINE
 buf_page_t*
-buf_page_hash_get(
-/*==============*/
-	buf_pool_t*	buf_pool,	/*!< in: buffer pool instance */
+buf_page_hash_get_locked(
+/*=====================*/
+					/*!< out: pointer to the bpage,
+					or NULL; if NULL, hash_lock
+					is also NULL. */
+	buf_pool_t*	buf_pool,	/*!< buffer pool instance */
 	ulint		space,		/*!< in: space id */
-	ulint		offset);	/*!< in: offset of the page
-					within space */
+	ulint		offset,		/*!< in: page number */
+	rw_lock_t**	lock,		/*!< in/out: lock of the page
+					hash acquired if bpage is
+					found. NULL otherwise. If NULL
+					is passed then the hash_lock
+					is released by this function */
+	ulint		lock_mode);	/*!< in: RW_LOCK_EX or
+					RW_LOCK_SHARED. Ignored if
+					lock == NULL */
 /******************************************************************//**
-Returns the control block of a file page, NULL if not found
-or an uncompressed page frame does not exist.
+Returns the control block of a file page, NULL if not found.
+If the block is found and lock is not NULL then the appropriate
+page_hash lock is acquired in the specified lock mode. Otherwise,
+mode value is ignored. It is up to the caller to release the
+lock. If the block is found and the lock is NULL then the page_hash
+lock is released by this function.
 @return	block, NULL if not found */
 UNIV_INLINE
 buf_block_t*
-buf_block_hash_get(
-/*===============*/
-	buf_pool_t*	buf_pool,	/*!< in: buffer pool instance */
+buf_block_hash_get_locked(
+/*=====================*/
+					/*!< out: pointer to the bpage,
+					or NULL; if NULL, hash_lock
+					is also NULL. */
+	buf_pool_t*	buf_pool,	/*!< buffer pool instance */
 	ulint		space,		/*!< in: space id */
-	ulint		offset);	/*!< in: offset of the page
-					within space */
+	ulint		offset,		/*!< in: page number */
+	rw_lock_t**	lock,		/*!< in/out: lock of the page
+					hash acquired if bpage is
+					found. NULL otherwise. If NULL
+					is passed then the hash_lock
+					is released by this function */
+	ulint		lock_mode);	/*!< in: RW_LOCK_EX or
+					RW_LOCK_SHARED. Ignored if
+					lock == NULL */
+/* There are four different ways we can try to get a bpage or block
+from the page hash:
+1) Caller already holds the appropriate page hash lock: in the case call
+buf_page_hash_get_low() function.
+2) Caller wants to hold page hash lock in x-mode
+3) Caller wants to hold page hash lock in s-mode
+4) Caller doesn't want to hold page hash lock */
+#define buf_page_hash_get_s_locked(b, s, o, l)			\
+	buf_page_hash_get_locked(b, s, o, l, RW_LOCK_SHARED)
+#define buf_page_hash_get_x_locked(b, s, o, l)			\
+	buf_page_hash_get_locked(b, s, o, l, RW_LOCK_EX)
+#define buf_page_hash_get(b, s, o)				\
+	buf_page_hash_get_locked(b, s, o, NULL, 0)
+
+#define buf_block_hash_get_s_locked(b, s, o, l)			\
+	buf_block_hash_get_locked(b, s, o, l, RW_LOCK_SHARED)
+#define buf_block_hash_get_x_locked(b, s, o, l)			\
+	buf_block_hash_get_locked(b, s, o, l, RW_LOCK_EX)
+#define buf_block_hash_get(b, s, o)				\
+	buf_block_hash_get_locked(b, s, o, NULL, 0)
+
 /*********************************************************************//**
 Gets the current length of the free list of buffer blocks.
 @return	length of the free list */
@@ -1324,12 +1373,25 @@ void
 buf_get_total_stat(
 /*===============*/
 	buf_pool_stat_t*tot_stat);	/*!< out: buffer pool stats */
+/*********************************************************************//**
+Get the nth chunk's buffer block in the specified buffer pool.
+@return the nth chunk's buffer block. */
+UNIV_INLINE
+buf_block_t*
+buf_get_nth_chunk_block(
+/*====================*/
+	const buf_pool_t* buf_pool,	/*!< in: buffer pool instance */
+	ulint		n,		/*!< in: nth chunk in the buffer pool */
+	ulint*		chunk_size);	/*!< in: chunk size */
 
 #endif /* !UNIV_HOTBACKUP */
 
 /** The common buffer control block structure
 for compressed and uncompressed frames */
 
+/** Number of bits used for buffer page states. */
+#define BUF_PAGE_STATE_BITS	3
+
 struct buf_page_struct{
 	/** @name General fields
 	None of these bit-fields must be modified without holding
@@ -1344,13 +1406,23 @@ struct buf_page_struct{
 	unsigned	offset:32;	/*!< page number; also protected
 					by buf_pool->mutex. */
 
-	unsigned	state:3;	/*!< state of the control block; also
+	unsigned	state:BUF_PAGE_STATE_BITS;
+					/*!< state of the control block; also
 					protected by buf_pool->mutex.
 					State transitions from
 					BUF_BLOCK_READY_FOR_USE to
 					BUF_BLOCK_MEMORY need not be
 					protected by buf_page_get_mutex().
-					@see enum buf_page_state */
+					@see enum buf_page_state.
+					State changes that are relevant
+					to page_hash are additionally
+					protected by the appropriate
+					page_hash mutex i.e.: if a page
+					is in page_hash or is being
+					added to/removed from page_hash
+					then the corresponding changes
+					must also be protected by
+					page_hash mutex. */
 #ifndef UNIV_HOTBACKUP
 	unsigned	flush_type:2;	/*!< if this block is currently being
 					flushed to disk, this tells the
@@ -1432,13 +1504,13 @@ struct buf_page_struct{
 					should hold: in_free_list
 					== (state == BUF_BLOCK_NOT_USED) */
 #endif /* UNIV_DEBUG */
-	ib_uint64_t	newest_modification;
+	lsn_t		newest_modification;
 					/*!< log sequence number of
 					the youngest modification to
 					this block, zero if not
 					modified. Protected by block
 					mutex */
-	ib_uint64_t	oldest_modification;
+	lsn_t		oldest_modification;
 					/*!< log sequence number of
 					the START of the log entry
 					written of the oldest
@@ -1480,8 +1552,10 @@ struct buf_page_struct{
 	/* @} */
 # if defined UNIV_DEBUG_FILE_ACCESSES || defined UNIV_DEBUG
 	ibool		file_page_was_freed;
-					/*!< this is set to TRUE when fsp
-					frees a page in buffer pool */
+					/*!< this is set to TRUE when
+					fsp frees a page in buffer pool;
+					protected by buf_pool->zip_mutex
+					or buf_block_struct::mutex. */
 # endif /* UNIV_DEBUG_FILE_ACCESSES || UNIV_DEBUG */
 #endif /* !UNIV_HOTBACKUP */
 };
@@ -1575,7 +1649,7 @@ struct buf_block_struct{
 	- we know that buf_block_struct::buf_fix_count == 0.
 
 	An exception to this is when we init or create a page
-	in the buffer pool in buf0buf.c.
+	in the buffer pool in buf0buf.cc.
 
 	Another exception is that assigning block->index = NULL
 	is allowed whenever holding an x-latch on btr_search_latch. */
@@ -1701,7 +1775,14 @@ struct buf_pool_struct{
 	hash_table_t*	page_hash;	/*!< hash table of buf_page_t or
 					buf_block_t file pages,
 					buf_page_in_file() == TRUE,
-					indexed by (space_id, offset) */
+					indexed by (space_id, offset).
+					page_hash is protected by an
+					array of mutexes.
+					Changes in page_hash are protected
+					by buf_pool->mutex and the relevant
+					page_hash mutex. Lookups can happen
+					while holding the buf_pool->mutex or
+					the relevant page_hash mutex. */
 	hash_table_t*	zip_hash;	/*!< hash table of buf_block_t blocks
 					whose frames are allocated to the
 					zip buddy system,
@@ -1713,7 +1794,7 @@ struct buf_pool_struct{
 	time_t		last_printout_time;
 					/*!< when buf_print_io was last time
 					called */
-	buf_buddy_stat_t buddy_stat[BUF_BUDDY_SIZES + 1];
+	buf_buddy_stat_t buddy_stat[BUF_BUDDY_SIZES_MAX + 1];
 					/*!< Statistics of buddy system,
 					indexed by block size */
 	buf_pool_stat_t	stat;		/*!< current statistics */
@@ -1768,10 +1849,16 @@ struct buf_pool_struct{
 					to read this for heuristic
 					purposes without holding any
 					mutex or latch */
-	ulint		LRU_flush_ended;/*!< when an LRU flush ends for a page,
-					this is incremented by one; this is
-					set to zero when a buffer block is
-					allocated */
+	ibool		try_LRU_scan;	/*!< Set to FALSE when an LRU
+					scan for free block fails. This
+					flag is used to avoid repeated
+					scans of LRU list when we know
+					that there is no free block
+					available in the scan depth for
+					eviction. Set to TRUE whenever
+					we flush a batch from the
+					buffer pool. Protected by the
+					buf_pool->mutex */
 	/* @} */
 
 	/** @name LRU replacement algorithm fields */
@@ -1792,7 +1879,7 @@ struct buf_pool_struct{
 	ulint		LRU_old_len;	/*!< length of the LRU list from
 					the block to which LRU_old points
 					onward, including that block;
-					see buf0lru.c for the restrictions
+					see buf0lru.cc for the restrictions
 					on this value; 0 if LRU_old == NULL;
 					NOTE: LRU_old_len must be adjusted
 					whenever LRU_old shrinks or grows! */
@@ -1811,19 +1898,16 @@ struct buf_pool_struct{
 	UT_LIST_BASE_NODE_T(buf_page_t)	zip_clean;
 					/*!< unmodified compressed pages */
 #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
-	UT_LIST_BASE_NODE_T(buf_page_t) zip_free[BUF_BUDDY_SIZES];
+	UT_LIST_BASE_NODE_T(buf_page_t) zip_free[BUF_BUDDY_SIZES_MAX];
 					/*!< buddy free lists */
 
-	buf_page_t			watch[BUF_POOL_WATCH_SIZE];
+	buf_page_t*			watch;
 					/*!< Sentinel records for buffer
 					pool watches. Protected by
-				       	buf_pool->mutex. */
+					buf_pool->mutex. */
 
-#if BUF_BUDDY_HIGH != UNIV_PAGE_SIZE
-# error "BUF_BUDDY_HIGH != UNIV_PAGE_SIZE"
-#endif
-#if BUF_BUDDY_LOW > PAGE_ZIP_MIN_SIZE
-# error "BUF_BUDDY_LOW > PAGE_ZIP_MIN_SIZE"
+#if BUF_BUDDY_LOW > UNIV_ZIP_SIZE_MIN
+# error "BUF_BUDDY_LOW > UNIV_ZIP_SIZE_MIN"
 #endif
 	/* @} */
 };
@@ -1854,6 +1938,47 @@ Use these instead of accessing buf_pool->mutex directly. */
 
 
 
+/** Get appropriate page_hash_lock. */
+# define buf_page_hash_lock_get(b, f)		\
+	hash_get_lock(b->page_hash, f)
+
+#ifdef UNIV_SYNC_DEBUG
+/** Test if page_hash lock is held in s-mode. */
+# define buf_page_hash_lock_held_s(b, p)		\
+	rw_lock_own(buf_page_hash_lock_get(b,		\
+		  buf_page_address_fold(p->space,	\
+					p->offset)),	\
+					RW_LOCK_SHARED)
+
+/** Test if page_hash lock is held in x-mode. */
+# define buf_page_hash_lock_held_x(b, p)		\
+	rw_lock_own(buf_page_hash_lock_get(b,		\
+		  buf_page_address_fold(p->space,	\
+					p->offset)),	\
+					RW_LOCK_EX)
+
+/** Test if page_hash lock is held in x or s-mode. */
+# define buf_page_hash_lock_held_s_or_x(b, p)		\
+	(buf_page_hash_lock_held_s(b, p)		\
+	 || buf_page_hash_lock_held_x(b, p))
+
+# define buf_block_hash_lock_held_s(b, p)		\
+	buf_page_hash_lock_held_s(b, &(p->page))
+
+# define buf_block_hash_lock_held_x(b, p)		\
+	buf_page_hash_lock_held_x(b, &(p->page))
+
+# define buf_block_hash_lock_held_s_or_x(b, p)		\
+	buf_page_hash_lock_held_s_or_x(b, &(p->page))
+#else /* UNIV_SYNC_DEBUG */
+# define buf_page_hash_lock_held_s(b, p)	(TRUE)
+# define buf_page_hash_lock_held_x(b, p)	(TRUE)
+# define buf_page_hash_lock_held_s_or_x(b, p)	(TRUE)
+# define buf_block_hash_lock_held_s(b, p)	(TRUE)
+# define buf_block_hash_lock_held_x(b, p)	(TRUE)
+# define buf_block_hash_lock_held_s_or_x(b, p)	(TRUE)
+#endif /* UNIV_SYNC_DEBUG */
+
 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
 /** Forbid the release of the buffer pool mutex. */
 # define buf_pool_mutex_exit_forbid(b) do {	\
@@ -1926,6 +2051,32 @@ FILE_PAGE => NOT_USED	NOTE: This transition is allowed if and only if
 				(3) io_fix == 0.
 */
 
+#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
+/** Functor to validate the LRU list. */
+struct	CheckInLRUList {
+	void	operator()(const buf_page_t* elem) const
+	{
+		ut_a(elem->in_LRU_list);
+	}
+};
+
+/** Functor to validate the LRU list. */
+struct	CheckInFreeList {
+	void	operator()(const buf_page_t* elem) const
+	{
+		ut_a(elem->in_free_list);
+	}
+};
+
+struct	CheckUnzipLRUAndLRUList {
+	void	operator()(const buf_block_t* elem) const
+	{
+                ut_a(elem->page.in_LRU_list);
+                ut_a(elem->in_unzip_LRU_list);
+	}
+};
+#endif /* UNIV_DEBUG || defined UNIV_BUF_DEBUG */
+
 #ifndef UNIV_NONINL
 #include "buf0buf.ic"
 #endif
diff --git a/storage/innobase/include/buf0buf.ic b/storage/innobase/include/buf0buf.ic
index 917ee5dda84..88c29ab5603 100644
--- a/storage/innobase/include/buf0buf.ic
+++ b/storage/innobase/include/buf0buf.ic
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1995, 2012, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 1995, 2011, Oracle and/or its affiliates. All Rights Reserved.
 Copyright (c) 2008, Google Inc.
 
 Portions of this file contain modifications contributed and copyrighted by
@@ -35,6 +35,16 @@ Created 11/5/1995 Heikki Tuuri
 #include "buf0lru.h"
 #include "buf0rea.h"
 
+#ifndef UNIV_HOTBACKUP
+/** A chunk of buffers. The buffer pool is allocated in chunks. */
+struct buf_chunk_struct{
+	ulint		mem_size;	/*!< allocated size of the chunk */
+	ulint		size;		/*!< size of frames[] and blocks[] */
+	void*		mem;		/*!< pointer to the memory area which
+					was allocated for the frames */
+	buf_block_t*	blocks;		/*!< array of buffer control blocks */
+};
+
 /*********************************************************************//**
 Gets the current size of buffer buf_pool in bytes.
 @return size in bytes */
@@ -160,7 +170,7 @@ buf_page_peek_if_too_old(
 {
 	buf_pool_t*		buf_pool = buf_pool_from_bpage(bpage);
 
-	if (UNIV_UNLIKELY(buf_pool->freed_page_clock == 0)) {
+	if (buf_pool->freed_page_clock == 0) {
 		/* If eviction has not started yet, do not update the
 		statistics or move blocks in the LRU list.  This is
 		either the warm-up phase or an in-memory workload. */
@@ -180,6 +190,7 @@ buf_page_peek_if_too_old(
 		return(!buf_page_peek_if_young(bpage));
 	}
 }
+#endif /* !UNIV_HOTBACKUP */
 
 /*********************************************************************//**
 Gets the state of a block.
@@ -361,11 +372,11 @@ buf_page_get_flush_type(
 #ifdef UNIV_DEBUG
 	switch (flush_type) {
 	case BUF_FLUSH_LRU:
-	case BUF_FLUSH_SINGLE_PAGE:
 	case BUF_FLUSH_LIST:
+	case BUF_FLUSH_SINGLE_PAGE:
 		return(flush_type);
 	case BUF_FLUSH_N_TYPES:
-		break;
+		ut_error;
 	}
 	ut_error;
 #endif /* UNIV_DEBUG */
@@ -628,7 +639,7 @@ buf_page_get_block(
 /*===============*/
 	buf_page_t*	bpage)	/*!< in: control block, or NULL */
 {
-	if (UNIV_LIKELY(bpage != NULL)) {
+	if (bpage != NULL) {
 		ut_ad(buf_page_in_file(bpage));
 
 		if (buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE) {
@@ -719,6 +730,23 @@ buf_page_get_page_no(
 
 	return(bpage->offset);
 }
+/***********************************************************************
+FIXME_FTS Gets the frame the pointer is pointing to. */
+UNIV_INLINE
+buf_frame_t*
+buf_frame_align(
+/*============*/
+                        /* out: pointer to frame */
+        byte*   ptr)    /* in: pointer to a frame */
+{
+        buf_frame_t*    frame;
+
+        ut_ad(ptr);
+
+        frame = (buf_frame_t*) ut_align_down(ptr, UNIV_PAGE_SIZE);
+
+        return(frame);
+}
 
 /*********************************************************************//**
 Gets the page number of a block.
@@ -744,7 +772,8 @@ buf_page_get_zip_size(
 /*==================*/
 	const buf_page_t*	bpage)	/*!< in: pointer to the control block */
 {
-	return(bpage->zip.ssize ? 512 << bpage->zip.ssize : 0);
+	return(bpage->zip.ssize
+	       ? (UNIV_ZIP_SIZE_MIN >> 1) << bpage->zip.ssize : 0);
 }
 
 /*********************************************************************//**
@@ -756,7 +785,8 @@ buf_block_get_zip_size(
 /*===================*/
 	const buf_block_t*	block)	/*!< in: pointer to the control block */
 {
-	return(block->page.zip.ssize ? 512 << block->page.zip.ssize : 0);
+	return(block->page.zip.ssize
+	       ? (UNIV_ZIP_SIZE_MIN >> 1) << block->page.zip.ssize : 0);
 }
 
 #ifndef UNIV_HOTBACKUP
@@ -852,7 +882,7 @@ buf_block_free(
 /*===========*/
 	buf_block_t*	block)	/*!< in, own: block to be freed */
 {
-	buf_pool_t*	buf_pool = buf_pool_from_bpage((buf_page_t*)block);
+	buf_pool_t*	buf_pool = buf_pool_from_bpage((buf_page_t*) block);
 
 	buf_pool_mutex_enter(buf_pool);
 
@@ -905,13 +935,13 @@ Gets the youngest modification log sequence number for a frame.
 Returns zero if not file page or no modification occurred yet.
 @return	newest modification to page */
 UNIV_INLINE
-ib_uint64_t
+lsn_t
 buf_page_get_newest_modification(
 /*=============================*/
 	const buf_page_t*	bpage)	/*!< in: block containing the
 					page frame */
 {
-	ib_uint64_t	lsn;
+	lsn_t		lsn;
 	mutex_t*	block_mutex = buf_page_get_mutex(bpage);
 
 	mutex_enter(block_mutex);
@@ -938,7 +968,7 @@ buf_block_modify_clock_inc(
 	buf_block_t*	block)	/*!< in: block */
 {
 #ifdef UNIV_SYNC_DEBUG
-	buf_pool_t*	buf_pool = buf_pool_from_bpage((buf_page_t*)block);
+	buf_pool_t*	buf_pool = buf_pool_from_bpage((buf_page_t*) block);
 
 	ut_ad((buf_pool_mutex_own(buf_pool)
 	       && (block->page.buf_fix_count == 0))
@@ -1047,18 +1077,24 @@ UNIV_INLINE
 buf_page_t*
 buf_page_hash_get_low(
 /*==================*/
-	buf_pool_t*	buf_pool,	/*!< buffer pool instance */
-	ulint		space,		/*!< in: space id */
-	ulint		offset,		/*!< in: offset of the page
-					within space */
-	ulint		fold)		/*!< in: buf_page_address_fold(
-					space, offset) */
+	buf_pool_t*	buf_pool,/*!< buffer pool instance */
+	ulint		space,	/*!< in: space id */
+	ulint		offset,	/*!< in: offset of the page within space */
+	ulint		fold)	/*!< in: buf_page_address_fold(space, offset) */
 {
 	buf_page_t*	bpage;
 
-	ut_ad(buf_pool);
-	ut_ad(buf_pool_mutex_own(buf_pool));
-	ut_ad(fold == buf_page_address_fold(space, offset));
+#ifdef UNIV_SYNC_DEBUG
+	ulint		hash_fold;
+	rw_lock_t*	hash_lock;
+
+	hash_fold = buf_page_address_fold(space, offset);
+	ut_ad(hash_fold == fold);
+
+	hash_lock = hash_get_lock(buf_pool->page_hash, fold);
+	ut_ad(rw_lock_own(hash_lock, RW_LOCK_EX)
+	      || rw_lock_own(hash_lock, RW_LOCK_SHARED));
+#endif /* UNIV_SYNC_DEBUG */
 
 	/* Look for the page in the hash table */
 
@@ -1083,46 +1119,145 @@ buf_page_hash_get_low(
 
 /******************************************************************//**
 Returns the control block of a file page, NULL if not found.
-@return	block, NULL if not found or not a real control block */
+If the block is found and lock is not NULL then the appropriate
+page_hash lock is acquired in the specified lock mode. Otherwise,
+mode value is ignored. It is up to the caller to release the
+lock. If the block is found and the lock is NULL then the page_hash
+lock is released by this function.
+@return	block, NULL if not found */
 UNIV_INLINE
 buf_page_t*
-buf_page_hash_get(
-/*==============*/
-	buf_pool_t*	buf_pool,	/*!< in: buffer pool instance */
+buf_page_hash_get_locked(
+/*=====================*/
+					/*!< out: pointer to the bpage,
+					or NULL; if NULL, hash_lock
+					is also NULL. */
+	buf_pool_t*	buf_pool,	/*!< buffer pool instance */
 	ulint		space,		/*!< in: space id */
-	ulint		offset)		/*!< in: offset of the page
-					within space */
+	ulint		offset,		/*!< in: page number */
+	rw_lock_t**	lock,		/*!< in/out: lock of the page
+					hash acquired if bpage is
+					found. NULL otherwise. If NULL
+					is passed then the hash_lock
+					is released by this function */
+	ulint		lock_mode)	/*!< in: RW_LOCK_EX or
+					RW_LOCK_SHARED. Ignored if
+					lock == NULL */
 {
-	buf_page_t*	bpage;
-	ulint		fold	= buf_page_address_fold(space, offset);
+	buf_page_t*	bpage = NULL;
+	ulint		fold;
+	rw_lock_t*	hash_lock;
+	ulint		mode = RW_LOCK_SHARED;
+
+	if (lock != NULL) {
+		*lock = NULL;
+		ut_ad(lock_mode == RW_LOCK_EX
+		      || lock_mode == RW_LOCK_SHARED);
+		mode = lock_mode;
+	}
+
+	fold = buf_page_address_fold(space, offset);
+	hash_lock = hash_get_lock(buf_pool->page_hash, fold);
+
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(!rw_lock_own(hash_lock, RW_LOCK_EX)
+	      && !rw_lock_own(hash_lock, RW_LOCK_SHARED));
+#endif /* UNIV_SYNC_DEBUG */
+
+	if (mode == RW_LOCK_SHARED) {
+		rw_lock_s_lock(hash_lock);
+	} else {
+		rw_lock_x_lock(hash_lock);
+	}
 
-	bpage	= buf_page_hash_get_low(buf_pool, space, offset, fold);
+	bpage = buf_page_hash_get_low(buf_pool, space, offset, fold);
 
-	if (bpage && buf_pool_watch_is_sentinel(buf_pool, bpage)) {
+	if (!bpage || buf_pool_watch_is_sentinel(buf_pool, bpage)) {
 		bpage = NULL;
+		goto unlock_and_exit;
 	}
 
+	ut_ad(buf_page_in_file(bpage));
+	ut_ad(offset == bpage->offset);
+	ut_ad(space == bpage->space);
+
+	if (lock == NULL) {
+		/* The caller wants us to release the page_hash lock */
+		goto unlock_and_exit;
+	} else {
+		/* To be released by the caller */
+		*lock = hash_lock;
+		goto exit;
+	}
+
+unlock_and_exit:
+	if (mode == RW_LOCK_SHARED) {
+		rw_lock_s_unlock(hash_lock);
+	} else {
+		rw_lock_x_unlock(hash_lock);
+	}
+exit:
 	return(bpage);
 }
 
 /******************************************************************//**
-Returns the control block of a file page, NULL if not found
-or an uncompressed page frame does not exist.
+Returns the control block of a file page, NULL if not found.
+If the block is found and lock is not NULL then the appropriate
+page_hash lock is acquired in the specified lock mode. Otherwise,
+mode value is ignored. It is up to the caller to release the
+lock. If the block is found and the lock is NULL then the page_hash
+lock is released by this function.
 @return	block, NULL if not found */
 UNIV_INLINE
 buf_block_t*
-buf_block_hash_get(
-/*===============*/
-	buf_pool_t*	buf_pool,	/*!< in: buffer pool instance */
+buf_block_hash_get_locked(
+/*=====================*/
+					/*!< out: pointer to the bpage,
+					or NULL; if NULL, hash_lock
+					is also NULL. */
+	buf_pool_t*	buf_pool,	/*!< buffer pool instance */
 	ulint		space,		/*!< in: space id */
-	ulint		offset)		/*!< in: offset of the page
-					within space */
+	ulint		offset,		/*!< in: page number */
+	rw_lock_t**	lock,		/*!< in/out: lock of the page
+					hash acquired if bpage is
+					found. NULL otherwise. If NULL
+					is passed then the hash_lock
+					is released by this function */
+	ulint		lock_mode)	/*!< in: RW_LOCK_EX or
+					RW_LOCK_SHARED. Ignored if
+					lock == NULL */
 {
-	buf_block_t*	block;
+	buf_page_t*	bpage = buf_page_hash_get_locked(buf_pool,
+							 space,
+							 offset,
+							 lock,
+							 lock_mode);
+	buf_block_t*	block = buf_page_get_block(bpage);
+
+	if (block) {
+		ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
+#ifdef UNIV_SYNC_DEBUG
+		ut_ad(!lock || rw_lock_own(*lock, lock_mode));
+#endif /* UNIV_SYNC_DEBUG */
+		return(block);
+	} else if (bpage) {
+		/* It is not a block. Just a bpage */
+		ut_ad(buf_page_in_file(bpage));
 
-	block = buf_page_get_block(buf_page_hash_get(buf_pool, space, offset));
+		if (lock) {
+			if (lock_mode == RW_LOCK_SHARED) {
+				rw_lock_s_unlock(*lock);
+			} else {
+				rw_lock_x_unlock(*lock);
+			}
+		}
+		*lock = NULL;
+		return(NULL);
+	}
 
-	return(block);
+	ut_ad(!bpage);
+	ut_ad(lock == NULL ||*lock == NULL);
+	return(NULL);
 }
 
 /********************************************************************//**
@@ -1139,16 +1274,9 @@ buf_page_peek(
 	ulint	space,	/*!< in: space id */
 	ulint	offset)	/*!< in: page number */
 {
-	const buf_page_t*	bpage;
 	buf_pool_t*		buf_pool = buf_pool_get(space, offset);
 
-	buf_pool_mutex_enter(buf_pool);
-
-	bpage = buf_page_hash_get(buf_pool, space, offset);
-
-	buf_pool_mutex_exit(buf_pool);
-
-	return(bpage != NULL);
+	return(buf_page_hash_get(buf_pool, space, offset) != NULL);
 }
 
 /********************************************************************//**
@@ -1189,7 +1317,6 @@ buf_page_release_zip(
 		break;
 	}
 
-	
 	ut_error;
 }
 
@@ -1274,4 +1401,21 @@ buf_pool_mutex_exit_all(void)
 		buf_pool_mutex_exit(buf_pool);
 	}
 }
+/*********************************************************************//**
+Get the nth chunk's buffer block in the specified buffer pool.
+@return the nth chunk's buffer block. */
+UNIV_INLINE
+buf_block_t*
+buf_get_nth_chunk_block(
+/*====================*/
+	const buf_pool_t* buf_pool,	/*!< in: buffer pool instance */
+	ulint		n,		/*!< in: nth chunk in the buffer pool */
+	ulint*		chunk_size)	/*!< in: chunk size */
+{
+	const buf_chunk_t*	chunk;
+
+	chunk = buf_pool->chunks + n;
+	*chunk_size = chunk->size;
+	return(chunk->blocks);
+}
 #endif /* !UNIV_HOTBACKUP */
diff --git a/storage/innobase/include/buf0checksum.h b/storage/innobase/include/buf0checksum.h
new file mode 100644
index 00000000000..cd21781dc6e
--- /dev/null
+++ b/storage/innobase/include/buf0checksum.h
@@ -0,0 +1,88 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2011, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file buf/buf0checksum.h
+Buffer pool checksum functions, also linked from /extra/innochecksum.cc
+
+Created Aug 11, 2011 Vasil Dimov
+*******************************************************/
+
+#ifndef buf0checksum_h
+#define buf0checksum_h
+
+#include "univ.i"
+
+#ifndef UNIV_INNOCHECKSUM
+
+#include "buf0types.h"
+
+#endif /* !UNIV_INNOCHECKSUM */
+
+/********************************************************************//**
+Calculates a page CRC32 which is stored to the page when it is written
+to a file. Note that we must be careful to calculate the same value on
+32-bit and 64-bit architectures.
+@return	checksum */
+UNIV_INTERN
+ib_uint32_t
+buf_calc_page_crc32(
+/*================*/
+	const byte*	page);	/*!< in: buffer page */
+
+/********************************************************************//**
+Calculates a page checksum which is stored to the page when it is written
+to a file. Note that we must be careful to calculate the same value on
+32-bit and 64-bit architectures.
+@return	checksum */
+UNIV_INTERN
+ulint
+buf_calc_page_new_checksum(
+/*=======================*/
+	const byte*	page);	/*!< in: buffer page */
+
+/********************************************************************//**
+In versions < 4.0.14 and < 4.1.1 there was a bug that the checksum only
+looked at the first few bytes of the page. This calculates that old
+checksum.
+NOTE: we must first store the new formula checksum to
+FIL_PAGE_SPACE_OR_CHKSUM before calculating and storing this old checksum
+because this takes that field as an input!
+@return	checksum */
+UNIV_INTERN
+ulint
+buf_calc_page_old_checksum(
+/*=======================*/
+	const byte*	page);	/*!< in: buffer page */
+
+#ifndef UNIV_INNOCHECKSUM
+
+/********************************************************************//**
+Return a printable string describing the checksum algorithm.
+@return	algorithm name */
+UNIV_INTERN
+const char*
+buf_checksum_algorithm_name(
+/*========================*/
+	srv_checksum_algorithm_t	algo);	/*!< in: algorithm */
+
+extern ulong	srv_checksum_algorithm;
+
+#endif /* !UNIV_INNOCHECKSUM */
+
+#endif /* buf0checksum_h */
diff --git a/storage/innobase/include/buf0dblwr.h b/storage/innobase/include/buf0dblwr.h
new file mode 100644
index 00000000000..fcc56d91405
--- /dev/null
+++ b/storage/innobase/include/buf0dblwr.h
@@ -0,0 +1,148 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2011, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/buf0dblwr.h
+Doublewrite buffer module
+
+Created 2011/12/19 Inaam Rana
+*******************************************************/
+
+#ifndef buf0dblwr_h
+#define buf0dblwr_h
+
+#include "univ.i"
+#include "ut0byte.h"
+#include "log0log.h"
+#include "buf0types.h"
+
+#ifndef UNIV_HOTBACKUP
+
+/** Doublewrite system */
+extern buf_dblwr_t*	buf_dblwr;
+/** Set to TRUE when the doublewrite buffer is being created */
+extern ibool		buf_dblwr_being_created;
+
+/****************************************************************//**
+Creates the doublewrite buffer to a new InnoDB installation. The header of the
+doublewrite buffer is placed on the trx system header page. */
+UNIV_INTERN
+void
+buf_dblwr_create(void);
+/*==================*/
+/****************************************************************//**
+At a database startup initializes the doublewrite buffer memory structure if
+we already have a doublewrite buffer created in the data files. If we are
+upgrading to an InnoDB version which supports multiple tablespaces, then this
+function performs the necessary update operations. If we are in a crash
+recovery, this function uses a possible doublewrite buffer to restore
+half-written pages in the data files. */
+UNIV_INTERN
+void
+buf_dblwr_init_or_restore_pages(
+/*============================*/
+	ibool	restore_corrupt_pages);	/*!< in: TRUE=restore pages */
+/****************************************************************//**
+frees doublewrite buffer. */
+UNIV_INTERN
+void
+buf_dblwr_free(void);
+/*================*/
+/********************************************************************//**
+Updates the doublewrite buffer when an IO request that is part of an
+LRU or flush batch is completed. */
+UNIV_INTERN
+void
+buf_dblwr_update(void);
+/*==================*/
+/****************************************************************//**
+Determines if a page number is located inside the doublewrite buffer.
+@return TRUE if the location is inside the two blocks of the
+doublewrite buffer */
+UNIV_INTERN
+ibool
+buf_dblwr_page_inside(
+/*==================*/
+	ulint	page_no);	/*!< in: page number */
+/********************************************************************//**
+Posts a buffer page for writing. If the doublewrite memory buffer is
+full, calls buf_dblwr_flush_buffered_writes and waits for for free
+space to appear. */
+UNIV_INTERN
+void
+buf_dblwr_add_to_batch(
+/*====================*/
+	buf_page_t*	bpage);	/*!< in: buffer block to write */
+/********************************************************************//**
+Flushes possible buffered writes from the doublewrite memory buffer to disk,
+and also wakes up the aio thread if simulated aio is used. It is very
+important to call this function after a batch of writes has been posted,
+and also when we may have to wait for a page latch! Otherwise a deadlock
+of threads can occur. */
+UNIV_INTERN
+void
+buf_dblwr_flush_buffered_writes(void);
+/*=================================*/
+/********************************************************************//**
+Writes a page to the doublewrite buffer on disk, sync it, then write
+the page to the datafile and sync the datafile. This function is used
+for single page flushes. If all the buffers allocated for single page
+flushes in the doublewrite buffer are in use we wait here for one to
+become free. We are guaranteed that a slot will become free because any
+thread that is using a slot must also release the slot before leaving
+this function. */
+UNIV_INTERN
+void
+buf_dblwr_write_single_page(
+/*========================*/
+	buf_page_t*	bpage);	/*!< in: buffer block to write */
+
+/** Doublewrite control struct */
+struct buf_dblwr_struct{
+	mutex_t	mutex;		/*!< mutex protecting the first_free field and
+				write_buf */
+	ulint	block1;		/*!< the page number of the first
+				doublewrite block (64 pages) */
+	ulint	block2;		/*!< page number of the second block */
+	ulint	first_free;	/*!< first free position in write_buf measured
+				in units of UNIV_PAGE_SIZE */
+	ulint	s_reserved;	/*!< number of slots currently reserved
+				for single page flushes. */
+	ulint	b_reserved;	/*!< number of slots currently reserved
+				for batch flush. */
+	ibool*	in_use;		/*!< flag used to indicate if a slot is
+				in use. Only used for single page
+				flushes. */
+	ibool	batch_running;	/*!< set to TRUE if currently a batch
+				is being written from the doublewrite
+				buffer. */
+	byte*	write_buf;	/*!< write buffer used in writing to the
+				doublewrite buffer, aligned to an
+				address divisible by UNIV_PAGE_SIZE
+				(which is required by Windows aio) */
+	byte*	write_buf_unaligned;
+				/*!< pointer to write_buf, but unaligned */
+	buf_page_t**
+		buf_block_arr;	/*!< array to store pointers to the buffer
+				blocks which have been cached to write_buf */
+};
+
+
+#endif /* UNIV_HOTBACKUP */
+
+#endif
diff --git a/storage/innobase/include/buf0dump.h b/storage/innobase/include/buf0dump.h
new file mode 100644
index 00000000000..c704a8e97e0
--- /dev/null
+++ b/storage/innobase/include/buf0dump.h
@@ -0,0 +1,72 @@
+/*****************************************************************************
+
+Copyright (c) 2011, 2011, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file buf/buf0dump.h
+Implements a buffer pool dump/load.
+
+Created April 08, 2011 Vasil Dimov
+*******************************************************/
+
+#ifndef buf0dump_h
+#define buf0dump_h
+
+#include "univ.i"
+
+/*****************************************************************//**
+Wakes up the buffer pool dump/load thread and instructs it to start
+a dump. This function is called by MySQL code via buffer_pool_dump_now()
+and it should return immediately because the whole MySQL is frozen during
+its execution. */
+UNIV_INTERN
+void
+buf_dump_start();
+/*============*/
+
+/*****************************************************************//**
+Wakes up the buffer pool dump/load thread and instructs it to start
+a load. This function is called by MySQL code via buffer_pool_load_now()
+and it should return immediately because the whole MySQL is frozen during
+its execution. */
+UNIV_INTERN
+void
+buf_load_start();
+/*============*/
+
+/*****************************************************************//**
+Aborts a currently running buffer pool load. This function is called by
+MySQL code via buffer_pool_load_abort() and it should return immediately
+because the whole MySQL is frozen during its execution. */
+UNIV_INTERN
+void
+buf_load_abort();
+/*============*/
+
+/*****************************************************************//**
+This is the main thread for buffer pool dump/load. It waits for an
+event and when waked up either performs a dump or load and sleeps
+again.
+@return this function does not return, it calls os_thread_exit() */
+extern "C" UNIV_INTERN
+os_thread_ret_t
+DECLARE_THREAD(buf_dump_thread)(
+/*============================*/
+	void*	arg);				/*!< in: a dummy parameter
+						required by os_thread_create */
+
+#endif /* buf0dump_h */
diff --git a/storage/innobase/include/buf0flu.h b/storage/innobase/include/buf0flu.h
index ae27f5dab0e..faf577f718b 100644
--- a/storage/innobase/include/buf0flu.h
+++ b/storage/innobase/include/buf0flu.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1995, 2010, Innobase Oy. All Rights Reserved.
+Copyright (c) 1995, 2011, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -28,10 +28,13 @@ Created 11/5/1995 Heikki Tuuri
 
 #include "univ.i"
 #include "ut0byte.h"
+#include "log0log.h"
 #ifndef UNIV_HOTBACKUP
 #include "mtr0types.h"
 #include "buf0types.h"
-#include "log0log.h"
+
+/** Flag indicating if the page_cleaner is in active state. */
+extern ibool buf_page_cleaner_is_active;
 
 /********************************************************************//**
 Remove a block from the flush list of modified blocks. */
@@ -57,21 +60,6 @@ void
 buf_flush_write_complete(
 /*=====================*/
 	buf_page_t*	bpage);	/*!< in: pointer to the block in question */
-/*********************************************************************//**
-Flushes pages from the end of the LRU list if there is too small
-a margin of replaceable pages there. If buffer pool is NULL it
-means flush free margin on all buffer pool instances. */
-UNIV_INTERN
-void
-buf_flush_free_margin(
-/*==================*/
-	 buf_pool_t*	buf_pool);
-/*********************************************************************//**
-Flushes pages from the end of all the LRU lists. */
-UNIV_INTERN
-void
-buf_flush_free_margins(void);
-/*=========================*/
 #endif /* !UNIV_HOTBACKUP */
 /********************************************************************//**
 Initializes a page for writing to the tablespace. */
@@ -79,10 +67,10 @@ UNIV_INTERN
 void
 buf_flush_init_for_writing(
 /*=======================*/
-	byte*		page,		/*!< in/out: page */
-	void*		page_zip_,	/*!< in/out: compressed page, or NULL */
-	ib_uint64_t	newest_lsn);	/*!< in: newest modification lsn
-					to the page */
+	byte*	page,		/*!< in/out: page */
+	void*	page_zip_,	/*!< in/out: compressed page, or NULL */
+	lsn_t	newest_lsn);	/*!< in: newest modification lsn
+				to the page */
 #ifndef UNIV_HOTBACKUP
 # if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG
 /********************************************************************//**
@@ -99,21 +87,13 @@ buf_flush_page_try(
 	buf_block_t*	block)		/*!< in/out: buffer control block */
 	__attribute__((nonnull, warn_unused_result));
 # endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */
-/*******************************************************************//**
-This utility flushes dirty blocks from the end of the LRU list.
-NOTE: The calling thread may own latches to pages: to avoid deadlocks,
-this function must be written so that it cannot end up waiting for these
-latches!
-@return number of blocks for which the write request was queued;
-ULINT_UNDEFINED if there was a flush of the same type already running */
+/********************************************************************//**
+Flush a batch of writes to the datafiles that have already been
+written by the OS. */
 UNIV_INTERN
-ulint
-buf_flush_LRU(
-/*==========*/
-	buf_pool_t*	buf_pool,	/*!< in: buffer pool instance */
-	ulint		min_n);		/*!< in: wished minimum mumber of blocks
-					flushed (it is not guaranteed that the
-					actual number is that big, though) */
+void
+buf_flush_sync_datafiles(void);
+/*==========================*/
 /*******************************************************************//**
 This utility flushes dirty blocks from the end of the flush_list of
 all buffer pool instances.
@@ -127,12 +107,25 @@ buf_flush_list(
 	ulint		min_n,		/*!< in: wished minimum mumber of blocks
 					flushed (it is not guaranteed that the
 					actual number is that big, though) */
-	ib_uint64_t	lsn_limit);	/*!< in the case BUF_FLUSH_LIST all
+	lsn_t		lsn_limit);	/*!< in the case BUF_FLUSH_LIST all
 					blocks whose oldest_modification is
 					smaller than this should be flushed
 					(if their number does not exceed
 					min_n), otherwise ignored */
 /******************************************************************//**
+This function picks up a single dirty page from the tail of the LRU
+list, flushes it, removes it from page_hash and LRU list and puts
+it on the free list. It is called from user threads when they are
+unable to find a replacable page at the tail of the LRU list i.e.:
+when the background LRU flushing in the page_cleaner thread is not
+fast enough to keep pace with the workload.
+@return TRUE if success. */
+UNIV_INTERN
+ibool
+buf_flush_single_page_from_LRU(
+/*===========================*/
+	buf_pool_t*	buf_pool);	/*!< in/out: buffer pool instance */
+/******************************************************************//**
 Waits until a flush batch of the given type ends */
 UNIV_INTERN
 void
@@ -169,9 +162,9 @@ void
 buf_flush_recv_note_modification(
 /*=============================*/
 	buf_block_t*	block,		/*!< in: block which is modified */
-	ib_uint64_t	start_lsn,	/*!< in: start lsn of the first mtr in a
+	lsn_t		start_lsn,	/*!< in: start lsn of the first mtr in a
 					set of mtr's */
-	ib_uint64_t	end_lsn);	/*!< in: end lsn of the last mtr in the
+	lsn_t		end_lsn);	/*!< in: end lsn of the last mtr in the
 					set of mtr's */
 /********************************************************************//**
 Returns TRUE if the file page block is immediately suitable for replacement,
@@ -195,8 +188,8 @@ how much redo the workload is generating and at what rate. */
 
 struct buf_flush_stat_struct
 {
-	ib_uint64_t	redo;		/**< amount of redo generated. */
-	ulint		n_flushed;	/**< number of pages flushed. */
+	lsn_t	redo;		/**< amount of redo generated. */
+	ulint	n_flushed;	/**< number of pages flushed. */
 };
 
 /** Statistics for selecting flush rate of dirty pages. */
@@ -208,18 +201,16 @@ UNIV_INTERN
 void
 buf_flush_stat_update(void);
 /*=======================*/
-/*********************************************************************
-Determines the fraction of dirty pages that need to be flushed based
-on the speed at which we generate redo log. Note that if redo log
-is generated at significant rate without a corresponding increase
-in the number of dirty pages (for example, an in-memory workload)
-it can cause IO bursts of flushing. This function implements heuristics
-to avoid this burstiness.
-@return	number of dirty pages to be flushed / second */
-UNIV_INTERN
-ulint
-buf_flush_get_desired_flush_rate(void);
-/*==================================*/
+/******************************************************************//**
+page_cleaner thread tasked with flushing dirty pages from the buffer
+pools. As of now we'll have only one instance of this thread.
+@return a dummy parameter */
+extern "C" UNIV_INTERN
+os_thread_ret_t
+DECLARE_THREAD(buf_flush_page_cleaner_thread)(
+/*==========================================*/
+	void*	arg);		/*!< in: a dummy parameter required by
+				os_thread_create */
 
 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
 /******************************************************************//**
@@ -247,15 +238,6 @@ UNIV_INTERN
 void
 buf_flush_free_flush_rbt(void);
 /*==========================*/
-
-/** When buf_flush_free_margin is called, it tries to make this many blocks
-available to replacement in the free list and at the end of the LRU list (to
-make sure that a read-ahead batch can be read efficiently in a single
-sweep). */
-#define BUF_FLUSH_FREE_BLOCK_MARGIN(b)	(5 + BUF_READ_AHEAD_AREA(b))
-/** Extra margin to apply above BUF_FLUSH_FREE_BLOCK_MARGIN */
-#define BUF_FLUSH_EXTRA_MARGIN(b)	((BUF_FLUSH_FREE_BLOCK_MARGIN(b) / 4 \
-					+ 100) / srv_buf_pool_instances)
 #endif /* !UNIV_HOTBACKUP */
 
 #ifndef UNIV_NONINL
diff --git a/storage/innobase/include/buf0flu.ic b/storage/innobase/include/buf0flu.ic
index 30e2cc8efe8..68a76c0b637 100644
--- a/storage/innobase/include/buf0flu.ic
+++ b/storage/innobase/include/buf0flu.ic
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1995, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -35,7 +35,7 @@ buf_flush_insert_into_flush_list(
 /*=============================*/
 	buf_pool_t*	buf_pool,	/*!< buffer pool instance */
 	buf_block_t*	block,		/*!< in/out: block which is modified */
-	ib_uint64_t	lsn);		/*!< in: oldest modification */
+	lsn_t		lsn);		/*!< in: oldest modification */
 /********************************************************************//**
 Inserts a modified block into the flush list in the right sorted position.
 This function is used by recovery, because there the modifications do not
@@ -46,7 +46,7 @@ buf_flush_insert_sorted_into_flush_list(
 /*====================================*/
 	buf_pool_t*	buf_pool,	/*!< buffer pool instance */
 	buf_block_t*	block,		/*!< in/out: block which is modified */
-	ib_uint64_t	lsn);		/*!< in: oldest modification */
+	lsn_t		lsn);		/*!< in: oldest modification */
 
 /********************************************************************//**
 This function should be called at a mini-transaction commit, if a page was
@@ -70,7 +70,7 @@ buf_flush_note_modification(
 
 	ut_ad(!buf_pool_mutex_own(buf_pool));
 	ut_ad(!buf_flush_list_mutex_own(buf_pool));
-	ut_ad(log_flush_order_mutex_own());
+	ut_ad(!mtr->made_dirty || log_flush_order_mutex_own());
 
 	ut_ad(mtr->start_lsn != 0);
 	ut_ad(mtr->modifications);
@@ -81,6 +81,8 @@ buf_flush_note_modification(
 	block->page.newest_modification = mtr->end_lsn;
 
 	if (!block->page.oldest_modification) {
+		ut_a(mtr->made_dirty);
+		ut_ad(log_flush_order_mutex_own());
 		buf_flush_insert_into_flush_list(
 			buf_pool, block, mtr->start_lsn);
 	} else {
@@ -99,9 +101,9 @@ void
 buf_flush_recv_note_modification(
 /*=============================*/
 	buf_block_t*	block,		/*!< in: block which is modified */
-	ib_uint64_t	start_lsn,	/*!< in: start lsn of the first mtr in a
+	lsn_t		start_lsn,	/*!< in: start lsn of the first mtr in a
 					set of mtr's */
-	ib_uint64_t	end_lsn)	/*!< in: end lsn of the last mtr in the
+	lsn_t		end_lsn)	/*!< in: end lsn of the last mtr in the
 					set of mtr's */
 {
 	buf_pool_t*	buf_pool = buf_pool_from_block(block);
diff --git a/storage/innobase/include/buf0lru.h b/storage/innobase/include/buf0lru.h
index 9ecb9de2afe..527852da758 100644
--- a/storage/innobase/include/buf0lru.h
+++ b/storage/innobase/include/buf0lru.h
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -27,23 +27,11 @@ Created 11/5/1995 Heikki Tuuri
 #define buf0lru_h
 
 #include "univ.i"
+#ifndef UNIV_HOTBACKUP
 #include "ut0byte.h"
 #include "buf0types.h"
 
 /******************************************************************//**
-Tries to remove LRU flushed blocks from the end of the LRU list and put them
-to the free list. This is beneficial for the efficiency of the insert buffer
-operation, as flushed pages from non-unique non-clustered indexes are here
-taken out of the buffer pool, and their inserts redirected to the insert
-buffer. Otherwise, the flushed blocks could get modified again before read
-operations need new buffer blocks, and the i/o work done in flushing would be
-wasted. */
-UNIV_INTERN
-void
-buf_LRU_try_free_flushed_blocks(
-/*============================*/
-	buf_pool_t*	buf_pool);	/*!< in: buffer pool instance */
-/******************************************************************//**
 Returns TRUE if less than 25 % of the buffer pool is available. This can be
 used in heuristics to prevent huge transactions eating up the whole buffer
 pool for their locks.
@@ -60,18 +48,16 @@ These are low-level functions
 /** Minimum LRU list length for which the LRU_old pointer is defined */
 #define BUF_LRU_OLD_MIN_LEN	512	/* 8 megabytes of 16k pages */
 
-/** Maximum LRU list search length in buf_flush_LRU_recommendation() */
-#define BUF_LRU_FREE_SEARCH_LEN(b)	(5 + 2 * BUF_READ_AHEAD_AREA(b))
-
 /******************************************************************//**
-Removes all pages belonging to a given tablespace. */
+Invalidates all pages belonging to a given tablespace when we are deleting
+the data file(s) of that tablespace. A PROBLEM: if readahead is being started,
+what guarantees that it will not try to read in pages after this operation has
+completed? */
 UNIV_INTERN
 void
-buf_LRU_flush_or_remove_pages(
+buf_LRU_invalidate_tablespace(
 /*==========================*/
-	ulint			id,	/*!< in: space id */
-	enum buf_remove_t	buf_remove);/*!< in: remove or flush
-					strategy */
+	ulint	id);	/*!< in: space id */
 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
 /********************************************************************//**
 Insert a compressed block into buf_pool->zip_clean in the LRU order. */
@@ -90,9 +76,8 @@ NOTE: If this function returns TRUE, it will temporarily
 release buf_pool->mutex.  Furthermore, the page frame will no longer be
 accessible via bpage.
 
-The caller must hold buf_pool->mutex and buf_page_get_mutex(bpage) and
-release these two mutexes after the call.  No other
-buf_page_get_mutex() may be held when calling this function.
+The caller must hold buf_pool->mutex and must not hold any
+buf_page_get_mutex() when calling this function.
 @return TRUE if freed, FALSE otherwise. */
 UNIV_INTERN
 ibool
@@ -107,19 +92,13 @@ Try to free a replaceable block.
 @return	TRUE if found and freed */
 UNIV_INTERN
 ibool
-buf_LRU_search_and_free_block(
-/*==========================*/
+buf_LRU_scan_and_free_block(
+/*========================*/
 	buf_pool_t*	buf_pool,	/*!< in: buffer pool instance */
-	ulint		n_iterations);	/*!< in: how many times this has
-					been called repeatedly without
-					result: a high value means that
-					we should search farther; if
-					n_iterations < 10, then we search
-					n_iterations / 10 * buf_pool->curr_size
-					pages from the end of the LRU list; if
-					n_iterations < 5, then we will
-					also search n_iterations / 5
-					of the unzip_LRU list. */
+	ibool		scan_all)	/*!< in: scan whole LRU list
+					if TRUE, otherwise scan only
+					'old' blocks. */
+	__attribute__((nonnull,warn_unused_result));
 /******************************************************************//**
 Returns a free block from the buf_pool.  The block is taken off the
 free list.  If it is empty, returns NULL.
@@ -133,6 +112,27 @@ buf_LRU_get_free_only(
 Returns a free block from the buf_pool. The block is taken off the
 free list. If it is empty, blocks are moved from the end of the
 LRU list to the free list.
+This function is called from a user thread when it needs a clean
+block to read in a page. Note that we only ever get a block from
+the free list. Even when we flush a page or find a page in LRU scan
+we put it to free list to be used.
+* iteration 0:
+  * get a block from free list, success:done
+  * if there is an LRU flush batch in progress:
+    * wait for batch to end: retry free list
+  * if buf_pool->try_LRU_scan is set
+    * scan LRU up to srv_LRU_scan_depth to find a clean block
+    * the above will put the block on free list
+    * success:retry the free list
+  * flush one dirty page from tail of LRU to disk
+    * the above will put the block on free list
+    * success: retry the free list
+* iteration 1:
+  * same as iteration 0 except:
+    * scan whole LRU list
+    * scan LRU list even if buf_pool->try_LRU_scan is not set
+* iteration > 1:
+  * same as iteration 1 but sleep 100ms
 @return	the free control block, in state BUF_BLOCK_READY_FOR_USE */
 UNIV_INTERN
 buf_block_t*
@@ -140,7 +140,15 @@ buf_LRU_get_free_block(
 /*===================*/
 	buf_pool_t*	buf_pool)	/*!< in/out: buffer pool instance */
 	__attribute__((nonnull,warn_unused_result));
-
+/******************************************************************//**
+Determines if the unzip_LRU list should be used for evicting a victim
+instead of the general LRU list.
+@return	TRUE if should use unzip_LRU */
+UNIV_INTERN
+ibool
+buf_LRU_evict_from_unzip_LRU(
+/*=========================*/
+	buf_pool_t*	buf_pool);
 /******************************************************************//**
 Puts a block back to the free list. */
 UNIV_INTERN
@@ -290,4 +298,6 @@ Increments the page_zip_decompress() counter in buf_LRU_stat_cur. */
 #include "buf0lru.ic"
 #endif
 
+#endif /* !UNIV_HOTBACKUP */
+
 #endif
diff --git a/storage/innobase/include/buf0lru.ic b/storage/innobase/include/buf0lru.ic
index 556f45d987f..6e0da7a2588 100644
--- a/storage/innobase/include/buf0lru.ic
+++ b/storage/innobase/include/buf0lru.ic
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1995, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
diff --git a/storage/innobase/include/buf0rea.h b/storage/innobase/include/buf0rea.h
index cd5eff66ee8..b98ff121209 100644
--- a/storage/innobase/include/buf0rea.h
+++ b/storage/innobase/include/buf0rea.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1995, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -43,6 +43,18 @@ buf_read_page(
 	ulint	zip_size,/*!< in: compressed page size in bytes, or 0 */
 	ulint	offset);/*!< in: page number */
 /********************************************************************//**
+High-level function which reads a page asynchronously from a file to the
+buffer buf_pool if it is not already there. Sets the io_fix flag and sets
+an exclusive lock on the buffer frame. The flag is cleared and the x-lock
+released by the i/o-handler thread.
+@return TRUE if page has been read in, FALSE in case of failure */
+UNIV_INTERN
+ibool
+buf_read_page_async(
+/*================*/
+	ulint	space,	/*!< in: space id */
+	ulint	offset);/*!< in: page number */
+/********************************************************************//**
 Applies a random read-ahead in buf_pool if there are at least a threshold
 value of accessed pages from the random read-ahead area. Does not read any
 page, not even the one at the position (space, offset), if the read-ahead
@@ -157,6 +169,9 @@ invoked */
 #define BUF_READ_IBUF_PAGES_ONLY	131
 /** read any page */
 #define BUF_READ_ANY_PAGE		132
+/** read any page, but ignore (return an error) if a page does not exist
+instead of crashing like BUF_READ_ANY_PAGE does */
+#define BUF_READ_IGNORE_NONEXISTENT_PAGES 1024
 /* @} */
 
 #endif
diff --git a/storage/innobase/include/buf0types.h b/storage/innobase/include/buf0types.h
index 2916f39f3fe..ba54a8aeeea 100644
--- a/storage/innobase/include/buf0types.h
+++ b/storage/innobase/include/buf0types.h
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -26,8 +26,6 @@ Created 11/17/1995 Heikki Tuuri
 #ifndef buf0types_h
 #define buf0types_h
 
-#include "page0types.h"
-
 /** Buffer page (uncompressed or compressed) */
 typedef	struct buf_page_struct		buf_page_t;
 /** Buffer block for which an uncompressed page exists */
@@ -40,6 +38,8 @@ typedef	struct buf_pool_struct		buf_pool_t;
 typedef	struct buf_pool_stat_struct	buf_pool_stat_t;
 /** Buffer pool buddy statistics struct */
 typedef	struct buf_buddy_stat_struct	buf_buddy_stat_t;
+/** Doublewrite memory struct */
+typedef struct buf_dblwr_struct		buf_dblwr_t;
 
 /** A buffer frame. @see page_t */
 typedef	byte	buf_frame_t;
@@ -47,9 +47,10 @@ typedef	byte	buf_frame_t;
 /** Flags for flush types */
 enum buf_flush {
 	BUF_FLUSH_LRU = 0,		/*!< flush via the LRU list */
-	BUF_FLUSH_SINGLE_PAGE,		/*!< flush a single page */
 	BUF_FLUSH_LIST,			/*!< flush via the flush list
 					of dirty blocks */
+	BUF_FLUSH_SINGLE_PAGE,		/*!< flush via the LRU list
+					but only a single page */
 	BUF_FLUSH_N_TYPES		/*!< index of last element + 1  */
 };
 
@@ -63,23 +64,39 @@ enum buf_io_fix {
 					the flush_list */
 };
 
-/** Algorithm to remove the pages for a tablespace from the buffer pool.
-@See buf_LRU_flush_or_remove_pages(). */
-enum buf_remove_t {
-	BUF_REMOVE_ALL_NO_WRITE,	/*!< Remove all pages from the buffer
-					pool, don't write or sync to disk */
-	BUF_REMOVE_FLUSH_NO_WRITE	/*!< Remove only, from the flush list,
-					don't write or sync to disk */
+/** Alternatives for srv_checksum_algorithm, which can be changed by
+setting innodb_checksum_algorithm */
+enum srv_checksum_algorithm_enum {
+	SRV_CHECKSUM_ALGORITHM_CRC32,		/*!< Write crc32, allow crc32,
+						innodb or none when reading */
+	SRV_CHECKSUM_ALGORITHM_STRICT_CRC32,	/*!< Write crc32, allow crc32
+						when reading */
+	SRV_CHECKSUM_ALGORITHM_INNODB,		/*!< Write innodb, allow crc32,
+						innodb or none when reading */
+	SRV_CHECKSUM_ALGORITHM_STRICT_INNODB,	/*!< Write innodb, allow
+						innodb when reading */
+	SRV_CHECKSUM_ALGORITHM_NONE,		/*!< Write none, allow crc32,
+						innodb or none when reading */
+	SRV_CHECKSUM_ALGORITHM_STRICT_NONE	/*!< Write none, allow none
+						when reading */
 };
 
+typedef enum srv_checksum_algorithm_enum	srv_checksum_algorithm_t;
+
 /** Parameters of binary buddy system for compressed pages (buf0buddy.h) */
 /* @{ */
-#define BUF_BUDDY_LOW_SHIFT	PAGE_ZIP_MIN_SIZE_SHIFT
+/** Zip shift value for the smallest page size */
+#define BUF_BUDDY_LOW_SHIFT	UNIV_ZIP_SIZE_SHIFT_MIN
 
-#define BUF_BUDDY_LOW		(1 << BUF_BUDDY_LOW_SHIFT)
+/** Smallest buddy page size */
+#define BUF_BUDDY_LOW		(1U << BUF_BUDDY_LOW_SHIFT)
 
+/** Actual number of buddy sizes based on current page size */
 #define BUF_BUDDY_SIZES		(UNIV_PAGE_SIZE_SHIFT - BUF_BUDDY_LOW_SHIFT)
-					/*!< number of buddy sizes */
+
+/** Maximum number of buddy sizes based on the max page size */
+#define BUF_BUDDY_SIZES_MAX	(UNIV_PAGE_SIZE_SHIFT_MAX	\
+				- BUF_BUDDY_LOW_SHIFT)
 
 /** twice the maximum block size of the buddy system;
 the underlying memory is aligned by this amount:
@@ -87,5 +104,4 @@ this must be equal to UNIV_PAGE_SIZE */
 #define BUF_BUDDY_HIGH	(BUF_BUDDY_LOW << BUF_BUDDY_SIZES)
 /* @} */
 
-#endif
-
+#endif /* buf0types.h */
diff --git a/storage/innobase/include/data0data.h b/storage/innobase/include/data0data.h
index 6d3c2988fdc..37364e891f5 100644
--- a/storage/innobase/include/data0data.h
+++ b/storage/innobase/include/data0data.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1994, 2012, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -153,6 +153,7 @@ dfield_dup(
 /*=======*/
 	dfield_t*	field,	/*!< in/out: data field */
 	mem_heap_t*	heap);	/*!< in: memory heap where allocated */
+#ifndef UNIV_HOTBACKUP
 /*********************************************************************//**
 Tests if two data fields are equal.
 If len==0, tests the data length and content for equality.
@@ -170,13 +171,15 @@ dfield_datas_are_binary_equal(
 /*********************************************************************//**
 Tests if dfield data length and content is equal to the given.
 @return	TRUE if equal */
-UNIV_INTERN
+UNIV_INLINE
 ibool
 dfield_data_is_binary_equal(
 /*========================*/
 	const dfield_t*	field,	/*!< in: field */
 	ulint		len,	/*!< in: data length or UNIV_SQL_NULL */
-	const byte*	data);	/*!< in: data */
+	const byte*	data)	/*!< in: data */
+	__attribute__((nonnull, warn_unused_result));
+#endif /* !UNIV_HOTBACKUP */
 /*********************************************************************//**
 Gets number of fields in a data tuple.
 @return	number of fields */
diff --git a/storage/innobase/include/data0data.ic b/storage/innobase/include/data0data.ic
index 205fa397987..da50e91e98d 100644
--- a/storage/innobase/include/data0data.ic
+++ b/storage/innobase/include/data0data.ic
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1994, 2012, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -138,7 +138,7 @@ dfield_is_ext(
 {
 	ut_ad(field);
 
-	return(UNIV_UNLIKELY(field->ext));
+	return(field->ext);
 }
 
 /*********************************************************************//**
@@ -228,6 +228,7 @@ dfield_dup(
 	}
 }
 
+#ifndef UNIV_HOTBACKUP
 /*********************************************************************//**
 Tests if two data fields are equal.
 If len==0, tests the data length and content for equality.
@@ -258,6 +259,23 @@ dfield_datas_are_binary_equal(
 }
 
 /*********************************************************************//**
+Tests if dfield data length and content is equal to the given.
+@return	TRUE if equal */
+UNIV_INLINE
+ibool
+dfield_data_is_binary_equal(
+/*========================*/
+	const dfield_t*	field,	/*!< in: field */
+	ulint		len,	/*!< in: data length or UNIV_SQL_NULL */
+	const byte*	data)	/*!< in: data */
+{
+	return(len == dfield_get_len(field)
+	       && (len == UNIV_SQL_NULL
+		   || !memcmp(dfield_get_data(field), data, len)));
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/*********************************************************************//**
 Gets info bits in a data tuple.
 @return	info bits */
 UNIV_INLINE
diff --git a/storage/innobase/include/data0type.h b/storage/innobase/include/data0type.h
index d7fa0b9cd44..c7fcf316f24 100644
--- a/storage/innobase/include/data0type.h
+++ b/storage/innobase/include/data0type.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -35,6 +35,16 @@ extern ulint	data_mysql_default_charset_coll;
 /* SQL data type struct */
 typedef struct dtype_struct		dtype_t;
 
+/* SQL Like operator comparison types */
+enum ib_like_enum {
+	IB_LIKE_EXACT,                  /* e.g.  STRING */
+	IB_LIKE_PREFIX,                 /* e.g., STRING% */
+	IB_LIKE_SUFFIX,                 /* e.g., %STRING */
+	IB_LIKE_SUBSTR,                 /* e.g., %STRING% */
+	IB_LIKE_REGEXP                  /* Future */
+};
+typedef enum ib_like_enum               ib_like_t;
+
 /*-------------------------------------------*/
 /* The 'MAIN TYPE' of a column */
 #define	DATA_VARCHAR	1	/* character varying of the
@@ -139,6 +149,8 @@ be less than 256 */
 
 #define	DATA_N_SYS_COLS 3	/* number of system columns defined above */
 
+#define DATA_FTS_DOC_ID	3	/* Used as FTS DOC ID column */
+
 #define DATA_SYS_PRTYPE_MASK 0xF /* mask to extract the above from prtype */
 
 /* Flags ORed to the precise data type */
@@ -182,6 +194,12 @@ because in GCC it returns a long. */
 /* Get mbmaxlen from mbminmaxlen. */
 #define DATA_MBMAXLEN(mbminmaxlen) ((ulint) ((mbminmaxlen) / DATA_MBMAX))
 
+/* We now support 15 bits (up to 32767) collation number */
+#define MAX_CHAR_COLL_NUM	32767
+
+/* Mask to get the Charset Collation number (0x7fff) */
+#define CHAR_COLL_MASK		MAX_CHAR_COLL_NUM
+
 #ifndef UNIV_HOTBACKUP
 /*********************************************************************//**
 Gets the MySQL type code from a dtype.
@@ -450,6 +468,20 @@ dtype_new_read_for_order_and_null_size(
 /*===================================*/
 	dtype_t*	type,	/*!< in: type struct */
 	const byte*	buf);	/*!< in: buffer for stored type order info */
+
+/*********************************************************************//**
+Returns the type's SQL name (e.g. BIGINT UNSIGNED) from mtype,prtype,len
+@return the SQL type name */
+UNIV_INLINE
+char*
+dtype_sql_name(
+/*===========*/
+	unsigned	mtype,	/*!< in: mtype */
+	unsigned	prtype,	/*!< in: prtype */
+	unsigned	len,	/*!< in: len */
+	char*		name,	/*!< out: SQL name */
+	unsigned	name_sz);/*!< in: size of the name buffer */
+
 #endif /* !UNIV_HOTBACKUP */
 
 /*********************************************************************//**
@@ -477,14 +509,14 @@ dtype_new_read_for_order_and_null_size()
 sym_tab_add_null_lit() */
 
 struct dtype_struct{
-	unsigned	mtype:8;	/*!< main data type */
-	unsigned	prtype:24;	/*!< precise type; MySQL data
+	unsigned	prtype:32;	/*!< precise type; MySQL data
 					type, charset code, flags to
 					indicate nullability,
 					signedness, whether this is a
 					binary string, whether this is
 					a true VARCHAR where MySQL
 					uses 2 bytes to store the length */
+	unsigned	mtype:8;	/*!< main data type */
 
 	/* the remaining fields do not affect alphabetical ordering: */
 
diff --git a/storage/innobase/include/data0type.ic b/storage/innobase/include/data0type.ic
index 757dd815c5e..a5e94a8edff 100644
--- a/storage/innobase/include/data0type.ic
+++ b/storage/innobase/include/data0type.ic
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1996, 2010, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2011, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -23,6 +23,8 @@ Data types
 Created 1/16/1996 Heikki Tuuri
 *******************************************************/
 
+#include <string.h> /* strlen() */
+
 #include "mach0data.h"
 #ifndef UNIV_HOTBACKUP
 # include "ha_prototypes.h"
@@ -36,7 +38,7 @@ dtype_get_charset_coll(
 /*===================*/
 	ulint	prtype)	/*!< in: precise data type */
 {
-	return((prtype >> 16) & 0xFFUL);
+	return((prtype >> 16) & CHAR_COLL_MASK);
 }
 
 /*********************************************************************//**
@@ -259,8 +261,8 @@ dtype_get_pad_char(
 	switch (mtype) {
 	case DATA_FIXBINARY:
 	case DATA_BINARY:
-		if (UNIV_UNLIKELY(dtype_get_charset_coll(prtype)
-				  == DATA_MYSQL_BINARY_CHARSET_COLL)) {
+		if (dtype_get_charset_coll(prtype)
+		    == DATA_MYSQL_BINARY_CHARSET_COLL) {
 			/* Starting from 5.0.18, do not pad
 			VARBINARY or BINARY columns. */
 			return(ULINT_UNDEFINED);
@@ -312,11 +314,11 @@ dtype_new_store_for_order_and_null_size(
 	buf[0] = (byte)(type->mtype & 0xFFUL);
 
 	if (type->prtype & DATA_BINARY_TYPE) {
-		buf[0] = buf[0] | 128;
+		buf[0] |= 128;
 	}
 
 	/* In versions < 4.1.2 we had:	if (type->prtype & DATA_NONLATIN1) {
-	buf[0] = buf[0] | 64;
+	buf[0] |= 64;
 	}
 	*/
 
@@ -326,7 +328,7 @@ dtype_new_store_for_order_and_null_size(
 
 	mach_write_to_2(buf + 2, len & 0xFFFFUL);
 
-	ut_ad(dtype_get_charset_coll(type->prtype) < 256);
+	ut_ad(dtype_get_charset_coll(type->prtype) <= MAX_CHAR_COLL_NUM);
 	mach_write_to_2(buf + 4, dtype_get_charset_coll(type->prtype));
 
 	if (type->prtype & DATA_NOT_NULL) {
@@ -353,7 +355,7 @@ dtype_read_for_order_and_null_size(
 	type->prtype = buf[1];
 
 	if (buf[0] & 128) {
-		type->prtype = type->prtype | DATA_BINARY_TYPE;
+		type->prtype |= DATA_BINARY_TYPE;
 	}
 
 	type->len = mach_read_from_2(buf + 2);
@@ -393,10 +395,10 @@ dtype_new_read_for_order_and_null_size(
 
 	type->len = mach_read_from_2(buf + 2);
 
-	charset_coll = mach_read_from_2(buf + 4) & 0x7fff;
+	charset_coll = mach_read_from_2(buf + 4) & CHAR_COLL_MASK;
 
 	if (dtype_is_string_type(type->mtype)) {
-		ut_a(charset_coll < 256);
+		ut_a(charset_coll <= MAX_CHAR_COLL_NUM);
 
 		if (charset_coll == 0) {
 			/* This insert buffer record was inserted with MySQL
@@ -412,6 +414,101 @@ dtype_new_read_for_order_and_null_size(
 	}
 	dtype_set_mblen(type);
 }
+
+/*********************************************************************//**
+Returns the type's SQL name (e.g. BIGINT UNSIGNED) from mtype,prtype,len
+@return the SQL type name */
+UNIV_INLINE
+char*
+dtype_sql_name(
+/*===========*/
+	unsigned	mtype,	/*!< in: mtype */
+	unsigned	prtype,	/*!< in: prtype */
+	unsigned	len,	/*!< in: len */
+	char*		name,	/*!< out: SQL name */
+	unsigned	name_sz)/*!< in: size of the name buffer */
+{
+
+#define APPEND_UNSIGNED()					\
+	do {							\
+		if (prtype & DATA_UNSIGNED) {			\
+			ut_snprintf(name + strlen(name),	\
+				    name_sz - strlen(name),	\
+				    " UNSIGNED");		\
+		}						\
+	} while (0)
+
+	ut_snprintf(name, name_sz, "UNKNOWN");
+
+	switch (mtype) {
+	case DATA_INT:
+		switch (len) {
+		case 1:
+			ut_snprintf(name, name_sz, "TINYINT");
+			break;
+		case 2:
+			ut_snprintf(name, name_sz, "SMALLINT");
+			break;
+		case 3:
+			ut_snprintf(name, name_sz, "MEDIUMINT");
+			break;
+		case 4:
+			ut_snprintf(name, name_sz, "INT");
+			break;
+		case 8:
+			ut_snprintf(name, name_sz, "BIGINT");
+			break;
+		}
+		APPEND_UNSIGNED();
+		break;
+	case DATA_FLOAT:
+		ut_snprintf(name, name_sz, "FLOAT");
+		APPEND_UNSIGNED();
+		break;
+	case DATA_DOUBLE:
+		ut_snprintf(name, name_sz, "DOUBLE");
+		APPEND_UNSIGNED();
+		break;
+	case DATA_FIXBINARY:
+		ut_snprintf(name, name_sz, "BINARY(%u)", len);
+		break;
+	case DATA_CHAR:
+	case DATA_MYSQL:
+		ut_snprintf(name, name_sz, "CHAR(%u)", len);
+		break;
+	case DATA_VARCHAR:
+	case DATA_VARMYSQL:
+		ut_snprintf(name, name_sz, "VARCHAR(%u)", len);
+		break;
+	case DATA_BINARY:
+		ut_snprintf(name, name_sz, "VARBINARY(%u)", len);
+		break;
+	case DATA_BLOB:
+		switch (len) {
+		case 9:
+			ut_snprintf(name, name_sz, "TINYBLOB");
+			break;
+		case 10:
+			ut_snprintf(name, name_sz, "BLOB");
+			break;
+		case 11:
+			ut_snprintf(name, name_sz, "MEDIUMBLOB");
+			break;
+		case 12:
+			ut_snprintf(name, name_sz, "LONGBLOB");
+			break;
+		}
+	}
+
+	if (prtype & DATA_NOT_NULL) {
+		ut_snprintf(name + strlen(name),
+			    name_sz - strlen(name),
+			    " NOT NULL");
+	}
+
+	return(name);
+}
+
 #endif /* !UNIV_HOTBACKUP */
 
 /***********************************************************************//**
@@ -473,9 +570,8 @@ dtype_get_fixed_size_low(
 				dtype_get_charset_coll(prtype),
 				&i_mbminlen, &i_mbmaxlen);
 
-			if (UNIV_UNLIKELY
-			    (DATA_MBMINMAXLEN(i_mbminlen, i_mbmaxlen)
-			     != mbminmaxlen)) {
+			if (DATA_MBMINMAXLEN(i_mbminlen, i_mbmaxlen)
+			     != mbminmaxlen) {
 
 				ut_print_timestamp(stderr);
 				fprintf(stderr, "  InnoDB: "
diff --git a/storage/innobase/include/data0types.h b/storage/innobase/include/data0types.h
index 04e835bc401..7d599ef2c8d 100644
--- a/storage/innobase/include/data0types.h
+++ b/storage/innobase/include/data0types.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 2000, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 2000, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
diff --git a/storage/innobase/include/db0err.h b/storage/innobase/include/db0err.h
index 95ccef16be0..1a3499b09e0 100644
--- a/storage/innobase/include/db0err.h
+++ b/storage/innobase/include/db0err.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1996, 2012, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 1996, 2011, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -42,78 +42,79 @@ enum db_err {
 	DB_ROLLBACK,
 	DB_DUPLICATE_KEY,
 	DB_QUE_THR_SUSPENDED,
-	DB_MISSING_HISTORY,		/* required history data has been
+	DB_MISSING_HISTORY,		/*!< required history data has been
 					deleted due to lack of space in
 					rollback segment */
 	DB_CLUSTER_NOT_FOUND = 30,
 	DB_TABLE_NOT_FOUND,
-	DB_MUST_GET_MORE_FILE_SPACE,	/* the database has to be stopped
+	DB_MUST_GET_MORE_FILE_SPACE,	/*!< the database has to be stopped
 					and restarted with more file space */
 	DB_TABLE_IS_BEING_USED,
-	DB_TOO_BIG_RECORD,		/* a record in an index would not fit
+	DB_TOO_BIG_RECORD,		/*!< a record in an index would not fit
 					on a compressed page, or it would
 					become bigger than 1/2 free space in
 					an uncompressed page frame */
-	DB_LOCK_WAIT_TIMEOUT,		/* lock wait lasted too long */
-	DB_NO_REFERENCED_ROW,		/* referenced key value not found
+	DB_LOCK_WAIT_TIMEOUT,		/*!< lock wait lasted too long */
+	DB_NO_REFERENCED_ROW,		/*!< referenced key value not found
 					for a foreign key in an insert or
 					update of a row */
-	DB_ROW_IS_REFERENCED,		/* cannot delete or update a row
+	DB_ROW_IS_REFERENCED,		/*!< cannot delete or update a row
 					because it contains a key value
 					which is referenced */
-	DB_CANNOT_ADD_CONSTRAINT,	/* adding a foreign key constraint
+	DB_CANNOT_ADD_CONSTRAINT,	/*!< adding a foreign key constraint
 					to a table failed */
-	DB_CORRUPTION,			/* data structure corruption noticed */
-	DB_CANNOT_DROP_CONSTRAINT,	/* dropping a foreign key constraint
+	DB_CORRUPTION,			/*!< data structure corruption noticed */
+	DB_CANNOT_DROP_CONSTRAINT,	/*!< dropping a foreign key constraint
 					from a table failed */
-	DB_NO_SAVEPOINT,		/* no savepoint exists with the given
+	DB_NO_SAVEPOINT,		/*!< no savepoint exists with the given
 					name */
-	DB_TABLESPACE_ALREADY_EXISTS,	/* we cannot create a new single-table
+	DB_TABLESPACE_ALREADY_EXISTS,	/*!< we cannot create a new single-table
 					tablespace because a file of the same
 					name already exists */
-	DB_TABLESPACE_DELETED,		/* tablespace does not exist or is
+	DB_TABLESPACE_DELETED,		/*!< tablespace does not exist or is
 					being dropped right now */
-	DB_LOCK_TABLE_FULL,		/* lock structs have exhausted the
+	DB_LOCK_TABLE_FULL,		/*!< lock structs have exhausted the
 					buffer pool (for big transactions,
 					InnoDB stores the lock structs in the
 					buffer pool) */
-	DB_FOREIGN_DUPLICATE_KEY,	/* foreign key constraints
+	DB_FOREIGN_DUPLICATE_KEY,	/*!< foreign key constraints
 					activated by the operation would
 					lead to a duplicate key in some
 					table */
-	DB_TOO_MANY_CONCURRENT_TRXS,	/* when InnoDB runs out of the
+	DB_TOO_MANY_CONCURRENT_TRXS,	/*!< when InnoDB runs out of the
 					preconfigured undo slots, this can
 					only happen when there are too many
 					concurrent transactions */
-	DB_UNSUPPORTED,			/* when InnoDB sees any artefact or
+	DB_UNSUPPORTED,			/*!< when InnoDB sees any artefact or
 					a feature that it can't recoginize or
 					work with e.g., FT indexes created by
 					a later version of the engine. */
 
-	DB_PRIMARY_KEY_IS_NULL,		/* a column in the PRIMARY KEY
+	DB_PRIMARY_KEY_IS_NULL,		/*!< a column in the PRIMARY KEY
 					was found to be NULL */
 
-	DB_STATS_DO_NOT_EXIST,		/* an operation that requires the
+	DB_STATS_DO_NOT_EXIST,		/*!< an operation that requires the
 					persistent storage, used for recording
 					table and index statistics, was
 					requested but this storage does not
 					exist itself or the stats for a given
 					table do not exist */
-	DB_FOREIGN_EXCEED_MAX_CASCADE,	/* Foreign key constraint related
+	DB_FOREIGN_EXCEED_MAX_CASCADE,	/*!< Foreign key constraint related
 					cascading delete/update exceeds
 					maximum allowed depth */
-	DB_CHILD_NO_INDEX,		/* the child (foreign) table does not
-					have an index that contains the
+	DB_CHILD_NO_INDEX,		/*!< the child (foreign) table does
+					not have an index that contains the
 					foreign keys as its prefix columns */
-	DB_PARENT_NO_INDEX,		/* the parent table does not
+	DB_PARENT_NO_INDEX,		/*!< the parent table does not
 					have an index that contains the
 					foreign keys as its prefix columns */
-	DB_TOO_BIG_INDEX_COL,		/* index column size exceeds maximum
-					limit */
-	DB_INDEX_CORRUPT,		/* we have corrupted index */
-	DB_UNDO_RECORD_TOO_BIG,		/* the undo log record is too big */
-	DB_TABLE_IN_FK_CHECK,		/* table is being used in foreign
-					key check */
+	DB_TOO_BIG_INDEX_COL,		/*!< index column size exceeds
+					maximum limit */
+	DB_INDEX_CORRUPT,		/*!< we have corrupted index */
+	DB_UNDO_RECORD_TOO_BIG,		/*!< the undo log record is too big */
+	DB_READ_ONLY,			/*!< Update operation attempted in
+					a read-only transaction */
+	DB_FTS_INVALID_DOCID,		/* FTS Doc ID cannot be zero */
 
 	/* The following are partial failure codes */
 	DB_FAIL = 1000,
diff --git a/storage/innobase/include/dict0boot.h b/storage/innobase/include/dict0boot.h
index 5d136862bc6..364aa746638 100644
--- a/storage/innobase/include/dict0boot.h
+++ b/storage/innobase/include/dict0boot.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1996, 2010, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2010, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -113,7 +113,6 @@ dict_create(void);
 					indexes; ibuf tables and indexes are
 					assigned as the id the number
 					DICT_IBUF_ID_MIN plus the space id */
-#define DICT_IBUF_ID_MIN	0xFFFFFFFF00000000ULL
 
 /* The offset of the dictionary header on the page */
 #define	DICT_HDR		FSEG_PAGE_DATA
@@ -121,30 +120,159 @@ dict_create(void);
 /*-------------------------------------------------------------*/
 /* Dictionary header offsets */
 #define DICT_HDR_ROW_ID		0	/* The latest assigned row id */
-#define	DICT_HDR_TABLE_ID	8	/* The latest assigned table id */
-#define	DICT_HDR_INDEX_ID	16	/* The latest assigned index id */
-#define DICT_HDR_MAX_SPACE_ID	24	/* The latest assigned space id, or 0*/
-#define	DICT_HDR_MIX_ID_LOW	28	/* Obsolete,always DICT_HDR_FIRST_ID */
-#define	DICT_HDR_TABLES		32	/* Root of the table index tree */
-#define	DICT_HDR_TABLE_IDS	36	/* Root of the table index tree */
-#define	DICT_HDR_COLUMNS	40	/* Root of the column index tree */
-#define	DICT_HDR_INDEXES	44	/* Root of the index index tree */
-#define	DICT_HDR_FIELDS		48	/* Root of the index field
-					index tree */
+#define DICT_HDR_TABLE_ID	8	/* The latest assigned table id */
+#define DICT_HDR_INDEX_ID	16	/* The latest assigned index id */
+#define DICT_HDR_MAX_SPACE_ID	24	/* The latest assigned space id,or 0*/
+#define DICT_HDR_MIX_ID_LOW	28	/* Obsolete,always DICT_HDR_FIRST_ID*/
+#define DICT_HDR_TABLES		32	/* Root of SYS_TABLES clust index */
+#define DICT_HDR_TABLE_IDS	36	/* Root of SYS_TABLE_IDS sec index */
+#define DICT_HDR_COLUMNS	40	/* Root of SYS_COLUMNS clust index */
+#define DICT_HDR_INDEXES	44	/* Root of SYS_INDEXES clust index */
+#define DICT_HDR_FIELDS		48	/* Root of SYS_FIELDS clust index */
 
 #define DICT_HDR_FSEG_HEADER	56	/* Segment header for the tablespace
 					segment into which the dictionary
 					header is created */
 /*-------------------------------------------------------------*/
 
+/* The columns in SYS_TABLES */
+enum dict_col_sys_tables_enum {
+	DICT_COL__SYS_TABLES__NAME		= 0,
+	DICT_COL__SYS_TABLES__ID		= 1,
+	DICT_COL__SYS_TABLES__N_COLS		= 2,
+	DICT_COL__SYS_TABLES__TYPE		= 3,
+	DICT_COL__SYS_TABLES__MIX_ID		= 4,
+	DICT_COL__SYS_TABLES__MIX_LEN		= 5,
+	DICT_COL__SYS_TABLES__CLUSTER_ID	= 6,
+	DICT_COL__SYS_TABLES__SPACE		= 7,
+	DICT_NUM_COLS__SYS_TABLES		= 8
+};
 /* The field numbers in the SYS_TABLES clustered index */
-#define DICT_SYS_TABLES_TYPE_FIELD		5
-
+enum dict_fld_sys_tables_enum {
+	DICT_FLD__SYS_TABLES__NAME		= 0,
+	DICT_FLD__SYS_TABLES__DB_TRX_ID		= 1,
+	DICT_FLD__SYS_TABLES__DB_ROLL_PTR	= 2,
+	DICT_FLD__SYS_TABLES__ID		= 3,
+	DICT_FLD__SYS_TABLES__N_COLS		= 4,
+	DICT_FLD__SYS_TABLES__TYPE		= 5,
+	DICT_FLD__SYS_TABLES__MIX_ID		= 6,
+	DICT_FLD__SYS_TABLES__MIX_LEN		= 7,
+	DICT_FLD__SYS_TABLES__CLUSTER_ID	= 8,
+	DICT_FLD__SYS_TABLES__SPACE		= 9,
+	DICT_NUM_FIELDS__SYS_TABLES		= 10
+};
+/* The field numbers in the SYS_TABLE_IDS index */
+enum dict_fld_sys_table_ids_enum {
+	DICT_FLD__SYS_TABLE_IDS__ID		= 0,
+	DICT_FLD__SYS_TABLE_IDS__NAME		= 1,
+	DICT_NUM_FIELDS__SYS_TABLE_IDS		= 2
+};
+/* The columns in SYS_COLUMNS */
+enum dict_col_sys_columns_enum {
+	DICT_COL__SYS_COLUMNS__TABLE_ID		= 0,
+	DICT_COL__SYS_COLUMNS__POS		= 1,
+	DICT_COL__SYS_COLUMNS__NAME		= 2,
+	DICT_COL__SYS_COLUMNS__MTYPE		= 3,
+	DICT_COL__SYS_COLUMNS__PRTYPE		= 4,
+	DICT_COL__SYS_COLUMNS__LEN		= 5,
+	DICT_COL__SYS_COLUMNS__PREC		= 6,
+	DICT_NUM_COLS__SYS_COLUMNS		= 7
+};
+/* The field numbers in the SYS_COLUMNS clustered index */
+enum dict_fld_sys_columns_enum {
+	DICT_FLD__SYS_COLUMNS__TABLE_ID		= 0,
+	DICT_FLD__SYS_COLUMNS__POS		= 1,
+	DICT_FLD__SYS_COLUMNS__DB_TRX_ID	= 2,
+	DICT_FLD__SYS_COLUMNS__DB_ROLL_PTR	= 3,
+	DICT_FLD__SYS_COLUMNS__NAME		= 4,
+	DICT_FLD__SYS_COLUMNS__MTYPE		= 5,
+	DICT_FLD__SYS_COLUMNS__PRTYPE		= 6,
+	DICT_FLD__SYS_COLUMNS__LEN		= 7,
+	DICT_FLD__SYS_COLUMNS__PREC		= 8,
+	DICT_NUM_FIELDS__SYS_COLUMNS		= 9
+};
+/* The columns in SYS_INDEXES */
+enum dict_col_sys_indexes_enum {
+	DICT_COL__SYS_INDEXES__TABLE_ID		= 0,
+	DICT_COL__SYS_INDEXES__ID		= 1,
+	DICT_COL__SYS_INDEXES__NAME		= 2,
+	DICT_COL__SYS_INDEXES__N_FIELDS		= 3,
+	DICT_COL__SYS_INDEXES__TYPE		= 4,
+	DICT_COL__SYS_INDEXES__SPACE		= 5,
+	DICT_COL__SYS_INDEXES__PAGE_NO		= 6,
+	DICT_NUM_COLS__SYS_INDEXES		= 7
+};
 /* The field numbers in the SYS_INDEXES clustered index */
-#define DICT_SYS_INDEXES_PAGE_NO_FIELD	 8
-#define DICT_SYS_INDEXES_SPACE_NO_FIELD	 7
-#define DICT_SYS_INDEXES_TYPE_FIELD	 6
-#define DICT_SYS_INDEXES_NAME_FIELD	 4
+enum dict_fld_sys_indexes_enum {
+	DICT_FLD__SYS_INDEXES__TABLE_ID		= 0,
+	DICT_FLD__SYS_INDEXES__ID		= 1,
+	DICT_FLD__SYS_INDEXES__DB_TRX_ID	= 2,
+	DICT_FLD__SYS_INDEXES__DB_ROLL_PTR	= 3,
+	DICT_FLD__SYS_INDEXES__NAME		= 4,
+	DICT_FLD__SYS_INDEXES__N_FIELDS		= 5,
+	DICT_FLD__SYS_INDEXES__TYPE		= 6,
+	DICT_FLD__SYS_INDEXES__SPACE		= 7,
+	DICT_FLD__SYS_INDEXES__PAGE_NO		= 8,
+	DICT_NUM_FIELDS__SYS_INDEXES		= 9
+};
+/* The columns in SYS_FIELDS */
+enum dict_col_sys_fields_enum {
+	DICT_COL__SYS_FIELDS__INDEX_ID		= 0,
+	DICT_COL__SYS_FIELDS__POS		= 1,
+	DICT_COL__SYS_FIELDS__COL_NAME		= 2,
+	DICT_NUM_COLS__SYS_FIELDS		= 3
+};
+/* The field numbers in the SYS_FIELDS clustered index */
+enum dict_fld_sys_fields_enum {
+	DICT_FLD__SYS_FIELDS__INDEX_ID		= 0,
+	DICT_FLD__SYS_FIELDS__POS		= 1,
+	DICT_FLD__SYS_FIELDS__DB_TRX_ID		= 2,
+	DICT_FLD__SYS_FIELDS__DB_ROLL_PTR	= 3,
+	DICT_FLD__SYS_FIELDS__COL_NAME		= 4,
+	DICT_NUM_FIELDS__SYS_FIELDS		= 5
+};
+/* The columns in SYS_FOREIGN */
+enum dict_col_sys_foreign_enum {
+	DICT_COL__SYS_FOREIGN__ID		= 0,
+	DICT_COL__SYS_FOREIGN__FOR_NAME		= 1,
+	DICT_COL__SYS_FOREIGN__REF_NAME		= 2,
+	DICT_COL__SYS_FOREIGN__N_COLS		= 3,
+	DICT_NUM_COLS__SYS_FOREIGN		= 4
+};
+/* The field numbers in the SYS_FOREIGN clustered index */
+enum dict_fld_sys_foreign_enum {
+	DICT_FLD__SYS_FOREIGN__ID		= 0,
+	DICT_FLD__SYS_FOREIGN__DB_TRX_ID	= 1,
+	DICT_FLD__SYS_FOREIGN__DB_ROLL_PTR	= 2,
+	DICT_FLD__SYS_FOREIGN__FOR_NAME		= 3,
+	DICT_FLD__SYS_FOREIGN__REF_NAME		= 4,
+	DICT_FLD__SYS_FOREIGN__N_COLS		= 5,
+	DICT_NUM_FIELDS__SYS_FOREIGN		= 6
+};
+/* The field numbers in the SYS_FOREIGN_FOR_NAME secondary index */
+enum dict_fld_sys_foreign_for_name_enum {
+	DICT_FLD__SYS_FOREIGN_FOR_NAME__NAME	= 0,
+	DICT_FLD__SYS_FOREIGN_FOR_NAME__ID	= 1,
+	DICT_NUM_FIELDS__SYS_FOREIGN_FOR_NAME	= 2
+};
+/* The columns in SYS_FOREIGN_COLS */
+enum dict_col_sys_foreign_cols_enum {
+	DICT_COL__SYS_FOREIGN_COLS__ID			= 0,
+	DICT_COL__SYS_FOREIGN_COLS__POS			= 1,
+	DICT_COL__SYS_FOREIGN_COLS__FOR_COL_NAME	= 2,
+	DICT_COL__SYS_FOREIGN_COLS__REF_COL_NAME	= 3,
+	DICT_NUM_COLS__SYS_FOREIGN_COLS			= 4
+};
+/* The field numbers in the SYS_FOREIGN_COLS clustered index */
+enum dict_fld_sys_foreign_cols_enum {
+	DICT_FLD__SYS_FOREIGN_COLS__ID			= 0,
+	DICT_FLD__SYS_FOREIGN_COLS__POS			= 1,
+	DICT_FLD__SYS_FOREIGN_COLS__DB_TRX_ID		= 2,
+	DICT_FLD__SYS_FOREIGN_COLS__DB_ROLL_PTR		= 3,
+	DICT_FLD__SYS_FOREIGN_COLS__FOR_COL_NAME	= 4,
+	DICT_FLD__SYS_FOREIGN_COLS__REF_COL_NAME	= 5,
+	DICT_NUM_FIELDS__SYS_FOREIGN_COLS		= 6
+};
 
 /* When a row id which is zero modulo this number (which must be a power of
 two) is assigned, the field DICT_HDR_ROW_ID on the dictionary header page is
diff --git a/storage/innobase/include/dict0boot.ic b/storage/innobase/include/dict0boot.ic
index d3ba9eee78f..0f660ab7555 100644
--- a/storage/innobase/include/dict0boot.ic
+++ b/storage/innobase/include/dict0boot.ic
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
diff --git a/storage/innobase/include/dict0crea.h b/storage/innobase/include/dict0crea.h
index cce1246b789..68fc9ba195a 100644
--- a/storage/innobase/include/dict0crea.h
+++ b/storage/innobase/include/dict0crea.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
diff --git a/storage/innobase/include/dict0crea.ic b/storage/innobase/include/dict0crea.ic
index c5365ce7489..98cbbf28208 100644
--- a/storage/innobase/include/dict0crea.ic
+++ b/storage/innobase/include/dict0crea.ic
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
diff --git a/storage/innobase/include/dict0dict.h b/storage/innobase/include/dict0dict.h
index 89d6fc66635..073b68c26ad 100644
--- a/storage/innobase/include/dict0dict.h
+++ b/storage/innobase/include/dict0dict.h
@@ -27,6 +27,7 @@ Created 1/8/1996 Heikki Tuuri
 #define dict0dict_h
 
 #include "univ.i"
+#include "db0err.h"
 #include "dict0types.h"
 #include "dict0mem.h"
 #include "data0type.h"
@@ -73,16 +74,16 @@ Returns a table object based on table id.
 @return	table, NULL if does not exist */
 UNIV_INTERN
 dict_table_t*
-dict_table_get_on_id(
-/*=================*/
-        table_id_t	table_id,	/*!< in: table id */
-        trx_t*		trx);		/*!< in: transaction handle */
+dict_table_open_on_id(
+/*==================*/
+	table_id_t	table_id,	/*!< in: table id */
+	ibool		dict_locked);	/*!< in: TRUE=data dictionary locked */
 /********************************************************************//**
-Decrements the count of open MySQL handles to a table. */
+Decrements the count of open handles to a table. */
 UNIV_INTERN
 void
-dict_table_decrement_handle_count(
-/*==============================*/
+dict_table_close(
+/*=============*/
 	dict_table_t*	table,		/*!< in/out: table */
 	ibool		dict_locked);	/*!< in: TRUE=data dictionary locked */
 /**********************************************************************//**
@@ -138,7 +139,7 @@ dict_col_copy_type(
 	dtype_t*		type);	/*!< out: data type */
 /**********************************************************************//**
 Determine bytes of column prefix to be stored in the undo log. Please
-note if the table format is UNIV_FORMAT_A (< DICT_TF_FORMAT_ZIP), no prefix
+note if the table format is UNIV_FORMAT_A (< UNIV_FORMAT_B), no prefix
 needs to be stored in the undo log.
 @return bytes of column prefix to be stored in the undo log */
 UNIV_INLINE
@@ -280,8 +281,9 @@ UNIV_INTERN
 void
 dict_table_add_to_cache(
 /*====================*/
-	dict_table_t*	table,	/*!< in: table */
-	mem_heap_t*	heap);	/*!< in: temporary heap */
+	dict_table_t*	table,		/*!< in: table */
+	ibool		can_be_evicted,	/*!< in: TRUE if can be evicted*/
+	mem_heap_t*	heap);		/*!< in: temporary heap */
 /**********************************************************************//**
 Removes a table object from the dictionary cache. */
 UNIV_INTERN
@@ -360,6 +362,16 @@ dict_table_replace_index_in_foreign_list(
 	dict_table_t*	table,  /*!< in/out: table */
 	dict_index_t*	index,	/*!< in: index to be replaced */
 	const trx_t*	trx);	/*!< in: transaction handle */
+/**********************************************************************//**
+Determines whether a string starts with the specified keyword.
+@return TRUE if str starts with keyword */
+UNIV_INTERN
+ibool
+dict_str_starts_with_keyword(
+/*=========================*/
+	void*		mysql_thd,	/*!< in: MySQL thread handle */
+	const char*	str,		/*!< in: string to scan for keyword */
+	const char*	keyword);	/*!< in: keyword to look for */
 /*********************************************************************//**
 Checks if a index is defined for a foreign key constraint. Index is a part
 of a foreign key constraint if the index is referenced by foreign key
@@ -417,66 +429,32 @@ dict_foreign_parse_drop_constraints(
 	const char***	constraints_to_drop);	/*!< out: id's of the
 						constraints to drop */
 /**********************************************************************//**
-Returns a table object and optionally increment its MySQL open handle count.
+Returns a table object and increments its open handle count.
 NOTE! This is a high-level function to be used mainly from outside the
-'dict' directory. Inside this directory dict_table_get_low is usually the
-appropriate function.
+'dict' directory. Inside this directory dict_table_get_low
+is usually the appropriate function.
 @return	table, NULL if does not exist */
 UNIV_INTERN
 dict_table_t*
-dict_table_get(
-/*===========*/
+dict_table_open_on_name(
+/*====================*/
 	const char*	table_name,	/*!< in: table name */
-	ibool		inc_mysql_count);
-					/*!< in: whether to increment the open
-					handle count on the table */
+	ibool		dict_locked);	/*!< in: TRUE=data dictionary locked */
+
 /**********************************************************************//**
-Returns a index object, based on table and index id, and memoryfixes it.
-@return	index, NULL if does not exist */
+Returns a table object and increment its open handle count. Table
+statistics will not be updated if they are not initialized.
+Call this function when dropping a table.
+@return	table, NULL if does not exist */
 UNIV_INTERN
-dict_index_t*
-dict_index_get_on_id_low(
-/*=====================*/
-	dict_table_t*	table,		/*!< in: table */
-	index_id_t	index_id);	/*!< in: index id */
-/**********************************************************************//**
-Checks if a table is in the dictionary cache.
-@return	table, NULL if not found */
-
-UNIV_INLINE
 dict_table_t*
-dict_table_check_if_in_cache_low(
+dict_table_open_on_name_no_stats(
 /*=============================*/
-	const char*	table_name);	/*!< in: table name */
-/**********************************************************************//**
-Gets a table; loads it to the dictionary cache if necessary. A low-level
-function.
-@return	table, NULL if not found */
-UNIV_INLINE
-dict_table_t*
-dict_table_get_low_ignore_err(
-/*===========================*/
 	const char*	table_name,	/*!< in: table name */
+	ibool		dict_locked,	/*!< in: TRUE=data dictionary locked */
 	dict_err_ignore_t
 			ignore_err);	/*!< in: error to be ignored when
-					loading a table definition */
-/**********************************************************************//**
-Gets a table; loads it to the dictionary cache if necessary. A low-level
-function.
-@return	table, NULL if not found */
-UNIV_INLINE
-dict_table_t*
-dict_table_get_low(
-/*===============*/
-	const char*	table_name);	/*!< in: table name */
-/**********************************************************************//**
-Returns a table object based on table id.
-@return	table, NULL if does not exist */
-UNIV_INLINE
-dict_table_t*
-dict_table_get_on_id_low(
-/*=====================*/
-	table_id_t	table_id);	/*!< in: table id */
+					loading the table */
 /**********************************************************************//**
 Find an index that is equivalent to the one passed in and is not marked
 for deletion.
@@ -627,6 +605,15 @@ dict_index_is_ibuf(
 	const dict_index_t*	index)	/*!< in: index */
 	__attribute__((nonnull, pure, warn_unused_result));
 /********************************************************************//**
+Check whether the index is an universal index tree.
+@return	nonzero for universal tree, zero for other indexes */
+UNIV_INLINE
+ulint
+dict_index_is_univ(
+/*===============*/
+	const dict_index_t*	index)	/*!< in: index */
+	__attribute__((nonnull, pure, warn_unused_result));
+/********************************************************************//**
 Check whether the index is a secondary index or the insert buffer tree.
 @return	nonzero for insert buffer, zero for other indexes */
 UNIV_INLINE
@@ -636,6 +623,16 @@ dict_index_is_sec_or_ibuf(
 	const dict_index_t*	index)	/*!< in: index */
 	__attribute__((nonnull, pure, warn_unused_result));
 
+/************************************************************************
+Gets the all the FTS indexes for the table. NOTE: must not be called for
+tables which do not have an FTS-index. */
+
+ulint
+dict_table_get_all_fts_indexes(
+/*===========================*/
+				/* out: number of indexes collected */
+	dict_table_t*	table,	/* in: table */
+	ib_vector_t*	indexes);/* out: vector for collecting FTS indexes */
 /********************************************************************//**
 Gets the number of user-defined columns in a table in the dictionary
 cache.
@@ -726,21 +723,54 @@ dict_table_get_format(
 /*==================*/
 	const dict_table_t*	table);	/*!< in: table */
 /********************************************************************//**
-Set the file format of a table. */
+Determine the file format from a dict_table_t::flags.
+@return	file format version */
+UNIV_INLINE
+ulint
+dict_tf_get_format(
+/*===============*/
+	ulint		flags);		/*!< in: dict_table_t::flags */
+/********************************************************************//**
+Set the various values in a dict_table_t::flags pointer. */
 UNIV_INLINE
 void
-dict_table_set_format(
-/*==================*/
-	dict_table_t*	table,	/*!< in/out: table */
-	ulint		format);/*!< in: file format version */
+dict_tf_set(
+/*========*/
+	ulint*		flags,		/*!< in/out: table */
+	rec_format_t	format,		/*!< in: file format */
+	ulint		zip_ssize);	/*!< in: zip shift size */
+/********************************************************************//**
+Convert a 32 bit integer table flags to the 32 bit integer that is
+written into the tablespace header at the offset FSP_SPACE_FLAGS and is
+also stored in the fil_space_t::flags field.  The following chart shows
+the translation of the low order bit.  Other bits are the same.
+========================= Low order bit ==========================
+                    | REDUNDANT | COMPACT | COMPRESSED | DYNAMIC
+dict_table_t::flags |     0     |    1    |     1      |    1
+fil_space_t::flags  |     0     |    0    |     1      |    1
+==================================================================
+@return	tablespace flags (fil_space_t::flags) */
+UNIV_INLINE
+ulint
+dict_tf_to_fsp_flags(
+/*=================*/
+	ulint	flags)	/*!< in: dict_table_t::flags */
+	__attribute__((const));
+/********************************************************************/
+UNIV_INLINE
+ulint
+dict_tf_to_sys_tables_type(
+/*=======================*/
+	ulint	flags)	/*!< in: dict_table_t::flags */
+	__attribute__((const));
 /********************************************************************//**
 Extract the compressed page size from table flags.
 @return	compressed page size, or 0 if not compressed */
 UNIV_INLINE
 ulint
-dict_table_flags_to_zip_size(
-/*=========================*/
-	ulint	flags)	/*!< in: flags */
+dict_tf_get_zip_size(
+/*=================*/
+	ulint	flags)			/*!< in: flags */
 	__attribute__((const));
 /********************************************************************//**
 Check whether the table uses the compressed compact page format.
@@ -750,6 +780,7 @@ ulint
 dict_table_zip_size(
 /*================*/
 	const dict_table_t*	table);	/*!< in: table */
+#ifndef UNIV_HOTBACKUP
 /*********************************************************************//**
 Obtain exclusive locks on all index trees of the table. This is to prevent
 accessing index trees while InnoDB is updating internal metadata for
@@ -776,7 +807,43 @@ dict_table_col_in_clustered_key(
 /*============================*/
 	const dict_table_t*	table,	/*!< in: table */
 	ulint			n);	/*!< in: column number */
-#ifndef UNIV_HOTBACKUP
+/*******************************************************************//**
+Check if the table has an FTS index.
+@return TRUE if table has an FTS index */
+UNIV_INLINE
+ibool
+dict_table_has_fts_index(
+/*=====================*/
+	dict_table_t*   table);		/*!< in: table */
+/*******************************************************************//**
+Validate and return the table flags.
+@return	Same as input after validating it as dict_table_t::flags.
+If there is an error, trigger assertion failure. */
+UNIV_INLINE
+ulint
+dict_tf_validate(
+/*=============*/
+         ulint	flags);		/*!< in: table flags */
+/********************************************************************//**
+Validate a SYS_TABLES TYPE field and return it.
+@return	Same as input after validating it as a SYS_TABLES TYPE field.
+If there is an error, return ULINT_UNDEFINED. */
+UNIV_INLINE
+ulint
+dict_sys_tables_type_validate(
+/*==========================*/
+	ulint	type,		/*!< in: SYS_TABLES.TYPE */
+	ulint	n_cols);	/*!< in: SYS_TABLES.N_COLS */
+/********************************************************************//**
+Determine the file format from dict_table_t::flags
+The low order bit will be zero for REDUNDANT and 1 for COMPACT. For any
+other row_format, file_format is > 0 and DICT_TF_COMPACT will also be set.
+@return	file format version */
+UNIV_INLINE
+rec_format_t
+dict_tf_get_rec_format(
+/*===================*/
+        ulint		flags);	/*!< in: dict_table_t::flags */
 /*******************************************************************//**
 Copies types of columns contained in table to tuple and sets all
 fields of the tuple to the SQL NULL value.  This function should
@@ -787,6 +854,17 @@ dict_table_copy_types(
 /*==================*/
 	dtuple_t*		tuple,	/*!< in/out: data tuple */
 	const dict_table_t*	table);	/*!< in: table */
+/********************************************************************
+Wait until all the background threads of the given table have exited, i.e.,
+bg_threads == 0. Note: bg_threads_mutex must be reserved when
+calling this. */
+
+void
+dict_table_wait_for_bg_threads_to_exit(
+/*===================================*/
+	dict_table_t*	table,	/* in: table */
+	ulint		delay);	/* in: time in microseconds to wait between
+				checks of bg_threads. */
 /**********************************************************************//**
 Looks for an index with the given id. NOTE that we do not reserve
 the dictionary mutex: this function is for emergency purposes like
@@ -798,6 +876,17 @@ dict_index_find_on_id_low(
 /*======================*/
 	index_id_t	id);	/*!< in: index id */
 /**********************************************************************//**
+Make room in the table cache by evicting an unused table. The unused table
+should not be part of FK relationship and currently not used in any user
+transaction. There is no guarantee that it will remove a table.
+@return number of tables evicted. */
+UNIV_INTERN
+ulint
+dict_make_room_in_cache(
+/*====================*/
+	ulint		max_tables,	/*!< in: max tables allowed in cache */
+	ulint		pct_check);	/*!< in: max percent to check */
+/**********************************************************************//**
 Adds an index to the dictionary cache.
 @return	DB_SUCCESS, DB_TOO_BIG_RECORD, or DB_CORRUPTION */
 UNIV_INTERN
@@ -901,13 +990,25 @@ dict_index_get_nth_col_no(
 Looks for column n in an index.
 @return position in internal representation of the index;
 ULINT_UNDEFINED if not contained */
-UNIV_INTERN
+UNIV_INLINE
 ulint
 dict_index_get_nth_col_pos(
 /*=======================*/
 	const dict_index_t*	index,	/*!< in: index */
 	ulint			n);	/*!< in: column number */
 /********************************************************************//**
+Looks for column n in an index.
+@return position in internal representation of the index;
+ULINT_UNDEFINED if not contained */
+UNIV_INTERN
+ulint
+dict_index_get_nth_col_or_prefix_pos(
+/*=================================*/
+	const dict_index_t*	index,		/*!< in: index */
+	ulint			n,		/*!< in: column number */
+	ibool			inc_prefix);	/*!< in: TRUE=consider
+						column prefixes too */
+/********************************************************************//**
 Returns TRUE if the index contains a column or a prefix of that column.
 @return	TRUE if contains the column or its prefix */
 UNIV_INTERN
@@ -1110,18 +1211,6 @@ ulint
 dict_index_calc_min_rec_len(
 /*========================*/
 	const dict_index_t*	index);	/*!< in: index */
-/*********************************************************************//**
-Calculates new estimates for table and index statistics. The statistics
-are used in query optimization. */
-UNIV_INTERN
-void
-dict_update_statistics(
-/*===================*/
-	dict_table_t*	table,		/*!< in/out: table */
-	ibool		only_calc_if_missing_stats);/*!< in: only
-					update/recalc the stats if they have
-					not been initialized yet, otherwise
-					do nothing */
 /********************************************************************//**
 Reserves the dictionary system mutex for MySQL. */
 UNIV_INTERN
@@ -1192,6 +1281,37 @@ dict_table_get_index_on_name_and_min_id(
 /*====================================*/
 	dict_table_t*	table,	/*!< in: table */
 	const char*	name);	/*!< in: name of the index to find */
+/***************************************************************
+Check whether a column exists in an FTS index. */
+UNIV_INLINE
+ulint
+dict_table_is_fts_column(
+/*=====================*/
+				/* out: ULINT_UNDEFINED if no match else
+				the offset within the vector */
+	ib_vector_t*	indexes,/* in: vector containing only FTS indexes */
+	ulint		col_no);/* in: col number to search for */
+/**********************************************************************//**
+Move a table to the non LRU end of the LRU list. */
+UNIV_INTERN
+void
+dict_table_move_from_lru_to_non_lru(
+/*================================*/
+	dict_table_t*	table);	/*!< in: table to move from LRU to non-LRU */
+/**********************************************************************//**
+Move a table to the LRU list from the non-LRU list. */
+UNIV_INTERN
+void
+dict_table_move_from_non_lru_to_lru(
+/*================================*/
+	dict_table_t*	table);	/*!< in: table to move from non-LRU to LRU */
+/**********************************************************************//**
+Move to the most recently used segment of the LRU list. */
+UNIV_INTERN
+void
+dict_move_to_mru(
+/*=============*/
+	dict_table_t*	table);	/*!< in: table to move to MRU */
 /* Buffers for storing detailed information about the latest foreign key
 and unique key errors */
 extern FILE*	dict_foreign_err_file;
@@ -1221,8 +1341,6 @@ struct dict_sys_struct{
 					on name */
 	hash_table_t*	table_id_hash;	/*!< hash table of the tables, based
 					on id */
-	UT_LIST_BASE_NODE_T(dict_table_t)
-			table_LRU;	/*!< LRU list of tables */
 	ulint		size;		/*!< varying space in bytes occupied
 					by the data dictionary table and
 					index objects */
@@ -1230,6 +1348,14 @@ struct dict_sys_struct{
 	dict_table_t*	sys_columns;	/*!< SYS_COLUMNS table */
 	dict_table_t*	sys_indexes;	/*!< SYS_INDEXES table */
 	dict_table_t*	sys_fields;	/*!< SYS_FIELDS table */
+
+	/*=============================*/
+	UT_LIST_BASE_NODE_T(dict_table_t)
+			table_LRU;	/*!< List of tables that can be evicted
+					from the cache */
+	UT_LIST_BASE_NODE_T(dict_table_t)
+			table_non_LRU;	/*!< List of tables that can't be
+					evicted from the cache */
 };
 #endif /* !UNIV_HOTBACKUP */
 
@@ -1245,6 +1371,56 @@ void
 dict_ind_init(void);
 /*===============*/
 
+/* Auxiliary structs for checking a table definition @{ */
+
+/* This struct is used to specify the name and type that a column must
+have when checking a table's schema. */
+struct dict_col_meta_struct {
+	const char*	name;		/* column name */
+	ulint		mtype;		/* required column main type */
+	ulint		prtype_mask;	/* required column precise type mask;
+					if this is non-zero then all the
+					bits it has set must also be set
+					in the column's prtype */
+	ulint		len;		/* required column length */
+};
+typedef struct dict_col_meta_struct dict_col_meta_t;
+
+/* This struct is used for checking whether a given table exists and
+whether it has a predefined schema (number of columns and columns names
+and types) */
+struct dict_table_schema_struct {
+	const char*		table_name;	/* the name of the table whose
+						structure we are checking */
+	ulint			n_cols;		/* the number of columns the
+						table must have */
+	dict_col_meta_t*	columns;	/* metadata for the columns;
+						this array has n_cols
+						elements */
+};
+typedef struct dict_table_schema_struct dict_table_schema_t;
+/* @} */
+
+/*********************************************************************//**
+Checks whether a table exists and whether it has the given structure.
+The table must have the same number of columns with the same names and
+types. The order of the columns does not matter.
+The caller must own the dictionary mutex.
+dict_table_schema_check() @{
+@return DB_SUCCESS if the table exists and contains the necessary columns */
+UNIV_INTERN
+enum db_err
+dict_table_schema_check(
+/*====================*/
+	dict_table_schema_t*	req_schema,	/*!< in/out: required table
+						schema */
+	char*			errstr,		/*!< out: human readable error
+						message if != DB_SUCCESS and
+						!= DB_TABLE_NOT_FOUND is
+						returned */
+	size_t			errstr_sz);	/*!< in: errstr size */
+/* @} */
+
 /**********************************************************************//**
 Closes the data dictionary module. */
 UNIV_INTERN
diff --git a/storage/innobase/include/dict0dict.ic b/storage/innobase/include/dict0dict.ic
index 7533ce01401..f6585ea8205 100644
--- a/storage/innobase/include/dict0dict.ic
+++ b/storage/innobase/include/dict0dict.ic
@@ -27,6 +27,7 @@ Created 1/8/1996 Heikki Tuuri
 #ifndef UNIV_HOTBACKUP
 #include "dict0load.h"
 #include "rem0types.h"
+#include "fsp0fsp.h"
 #include "srv0srv.h"
 
 /*********************************************************************//**
@@ -103,7 +104,7 @@ dict_col_type_assert_equal(
 
 	ut_ad(col->mtype == type->mtype);
 	ut_ad(col->prtype == type->prtype);
-	ut_ad(col->len == type->len);
+	//ut_ad(col->len == type->len);
 # ifndef UNIV_HOTBACKUP
 	ut_ad(col->mbminmaxlen == type->mbminmaxlen);
 # endif /* !UNIV_HOTBACKUP */
@@ -145,7 +146,7 @@ ulint
 dict_col_get_fixed_size(
 /*====================*/
 	const dict_col_t*	col,	/*!< in: column */
-	ulint			comp)	/*!< in: nonzero=ROW_FORMAT=COMPACT  */
+	ulint			comp)	/*!< in: nonzero=ROW_FORMAT=COMPACT */
 {
 	return(dtype_get_fixed_size_low(col->mtype, col->prtype, col->len,
 					col->mbminmaxlen, comp));
@@ -250,7 +251,7 @@ dict_index_is_clust(
 	ut_ad(index);
 	ut_ad(index->magic_n == DICT_INDEX_MAGIC_N);
 
-	return(UNIV_UNLIKELY(index->type & DICT_CLUSTERED));
+	return(index->type & DICT_CLUSTERED);
 }
 /********************************************************************//**
 Check whether the index is unique.
@@ -264,7 +265,7 @@ dict_index_is_unique(
 	ut_ad(index);
 	ut_ad(index->magic_n == DICT_INDEX_MAGIC_N);
 
-	return(UNIV_UNLIKELY(index->type & DICT_UNIQUE));
+	return(index->type & DICT_UNIQUE);
 }
 
 /********************************************************************//**
@@ -279,7 +280,22 @@ dict_index_is_ibuf(
 	ut_ad(index);
 	ut_ad(index->magic_n == DICT_INDEX_MAGIC_N);
 
-	return(UNIV_UNLIKELY(index->type & DICT_IBUF));
+	return(index->type & DICT_IBUF);
+}
+
+/********************************************************************//**
+Check whether the index is an universal index tree.
+@return	nonzero for universal tree, zero for other indexes */
+UNIV_INLINE
+ulint
+dict_index_is_univ(
+/*===============*/
+	const dict_index_t*	index)	/*!< in: index */
+{
+	ut_ad(index);
+	ut_ad(index->magic_n == DICT_INDEX_MAGIC_N);
+
+	return(index->type & DICT_UNIVERSAL);
 }
 
 /********************************************************************//**
@@ -298,7 +314,7 @@ dict_index_is_sec_or_ibuf(
 
 	type = index->type;
 
-	return(UNIV_LIKELY(!(type & DICT_CLUSTERED) || (type & DICT_IBUF)));
+	return(!(type & DICT_CLUSTERED) || (type & DICT_IBUF));
 }
 
 /********************************************************************//**
@@ -420,11 +436,185 @@ dict_table_is_comp(
 {
 	ut_ad(table);
 
-#if DICT_TF_COMPACT != TRUE
-#error
+#if DICT_TF_COMPACT != 1
+#error "DICT_TF_COMPACT must be 1"
 #endif
 
-	return(UNIV_LIKELY(table->flags & DICT_TF_COMPACT));
+	return(table->flags & DICT_TF_COMPACT);
+}
+
+/************************************************************************
+Check if the table has an FTS index. */
+UNIV_INLINE
+ibool
+dict_table_has_fts_index(
+/*=====================*/
+				/* out: TRUE if table has an FTS index */
+	dict_table_t*   table)  /* in: table */
+{
+	ut_ad(table);
+
+	return(DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS));
+}
+
+/********************************************************************//**
+Validate and return the table flags.
+@return	Same as input after validating it as dict_table_t::flags.
+If there is an error, trigger assertion failure. */
+UNIV_INLINE
+ulint
+dict_tf_validate(
+/*=============*/
+	ulint	flags)		/*!< in: table flags */
+{
+	ulint	compact = DICT_TF_GET_COMPACT(flags);
+	ulint	zip_ssize = DICT_TF_GET_ZIP_SSIZE(flags);
+	ulint	atomic_blobs = DICT_TF_HAS_ATOMIC_BLOBS(flags);
+	ulint	unused = DICT_TF_GET_UNUSED(flags);
+
+	/* Make sure there are no bits that we do not know about. */
+	ut_a(unused == 0);
+
+	if (atomic_blobs) {
+		/* Barracuda row formats COMPRESSED and DYNAMIC build on
+		the page structure introduced for the COMPACT row format
+		by allowing keys in secondary indexes to be made from
+		data stored off-page in the clustered index. */
+		ut_a(compact);
+	} else {
+		/* Antelope does not support COMPRESSED row format. */
+		ut_a(!zip_ssize);
+	}
+
+	if (zip_ssize) {
+		/* COMPRESSED row format must have compact and atomic_blobs
+		bits set. */
+		ut_a(compact);
+		ut_a(atomic_blobs);
+
+		/* Validate the number is within allowed range. */
+		ut_a(zip_ssize <= PAGE_ZIP_SSIZE_MAX);
+	}
+
+	/* Return the flags sent if we did not crash. */
+	return(flags);
+}
+
+/********************************************************************//**
+Validate a SYS_TABLES TYPE field and return it.
+@return	Same as input after validating it as a SYS_TABLES TYPE field.
+If there is an error, return ULINT_UNDEFINED. */
+UNIV_INLINE
+ulint
+dict_sys_tables_type_validate(
+/*==========================*/
+	ulint	type,		/*!< in: SYS_TABLES.TYPE */
+	ulint	n_cols)		/*!< in: SYS_TABLES.N_COLS */
+{
+	ulint	low_order_bit = DICT_TF_GET_COMPACT(type);
+	ulint	redundant = !(n_cols & DICT_N_COLS_COMPACT);
+	ulint	zip_ssize = DICT_TF_GET_ZIP_SSIZE(type);
+	ulint	atomic_blobs = DICT_TF_HAS_ATOMIC_BLOBS(type);
+	ulint	unused = DICT_TF_GET_UNUSED(type);
+
+	/* If the format is UNIV_FORMAT_A, table->flags == 0, but
+	SYS_TABLES.TYPE == 1, which is defined as SYS_TABLE_TYPE_ANTELOPE.
+	The low order bit of SYS_TABLES.TYPE is always set to 1.
+	If the format is UNIV_FORMAT_B or higher, this field is the same
+	as dict_table_t::flags. Zero is not allowed here. */
+	if (!low_order_bit) {
+		return(ULINT_UNDEFINED);
+	}
+
+	if (redundant) {
+		/* This is Redundant row format, only the first bit
+		should be set in SYS_TABLES.TYPE */
+		if (type != SYS_TABLE_TYPE_ANTELOPE) {
+			return(ULINT_UNDEFINED);
+		}
+		return(DICT_TF_REDUNDANT);
+	}
+
+	/* Make sure there are no bits that we do not know about. */
+	if (unused) {
+		return(ULINT_UNDEFINED);
+	}
+
+	if (atomic_blobs) {
+		/* Barracuda row formats COMPRESSED and DYNAMIC build on
+		the page structure introduced for the COMPACT row format
+		by allowing keys in secondary indexes to be made from
+		data stored off-page in the clustered index.
+
+		The DICT_N_COLS_COMPACT flag should be in N_COLS,
+		but we already know that. */
+
+	} else if (zip_ssize) {
+		/* Antelope does not support COMPRESSED format. */
+		return(ULINT_UNDEFINED);
+	}
+
+	if (zip_ssize) {
+		/* COMPRESSED row format must have low_order_bit and
+		atomic_blobs bits set and the DICT_N_COLS_COMPACT flag
+		should be in N_COLS, but we already know about the
+		low_order_bit and DICT_N_COLS_COMPACT flags. */
+		if (!atomic_blobs) {
+			return(ULINT_UNDEFINED);
+		}
+
+		/* Validate that the number is within allowed range. */
+		if (zip_ssize > PAGE_ZIP_SSIZE_MAX) {
+			return(ULINT_UNDEFINED);
+		}
+	}
+
+	/* Return the validated SYS_TABLES.TYPE. */
+	return(type);
+}
+
+/********************************************************************//**
+Determine the file format from dict_table_t::flags
+The low order bit will be zero for REDUNDANT and 1 for COMPACT. For any
+other row_format, file_format is > 0 and DICT_TF_COMPACT will also be set.
+@return	file format version */
+UNIV_INLINE
+rec_format_t
+dict_tf_get_rec_format(
+/*===================*/
+	ulint		flags)	/*!< in: dict_table_t::flags */
+{
+	dict_tf_validate(flags);
+
+	if (!DICT_TF_GET_COMPACT(flags)) {
+		return(REC_FORMAT_REDUNDANT);
+	}
+
+	if (!DICT_TF_HAS_ATOMIC_BLOBS(flags)) {
+		return(REC_FORMAT_COMPACT);
+	}
+
+	if (DICT_TF_GET_ZIP_SSIZE(flags)) {
+		return(REC_FORMAT_COMPRESSED);
+	}
+
+	return(REC_FORMAT_DYNAMIC);
+}
+
+/********************************************************************//**
+Determine the file format from a dict_table_t::flags.
+@return	file format version */
+UNIV_INLINE
+ulint
+dict_tf_get_format(
+/*===============*/
+	ulint		flags)	/*!< in: dict_table_t::flags */
+{
+	if (DICT_TF_HAS_ATOMIC_BLOBS(flags)) {
+		return(UNIV_FORMAT_B);
+	}
+
+	return(UNIV_FORMAT_A);
 }
 
 /********************************************************************//**
@@ -438,41 +628,109 @@ dict_table_get_format(
 {
 	ut_ad(table);
 
-	return((table->flags & DICT_TF_FORMAT_MASK) >> DICT_TF_FORMAT_SHIFT);
+	return(dict_tf_get_format(table->flags));
 }
 
 /********************************************************************//**
-Determine the file format of a table. */
+Set the file format and zip size in a dict_table_t::flags.  If zip size
+is not needed, it should be 0. */
 UNIV_INLINE
 void
-dict_table_set_format(
-/*==================*/
-	dict_table_t*	table,	/*!< in/out: table */
-	ulint		format)	/*!< in: file format version */
+dict_tf_set(
+/*========*/
+	ulint*		flags,		/*!< in/out: table flags */
+	rec_format_t	format,		/*!< in: file format */
+	ulint		zip_ssize)	/*!< in: zip shift size */
 {
-	ut_ad(table);
+	switch (format) {
+	case REC_FORMAT_REDUNDANT:
+		*flags = 0;
+		ut_ad(zip_ssize == 0);
+		break;
+	case REC_FORMAT_COMPACT:
+		*flags = DICT_TF_COMPACT;
+		ut_ad(zip_ssize == 0);
+		break;
+	case REC_FORMAT_COMPRESSED:
+		*flags = DICT_TF_COMPACT
+			| (1 << DICT_TF_POS_ATOMIC_BLOBS)
+			| (zip_ssize << DICT_TF_POS_ZIP_SSIZE);
+		break;
+	case REC_FORMAT_DYNAMIC:
+		*flags = DICT_TF_COMPACT
+			| (1 << DICT_TF_POS_ATOMIC_BLOBS);
+		ut_ad(zip_ssize == 0);
+		break;
+	}
+}
+
+/********************************************************************//**
+Convert a 32 bit integer table flags to the 32 bit integer that is
+written into the tablespace header at the offset FSP_SPACE_FLAGS and is
+also stored in the fil_space_t::flags field.  The following chart shows
+the translation of the low order bit.  Other bits are the same.
+========================= Low order bit ==========================
+                    | REDUNDANT | COMPACT | COMPRESSED | DYNAMIC
+dict_table_t::flags |     0     |    1    |     1      |    1
+fil_space_t::flags  |     0     |    0    |     1      |    1
+==================================================================
+@return	tablespace flags (fil_space_t::flags) */
+UNIV_INLINE
+ulint
+dict_tf_to_fsp_flags(
+/*=================*/
+	ulint	flags)	/*!< in: dict_table_t::flags */
+{
+	/* Adjust bit zero. */
+	flags = (flags == DICT_TF_COMPACT) ? 0 : flags;
+
+	/* In addition, tablespace flags also contain the page size. */
+	flags = fsp_flags_set_page_size(flags, UNIV_PAGE_SIZE);
 
-	table->flags = (table->flags & ~DICT_TF_FORMAT_MASK)
-		| (format << DICT_TF_FORMAT_SHIFT);
+	return(fsp_flags_validate(flags));
 }
 
 /********************************************************************//**
-Extract the compressed page size from table flags.
+Convert a 32 bit integer table flags to the 32bit integer that is written
+to a SYS_TABLES.TYPE field. The following chart shows the translation of
+the low order bit.  Other bits are the same.
+========================= Low order bit ==========================
+                    | REDUNDANT | COMPACT | COMPRESSED and DYNAMIC
+dict_table_t::flags |     0     |    1    |     1
+SYS_TABLES.TYPE     |     1     |    1    |     1
+==================================================================
+@return	ulint containing SYS_TABLES.TYPE */
+UNIV_INLINE
+ulint
+dict_tf_to_sys_tables_type(
+/*=======================*/
+	ulint	flags)	/*!< in: dict_table_t::flags */
+{
+	if (!DICT_TF_HAS_ATOMIC_BLOBS(flags)) {
+		ut_a(flags == DICT_TF_REDUNDANT
+		     || flags == DICT_TF_COMPACT);
+		return(SYS_TABLE_TYPE_ANTELOPE);
+	}
+
+	return(dict_tf_validate(flags));
+}
+
+/********************************************************************//**
+Extract the compressed page size from dict_table_t::flags.
+These flags are in memory, so assert that they are valid.
 @return	compressed page size, or 0 if not compressed */
 UNIV_INLINE
 ulint
-dict_table_flags_to_zip_size(
-/*=========================*/
+dict_tf_get_zip_size(
+/*=================*/
 	ulint	flags)	/*!< in: flags */
 {
-	ulint	zip_size = flags & DICT_TF_ZSSIZE_MASK;
+	ulint zip_ssize = DICT_TF_GET_ZIP_SSIZE(flags);
+	ulint zip_size = (zip_ssize
+			  ? (UNIV_ZIP_SIZE_MIN >> 1) << zip_ssize
+			  : 0);
 
-	if (UNIV_UNLIKELY(zip_size)) {
-		zip_size = ((PAGE_ZIP_MIN_SIZE >> 1)
-			 << (zip_size >> DICT_TF_ZSSIZE_SHIFT));
-
-		ut_ad(zip_size <= UNIV_PAGE_SIZE);
-	}
+	ut_ad(zip_size <= UNIV_ZIP_SIZE_MAX);
 
 	return(zip_size);
 }
@@ -488,9 +746,10 @@ dict_table_zip_size(
 {
 	ut_ad(table);
 
-	return(dict_table_flags_to_zip_size(table->flags));
+	return(dict_tf_get_zip_size(table->flags));
 }
 
+#ifndef UNIV_HOTBACKUP
 /*********************************************************************//**
 Obtain exclusive locks on all index trees of the table. This is to prevent
 accessing index trees while InnoDB is updating internal metadata for
@@ -533,6 +792,8 @@ dict_table_x_unlock_indexes(
 		rw_lock_x_unlock(dict_index_get_lock(index));
 	}
 }
+#endif /* !UNIV_HOTBACKUP */
+
 /********************************************************************//**
 Gets the number of fields in the internal representation of an index,
 including fields added by the dictionary system.
@@ -642,7 +903,7 @@ dict_index_get_sys_col_pos(
 {
 	ut_ad(index);
 	ut_ad(index->magic_n == DICT_INDEX_MAGIC_N);
-	ut_ad(!(index->type & DICT_UNIVERSAL));
+	ut_ad(!dict_index_is_univ(index));
 
 	if (dict_index_is_clust(index)) {
 
@@ -695,6 +956,20 @@ dict_index_get_nth_col_no(
 	return(dict_col_get_no(dict_index_get_nth_col(index, pos)));
 }
 
+/********************************************************************//**
+Looks for column n in an index.
+@return position in internal representation of the index;
+ULINT_UNDEFINED if not contained */
+UNIV_INLINE
+ulint
+dict_index_get_nth_col_pos(
+/*=======================*/
+	const dict_index_t*	index,	/*!< in: index */
+	ulint			n)	/*!< in: column number */
+{
+	return(dict_index_get_nth_col_or_prefix_pos(index, n, FALSE));
+}
+
 #ifndef UNIV_HOTBACKUP
 /********************************************************************//**
 Returns the minimum data size of an index record.
@@ -790,129 +1065,35 @@ dict_index_get_space_reserve(void)
 }
 
 /**********************************************************************//**
-Checks if a table is in the dictionary cache.
-@return	table, NULL if not found */
+Check whether a column exists in an FTS index.
+@return ULINT_UNDEFINED if no match else the offset within the vector */
 UNIV_INLINE
-dict_table_t*
-dict_table_check_if_in_cache_low(
-/*=============================*/
-	const char*	table_name)	/*!< in: table name */
-{
-	dict_table_t*	table;
-	ulint		table_fold;
-
-	ut_ad(table_name);
-	ut_ad(mutex_own(&(dict_sys->mutex)));
-
-	/* Look for the table name in the hash table */
-	table_fold = ut_fold_string(table_name);
-
-	HASH_SEARCH(name_hash, dict_sys->table_hash, table_fold,
-		    dict_table_t*, table, ut_ad(table->cached),
-		    !strcmp(table->name, table_name));
-	return(table);
-}
+ulint
+dict_table_is_fts_column(
+/*=====================*/
+	ib_vector_t*	indexes,/*!< in: vector containing only FTS indexes */
+	ulint		col_no)	/*!< in: col number to search for */
 
-/**********************************************************************//**
-load a table into dictionary cache, ignore any error specified during load;
-@return	table, NULL if not found */
-UNIV_INLINE
-dict_table_t*
-dict_table_get_low_ignore_err(
-/*==========================*/
-	const char*	table_name,	/*!< in: table name */
-	dict_err_ignore_t
-			ignore_err)	/*!< in: error to be ignored when
-					loading a table definition */
 {
-	dict_table_t*	table;
+	ulint		i;
 
-	ut_ad(table_name);
-	ut_ad(mutex_own(&(dict_sys->mutex)));
+	for (i = 0; i < ib_vector_size(indexes); ++i) {
+		dict_index_t*	index;
 
-	table = dict_table_check_if_in_cache_low(table_name);
-
-	if (table == NULL) {
-		table = dict_load_table(table_name, TRUE, ignore_err);
-	}
+		index = (dict_index_t*) ib_vector_getp(indexes, i);
 
-	ut_ad(!table || table->cached);
-
-	return(table);
-}
-
-/**********************************************************************//**
-Gets a table; loads it to the dictionary cache if necessary. A low-level
-function.
-@return	table, NULL if not found */
-UNIV_INLINE
-dict_table_t*
-dict_table_get_low(
-/*===============*/
-	const char*	table_name)	/*!< in: table name */
-{
-	dict_table_t*	table;
-
-	ut_ad(table_name);
-	ut_ad(mutex_own(&(dict_sys->mutex)));
+		if (dict_index_contains_col_or_prefix(index, col_no)) {
 
-	table = dict_table_check_if_in_cache_low(table_name);
-
-	if (table && table->corrupted) {
-		fprintf(stderr, "InnoDB: table");
-		ut_print_name(stderr, NULL, TRUE, table->name);
-		if (srv_load_corrupted) {
-			fputs(" is corrupted, but"
-			      " innodb_force_load_corrupted is set\n", stderr);
-		} else {
-			fputs(" is corrupted\n", stderr);
-			return(NULL);
+			return(i);
 		}
 	}
 
-	if (table == NULL) {
-		table = dict_load_table(table_name, TRUE, DICT_ERR_IGNORE_NONE);
-	}
-
-	ut_ad(!table || table->cached);
-
-	return(table);
-}
-
-/**********************************************************************//**
-Returns a table object based on table id.
-@return	table, NULL if does not exist */
-UNIV_INLINE
-dict_table_t*
-dict_table_get_on_id_low(
-/*=====================*/
-	table_id_t	table_id)	/*!< in: table id */
-{
-	dict_table_t*	table;
-	ulint		fold;
-
-	ut_ad(mutex_own(&(dict_sys->mutex)));
-
-	/* Look for the table name in the hash table */
-	fold = ut_fold_ull(table_id);
-
-	HASH_SEARCH(id_hash, dict_sys->table_id_hash, fold,
-		    dict_table_t*, table, ut_ad(table->cached),
-		    table->id == table_id);
-	if (table == NULL) {
-		table = dict_load_table_on_id(table_id);
-	}
-
-	ut_ad(!table || table->cached);
-
-	/* TODO: should get the type information from MySQL */
-
-	return(table);
+	return(ULINT_UNDEFINED);
 }
 
 /**********************************************************************//**
 Determine bytes of column prefix to be stored in the undo log. Please
-note if the table format is UNIV_FORMAT_A (< DICT_TF_FORMAT_ZIP), no prefix
+note if the table format is UNIV_FORMAT_A (< UNIV_FORMAT_B), no prefix
 needs to be stored in the undo log.
 @return bytes of column prefix to be stored in the undo log */
 UNIV_INLINE
@@ -923,9 +1104,9 @@ dict_max_field_len_store_undo(
 	const dict_col_t*	col)	/*!< in: column which index prefix
 					is based on */
 {
-	ulint   prefix_len = 0;
+	ulint	prefix_len = 0;
 
-	if (dict_table_get_format(table) >= DICT_TF_FORMAT_ZIP)
+	if (dict_table_get_format(table) >= UNIV_FORMAT_B)
 	{
 		prefix_len = col->max_prefix
 			? col->max_prefix
@@ -947,7 +1128,7 @@ dict_table_is_corrupted(
 	ut_ad(table);
 	ut_ad(table->magic_n == DICT_TABLE_MAGIC_N);
 
-	return(UNIV_UNLIKELY(table->corrupted));
+	return(table->corrupted);
 }
 
 /********************************************************************//**
@@ -962,8 +1143,8 @@ dict_index_is_corrupted(
 	ut_ad(index);
 	ut_ad(index->magic_n == DICT_INDEX_MAGIC_N);
 
-	return(UNIV_UNLIKELY((index->type & DICT_CORRUPT)
-	       || (index->table && index->table->corrupted)));
+	return((index->type & DICT_CORRUPT)
+	       || (index->table && index->table->corrupted));
 }
 
 #endif /* !UNIV_HOTBACKUP */
diff --git a/storage/innobase/include/dict0load.h b/storage/innobase/include/dict0load.h
index 16177ade713..13b9a121c1c 100644
--- a/storage/innobase/include/dict0load.h
+++ b/storage/innobase/include/dict0load.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -48,7 +48,7 @@ enum dict_system_table_id {
 
 typedef enum dict_system_table_id	dict_system_id_t;
 
-/** Status bit for dict_process_sys_tables_rec() */
+/** Status bit for dict_process_sys_tables_rec_and_mtr_commit() */
 enum dict_table_info {
 	DICT_TABLE_LOAD_FROM_RECORD = 0,/*!< Directly populate a dict_table_t
 					structure with information from
@@ -155,12 +155,7 @@ dict_load_field_low(
 	byte*		last_index_id,	/*!< in: last index id */
 	mem_heap_t*	heap,		/*!< in/out: memory heap
 					for temporary storage */
-	const rec_t*	rec,		/*!< in: SYS_FIELDS record */
-	char*		addition_err_str,/*!< out: additional error message
-					that requires information to be
-					filled, or NULL */
-	ulint		err_str_len);	/*!< in: length of addition_err_str
-					in bytes */
+	const rec_t*	rec);		/*!< in: SYS_FIELDS record */
 /********************************************************************//**
 Loads a table definition and also all its index definitions, and also
 the cluster definition if the table is a member in a cluster. Also loads
@@ -248,15 +243,17 @@ both monitor table output and information schema innodb_sys_tables output.
 @return error message, or NULL on success */
 UNIV_INTERN
 const char*
-dict_process_sys_tables_rec(
-/*========================*/
+dict_process_sys_tables_rec_and_mtr_commit(
+/*=======================================*/
 	mem_heap_t*	heap,		/*!< in: temporary memory heap */
 	const rec_t*	rec,		/*!< in: SYS_TABLES record */
 	dict_table_t**	table,		/*!< out: dict_table_t to fill */
-	dict_table_info_t status);	/*!< in: status bit controls
+	dict_table_info_t status,	/*!< in: status bit controls
 					options such as whether we shall
 					look for dict_table_t from cache
 					first */
+	mtr_t*		mtr);		/*!< in/out: mini-transaction,
+					will be committed */
 /********************************************************************//**
 This function parses a SYS_INDEXES record and populate a dict_index_t
 structure with the information from the record. For detail information
diff --git a/storage/innobase/include/dict0load.ic b/storage/innobase/include/dict0load.ic
index ccc16db165b..2c0f1ff38a5 100644
--- a/storage/innobase/include/dict0load.ic
+++ b/storage/innobase/include/dict0load.ic
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
diff --git a/storage/innobase/include/dict0mem.h b/storage/innobase/include/dict0mem.h
index 4c371c8d5cf..b770f7e3ca7 100644
--- a/storage/innobase/include/dict0mem.h
+++ b/storage/innobase/include/dict0mem.h
@@ -43,6 +43,10 @@ Created 1/8/1996 Heikki Tuuri
 #include "ut0byte.h"
 #include "hash0hash.h"
 #include "trx0types.h"
+#include "fts0fts.h"
+
+/* Forward declaration. */
+typedef struct ib_rbt_struct ib_rbt_t;
 
 /** Type flags of an index: OR'ing of the flags is allowed to define a
 combination of types */
@@ -54,73 +58,148 @@ combination of types */
 #define	DICT_IBUF	8	/*!< insert buffer tree */
 #define	DICT_CORRUPT	16	/*!< bit to store the corrupted flag
 				in SYS_INDEXES.TYPE */
+#define	DICT_FTS	32	/* FTS index; can't be combined with the
+				other flags */
 
-#define	DICT_IT_BITS	5	/*!< number of bits used for
+#define	DICT_IT_BITS	6	/*!< number of bits used for
 				SYS_INDEXES.TYPE */
 /* @} */
 
+#if 0 /* not implemented, retained for history */
 /** Types for a table object */
 #define DICT_TABLE_ORDINARY		1 /*!< ordinary table */
-#if 0 /* not implemented */
 #define	DICT_TABLE_CLUSTER_MEMBER	2
 #define	DICT_TABLE_CLUSTER		3 /* this means that the table is
 					  really a cluster definition */
 #endif
 
-/** Table flags.  All unused bits must be 0. */
-/* @{ */
-#define DICT_TF_COMPACT			1	/* Compact page format.
-						This must be set for
-						new file formats
-						(later than
-						DICT_TF_FORMAT_51). */
+/* Table and tablespace flags are generally not used for the Antelope file
+format except for the low order bit, which is used differently depending on
+where the flags are stored.
 
-/** Compressed page size (0=uncompressed, up to 15 compressed sizes) */
-/* @{ */
-#define DICT_TF_ZSSIZE_SHIFT		1
-#define DICT_TF_ZSSIZE_MASK		(15 << DICT_TF_ZSSIZE_SHIFT)
-#define DICT_TF_ZSSIZE_MAX (UNIV_PAGE_SIZE_SHIFT - PAGE_ZIP_MIN_SIZE_SHIFT + 1)
-/* @} */
+==================== Low order flags bit =========================
+                    | REDUNDANT | COMPACT | COMPRESSED and DYNAMIC
+SYS_TABLES.TYPE     |     1     |    1    |     1
+dict_table_t::flags |     0     |    1    |     1
+FSP_SPACE_FLAGS     |     0     |    0    |     1
+fil_space_t::flags  |     0     |    0    |     1
 
-/** File format */
-/* @{ */
-#define DICT_TF_FORMAT_SHIFT		5	/* file format */
-#define DICT_TF_FORMAT_MASK		\
-((~(~0 << (DICT_TF_BITS - DICT_TF_FORMAT_SHIFT))) << DICT_TF_FORMAT_SHIFT)
-#define DICT_TF_FORMAT_51		0	/*!< InnoDB/MySQL up to 5.1 */
-#define DICT_TF_FORMAT_ZIP		1	/*!< InnoDB plugin for 5.1:
-						compressed tables,
-						new BLOB treatment */
-/** Maximum supported file format */
-#define DICT_TF_FORMAT_MAX		DICT_TF_FORMAT_ZIP
-
-/** Minimum supported file format */
-#define DICT_TF_FORMAT_MIN		DICT_TF_FORMAT_51
+Before the 5.1 plugin, SYS_TABLES.TYPE was always DICT_TABLE_ORDINARY (1)
+and the tablespace flags field was always 0. In the 5.1 plugin, these fields
+were repurposed to identify compressed and dynamic row formats.
 
-/* @} */
-#define DICT_TF_BITS			6	/*!< number of flag bits */
-#if (1 << (DICT_TF_BITS - DICT_TF_FORMAT_SHIFT)) <= DICT_TF_FORMAT_MAX
-# error "DICT_TF_BITS is insufficient for DICT_TF_FORMAT_MAX"
-#endif
+The following types and constants describe the flags found in dict_table_t
+and SYS_TABLES.TYPE.  Similar flags found in fil_space_t and FSP_SPACE_FLAGS
+are described in fsp0fsp.h. */
+
+/* @{ */
+/** SYS_TABLES.TYPE can be equal to 1 which means that the Row format
+is one of two Antelope row formats, Redundant or Compact. */
+#define SYS_TABLE_TYPE_ANTELOPE		1
+/** dict_table_t::flags can be equal to 0 if the row format = Redundant */
+#define DICT_TF_REDUNDANT		0	/*!< Redundant row format. */
+/** dict_table_t::flags can be equal to 1 if the row format = Compact */
+#define DICT_TF_COMPACT			1	/*!< Compact row format. */
+
+/** This bitmask is used in SYS_TABLES.N_COLS to set and test whether
+the Compact page format is used, i.e ROW_FORMAT != REDUNDANT */
+#define DICT_N_COLS_COMPACT	0x80000000UL
+
+/** Width of the COMPACT flag */
+#define DICT_TF_WIDTH_COMPACT		1
+/** Width of the ZIP_SSIZE flag */
+#define DICT_TF_WIDTH_ZIP_SSIZE		4
+/** Width of the ATOMIC_BLOBS flag.  The Antelope file formats broke up
+BLOB and TEXT fields, storing the first 768 bytes in the clustered index.
+Brracuda row formats store the whole blob or text field off-page atomically.
+Secondary indexes are created from this external data using row_ext_t
+to cache the BLOB prefixes. */
+#define DICT_TF_WIDTH_ATOMIC_BLOBS	1
+/** Width of all the currently known table flags */
+#define DICT_TF_BITS	(DICT_TF_WIDTH_COMPACT		\
+			+ DICT_TF_WIDTH_ZIP_SSIZE	\
+			+ DICT_TF_WIDTH_ATOMIC_BLOBS)
+
+/** A mask of all the known/used bits in table flags */
+#define DICT_TF_BIT_MASK	(~(~0 << DICT_TF_BITS))
+
+/** Zero relative shift position of the COMPACT field */
+#define DICT_TF_POS_COMPACT		0
+/** Zero relative shift position of the ZIP_SSIZE field */
+#define DICT_TF_POS_ZIP_SSIZE		(DICT_TF_POS_COMPACT		\
+					+ DICT_TF_WIDTH_COMPACT)
+/** Zero relative shift position of the ATOMIC_BLOBS field */
+#define DICT_TF_POS_ATOMIC_BLOBS	(DICT_TF_POS_ZIP_SSIZE		\
+					+ DICT_TF_WIDTH_ZIP_SSIZE)
+/** Zero relative shift position of the start of the UNUSED bits */
+#define DICT_TF_POS_UNUSED		(DICT_TF_POS_ATOMIC_BLOBS	\
+					+ DICT_TF_WIDTH_ATOMIC_BLOBS)
+
+/** Bit mask of the COMPACT field */
+#define DICT_TF_MASK_COMPACT				\
+		((~(~0 << DICT_TF_WIDTH_COMPACT))	\
+		<< DICT_TF_POS_COMPACT)
+/** Bit mask of the ZIP_SSIZE field */
+#define DICT_TF_MASK_ZIP_SSIZE				\
+		((~(~0 << DICT_TF_WIDTH_ZIP_SSIZE))	\
+		<< DICT_TF_POS_ZIP_SSIZE)
+/** Bit mask of the ATOMIC_BLOBS field */
+#define DICT_TF_MASK_ATOMIC_BLOBS			\
+		((~(~0 << DICT_TF_WIDTH_ATOMIC_BLOBS))	\
+		<< DICT_TF_POS_ATOMIC_BLOBS)
+
+/** Return the value of the COMPACT field */
+#define DICT_TF_GET_COMPACT(flags)			\
+		((flags & DICT_TF_MASK_COMPACT)		\
+		>> DICT_TF_POS_COMPACT)
+/** Return the value of the ZIP_SSIZE field */
+#define DICT_TF_GET_ZIP_SSIZE(flags)			\
+		((flags & DICT_TF_MASK_ZIP_SSIZE)	\
+		>> DICT_TF_POS_ZIP_SSIZE)
+/** Return the value of the ATOMIC_BLOBS field */
+#define DICT_TF_HAS_ATOMIC_BLOBS(flags)			\
+		((flags & DICT_TF_MASK_ATOMIC_BLOBS)	\
+		>> DICT_TF_POS_ATOMIC_BLOBS)
+/** Return the contents of the UNUSED bits */
+#define DICT_TF_GET_UNUSED(flags)			\
+		(flags >> DICT_TF_POS_UNUSED)
 /* @} */
 
-/** @brief Additional table flags.
+/** @brief Table Flags set number 2.
 
 These flags will be stored in SYS_TABLES.MIX_LEN.  All unused flags
 will be written as 0.  The column may contain garbage for tables
 created with old versions of InnoDB that only implemented
-ROW_FORMAT=REDUNDANT. */
+ROW_FORMAT=REDUNDANT.  InnoDB engines do not check these flags
+for unknown bits in order to protect backward incompatibility. */
 /* @{ */
-#define DICT_TF2_SHIFT			DICT_TF_BITS
-						/*!< Shift value for
-						table->flags. */
-#define DICT_TF2_TEMPORARY		1	/*!< TRUE for tables from
-						CREATE TEMPORARY TABLE. */
-#define DICT_TF2_BITS			(DICT_TF2_SHIFT + 1)
-						/*!< Total number of bits
-						in table->flags. */
+/** Total number of bits in table->flags2. */
+#define DICT_TF2_BITS			5
+#define DICT_TF2_BIT_MASK		~(~0 << DICT_TF2_BITS)
+
+/** TEMPORARY; TRUE for tables from CREATE TEMPORARY TABLE. */
+#define DICT_TF2_TEMPORARY		1
+/** The table has an internal defined DOC ID column */
+#define DICT_TF2_FTS_HAS_DOC_ID		2
+/** The table has an FTS index */
+#define DICT_TF2_FTS			4
+/** Need to add Doc ID column for FTS index build.
+This is a transient bit for index build */
+#define DICT_TF2_FTS_ADD_DOC_ID		8
+/** This bit is used during table creation to indicate that it will
+use its own tablespace instead of the system tablespace. */
+#define DICT_TF2_USE_TABLESPACE		16
 /* @} */
 
+#define DICT_TF2_FLAG_SET(table, flag)				\
+	(table->flags2 |= (flag))
+
+#define DICT_TF2_FLAG_IS_SET(table, flag)			\
+	(table->flags2 & (flag))
+
+#define DICT_TF2_FLAG_UNSET(table, flag)			\
+	(table->flags2 &= ~(flag))
+
 /** Tables could be chained together with Foreign key constraint. When
 first load the parent table, we would load all of its descedents.
 This could result in rescursive calls and out of stack error eventually.
@@ -150,7 +229,8 @@ dict_mem_table_create(
 					is ignored if the table is made
 					a member of a cluster */
 	ulint		n_cols,		/*!< in: number of columns */
-	ulint		flags);		/*!< in: table flags */
+	ulint		flags,		/*!< in: table flags */
+	ulint		flags2);	/*!< in: table flags2 */
 /****************************************************************//**
 Free a table memory object. */
 UNIV_INTERN
@@ -273,14 +353,14 @@ struct dict_col_struct{
 	/** The following are copied from dtype_t,
 	so that all bit-fields can be packed tightly. */
 	/* @{ */
-	unsigned	mtype:8;	/*!< main data type */
-	unsigned	prtype:24;	/*!< precise type; MySQL data
+	unsigned	prtype:32;	/*!< precise type; MySQL data
 					type, charset code, flags to
 					indicate nullability,
 					signedness, whether this is a
 					binary string, whether this is
 					a true VARCHAR where MySQL
 					uses 2 bytes to store the length */
+	unsigned	mtype:8;	/*!< main data type */
 
 	/* the remaining fields do not affect alphabetical ordering: */
 
@@ -327,17 +407,16 @@ files would be at risk! */
 
 /** Find out maximum indexed column length by its table format.
 For ROW_FORMAT=REDUNDANT and ROW_FORMAT=COMPACT, the maximum
-field length is REC_ANTELOPE_MAX_INDEX_COL_LEN - 1 (767). For new
-barracuda format, the length could be REC_VERSION_56_MAX_INDEX_COL_LEN
-(3072) bytes */
+field length is REC_ANTELOPE_MAX_INDEX_COL_LEN - 1 (767). For
+Barracuda row formats COMPRESSED and DYNAMIC, the length could
+be REC_VERSION_56_MAX_INDEX_COL_LEN (3072) bytes */
 #define DICT_MAX_FIELD_LEN_BY_FORMAT(table)				\
-		((dict_table_get_format(table) < DICT_TF_FORMAT_ZIP)	\
+		((dict_table_get_format(table) < UNIV_FORMAT_B)		\
 			? (REC_ANTELOPE_MAX_INDEX_COL_LEN - 1)		\
 			: REC_VERSION_56_MAX_INDEX_COL_LEN)
 
 #define DICT_MAX_FIELD_LEN_BY_FORMAT_FLAG(flags)			\
-		((((flags & DICT_TF_FORMAT_MASK) >> DICT_TF_FORMAT_SHIFT)\
-		    < DICT_TF_FORMAT_ZIP)				\
+		((DICT_TF_HAS_ATOMIC_BLOBS(flags) < UNIV_FORMAT_B)	\
 			? (REC_ANTELOPE_MAX_INDEX_COL_LEN - 1)		\
 			: REC_VERSION_56_MAX_INDEX_COL_LEN)
 
@@ -407,14 +486,19 @@ struct dict_index_struct{
 	/*----------------------*/
 	/** Statistics for query optimization */
 	/* @{ */
-	ib_int64_t*	stat_n_diff_key_vals;
+	ib_uint64_t*	stat_n_diff_key_vals;
 				/*!< approximate number of different
 				key values for this index, for each
 				n-column prefix where n <=
 				dict_get_n_unique(index); we
 				periodically calculate new
 				estimates */
-	ib_int64_t*	stat_n_non_null_key_vals;
+	ib_uint64_t*	stat_n_sample_sizes;
+				/*!< number of pages that were sampled
+				to calculate each of stat_n_diff_key_vals[],
+				e.g. stat_n_sample_sizes[3] pages were sampled
+				to get the number stat_n_diff_key_vals[3]. */
+	ib_uint64_t*	stat_n_non_null_key_vals;
 				/* approximate number of non-null key values
 				for this index, for each column where
 				n < dict_get_n_unique(index); This
@@ -436,7 +520,7 @@ struct dict_index_struct{
 #ifdef UNIV_BLOB_DEBUG
 	mutex_t		blobs_mutex;
 				/*!< mutex protecting blobs */
-	void*		blobs;	/*!< map of (page_no,heap_no,field_no)
+	ib_rbt_t*	blobs;	/*!< map of (page_no,heap_no,field_no)
 				to first_blob_page_no; protected by
 				blobs_mutex; @see btr_blob_dbg_t */
 #endif /* UNIV_BLOB_DEBUG */
@@ -501,7 +585,6 @@ a foreign key constraint is enforced, therefore RESTRICT just means no flag */
 #define DICT_FOREIGN_ON_UPDATE_NO_ACTION 32	/*!< ON UPDATE NO ACTION */
 /* @} */
 
-
 /** Data structure for a database table.  Most fields will be
 initialized to 0, NULL or FALSE in dict_mem_table_create(). */
 struct dict_table_struct{
@@ -517,7 +600,8 @@ struct dict_table_struct{
 	unsigned	space:32;
 				/*!< space where the clustered index of the
 				table is placed */
-	unsigned	flags:DICT_TF2_BITS;/*!< DICT_TF_COMPACT, ... */
+	unsigned	flags:DICT_TF_BITS;	/*!< DICT_TF_... */
+	unsigned	flags2:DICT_TF2_BITS;	/*!< DICT_TF2_... */
 	unsigned	ibd_file_missing:1;
 				/*!< TRUE if this is in a single-table
 				tablespace and the .ibd file is missing; then
@@ -532,6 +616,9 @@ struct dict_table_struct{
 				to the dictionary cache */
 	unsigned	n_def:10;/*!< number of columns defined so far */
 	unsigned	n_cols:10;/*!< number of columns */
+	unsigned	can_be_evicted:1;
+				/*!< TRUE if it's not an InnoDB system table
+				or a table that has no FK relationships */
 	unsigned	corrupted:1;
 				/*!< TRUE if table is corrupted */
 	dict_col_t*	cols;	/*!< array of column descriptions */
@@ -555,12 +642,6 @@ struct dict_table_struct{
 				which refer to this table */
 	UT_LIST_NODE_T(dict_table_t)
 			table_LRU; /*!< node of the LRU list of tables */
-	ulint		n_mysql_handles_opened;
-				/*!< count of how many handles MySQL has opened
-				to this table; dropping of the table is
-				NOT allowed until this count gets to zero;
-				MySQL does NOT itself check the number of
-				open handles at drop */
 	unsigned	fk_max_recusive_level:8;
 				/*!< maximum recursive level we support when
 				loading tables chained together with FK
@@ -581,8 +662,6 @@ struct dict_table_struct{
 				with undo logs commits, it sets this
 				to the value of the trx id counter for
 				the tables it had an IX lock on */
-	UT_LIST_BASE_NODE_T(lock_t)
-			locks; /*!< list of locks on the table */
 #ifdef UNIV_DEBUG
 	/*----------------------*/
 	ibool		does_not_fit_in_memory;
@@ -637,8 +716,8 @@ struct dict_table_struct{
 				whether a transaction has locked the AUTOINC
 				lock we keep a pointer to the transaction
 				here in the autoinc_trx variable. This is to
-				avoid acquiring the kernel mutex and scanning
-				the vector in trx_t.
+				avoid acquiring the lock_sys_t::mutex and
+				scanning the vector in trx_t.
 
 				When an AUTOINC lock has to wait, the
 				corresponding lock instance is created on
@@ -662,16 +741,32 @@ struct dict_table_struct{
 				/*!< This counter is used to track the number
 				of granted and pending autoinc locks on this
 				table. This value is set after acquiring the
-				kernel mutex but we peek the contents to
+				lock_sys_t::mutex but we peek the contents to
 				determine whether other transactions have
 				acquired the AUTOINC lock or not. Of course
 				only one transaction can be granted the
 				lock but there can be multiple waiters. */
-	const trx_t*		autoinc_trx;
+	const trx_t*	autoinc_trx;
 				/*!< The transaction that currently holds the
-				the AUTOINC lock on this table. */
+				the AUTOINC lock on this table.
+				Protected by lock_sys->mutex. */
+	fts_t*		fts;	/* FTS specific state variables */
 				/* @} */
 	/*----------------------*/
+	ulint		n_rec_locks;
+				/*!< Count of the number of record locks on
+				this table. We use this to determine whether
+				we can evict the table from the dictionary
+				cache. It is protected by lock_sys->mutex. */
+	ulint		n_ref_count;
+				/*!< count of how many handles are opened
+				to this table; dropping of the table is
+				NOT allowed until this count gets to zero;
+				MySQL does NOT itself check the number of
+				open handles at drop */
+	UT_LIST_BASE_NODE_T(lock_t)
+			locks;	/*!< list of locks on the table; protected
+				by lock_sys->mutex */
 #endif /* !UNIV_HOTBACKUP */
 
 #ifdef UNIV_DEBUG
diff --git a/storage/innobase/include/dict0mem.ic b/storage/innobase/include/dict0mem.ic
index 1d80ffc9b94..38d51f61789 100644
--- a/storage/innobase/include/dict0mem.ic
+++ b/storage/innobase/include/dict0mem.ic
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -57,16 +57,18 @@ dict_mem_fill_index_struct(
 		index->fields = NULL;
 	}
 
-        index->type = type;
+	/* Assign a ulint to a 4-bit-mapped field.
+	Only the low-order 4 bits are assigned. */
+	index->type = type;
 #ifndef UNIV_HOTBACKUP
-        index->space = (unsigned int) space;
-        index->page = FIL_NULL;
+	index->space = (unsigned int) space;
+	index->page = FIL_NULL;
 #endif /* !UNIV_HOTBACKUP */
-        index->table_name = table_name;
-        index->n_fields = (unsigned int) n_fields;
-        /* The '1 +' above prevents allocation
-        of an empty mem block */
+	index->table_name = table_name;
+	index->n_fields = (unsigned int) n_fields;
+	/* The '1 +' above prevents allocation
+	of an empty mem block */
 #ifdef UNIV_DEBUG
-        index->magic_n = DICT_INDEX_MAGIC_N;
+	index->magic_n = DICT_INDEX_MAGIC_N;
 #endif /* UNIV_DEBUG */
 }
diff --git a/storage/innobase/include/dict0priv.h b/storage/innobase/include/dict0priv.h
new file mode 100644
index 00000000000..69eeb835885
--- /dev/null
+++ b/storage/innobase/include/dict0priv.h
@@ -0,0 +1,61 @@
+/*****************************************************************************
+
+Copyright (c) 2010, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/dict0priv.h
+Data dictionary private functions
+
+Created  Fri 2 Jul 2010 13:30:38 EST - Sunny Bains
+*******************************************************/
+
+#ifndef dict0priv_h
+#define dict0priv_h
+
+/**********************************************************************//**
+Gets a table; loads it to the dictionary cache if necessary. A low-level
+function. Note: Not to be called from outside dict0*c functions.
+@return	table, NULL if not found */
+UNIV_INLINE
+dict_table_t*
+dict_table_get_low(
+/*===============*/
+	const char*	table_name);		/*!< in: table name */
+
+/**********************************************************************//**
+Checks if a table is in the dictionary cache.
+@return	table, NULL if not found */
+UNIV_INLINE
+dict_table_t*
+dict_table_check_if_in_cache_low(
+/*=============================*/
+	const char*	table_name);		/*!< in: table name */
+
+/**********************************************************************//**
+Returns a table object based on table id.
+@return	table, NULL if does not exist */
+UNIV_INLINE
+dict_table_t*
+dict_table_open_on_id_low(
+/*=====================*/
+	table_id_t	table_id);		/*!< in: table id */
+
+#ifndef UNIV_NONINL
+#include "dict0priv.ic"
+#endif
+
+#endif /* dict0priv.h */
diff --git a/storage/innobase/include/dict0priv.ic b/storage/innobase/include/dict0priv.ic
new file mode 100644
index 00000000000..e15fbc65a63
--- /dev/null
+++ b/storage/innobase/include/dict0priv.ic
@@ -0,0 +1,123 @@
+/*****************************************************************************
+
+Copyright (c) 2010, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/dict0priv.ic
+Data dictionary system private include file
+
+Created  Wed 13 Oct 2010 16:10:14 EST Sunny Bains
+***********************************************************************/
+
+#include "dict0dict.h"
+#include "dict0load.h"
+#include "dict0priv.h"
+#ifndef UNIV_HOTBACKUP
+
+/**********************************************************************//**
+Gets a table; loads it to the dictionary cache if necessary. A low-level
+function.
+@return	table, NULL if not found */
+UNIV_INLINE
+dict_table_t*
+dict_table_get_low(
+/*===============*/
+	const char*	table_name)	/*!< in: table name */
+{
+	dict_table_t*	table;
+
+	ut_ad(table_name);
+	ut_ad(mutex_own(&(dict_sys->mutex)));
+
+	table = dict_table_check_if_in_cache_low(table_name);
+
+	if (table && table->corrupted) {
+		fprintf(stderr, "InnoDB: table");
+		ut_print_name(stderr, NULL, TRUE, table->name);
+		if (srv_load_corrupted) {
+			fputs(" is corrupted, but"
+			      " innodb_force_load_corrupted is set\n", stderr);
+		} else {
+			fputs(" is corrupted\n", stderr);
+			return(NULL);
+		}
+	}
+
+	if (table == NULL) {
+		table = dict_load_table(table_name, TRUE, DICT_ERR_IGNORE_NONE);
+	}
+
+	ut_ad(!table || table->cached);
+
+	return(table);
+}
+
+/**********************************************************************//**
+Returns a table object based on table id.
+@return	table, NULL if does not exist */
+UNIV_INLINE
+dict_table_t*
+dict_table_open_on_id_low(
+/*======================*/
+	table_id_t	table_id)	/*!< in: table id */
+{
+	dict_table_t*	table;
+	ulint		fold;
+
+	ut_ad(mutex_own(&(dict_sys->mutex)));
+
+	/* Look for the table name in the hash table */
+	fold = ut_fold_ull(table_id);
+
+	HASH_SEARCH(id_hash, dict_sys->table_id_hash, fold,
+		    dict_table_t*, table, ut_ad(table->cached),
+		    table->id == table_id);
+	if (table == NULL) {
+		table = dict_load_table_on_id(table_id);
+	}
+
+	ut_ad(!table || table->cached);
+
+	/* TODO: should get the type information from MySQL */
+
+	return(table);
+}
+
+/**********************************************************************//**
+Checks if a table is in the dictionary cache.
+@return	table, NULL if not found */
+UNIV_INLINE
+dict_table_t*
+dict_table_check_if_in_cache_low(
+/*=============================*/
+	const char*	table_name)	/*!< in: table name */
+{
+	dict_table_t*	table;
+	ulint		table_fold;
+
+	ut_ad(table_name);
+	ut_ad(mutex_own(&(dict_sys->mutex)));
+
+	/* Look for the table name in the hash table */
+	table_fold = ut_fold_string(table_name);
+
+	HASH_SEARCH(name_hash, dict_sys->table_hash, table_fold,
+		    dict_table_t*, table, ut_ad(table->cached),
+		    !strcmp(table->name, table_name));
+	return(table);
+}
+#endif /*! UNIV_HOTBACKUP */
diff --git a/storage/innobase/include/dict0stats.h b/storage/innobase/include/dict0stats.h
new file mode 100644
index 00000000000..879e67a0918
--- /dev/null
+++ b/storage/innobase/include/dict0stats.h
@@ -0,0 +1,108 @@
+/*****************************************************************************
+
+Copyright (c) 2009, 2010, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/dict0stats.h
+Code used for calculating and manipulating table statistics.
+
+Created Jan 06, 2010 Vasil Dimov
+*******************************************************/
+
+#ifndef dict0stats_h
+#define dict0stats_h
+
+#include "univ.i"
+
+#include "db0err.h"
+#include "dict0types.h"
+#include "trx0types.h"
+
+enum dict_stats_upd_option {
+	DICT_STATS_RECALC_PERSISTENT,/* (re) calculate the
+				statistics using a precise and slow
+				algo and save them to the persistent
+				storage, if the persistent storage is
+				not present then emit a warning and
+				fall back to transient stats */
+	DICT_STATS_RECALC_PERSISTENT_SILENT,/* same as
+				DICT_STATS_RECALC_PERSISTENT
+				but do not emit a warning */
+	DICT_STATS_RECALC_TRANSIENT,/* (re) calculate the statistics
+				using an imprecise quick algo
+				without saving the results
+				persistently */
+	DICT_STATS_FETCH,	/* fetch the statistics from the
+				persistent storage */
+	DICT_STATS_FETCH_ONLY_IF_NOT_IN_MEMORY /* only fetch the stats
+				from the persistent storage if the in-memory
+				structures have not been initialized yet,
+				otherwise do nothing */
+};
+
+typedef enum dict_stats_upd_option	dict_stats_upd_option_t;
+
+/*********************************************************************//**
+Calculates new estimates for table and index statistics. The statistics
+are used in query optimization.
+@return DB_* error code or DB_SUCCESS */
+UNIV_INTERN
+enum db_err
+dict_stats_update(
+/*==============*/
+	dict_table_t*		table,	/*!< in/out: table */
+	dict_stats_upd_option_t	stats_upd_option,
+					/*!< in: whether to (re) calc
+					the stats or to fetch them from
+					the persistent storage */
+	ibool			caller_has_dict_sys_mutex);
+					/*!< in: TRUE if the caller
+					owns dict_sys->mutex */
+
+/*********************************************************************//**
+Removes the information for a particular index's stats from the persistent
+storage if it exists and if there is data stored for this index.
+The transaction is not committed, it must not be committed in this
+function because this is the user trx that is running DROP INDEX.
+The transaction will be committed at the very end when dropping an
+index.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+enum db_err
+dict_stats_delete_index_stats(
+/*==========================*/
+	dict_index_t*	index,	/*!< in: index */
+	trx_t*		trx,	/*!< in: transaction to use */
+	char*		errstr, /*!< out: error message if != DB_SUCCESS
+				is returned */
+	ulint		errstr_sz);/*!< in: size of the errstr buffer */
+
+/*********************************************************************//**
+Removes the statistics for a table and all of its indexes from the
+persistent storage if it exists and if there is data stored for the table.
+This function creates its own transaction and commits it.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+enum db_err
+dict_stats_delete_table_stats(
+/*==========================*/
+	const char*	table_name,	/*!< in: table name */
+	char*		errstr,		/*!< out: error message
+					if != DB_SUCCESS is returned */
+	ulint		errstr_sz);	/*!< in: size of errstr buffer */
+
+#endif /* dict0stats_h */
diff --git a/storage/innobase/include/dict0types.h b/storage/innobase/include/dict0types.h
index f0a05a38070..cd2863582c1 100644
--- a/storage/innobase/include/dict0types.h
+++ b/storage/innobase/include/dict0types.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2011, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -40,6 +40,10 @@ typedef struct tab_node_struct		tab_node_t;
 #define	DICT_HDR_SPACE		0	/* the SYSTEM tablespace */
 #define	DICT_HDR_PAGE_NO	FSP_DICT_HDR_PAGE_NO
 
+/* The ibuf table and indexes's ID are assigned as the number
+DICT_IBUF_ID_MIN plus the space id */
+#define DICT_IBUF_ID_MIN	0xFFFFFFFF00000000ULL
+
 typedef ib_id_t		table_id_t;
 typedef ib_id_t		index_id_t;
 
@@ -49,11 +53,11 @@ be responsible to deal with corrupted table or index.
 Note: please define the IGNORE_ERR_* as bits, so their value can
 be or-ed together */
 enum dict_err_ignore {
-        DICT_ERR_IGNORE_NONE = 0,        /*!< no error to ignore */
-        DICT_ERR_IGNORE_INDEX_ROOT = 1, /*!< ignore error if index root
+	DICT_ERR_IGNORE_NONE = 0,	/*!< no error to ignore */
+	DICT_ERR_IGNORE_INDEX_ROOT = 1,	/*!< ignore error if index root
 					page is FIL_NULL or incorrect value */
 	DICT_ERR_IGNORE_CORRUPT = 2,	/*!< skip corrupted indexes */
-        DICT_ERR_IGNORE_ALL = 0xFFFF	/*!< ignore all errors */
+	DICT_ERR_IGNORE_ALL = 0xFFFF	/*!< ignore all errors */
 };
 
 typedef enum dict_err_ignore		dict_err_ignore_t;
diff --git a/storage/innobase/include/dyn0dyn.h b/storage/innobase/include/dyn0dyn.h
index 121a5946ac7..5e69cb13122 100644
--- a/storage/innobase/include/dyn0dyn.h
+++ b/storage/innobase/include/dyn0dyn.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
diff --git a/storage/innobase/include/dyn0dyn.ic b/storage/innobase/include/dyn0dyn.ic
index 110e674abff..b86697d6865 100644
--- a/storage/innobase/include/dyn0dyn.ic
+++ b/storage/innobase/include/dyn0dyn.ic
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
diff --git a/storage/innobase/include/eval0eval.h b/storage/innobase/include/eval0eval.h
index 60aefd8d453..e3b1e6c16b6 100644
--- a/storage/innobase/include/eval0eval.h
+++ b/storage/innobase/include/eval0eval.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1997, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
diff --git a/storage/innobase/include/eval0eval.ic b/storage/innobase/include/eval0eval.ic
index fe767f39b00..e4b1dd08017 100644
--- a/storage/innobase/include/eval0eval.ic
+++ b/storage/innobase/include/eval0eval.ic
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1997, 2011, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -70,7 +70,7 @@ eval_node_ensure_val_buf(
 	dfield = que_node_get_val(node);
 	dfield_set_len(dfield, size);
 
-	data = dfield_get_data(dfield);
+	data = static_cast<byte*>(dfield_get_data(dfield));
 
 	if (!data || que_node_get_val_buf_size(node) < size) {
 
@@ -110,12 +110,12 @@ eval_exp(
 {
 	if (que_node_get_type(exp_node) == QUE_NODE_SYMBOL) {
 
-		eval_sym((sym_node_t*)exp_node);
+		eval_sym((sym_node_t*) exp_node);
 
 		return;
 	}
 
-	eval_func(exp_node);
+	eval_func(static_cast<func_node_t*>(exp_node));
 }
 
 /*****************************************************************//**
@@ -132,7 +132,7 @@ eval_node_set_int_val(
 
 	dfield = que_node_get_val(node);
 
-	data = dfield_get_data(dfield);
+	data = static_cast<byte*>(dfield_get_data(dfield));
 
 	if (data == NULL) {
 		data = eval_node_alloc_val_buf(node, 4);
@@ -140,7 +140,7 @@ eval_node_set_int_val(
 
 	ut_ad(dfield_get_len(dfield) == 4);
 
-	mach_write_to_4(data, (ulint)val);
+	mach_write_to_4(data, (ulint) val);
 }
 
 /*****************************************************************//**
@@ -152,13 +152,15 @@ eval_node_get_int_val(
 /*==================*/
 	que_node_t*	node)	/*!< in: expression node */
 {
+	const byte*	ptr;
 	dfield_t*	dfield;
 
 	dfield = que_node_get_val(node);
+	ptr = static_cast<byte*>(dfield_get_data(dfield));
 
 	ut_ad(dfield_get_len(dfield) == 4);
 
-	return((int)mach_read_from_4(dfield_get_data(dfield)));
+	return((int) mach_read_from_4(ptr));
 }
 
 /*****************************************************************//**
@@ -175,7 +177,7 @@ eval_node_get_ibool_val(
 
 	dfield = que_node_get_val(node);
 
-	data = dfield_get_data(dfield);
+	data = static_cast<byte*>(dfield_get_data(dfield));
 
 	ut_ad(data != NULL);
 
@@ -196,7 +198,7 @@ eval_node_set_ibool_val(
 
 	dfield = que_node_get_val(func_node);
 
-	data = dfield_get_data(dfield);
+	data = static_cast<byte*>(dfield_get_data(dfield));
 
 	if (data == NULL) {
 		/* Allocate 1 byte to hold the value */
@@ -246,6 +248,8 @@ eval_node_copy_val(
 
 	dfield2 = que_node_get_val(node2);
 
-	eval_node_copy_and_alloc_val(node1, dfield_get_data(dfield2),
-				     dfield_get_len(dfield2));
+	eval_node_copy_and_alloc_val(
+		node1,
+		static_cast<byte*>(dfield_get_data(dfield2)),
+		dfield_get_len(dfield2));
 }
diff --git a/storage/innobase/include/eval0proc.h b/storage/innobase/include/eval0proc.h
index 13e2e365320..7755fb10343 100644
--- a/storage/innobase/include/eval0proc.h
+++ b/storage/innobase/include/eval0proc.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1998, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1998, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
diff --git a/storage/innobase/include/eval0proc.ic b/storage/innobase/include/eval0proc.ic
index c602af0a694..81418bae2c9 100644
--- a/storage/innobase/include/eval0proc.ic
+++ b/storage/innobase/include/eval0proc.ic
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1998, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1998, 2011, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -40,7 +40,7 @@ proc_step(
 
 	ut_ad(thr);
 
-	node = thr->run_node;
+	node = static_cast<proc_node_t*>(thr->run_node);
 	ut_ad(que_node_get_type(node) == QUE_NODE_PROC);
 
 	if (thr->prev_node == que_node_get_parent(node)) {
@@ -75,7 +75,7 @@ proc_eval_step(
 
 	ut_ad(thr);
 
-	node = thr->run_node;
+	node = static_cast<func_node_t*>(thr->run_node);
 	ut_ad(que_node_get_type(node) == QUE_NODE_FUNC);
 
 	/* Evaluate the procedure */
diff --git a/storage/innobase/include/fil0fil.h b/storage/innobase/include/fil0fil.h
index 610bd4b0e5c..fa632ea3c6b 100644
--- a/storage/innobase/include/fil0fil.h
+++ b/storage/innobase/include/fil0fil.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1995, 2010, Innobase Oy. All Rights Reserved.
+Copyright (c) 1995, 2010, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -27,12 +27,16 @@ Created 10/25/1995 Heikki Tuuri
 #define fil0fil_h
 
 #include "univ.i"
+
+#ifndef UNIV_INNOCHECKSUM
+
 #include "dict0types.h"
 #include "ut0byte.h"
 #include "os0file.h"
 #ifndef UNIV_HOTBACKUP
 #include "sync0rw.h"
 #include "ibuf0types.h"
+#include "log0log.h"
 #endif /* !UNIV_HOTBACKUP */
 
 /** When mysqld is run, the default directory "." is the mysqld datadir,
@@ -70,6 +74,8 @@ struct fil_addr_struct{
 /** The null file address */
 extern fil_addr_t	fil_addr_null;
 
+#endif /* !UNIV_INNOCHECKSUM */
+
 /** The byte offsets on a file page for various variables @{ */
 #define FIL_PAGE_SPACE_OR_CHKSUM 0	/*!< in < MySQL-4.0.14 space id the
 					page belongs to (== 0) but in later
@@ -127,6 +133,8 @@ extern fil_addr_t	fil_addr_null;
 #define FIL_PAGE_DATA_END	8	/*!< size of the page trailer */
 /* @} */
 
+#ifndef UNIV_INNOCHECKSUM
+
 /** File page types (values of FIL_PAGE_TYPE) @{ */
 #define FIL_PAGE_INDEX		17855	/*!< B-tree node */
 #define FIL_PAGE_UNDO_LOG	2	/*!< Undo log page */
@@ -142,6 +150,8 @@ extern fil_addr_t	fil_addr_null;
 #define FIL_PAGE_TYPE_BLOB	10	/*!< Uncompressed BLOB page */
 #define FIL_PAGE_TYPE_ZBLOB	11	/*!< First compressed BLOB page */
 #define FIL_PAGE_TYPE_ZBLOB2	12	/*!< Subsequent compressed BLOB page */
+#define FIL_PAGE_TYPE_LAST	FIL_PAGE_TYPE_ZBLOB2
+					/*!< Last page type */
 /* @} */
 
 /** Space types @{ */
@@ -157,6 +167,8 @@ extern ulint	fil_n_pending_log_flushes;
 /** Number of pending tablespace flushes */
 extern ulint	fil_n_pending_tablespace_flushes;
 
+/** Number of files currently open */
+extern ulint	fil_n_file_opened;
 
 #ifndef UNIV_HOTBACKUP
 /*******************************************************************//**
@@ -213,8 +225,8 @@ fil_space_truncate_start(
 				some initial files in the space */
 #endif /* UNIV_LOG_ARCHIVE */
 /*******************************************************************//**
-Creates a space memory object and puts it to the 'fil system' hash table. If
-there is an error, prints an error message to the .err log.
+Creates a space memory object and puts it to the 'fil system' hash table.
+If there is an error, prints an error message to the .err log.
 @return	TRUE if success */
 UNIV_INTERN
 ibool
@@ -320,12 +332,11 @@ UNIV_INTERN
 ulint
 fil_write_flushed_lsn_to_data_files(
 /*================================*/
-	ib_uint64_t	lsn,		/*!< in: lsn to write */
-	ulint		arch_log_no);	/*!< in: latest archived log
-					file number */
+	lsn_t	lsn,		/*!< in: lsn to write */
+	ulint	arch_log_no);	/*!< in: latest archived log file number */
 /*******************************************************************//**
-Reads the flushed lsn and arch no fields from a data file at database
-startup. */
+Reads the flushed lsn, arch no, and tablespace flag fields from a data
+file at database startup. */
 UNIV_INTERN
 void
 fil_read_first_page(
@@ -341,24 +352,25 @@ fil_read_first_page(
 	ulint*		max_arch_log_no,	/*!< out: max of archived
 						log numbers in data files */
 #endif /* UNIV_LOG_ARCHIVE */
-	ib_uint64_t*	min_flushed_lsn,	/*!< out: min of flushed
+	lsn_t*		min_flushed_lsn,	/*!< out: min of flushed
 						lsn values in data files */
-	ib_uint64_t*	max_flushed_lsn);	/*!< out: max of flushed
+	lsn_t*		max_flushed_lsn);	/*!< out: max of flushed
 						lsn values in data files */
 /*******************************************************************//**
-Increments the count of pending operation, if space is not being deleted.
-@return	TRUE if being deleted, and operation should be skipped */
+Increments the count of pending insert buffer page merges, if space is not
+being deleted.
+@return	TRUE if being deleted, and ibuf merges should be skipped */
 UNIV_INTERN
 ibool
-fil_inc_pending_ops(
-/*================*/
+fil_inc_pending_ibuf_merges(
+/*========================*/
 	ulint	id);	/*!< in: space id */
 /*******************************************************************//**
-Decrements the count of pending operations. */
+Decrements the count of pending insert buffer page merges. */
 UNIV_INTERN
 void
-fil_decr_pending_ops(
-/*=================*/
+fil_decr_pending_ibuf_merges(
+/*=========================*/
 	ulint	id);	/*!< in: space id */
 #endif /* !UNIV_HOTBACKUP */
 /*******************************************************************//**
@@ -397,9 +409,7 @@ UNIV_INTERN
 ibool
 fil_delete_tablespace(
 /*==================*/
-	ulint	id,		/*!< in: space id */
-	ibool	evict_all);	/*!< in: TRUE if we want all pages
-				evicted from LRU. */
+	ulint	id);	/*!< in: space id */
 #ifndef UNIV_HOTBACKUP
 /*******************************************************************//**
 Discards a single-table tablespace. The tablespace must be cached in the
@@ -424,7 +434,7 @@ UNIV_INTERN
 ibool
 fil_rename_tablespace(
 /*==================*/
-	const char*	old_name,	/*!< in: old table name in the standard
+	const char*	old_name_in,	/*!< in: old table name in the standard
 					databasename/tablename format of
 					InnoDB, or NULL if we do the rename
 					based on the space id only */
@@ -452,6 +462,7 @@ fil_create_new_single_table_tablespace(
 	ibool		is_temp,	/*!< in: TRUE if a table created with
 					CREATE TEMPORARY TABLE */
 	ulint		flags,		/*!< in: tablespace flags */
+	ulint		flags2,		/*!< in: table flags2 */
 	ulint		size);		/*!< in: the initial size of the
 					tablespace file in pages,
 					must be >= FIL_IBD_FILE_INITIAL_SIZE */
@@ -497,7 +508,7 @@ fil_reset_too_high_lsns(
 /*====================*/
 	const char*	name,		/*!< in: table name in the
 					databasename/tablename format */
-	ib_uint64_t	current_lsn);	/*!< in: reset lsn's if the lsn stamped
+	lsn_t		current_lsn);	/*!< in: reset lsn's if the lsn stamped
 					to FIL_PAGE_FILE_FLUSH_LSN in the
 					first page is too high */
 #endif /* !UNIV_HOTBACKUP */
@@ -516,7 +527,7 @@ fil_load_single_table_tablespaces(void);
 /*******************************************************************//**
 Returns TRUE if a single-table tablespace does not exist in the memory cache,
 or is being deleted there.
-@return	TRUE if does not exist or is being\ deleted */
+@return	TRUE if does not exist or is being deleted */
 UNIV_INTERN
 ibool
 fil_tablespace_deleted_or_being_deleted_in_mem(
@@ -545,10 +556,7 @@ fil_space_for_table_exists_in_mem(
 /*==============================*/
 	ulint		id,		/*!< in: space id */
 	const char*	name,		/*!< in: table name in the standard
-					'databasename/tablename' format or
-					the dir path to a temp table */
-	ibool		is_temp,	/*!< in: TRUE if created with CREATE
-					TEMPORARY TABLE */
+					'databasename/tablename' format */
 	ibool		mark_space,	/*!< in: in crash recovery, at database
 					startup we mark all spaces which have
 					an associated table in the InnoDB
@@ -649,7 +657,7 @@ fil_io(
 /**********************************************************************//**
 Waits for an aio operation to complete. This function is used to write the
 handler for completed requests. The aio array of pending requests is divided
-into segments (see os0file.c for more info). The thread specifies which
+into segments (see os0file.cc for more info). The thread specifies which
 segment it wants to wait for. */
 UNIV_INTERN
 void
@@ -734,4 +742,6 @@ fil_tablespace_is_being_deleted(
 
 typedef	struct fil_space_struct	fil_space_t;
 
+#endif /* !UNIV_INNOCHECKSUM */
+
 #endif
diff --git a/storage/innobase/include/fsp0fsp.h b/storage/innobase/include/fsp0fsp.h
index f07e3decc66..994783c2db9 100644
--- a/storage/innobase/include/fsp0fsp.h
+++ b/storage/innobase/include/fsp0fsp.h
@@ -28,26 +28,93 @@ Created 12/18/1995 Heikki Tuuri
 
 #include "univ.i"
 
+#ifndef UNIV_INNOCHECKSUM
+
 #include "mtr0mtr.h"
 #include "fut0lst.h"
 #include "ut0byte.h"
 #include "page0types.h"
 #include "fsp0types.h"
 
+#endif /* !UNIV_INNOCHECKSUM */
+
 /* @defgroup fsp_flags InnoDB Tablespace Flag Constants @{ */
 
+/** Width of the POST_ANTELOPE flag */
+#define FSP_FLAGS_WIDTH_POST_ANTELOPE	1
+/** Number of flag bits used to indicate the tablespace zip page size */
+#define FSP_FLAGS_WIDTH_ZIP_SSIZE	4
+/** Width of the ATOMIC_BLOBS flag.  The ability to break up a long
+column into an in-record prefix and an externally stored part is available
+to the two Barracuda row formats COMPRESSED and DYNAMIC. */
+#define FSP_FLAGS_WIDTH_ATOMIC_BLOBS	1
 /** Number of flag bits used to indicate the tablespace page size */
 #define FSP_FLAGS_WIDTH_PAGE_SSIZE	4
+/** Width of all the currently known tablespace flags */
+#define FSP_FLAGS_WIDTH		(FSP_FLAGS_WIDTH_POST_ANTELOPE	\
+				+ FSP_FLAGS_WIDTH_ZIP_SSIZE	\
+				+ FSP_FLAGS_WIDTH_ATOMIC_BLOBS	\
+				+ FSP_FLAGS_WIDTH_PAGE_SSIZE)
+
+/** A mask of all the known/used bits in tablespace flags */
+#define FSP_FLAGS_MASK		(~(~0 << FSP_FLAGS_WIDTH))
+
+/** Zero relative shift position of the POST_ANTELOPE field */
+#define FSP_FLAGS_POS_POST_ANTELOPE	0
+/** Zero relative shift position of the ZIP_SSIZE field */
+#define FSP_FLAGS_POS_ZIP_SSIZE		(FSP_FLAGS_POS_POST_ANTELOPE	\
+					+ FSP_FLAGS_WIDTH_POST_ANTELOPE)
+/** Zero relative shift position of the ATOMIC_BLOBS field */
+#define FSP_FLAGS_POS_ATOMIC_BLOBS	(FSP_FLAGS_POS_ZIP_SSIZE	\
+					+ FSP_FLAGS_WIDTH_ZIP_SSIZE)
 /** Zero relative shift position of the PAGE_SSIZE field */
-#define FSP_FLAGS_POS_PAGE_SSIZE	6
+#define FSP_FLAGS_POS_PAGE_SSIZE	(FSP_FLAGS_POS_ATOMIC_BLOBS	\
+					+ FSP_FLAGS_WIDTH_ATOMIC_BLOBS)
+/** Zero relative shift position of the start of the UNUSED bits */
+#define FSP_FLAGS_POS_UNUSED		(FSP_FLAGS_POS_PAGE_SSIZE	\
+					+ FSP_FLAGS_WIDTH_PAGE_SSIZE)
+
+/** Bit mask of the POST_ANTELOPE field */
+#define FSP_FLAGS_MASK_POST_ANTELOPE				\
+		((~(~0 << FSP_FLAGS_WIDTH_POST_ANTELOPE))	\
+		<< FSP_FLAGS_POS_POST_ANTELOPE)
+/** Bit mask of the ZIP_SSIZE field */
+#define FSP_FLAGS_MASK_ZIP_SSIZE				\
+		((~(~0 << FSP_FLAGS_WIDTH_ZIP_SSIZE))		\
+		<< FSP_FLAGS_POS_ZIP_SSIZE)
+/** Bit mask of the ATOMIC_BLOBS field */
+#define FSP_FLAGS_MASK_ATOMIC_BLOBS				\
+		((~(~0 << FSP_FLAGS_WIDTH_ATOMIC_BLOBS))	\
+		<< FSP_FLAGS_POS_ATOMIC_BLOBS)
 /** Bit mask of the PAGE_SSIZE field */
 #define FSP_FLAGS_MASK_PAGE_SSIZE				\
 		((~(~0 << FSP_FLAGS_WIDTH_PAGE_SSIZE))		\
 		<< FSP_FLAGS_POS_PAGE_SSIZE)
+
+/** Return the value of the POST_ANTELOPE field */
+#define FSP_FLAGS_GET_POST_ANTELOPE(flags)			\
+		((flags & FSP_FLAGS_MASK_POST_ANTELOPE)		\
+		>> FSP_FLAGS_POS_POST_ANTELOPE)
+/** Return the value of the ZIP_SSIZE field */
+#define FSP_FLAGS_GET_ZIP_SSIZE(flags)				\
+		((flags & FSP_FLAGS_MASK_ZIP_SSIZE)		\
+		>> FSP_FLAGS_POS_ZIP_SSIZE)
+/** Return the value of the ATOMIC_BLOBS field */
+#define FSP_FLAGS_HAS_ATOMIC_BLOBS(flags)			\
+		((flags & FSP_FLAGS_MASK_ATOMIC_BLOBS)		\
+		>> FSP_FLAGS_POS_ATOMIC_BLOBS)
 /** Return the value of the PAGE_SSIZE field */
 #define FSP_FLAGS_GET_PAGE_SSIZE(flags)				\
 		((flags & FSP_FLAGS_MASK_PAGE_SSIZE)		\
 		>> FSP_FLAGS_POS_PAGE_SSIZE)
+/** Return the contents of the UNUSED bits */
+#define FSP_FLAGS_GET_UNUSED(flags)				\
+		(flags >> FSP_FLAGS_POS_UNUSED)
+
+/** Set a PAGE_SSIZE into the correct bits in a given
+tablespace flags. */
+#define FSP_FLAGS_SET_PAGE_SSIZE(flags, ssize)			\
+		(flags | (ssize << FSP_FLAGS_POS_PAGE_SSIZE))
 
 /* @} */
 
@@ -116,6 +183,142 @@ descriptor page, but used only in the first. */
 					FSP_FREE_LIMIT at a time */
 /* @} */
 
+#ifndef UNIV_INNOCHECKSUM
+
+/* @defgroup File Segment Inode Constants (moved from fsp0fsp.c) @{ */
+
+/*			FILE SEGMENT INODE
+			==================
+
+Segment inode which is created for each segment in a tablespace. NOTE: in
+purge we assume that a segment having only one currently used page can be
+freed in a few steps, so that the freeing cannot fill the file buffer with
+bufferfixed file pages. */
+
+typedef	byte	fseg_inode_t;
+
+#define FSEG_INODE_PAGE_NODE	FSEG_PAGE_DATA
+					/* the list node for linking
+					segment inode pages */
+
+#define FSEG_ARR_OFFSET		(FSEG_PAGE_DATA + FLST_NODE_SIZE)
+/*-------------------------------------*/
+#define	FSEG_ID			0	/* 8 bytes of segment id: if this is 0,
+					it means that the header is unused */
+#define FSEG_NOT_FULL_N_USED	8
+					/* number of used segment pages in
+					the FSEG_NOT_FULL list */
+#define	FSEG_FREE		12
+					/* list of free extents of this
+					segment */
+#define	FSEG_NOT_FULL		(12 + FLST_BASE_NODE_SIZE)
+					/* list of partially free extents */
+#define	FSEG_FULL		(12 + 2 * FLST_BASE_NODE_SIZE)
+					/* list of full extents */
+#define	FSEG_MAGIC_N		(12 + 3 * FLST_BASE_NODE_SIZE)
+					/* magic number used in debugging */
+#define	FSEG_FRAG_ARR		(16 + 3 * FLST_BASE_NODE_SIZE)
+					/* array of individual pages
+					belonging to this segment in fsp
+					fragment extent lists */
+#define FSEG_FRAG_ARR_N_SLOTS	(FSP_EXTENT_SIZE / 2)
+					/* number of slots in the array for
+					the fragment pages */
+#define	FSEG_FRAG_SLOT_SIZE	4	/* a fragment page slot contains its
+					page number within space, FIL_NULL
+					means that the slot is not in use */
+/*-------------------------------------*/
+#define FSEG_INODE_SIZE					\
+	(16 + 3 * FLST_BASE_NODE_SIZE			\
+	 + FSEG_FRAG_ARR_N_SLOTS * FSEG_FRAG_SLOT_SIZE)
+
+#define FSP_SEG_INODES_PER_PAGE(zip_size)		\
+	(((zip_size ? zip_size : UNIV_PAGE_SIZE)	\
+	  - FSEG_ARR_OFFSET - 10) / FSEG_INODE_SIZE)
+				/* Number of segment inodes which fit on a
+				single page */
+
+#define FSEG_MAGIC_N_VALUE	97937874
+
+#define	FSEG_FILLFACTOR		8	/* If this value is x, then if
+					the number of unused but reserved
+					pages in a segment is less than
+					reserved pages * 1/x, and there are
+					at least FSEG_FRAG_LIMIT used pages,
+					then we allow a new empty extent to
+					be added to the segment in
+					fseg_alloc_free_page. Otherwise, we
+					use unused pages of the segment. */
+
+#define FSEG_FRAG_LIMIT		FSEG_FRAG_ARR_N_SLOTS
+					/* If the segment has >= this many
+					used pages, it may be expanded by
+					allocating extents to the segment;
+					until that only individual fragment
+					pages are allocated from the space */
+
+#define	FSEG_FREE_LIST_LIMIT	40	/* If the reserved size of a segment
+					is at least this many extents, we
+					allow extents to be put to the free
+					list of the extent: at most
+					FSEG_FREE_LIST_MAX_LEN many */
+#define	FSEG_FREE_LIST_MAX_LEN	4
+/* @} */
+
+/* @defgroup Extent Descriptor Constants (moved from fsp0fsp.c) @{ */
+
+/*			EXTENT DESCRIPTOR
+			=================
+
+File extent descriptor data structure: contains bits to tell which pages in
+the extent are free and which contain old tuple version to clean. */
+
+/*-------------------------------------*/
+#define	XDES_ID			0	/* The identifier of the segment
+					to which this extent belongs */
+#define XDES_FLST_NODE		8	/* The list node data structure
+					for the descriptors */
+#define	XDES_STATE		(FLST_NODE_SIZE + 8)
+					/* contains state information
+					of the extent */
+#define	XDES_BITMAP		(FLST_NODE_SIZE + 12)
+					/* Descriptor bitmap of the pages
+					in the extent */
+/*-------------------------------------*/
+
+#define	XDES_BITS_PER_PAGE	2	/* How many bits are there per page */
+#define	XDES_FREE_BIT		0	/* Index of the bit which tells if
+					the page is free */
+#define	XDES_CLEAN_BIT		1	/* NOTE: currently not used!
+					Index of the bit which tells if
+					there are old versions of tuples
+					on the page */
+/* States of a descriptor */
+#define	XDES_FREE		1	/* extent is in free list of space */
+#define	XDES_FREE_FRAG		2	/* extent is in free fragment list of
+					space */
+#define	XDES_FULL_FRAG		3	/* extent is in full fragment list of
+					space */
+#define	XDES_FSEG		4	/* extent belongs to a segment */
+
+/** File extent data structure size in bytes. */
+#define	XDES_SIZE							\
+	(XDES_BITMAP							\
+	+ UT_BITS_IN_BYTES(FSP_EXTENT_SIZE * XDES_BITS_PER_PAGE))
+
+/** File extent data structure size in bytes for MAX page size. */
+#define	XDES_SIZE_MAX							\
+	(XDES_BITMAP							\
+	+ UT_BITS_IN_BYTES(FSP_EXTENT_SIZE_MAX * XDES_BITS_PER_PAGE))
+
+/** File extent data structure size in bytes for MIN page size. */
+#define	XDES_SIZE_MIN							\
+	(XDES_BITMAP							\
+	+ UT_BITS_IN_BYTES(FSP_EXTENT_SIZE_MIN * XDES_BITS_PER_PAGE))
+
+/** Offset of the descriptor array on a descriptor page */
+#define	XDES_ARR_OFFSET		(FSP_HEADER_OFFSET + FSP_HEADER_SIZE)
+
 /* @} */
 
 /**********************************************************************//**
@@ -125,16 +328,6 @@ void
 fsp_init(void);
 /*==========*/
 /**********************************************************************//**
-Gets the current free limit of the system tablespace.  The free limit
-means the place of the first page which has never been put to the
-free list for allocation.  The space above that address is initialized
-to zero.  Sets also the global variable log_fsp_current_free_limit.
-@return	free limit in megabytes */
-UNIV_INTERN
-ulint
-fsp_header_get_free_limit(void);
-/*===========================*/
-/**********************************************************************//**
 Gets the size of the system tablespace from the tablespace header.  If
 we do not have an auto-extending data file, this should be equal to
 the size of the data files.  If there is an auto-extending data file,
@@ -177,9 +370,9 @@ fsp_header_get_zip_size(
 /*====================*/
 	const page_t*	page);	/*!< in: first page of a tablespace */
 /**********************************************************************//**
-Writes the space id and compressed page size to a tablespace header.
-This function is used past the buffer pool when we in fil0fil.c create
-a new single-table tablespace. */
+Writes the space id and flags to a tablespace header.  The flags contain
+row type, physical/compressed page size, and logical/uncompressed page
+size of the tablespace. */
 UNIV_INTERN
 void
 fsp_header_init_fields(
@@ -197,16 +390,16 @@ fsp_header_init(
 /*============*/
 	ulint	space,		/*!< in: space id */
 	ulint	size,		/*!< in: current size in blocks */
-	mtr_t*	mtr);		/*!< in: mini-transaction handle */
+	mtr_t*	mtr);		/*!< in/out: mini-transaction */
 /**********************************************************************//**
 Increases the space size field of a space. */
 UNIV_INTERN
 void
 fsp_header_inc_size(
 /*================*/
-	ulint	space,	/*!< in: space id */
-	ulint	size_inc,/*!< in: size increment in pages */
-	mtr_t*	mtr);	/*!< in: mini-transaction handle */
+	ulint	space,		/*!< in: space id */
+	ulint	size_inc,	/*!< in: size increment in pages */
+	mtr_t*	mtr);		/*!< in/out: mini-transaction */
 /**********************************************************************//**
 Creates a new segment.
 @return the block where the segment header is placed, x-latched, NULL
@@ -222,7 +415,7 @@ fseg_create(
 			will belong to the created segment */
 	ulint	byte_offset, /*!< in: byte offset of the created segment header
 			on the page */
-	mtr_t*	mtr);	/*!< in: mtr */
+	mtr_t*	mtr);	/*!< in/out: mini-transaction */
 /**********************************************************************//**
 Creates a new segment.
 @return the block where the segment header is placed, x-latched, NULL
@@ -244,7 +437,7 @@ fseg_create_general(
 			the inode and the other for the segment) then there is
 			no need to do the check for this individual
 			operation */
-	mtr_t*	mtr);	/*!< in: mtr */
+	mtr_t*	mtr);	/*!< in/out: mini-transaction */
 /**********************************************************************//**
 Calculates the number of pages reserved by a segment, and how many pages are
 currently used.
@@ -255,7 +448,7 @@ fseg_n_reserved_pages(
 /*==================*/
 	fseg_header_t*	header,	/*!< in: segment header */
 	ulint*		used,	/*!< out: number of pages used (<= reserved) */
-	mtr_t*		mtr);	/*!< in: mtr handle */
+	mtr_t*		mtr);	/*!< in/out: mini-transaction */
 /**********************************************************************//**
 Allocates a single free page from a segment. This function implements
 the intelligent allocation strategy which tries to minimize
@@ -339,7 +532,7 @@ fsp_reserve_free_extents(
 	ulint	space,	/*!< in: space id */
 	ulint	n_ext,	/*!< in: number of extents to reserve */
 	ulint	alloc_type,/*!< in: FSP_NORMAL, FSP_UNDO, or FSP_CLEANING */
-	mtr_t*	mtr);	/*!< in: mtr */
+	mtr_t*	mtr);	/*!< in: mini-transaction */
 /**********************************************************************//**
 This function should be used to get information on how much we still
 will be able to insert new data to the database without running out the
@@ -360,7 +553,7 @@ fseg_free_page(
 	fseg_header_t*	seg_header, /*!< in: segment header */
 	ulint		space,	/*!< in: space id */
 	ulint		page,	/*!< in: page offset */
-	mtr_t*		mtr);	/*!< in: mtr handle */
+	mtr_t*		mtr);	/*!< in/out: mini-transaction */
 /**********************************************************************//**
 Frees part of a segment. This function can be used to free a segment
 by repeatedly calling this function in different mini-transactions.
@@ -375,7 +568,7 @@ fseg_free_step(
 				resides on the first page of the frag list
 				of the segment, this pointer becomes obsolete
 				after the last freeing step */
-	mtr_t*		mtr);	/*!< in: mtr */
+	mtr_t*		mtr);	/*!< in/out: mini-transaction */
 /**********************************************************************//**
 Frees part of a segment. Differs from fseg_free_step because this function
 leaves the header page unfreed.
@@ -386,7 +579,7 @@ fseg_free_step_not_header(
 /*======================*/
 	fseg_header_t*	header,	/*!< in: segment header which must reside on
 				the first fragment page of the segment */
-	mtr_t*		mtr);	/*!< in: mtr */
+	mtr_t*		mtr);	/*!< in/out: mini-transaction */
 /***********************************************************************//**
 Checks if a page address is an extent descriptor page address.
 @return	TRUE if a descriptor page */
@@ -431,7 +624,7 @@ ibool
 fseg_validate(
 /*==========*/
 	fseg_header_t*	header, /*!< in: segment header */
-	mtr_t*		mtr);	/*!< in: mtr */
+	mtr_t*		mtr);	/*!< in/out: mini-transaction */
 #endif /* UNIV_DEBUG */
 #ifdef UNIV_BTR_PRINT
 /*******************************************************************//**
@@ -441,14 +634,44 @@ void
 fseg_print(
 /*=======*/
 	fseg_header_t*	header, /*!< in: segment header */
-	mtr_t*		mtr);	/*!< in: mtr */
+	mtr_t*		mtr);	/*!< in/out: mini-transaction */
 #endif /* UNIV_BTR_PRINT */
 
 /********************************************************************//**
+Validate and return the tablespace flags, which are stored in the
+tablespace header at offset FSP_SPACE_FLAGS.  They should be 0 for
+ROW_FORMAT=COMPACT and ROW_FORMAT=REDUNDANT. The newer row formats,
+COMPRESSED and DYNAMIC, use a file format > Antelope so they should
+have a file format number plus the DICT_TF_COMPACT bit set.
+@return	ulint containing the validated tablespace flags. */
+UNIV_INLINE
+ulint
+fsp_flags_validate(
+/*===============*/
+	ulint	flags);		/*!< in: tablespace flags */
+/********************************************************************//**
+Determine if the tablespace is compressed from dict_table_t::flags.
+@return	TRUE if compressed, FALSE if not compressed */
+UNIV_INLINE
+ibool
+fsp_flags_is_compressed(
+/*====================*/
+	ulint	flags);	/*!< in: tablespace flags */
+
+#endif /* !UNIV_INNOCHECKSUM */
+
+/********************************************************************//**
+Extract the zip size from tablespace flags.  A tablespace has only one
+physical page size whether that page is compressed or not.
+@return	compressed page size of the file-per-table tablespace in bytes,
+or zero if the table is not compressed.  */
+UNIV_INLINE
+ulint
+fsp_flags_get_zip_size(
+/*====================*/
+	ulint	flags);	/*!< in: tablespace flags */
+/********************************************************************//**
 Extract the page size from tablespace flags.
-This feature, storing the page_ssize into the tablespace flags, is added
-to InnoDB 5.6.4.  This is here only to protect against a crash if a newer
-database is opened with this code branch.
 @return	page size of the tablespace in bytes */
 UNIV_INLINE
 ulint
@@ -456,6 +679,15 @@ fsp_flags_get_page_size(
 /*====================*/
 	ulint	flags);	/*!< in: tablespace flags */
 
+/********************************************************************//**
+Set page size */
+UNIV_INLINE
+ulint
+fsp_flags_set_page_size(
+/*====================*/
+	ulint	flags,		/*!< in: tablespace flags */
+	ulint	page_size);	/*!< in: page size in bytes */
+
 #ifndef UNIV_NONINL
 #include "fsp0fsp.ic"
 #endif
diff --git a/storage/innobase/include/fsp0fsp.ic b/storage/innobase/include/fsp0fsp.ic
index c92111a9d89..498f9000888 100644
--- a/storage/innobase/include/fsp0fsp.ic
+++ b/storage/innobase/include/fsp0fsp.ic
@@ -23,6 +23,8 @@ File space management
 Created 12/18/1995 Heikki Tuuri
 *******************************************************/
 
+#ifndef UNIV_INNOCHECKSUM
+
 /***********************************************************************//**
 Checks if a page address is an extent descriptor page address.
 @return	TRUE if a descriptor page */
@@ -37,17 +39,111 @@ fsp_descr_page(
 	ut_ad(ut_is_2pow(zip_size));
 
 	if (!zip_size) {
-		return(UNIV_UNLIKELY((page_no & (UNIV_PAGE_SIZE - 1))
-				     == FSP_XDES_OFFSET));
+		return((page_no & (UNIV_PAGE_SIZE - 1)) == FSP_XDES_OFFSET);
 	}
 
-	return(UNIV_UNLIKELY((page_no & (zip_size - 1)) == FSP_XDES_OFFSET));
+	return((page_no & (zip_size - 1)) == FSP_XDES_OFFSET);
 }
+
+/********************************************************************//**
+Validate and return the tablespace flags, which are stored in the
+tablespace header at offset FSP_SPACE_FLAGS.  They should be 0 for
+ROW_FORMAT=COMPACT and ROW_FORMAT=REDUNDANT. The newer row formats,
+COMPRESSED and DYNAMIC, use a file format > Antelope so they should
+have a file format number plus the DICT_TF_COMPACT bit set.
+@return	Same as input after validating it as FSP_SPACE_FLAGS.
+If there is an error, trigger assertion failure. */
+UNIV_INLINE
+ulint
+fsp_flags_validate(
+/*===============*/
+	ulint	flags)		/*!< in: tablespace flags */
+{
+	ulint	post_antelope = FSP_FLAGS_GET_POST_ANTELOPE(flags);
+	ulint	zip_ssize = FSP_FLAGS_GET_ZIP_SSIZE(flags);
+	ulint	atomic_blobs = FSP_FLAGS_HAS_ATOMIC_BLOBS(flags);
+	ulint	page_ssize = FSP_FLAGS_GET_PAGE_SSIZE(flags);
+	ulint	unused = FSP_FLAGS_GET_UNUSED(flags);
+
+	/* Make sure there are no bits that we do not know about. */
+	ut_a(unused == 0);
+
+	/* fsp_flags is zero unless atomic_blobs is set. */
+	ut_a(flags != 1);
+	if (post_antelope) {
+		/* The Antelope row formats REDUNDANT and COMPACT did
+		not use tablespace flags, so this flag and the entire
+		4-byte field is zero for Antelope row formats. */
+		ut_a(atomic_blobs);
+	}
+
+	if (!atomic_blobs) {
+		/* Barracuda row formats COMPRESSED and DYNAMIC build on
+		the page structure introduced for the COMPACT row format
+		by allowing long fields to be broken into prefix and
+		externally stored parts. */
+		ut_a(!post_antelope);
+		ut_a(zip_ssize == 0);
+	} else {
+		ut_a(post_antelope);
+
+		/* Validate the zip shift size is within allowed range. */
+		ut_a(zip_ssize <= PAGE_ZIP_SSIZE_MAX);
+	}
+
+	/* The page size field can be used for any row type, or it may
+	be zero for an original 16k page size.
+	Validate the page shift size is within allowed range. */
+	ut_a(page_ssize <= UNIV_PAGE_SSIZE_MAX);
+	ut_a((UNIV_PAGE_SIZE == UNIV_PAGE_SIZE_ORIG) || (page_ssize));
+
+#if UNIV_FORMAT_MAX != UNIV_FORMAT_B
+# error "UNIV_FORMAT_MAX != UNIV_FORMAT_B, Add more validations."
+#endif
+
+	/* Return the flags sent in if we did not fail an assert. */
+	return(flags);
+}
+
+/********************************************************************//**
+Determine if the tablespace is compressed from dict_table_t::flags.
+@return	TRUE if compressed, FALSE if not compressed */
+UNIV_INLINE
+ibool
+fsp_flags_is_compressed(
+/*====================*/
+	ulint	flags)	/*!< in: tablespace flags */
+{
+	return(FSP_FLAGS_GET_ZIP_SSIZE(flags) != 0);
+}
+
+#endif /* !UNIV_INNOCHECKSUM */
+
+/********************************************************************//**
+Extract the zip size from tablespace flags.
+@return	compressed page size of the file-per-table tablespace in bytes,
+or zero if the table is not compressed. */
+UNIV_INLINE
+ulint
+fsp_flags_get_zip_size(
+/*===================*/
+	ulint	flags)	/*!< in: tablespace flags */
+{
+	ulint	zip_size = 0;
+	ulint	ssize = FSP_FLAGS_GET_ZIP_SSIZE(flags);
+
+	/* Convert from a 'log2 minus 9' to a page size in bytes. */
+	if (ssize) {
+		zip_size = ((UNIV_ZIP_SIZE_MIN >> 1) << ssize);
+
+		ut_ad(zip_size <= UNIV_ZIP_SIZE_MAX);
+	}
+
+	return(zip_size);
+}
+
 /********************************************************************//**
 Extract the page size from tablespace flags.
-This feature, storing the page_ssize into the tablespace flags, is added
-to InnoDB 5.6.4.  This is here only to protect against a crash if a newer
-database is opened with this code branch.
 @return	page size of the tablespace in bytes */
 UNIV_INLINE
 ulint
@@ -60,14 +156,61 @@ fsp_flags_get_page_size(
 
 	/* Convert from a 'log2 minus 9' to a page size in bytes. */
 	if (UNIV_UNLIKELY(ssize)) {
-		page_size = (512 << ssize);
+		page_size = ((UNIV_ZIP_SIZE_MIN >> 1) << ssize);
 
-		ut_ad(page_size <= UNIV_PAGE_SIZE);
+		ut_ad(page_size <= UNIV_PAGE_SIZE_MAX);
 	} else {
 		/* If the page size was not stored, then it is the
 		original 16k. */
-		page_size = UNIV_PAGE_SIZE;
+		page_size = UNIV_PAGE_SIZE_ORIG;
 	}
 
 	return(page_size);
 }
+
+#ifndef UNIV_INNOCHECKSUM
+
+/********************************************************************//**
+Add the page size to the tablespace flags.
+@return	tablespace flags after page size is added */
+UNIV_INLINE
+ulint
+fsp_flags_set_page_size(
+/*====================*/
+	ulint	flags,		/*!< in: tablespace flags */
+	ulint	page_size)	/*!< in: page size in bytes */
+{
+	ulint ssize = 0;
+	ulint shift;
+
+	/* Page size should be > UNIV_PAGE_SIZE_MIN */
+	ut_ad(page_size >= UNIV_PAGE_SIZE_MIN);
+	ut_ad(page_size <= UNIV_PAGE_SIZE_MAX);
+
+	if (page_size == UNIV_PAGE_SIZE_ORIG) {
+		ut_ad(0 == FSP_FLAGS_GET_PAGE_SSIZE(flags));
+		return(flags);
+	}
+
+	for (shift = UNIV_PAGE_SIZE_SHIFT_MAX;
+	     shift >= UNIV_PAGE_SIZE_SHIFT_MIN;
+	     shift--) {
+		ulint	mask = (1 << shift);
+		if (page_size & mask) {
+			ut_ad(!(page_size & ~mask));
+			ssize = shift - UNIV_ZIP_SIZE_SHIFT_MIN + 1;
+			break;
+		}
+	}
+
+	ut_ad(ssize);
+	ut_ad(ssize <= UNIV_PAGE_SSIZE_MAX);
+
+	flags = FSP_FLAGS_SET_PAGE_SSIZE(flags, ssize);
+
+	ut_ad(flags == fsp_flags_validate(flags));
+
+	return(flags);
+}
+
+#endif /* !UNIV_INNOCHECKSUM */
diff --git a/storage/innobase/include/fsp0types.h b/storage/innobase/include/fsp0types.h
index 496081c2346..94fd908ab0c 100644
--- a/storage/innobase/include/fsp0types.h
+++ b/storage/innobase/include/fsp0types.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1995, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -42,7 +42,13 @@ fseg_alloc_free_page) */
 /* @} */
 
 /** File space extent size (one megabyte) in pages */
-#define	FSP_EXTENT_SIZE		(1 << (20 - UNIV_PAGE_SIZE_SHIFT))
+#define	FSP_EXTENT_SIZE		(1048576U / UNIV_PAGE_SIZE)
+
+/** File space extent size (one megabyte) in pages for MAX page size */
+#define	FSP_EXTENT_SIZE_MAX	(1048576 / UNIV_PAGE_SIZE_MAX)
+
+/** File space extent size (one megabyte) in pages for MIN page size */
+#define	FSP_EXTENT_SIZE_MIN	(1048576 / UNIV_PAGE_SIZE_MIN)
 
 /** On a page of any file segment, data may be put starting from this
 offset */
diff --git a/storage/innobase/include/fts0ast.h b/storage/innobase/include/fts0ast.h
new file mode 100644
index 00000000000..da40e2bbc96
--- /dev/null
+++ b/storage/innobase/include/fts0ast.h
@@ -0,0 +1,257 @@
+/*****************************************************************************
+
+Copyright (c) 2007, 2011, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/fts0ast.h
+The FTS query parser (AST) abstract syntax tree routines
+
+Created 2007/03/16/03 Sunny Bains
+*******************************************************/
+
+#ifndef INNOBASE_FST0AST_H
+#define INNOBASE_FST0AST_H
+
+#include "mem0mem.h"
+
+/* The type of AST Node */
+enum fts_ast_type_enum {
+	FTS_AST_OPER,				/*!< Operator */
+	FTS_AST_NUMB,				/*!< Number */
+	FTS_AST_TERM,				/*!< Term (or word) */
+	FTS_AST_TEXT,				/*!< Text string */
+	FTS_AST_LIST,				/*!< Expression list */
+	FTS_AST_SUBEXP_LIST			/*!< Sub-Expression list */
+};
+
+/* The FTS query operators that we support */
+enum fts_ast_oper_enum {
+	FTS_NONE,				/*!< No operator */
+
+	FTS_IGNORE,				/*!< Ignore rows that contain
+						this word */
+
+	FTS_EXIST,				/*!< Include rows that contain
+						this word */
+
+	FTS_NEGATE,				/*!< Include rows that contain
+						this word but rank them
+						lower*/
+
+	FTS_INCR_RATING,			/*!< Increase the rank for this
+						word*/
+
+	FTS_DECR_RATING,			/*!< Decrease the rank for this
+						word*/
+
+	FTS_DISTANCE				/*!< Proximity distance */
+};
+
+/* Enum types used by the FTS parser */
+typedef enum fts_ast_type_enum fts_ast_type_t;
+typedef enum fts_ast_oper_enum fts_ast_oper_t;
+
+/* Data types used by the FTS parser */
+typedef struct fts_lexer_struct fts_lexer_t;
+typedef struct fts_ast_text_struct fts_ast_text_t;
+typedef struct fts_ast_term_struct fts_ast_term_t;
+typedef struct fts_ast_node_struct fts_ast_node_t;
+typedef struct fts_ast_list_struct fts_ast_list_t;
+typedef struct fts_ast_state_struct fts_ast_state_t;
+
+typedef ulint (*fts_ast_callback)(fts_ast_oper_t, fts_ast_node_t*, void*);
+
+/********************************************************************
+Parse the string using the lexer setup within state.*/
+int
+fts_parse(
+/*======*/
+						/* out: 0 on OK, 1 on error */
+	fts_ast_state_t* state);		/*!< in: ast state instance.*/
+
+/********************************************************************
+Create an AST operator node */
+extern
+fts_ast_node_t*
+fts_ast_create_node_oper(
+/*=====================*/
+	void*		arg,			/*!< in: ast state */
+	fts_ast_oper_t	oper);			/*!< in: ast operator */
+/********************************************************************
+Create an AST term node, makes a copy of ptr */
+extern
+fts_ast_node_t*
+fts_ast_create_node_term(
+/*=====================*/
+	void*		arg,			/*!< in: ast state */
+	const char*	ptr);			/*!< in: term string */
+/********************************************************************
+Create an AST text node */
+extern
+fts_ast_node_t*
+fts_ast_create_node_text(
+/*=====================*/
+	void*		arg,			/*!< in: ast state */
+	const char*	ptr);			/*!< in: text string */
+/********************************************************************
+Create an AST expr list node */
+extern
+fts_ast_node_t*
+fts_ast_create_node_list(
+/*=====================*/
+	void*		arg,			/*!< in: ast state */
+	fts_ast_node_t*	expr);			/*!< in: ast expr */
+/********************************************************************
+Create a sub-expression list node. This function takes ownership of
+expr and is responsible for deleting it. */
+extern
+fts_ast_node_t*
+fts_ast_create_node_subexp_list(
+/*============================*/
+						/* out: new node */
+	void*		arg,			/*!< in: ast state instance */
+	fts_ast_node_t*	expr);			/*!< in: ast expr instance */
+/********************************************************************
+Set the wildcard attribute of a term.*/
+extern
+void
+fts_ast_term_set_wildcard(
+/*======================*/
+	fts_ast_node_t*	node);			/*!< in: term to change */
+/********************************************************************
+Set the proximity attribute of a text node. */
+
+void
+fts_ast_term_set_distance(
+/*======================*/
+	fts_ast_node_t*	node,			/*!< in/out: text node */
+	ulint		distance);		/*!< in: the text proximity
+						distance */
+/********************************************************************//**
+Free a fts_ast_node_t instance.
+@return next node to free */
+UNIV_INTERN
+fts_ast_node_t*
+fts_ast_free_node(
+/*==============*/
+	fts_ast_node_t*	node);			/*!< in: node to free */
+/********************************************************************
+Add a sub-expression to an AST*/
+extern
+fts_ast_node_t*
+fts_ast_add_node(
+/*=============*/
+	fts_ast_node_t*	list,			/*!< in: list node instance */
+	fts_ast_node_t*	node);			/*!< in: (sub) expr to add */
+/********************************************************************
+Print the AST node recursively.*/
+extern
+void
+fts_ast_node_print(
+/*===============*/
+	fts_ast_node_t*	node);			/*!< in: ast node to print */
+/********************************************************************
+For tracking node allocations, in case there is an during parsing.*/
+extern
+void
+fts_ast_state_add_node(
+/*===================*/
+	fts_ast_state_t*state,			/*!< in: ast state instance */
+	fts_ast_node_t*	node);			/*!< in: node to add to state */
+/********************************************************************
+Free node and expr allocations.*/
+extern
+void
+fts_ast_state_free(
+/*===============*/
+	fts_ast_state_t*state);			/*!< in: state instance
+						to free */
+/********************************************************************
+Traverse the AST.*/
+ulint
+fts_ast_visit(
+/*==========*/
+	fts_ast_oper_t		oper,		/*!< in: FTS operator */
+	fts_ast_node_t*		node,		/*!< in: instance to traverse*/
+	fts_ast_callback	visitor,	/*!< in: callback */
+	void*			arg);		/*!< in: callback arg */
+/********************************************************************
+Traverse the sub expression list.*/
+ulint
+fts_ast_visit_sub_exp(
+/*==========*/
+	fts_ast_node_t*		node,		/*!< in: instance to traverse*/
+	fts_ast_callback	visitor,	/*!< in: callback */
+	void*			arg);		/*!< in: callback arg */
+/********************************************************************
+Create a lex instance.*/
+fts_lexer_t*
+fts_lexer_create(
+/*=============*/
+	ibool		boolean_mode,		/*!< in: query type */
+	const byte*	query,			/*!< in: query string */
+	ulint		query_len);		/*!< in: query string len */
+/********************************************************************
+Free an fts_lexer_t instance.*/
+void
+fts_lexer_free(
+/*===========*/
+	fts_lexer_t*	fts_lexer);		/*!< in: lexer instance to
+						free */
+
+/* Query term type */
+struct fts_ast_term_struct {
+	byte*		ptr;			/*!< Pointer to term string.*/
+	ibool		wildcard;		/*!< TRUE if wild card set.*/
+};
+
+/* Query text type */
+struct fts_ast_text_struct {
+	byte*		ptr;			/*!< Pointer to term string.*/
+	ulint		distance;		/*!< > 0 if proximity distance
+						set */
+};
+
+/* The list of nodes in an expr list */
+struct fts_ast_list_struct {
+	fts_ast_node_t*	head;			/*!< Children list head */
+	fts_ast_node_t*	tail;			/*!< Children list tail */
+};
+
+/* FTS AST node to store the term, text, operator and sub-expressions.*/
+struct fts_ast_node_struct {
+	fts_ast_type_t	type;			/*!< The type of node */
+	fts_ast_text_t	text;			/*!< Text node */
+	fts_ast_term_t	term;			/*!< Term node */
+	fts_ast_oper_t	oper;			/*!< Operator value */
+	fts_ast_list_t	list;			/*!< Expression list */
+	fts_ast_node_t*	next;			/*!< Link for expr list */
+	fts_ast_node_t*	next_alloc;		/*!< For tracking allocations */
+};
+
+/* To track state during parsing */
+struct fts_ast_state_struct {
+	mem_heap_t*	heap;			/*!< Heap to use for alloc */
+	fts_ast_node_t*	root;			/*!< If all goes OK, then this
+						will point to the root.*/
+
+	fts_ast_list_t	list;			/*!< List of nodes allocated */
+
+	fts_lexer_t*	lexer;			/*!< Lexer callback + arg */
+};
+
+#endif /* INNOBASE_FSTS0AST_H */
diff --git a/storage/innobase/include/fts0blex.h b/storage/innobase/include/fts0blex.h
new file mode 100644
index 00000000000..6f8d6eaeb29
--- /dev/null
+++ b/storage/innobase/include/fts0blex.h
@@ -0,0 +1,349 @@
+#ifndef fts0bHEADER_H
+#define fts0bHEADER_H 1
+#define fts0bIN_HEADER 1
+
+#line 6 "../include/fts0blex.h"
+
+#line 8 "../include/fts0blex.h"
+
+#define  YY_INT_ALIGNED short int
+
+/* A lexical scanner generated by flex */
+
+#define FLEX_SCANNER
+#define YY_FLEX_MAJOR_VERSION 2
+#define YY_FLEX_MINOR_VERSION 5
+#define YY_FLEX_SUBMINOR_VERSION 35
+#if YY_FLEX_SUBMINOR_VERSION > 0
+#define FLEX_BETA
+#endif
+
+/* First, we deal with  platform-specific or compiler-specific issues. */
+
+/* begin standard C headers. */
+#include <stdio.h>
+#include <string.h>
+#include <errno.h>
+#include <stdlib.h>
+
+/* end standard C headers. */
+
+/* flex integer type definitions */
+
+#ifndef FLEXINT_H
+#define FLEXINT_H
+
+/* C99 systems have <inttypes.h>. Non-C99 systems may or may not. */
+
+#if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
+
+/* C99 says to define __STDC_LIMIT_MACROS before including stdint.h,
+ * if you want the limit (max/min) macros for int types.
+ */
+#ifndef __STDC_LIMIT_MACROS
+#define __STDC_LIMIT_MACROS 1
+#endif
+
+#include <inttypes.h>
+typedef int8_t flex_int8_t;
+typedef uint8_t flex_uint8_t;
+typedef int16_t flex_int16_t;
+typedef uint16_t flex_uint16_t;
+typedef int32_t flex_int32_t;
+typedef uint32_t flex_uint32_t;
+#else
+typedef signed char flex_int8_t;
+typedef short int flex_int16_t;
+typedef int flex_int32_t;
+typedef unsigned char flex_uint8_t;
+typedef unsigned short int flex_uint16_t;
+typedef unsigned int flex_uint32_t;
+
+/* Limits of integral types. */
+#ifndef INT8_MIN
+#define INT8_MIN               (-128)
+#endif
+#ifndef INT16_MIN
+#define INT16_MIN              (-32767-1)
+#endif
+#ifndef INT32_MIN
+#define INT32_MIN              (-2147483647-1)
+#endif
+#ifndef INT8_MAX
+#define INT8_MAX               (127)
+#endif
+#ifndef INT16_MAX
+#define INT16_MAX              (32767)
+#endif
+#ifndef INT32_MAX
+#define INT32_MAX              (2147483647)
+#endif
+#ifndef UINT8_MAX
+#define UINT8_MAX              (255U)
+#endif
+#ifndef UINT16_MAX
+#define UINT16_MAX             (65535U)
+#endif
+#ifndef UINT32_MAX
+#define UINT32_MAX             (4294967295U)
+#endif
+
+#endif /* ! C99 */
+
+#endif /* ! FLEXINT_H */
+
+#ifdef __cplusplus
+
+/* The "const" storage-class-modifier is valid. */
+#define YY_USE_CONST
+
+#else	/* ! __cplusplus */
+
+/* C99 requires __STDC__ to be defined as 1. */
+#if defined (__STDC__)
+
+#define YY_USE_CONST
+
+#endif	/* defined (__STDC__) */
+#endif	/* ! __cplusplus */
+
+#ifdef YY_USE_CONST
+#define yyconst const
+#else
+#define yyconst
+#endif
+
+/* An opaque pointer. */
+#ifndef YY_TYPEDEF_YY_SCANNER_T
+#define YY_TYPEDEF_YY_SCANNER_T
+typedef void* yyscan_t;
+#endif
+
+/* For convenience, these vars (plus the bison vars far below)
+   are macros in the reentrant scanner. */
+#define yyin yyg->yyin_r
+#define yyout yyg->yyout_r
+#define yyextra yyg->yyextra_r
+#define yyleng yyg->yyleng_r
+#define yytext yyg->yytext_r
+#define yylineno (YY_CURRENT_BUFFER_LVALUE->yy_bs_lineno)
+#define yycolumn (YY_CURRENT_BUFFER_LVALUE->yy_bs_column)
+#define yy_flex_debug yyg->yy_flex_debug_r
+
+/* Size of default input buffer. */
+#ifndef YY_BUF_SIZE
+#ifdef __ia64__
+/* On IA-64, the buffer size is 16k, not 8k.
+ * Moreover, YY_BUF_SIZE is 2*YY_READ_BUF_SIZE in the general case.
+ * Ditto for the __ia64__ case accordingly.
+ */
+#define YY_BUF_SIZE 32768
+#else
+#define YY_BUF_SIZE 16384
+#endif /* __ia64__ */
+#endif
+
+#ifndef YY_TYPEDEF_YY_BUFFER_STATE
+#define YY_TYPEDEF_YY_BUFFER_STATE
+typedef struct yy_buffer_state *YY_BUFFER_STATE;
+#endif
+
+#ifndef YY_TYPEDEF_YY_SIZE_T
+#define YY_TYPEDEF_YY_SIZE_T
+typedef size_t yy_size_t;
+#endif
+
+#ifndef YY_STRUCT_YY_BUFFER_STATE
+#define YY_STRUCT_YY_BUFFER_STATE
+struct yy_buffer_state
+	{
+	FILE *yy_input_file;
+
+	char *yy_ch_buf;		/* input buffer */
+	char *yy_buf_pos;		/* current position in input buffer */
+
+	/* Size of input buffer in bytes, not including room for EOB
+	 * characters.
+	 */
+	yy_size_t yy_buf_size;
+
+	/* Number of characters read into yy_ch_buf, not including EOB
+	 * characters.
+	 */
+	int yy_n_chars;
+
+	/* Whether we "own" the buffer - i.e., we know we created it,
+	 * and can realloc() it to grow it, and should free() it to
+	 * delete it.
+	 */
+	int yy_is_our_buffer;
+
+	/* Whether this is an "interactive" input source; if so, and
+	 * if we're using stdio for input, then we want to use getc()
+	 * instead of fread(), to make sure we stop fetching input after
+	 * each newline.
+	 */
+	int yy_is_interactive;
+
+	/* Whether we're considered to be at the beginning of a line.
+	 * If so, '^' rules will be active on the next match, otherwise
+	 * not.
+	 */
+	int yy_at_bol;
+
+    int yy_bs_lineno; /**< The line count. */
+    int yy_bs_column; /**< The column count. */
+
+	/* Whether to try to fill the input buffer when we reach the
+	 * end of it.
+	 */
+	int yy_fill_buffer;
+
+	int yy_buffer_status;
+
+	};
+#endif /* !YY_STRUCT_YY_BUFFER_STATE */
+
+void fts0brestart (FILE *input_file ,yyscan_t yyscanner );
+void fts0b_switch_to_buffer (YY_BUFFER_STATE new_buffer ,yyscan_t yyscanner );
+YY_BUFFER_STATE fts0b_create_buffer (FILE *file,int size ,yyscan_t yyscanner );
+void fts0b_delete_buffer (YY_BUFFER_STATE b ,yyscan_t yyscanner );
+void fts0b_flush_buffer (YY_BUFFER_STATE b ,yyscan_t yyscanner );
+void fts0bpush_buffer_state (YY_BUFFER_STATE new_buffer ,yyscan_t yyscanner );
+void fts0bpop_buffer_state (yyscan_t yyscanner );
+
+YY_BUFFER_STATE fts0b_scan_buffer (char *base,yy_size_t size ,yyscan_t yyscanner );
+YY_BUFFER_STATE fts0b_scan_string (yyconst char *yy_str ,yyscan_t yyscanner );
+YY_BUFFER_STATE fts0b_scan_bytes (yyconst char *bytes,int len ,yyscan_t yyscanner );
+
+void *fts0balloc (yy_size_t ,yyscan_t yyscanner );
+void *fts0brealloc (void *,yy_size_t ,yyscan_t yyscanner );
+void fts0bfree (void * ,yyscan_t yyscanner );
+
+/* Begin user sect3 */
+
+#define fts0bwrap(n) 1
+#define YY_SKIP_YYWRAP
+
+#define yytext_ptr yytext_r
+
+#ifdef YY_HEADER_EXPORT_START_CONDITIONS
+#define INITIAL 0
+
+#endif
+
+#ifndef YY_NO_UNISTD_H
+/* Special case for "unistd.h", since it is non-ANSI. We include it way
+ * down here because we want the user's section 1 to have been scanned first.
+ * The user has a chance to override it with an option.
+ */
+#include <unistd.h>
+#endif
+
+#ifndef YY_EXTRA_TYPE
+#define YY_EXTRA_TYPE void *
+#endif
+
+int fts0blex_init (yyscan_t* scanner);
+
+int fts0blex_init_extra (YY_EXTRA_TYPE user_defined,yyscan_t* scanner);
+
+/* Accessor methods to globals.
+   These are made visible to non-reentrant scanners for convenience. */
+
+int fts0blex_destroy (yyscan_t yyscanner );
+
+int fts0bget_debug (yyscan_t yyscanner );
+
+void fts0bset_debug (int debug_flag ,yyscan_t yyscanner );
+
+YY_EXTRA_TYPE fts0bget_extra (yyscan_t yyscanner );
+
+void fts0bset_extra (YY_EXTRA_TYPE user_defined ,yyscan_t yyscanner );
+
+FILE *fts0bget_in (yyscan_t yyscanner );
+
+void fts0bset_in  (FILE * in_str ,yyscan_t yyscanner );
+
+FILE *fts0bget_out (yyscan_t yyscanner );
+
+void fts0bset_out  (FILE * out_str ,yyscan_t yyscanner );
+
+int fts0bget_leng (yyscan_t yyscanner );
+
+char *fts0bget_text (yyscan_t yyscanner );
+
+int fts0bget_lineno (yyscan_t yyscanner );
+
+void fts0bset_lineno (int line_number ,yyscan_t yyscanner );
+
+/* Macros after this point can all be overridden by user definitions in
+ * section 1.
+ */
+
+#ifndef YY_SKIP_YYWRAP
+#ifdef __cplusplus
+extern "C" int fts0bwrap (yyscan_t yyscanner );
+#else
+extern int fts0bwrap (yyscan_t yyscanner );
+#endif
+#endif
+
+#ifndef yytext_ptr
+static void yy_flex_strncpy (char *,yyconst char *,int ,yyscan_t yyscanner);
+#endif
+
+#ifdef YY_NEED_STRLEN
+static int yy_flex_strlen (yyconst char * ,yyscan_t yyscanner);
+#endif
+
+#ifndef YY_NO_INPUT
+
+#endif
+
+/* Amount of stuff to slurp up with each read. */
+#ifndef YY_READ_BUF_SIZE
+#ifdef __ia64__
+/* On IA-64, the buffer size is 16k, not 8k */
+#define YY_READ_BUF_SIZE 16384
+#else
+#define YY_READ_BUF_SIZE 8192
+#endif /* __ia64__ */
+#endif
+
+/* Number of entries by which start-condition stack grows. */
+#ifndef YY_START_STACK_INCR
+#define YY_START_STACK_INCR 25
+#endif
+
+/* Default declaration of generated scanner - a define so the user can
+ * easily add parameters.
+ */
+#ifndef YY_DECL
+#define YY_DECL_IS_OURS 1
+
+extern int fts0blex (yyscan_t yyscanner);
+
+#define YY_DECL int fts0blex (yyscan_t yyscanner)
+#endif /* !YY_DECL */
+
+/* yy_get_previous_state - get the state just before the EOB char was reached */
+
+#undef YY_NEW_FILE
+#undef YY_FLUSH_BUFFER
+#undef yy_set_bol
+#undef yy_new_buffer
+#undef yy_set_interactive
+#undef YY_DO_BEFORE_ACTION
+
+#ifdef YY_DECL_IS_OURS
+#undef YY_DECL_IS_OURS
+#undef YY_DECL
+#endif
+
+#line 73 "fts0blex.l"
+
+
+#line 348 "../include/fts0blex.h"
+#undef fts0bIN_HEADER
+#endif /* fts0bHEADER_H */
diff --git a/storage/innobase/include/fts0fts.h b/storage/innobase/include/fts0fts.h
new file mode 100644
index 00000000000..e515772bdbd
--- /dev/null
+++ b/storage/innobase/include/fts0fts.h
@@ -0,0 +1,1031 @@
+/*****************************************************************************
+
+Copyright (c) 2011, 2011, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/fts0fts.h
+Full text search header file
+
+Created 2011/09/02 Sunny Bains
+***********************************************************************/
+
+#ifndef fts0fts_h
+#define fts0fts_h
+
+#include "univ.i"
+
+#include "data0type.h"
+#include "data0types.h"
+#include "dict0types.h"
+#include "hash0hash.h"
+#include "mem0mem.h"
+#include "rem0types.h"
+#include "row0types.h"
+#include "trx0types.h"
+#include "ut0vec.h"
+#include "ut0rbt.h"
+#include "ut0wqueue.h"
+#include "que0types.h"
+#include "ft_global.h"
+
+/** "NULL" value of a document id. */
+#define FTS_NULL_DOC_ID			0
+
+/** FTS hidden column that is used to map to and from the row */
+#define FTS_DOC_ID_COL_NAME		"FTS_DOC_ID"
+
+/** The name of the index created by FTS */
+#define FTS_DOC_ID_INDEX_NAME		"FTS_DOC_ID_INDEX"
+
+#define FTS_DOC_ID_INDEX_NAME_LEN	16
+
+/** Doc ID is a 8 byte value */
+#define FTS_DOC_ID_LEN			8
+
+/** The number of fields to sort when we build FT index with
+FIC. Three fields are sort: (word, doc_id, position) */
+#define FTS_NUM_FIELDS_SORT		3
+
+/** Maximum number of rows in a table, smaller than which, we will
+optimize using a 4 byte Doc ID for FIC merge sort to reduce sort size */
+#define MAX_DOC_ID_OPT_VAL		1073741824
+
+/** Document id type. */
+typedef ib_uint64_t doc_id_t;
+
+/** doc_id_t printf format */
+#define FTS_DOC_ID_FORMAT	IB_ID_FMT
+
+/** Convert document id to the InnoDB (BIG ENDIAN) storage format. */
+#define fts_write_doc_id(d, s)	mach_write_to_8(d, s)
+
+/** Read a document id to internal format. */
+#define fts_read_doc_id(s)	mach_read_from_8(s)
+
+/** Bind the doc id to a variable */
+#define fts_bind_doc_id(i, n, v) pars_info_bind_int8_literal(i, n, v)
+
+/** Defines for FTS query mode, they have the same values as
+those defined in mysql file ft_global.h */
+#define FTS_NL		0
+#define FTS_BOOL	1
+#define FTS_SORTED	2
+#define FTS_EXPAND	4
+#define FTS_PROXIMITY	8
+#define FTS_PHRASE	16
+
+#define FTS_INDEX_TABLE_IND_NAME	"FTS_INDEX_TABLE_IND"
+
+/** Threshold where our optimize thread automatically kicks in */
+#define FTS_OPTIMIZE_THRESHOLD		10000000
+
+#define FTS_DOC_ID_MAX_STEP		10000
+/** Variable specifying the FTS parallel sort degree */
+extern ulong		fts_sort_pll_degree;
+
+/** Variable specifying the number of word to optimize for each optimize table
+call */
+extern ulong		fts_num_word_optimize;
+
+/** Variable specifying whether we do additional FTS diagnostic printout
+in the log */
+extern char		fts_enable_diag_print;
+
+/** FTS rank type, which will be between 0 .. 1 inclusive */
+typedef float 		fts_rank_t;
+
+/** Type of a row during a transaction. FTS_NOTHING means the row can be
+forgotten from the FTS system's POV, FTS_INVALID is an internal value used
+to mark invalid states.
+
+NOTE: Do not change the order or value of these, fts_trx_row_get_new_state
+depends on them being exactly as they are. */
+typedef enum {
+	FTS_INSERT = 0,
+	FTS_MODIFY,
+	FTS_DELETE,
+	FTS_NOTHING,
+	FTS_INVALID
+} fts_row_state;
+
+/** The FTS table types. */
+enum fts_table_type_enum {
+	FTS_INDEX_TABLE,		/*!< FTS auxiliary table that is
+					specific to a particular FTS index
+					on a table */
+
+	FTS_COMMON_TABLE		/*!< FTS auxiliary table that is common
+					for all FTS index on a table */
+};
+
+typedef struct fts_struct fts_t;
+typedef struct fts_doc_struct fts_doc_t;
+typedef struct fts_trx_struct fts_trx_t;
+typedef struct fts_table_struct fts_table_t;
+typedef struct fts_cache_struct fts_cache_t;
+typedef struct fts_token_struct fts_token_t;
+typedef struct fts_string_struct fts_string_t;
+typedef	struct fts_result_struct fts_result_t;
+typedef struct fts_ranking_struct fts_ranking_t;
+typedef struct fts_trx_row_struct fts_trx_row_t;
+typedef struct fts_doc_ids_struct fts_doc_ids_t;
+typedef enum fts_table_type_enum fts_table_type_t;
+typedef struct fts_trx_table_struct fts_trx_table_t;
+typedef	struct fts_savepoint_struct fts_savepoint_t;
+typedef struct fts_index_cache_struct fts_index_cache_t;
+
+
+/** Initialize the "fts_table" for internal query into FTS auxiliary
+tables */
+#define FTS_INIT_FTS_TABLE(fts_table, m_suffix, m_type, m_table)\
+do {								\
+	(fts_table)->suffix = m_suffix;				\
+        (fts_table)->type = m_type;				\
+        (fts_table)->table_id = m_table->id;			\
+        (fts_table)->parent = m_table->name;			\
+        (fts_table)->table = m_table;				\
+} while (0);
+
+#define FTS_INIT_INDEX_TABLE(fts_table, m_suffix, m_type, m_index)\
+do {								\
+	(fts_table)->suffix = m_suffix;				\
+        (fts_table)->type = m_type;				\
+        (fts_table)->table_id = m_index->table->id;		\
+        (fts_table)->parent = m_index->table->name;		\
+        (fts_table)->table = m_index->table;			\
+        (fts_table)->index_id = m_index->id;			\
+} while (0);
+
+/** Information about changes in a single transaction affecting
+the FTS system. */
+struct fts_trx_struct {
+	trx_t*		trx;		/*!< InnoDB transaction */
+
+	ib_vector_t*	savepoints;	/*!< Active savepoints, must have at
+					least one element, the implied
+					savepoint */
+	ib_vector_t*	last_stmt;	/*!< last_stmt */
+
+	mem_heap_t*	heap;		/*!< heap */
+};
+
+/** Information required for transaction savepoint handling. */
+struct fts_savepoint_struct {
+	char*		name;		/*!< First entry is always NULL, the
+					default instance. Otherwise the name
+					of the savepoint */
+
+	ib_rbt_t*	tables;		/*!< Modified FTS tables */
+};
+
+/** Information about changed rows in a transaction for a single table. */
+struct fts_trx_table_struct {
+	dict_table_t*	table;		/*!< table */
+
+	fts_trx_t*	fts_trx;	/*!< link to parent */
+
+	ib_rbt_t*	rows;		/*!< rows changed; indexed by doc-id,
+					cells are fts_trx_row_t* */
+
+	fts_doc_ids_t*	added_doc_ids;	/*!< list of added doc ids (NULL until
+					the first addition) */
+
+					/*!< for adding doc ids */
+	que_t*		docs_added_graph;
+};
+
+/** Information about one changed row in a transaction. */
+struct fts_trx_row_struct {
+	doc_id_t	doc_id;		/*!< Id of the ins/upd/del document */
+
+	fts_row_state	state;		/*!< state of the row */
+
+	ib_vector_t*	fts_indexes;	/*!< The indexes that are affected */
+};
+
+/** List of document ids that were added during a transaction. This
+list is passed on to a background 'Add' thread and OPTIMIZE, so it
+needs its own memory heap. */
+struct fts_doc_ids_struct {
+	ib_vector_t*	doc_ids;	/*!< document ids (each element is
+					of type doc_id_t). */
+
+	ib_alloc_t*	self_heap;	/*!< Allocator used to create an
+					instance of this type and the
+					doc_ids vector */
+};
+
+// FIXME: Get rid of this if possible.
+/** Since MySQL's character set support for Unicode is woefully inadequate
+(it supports basic operations like isalpha etc. only for 8-bit characters),
+we have to implement our own. We use UTF-16 without surrogate processing
+as our in-memory format. This typedef is a single such character. */
+typedef unsigned short ib_uc_t;
+
+/** An UTF-16 ro UTF-8 string. */
+struct fts_string_struct {
+	byte*		f_str;		/*!< string, not necessary terminated in
+					any way */
+	ulint		f_len;		/*!< Length of the string in bytes */
+	ulint		f_n_char;	/*!< Number of characters */
+};
+
+/** Query ranked doc ids. */
+struct fts_ranking_struct {
+	doc_id_t	doc_id;		/*!< Document id */
+
+	fts_rank_t	rank;		/*!< Rank is between 0 .. 1 */
+
+	ib_rbt_t*	words;		/*!< RB Tree of type byte*, this
+					contains the words that were queried
+					and found in this document */
+};
+
+/** Query result. */
+struct fts_result_struct {
+	ib_rbt_node_t*	current;	/*!< Current element */
+
+	ib_rbt_t*	rankings_by_id;	/*!< RB tree of type fts_ranking_t
+					indexed by doc id */
+	ib_rbt_t*	rankings_by_rank;/*!< RB tree of type fts_ranking_t
+					indexed by rank */
+};
+
+/** This is used to generate the FTS auxiliary table name, we need the
+table id and the index id to generate the column specific FTS auxiliary
+table name. */
+struct fts_table_struct {
+	const char*	parent;		/*!< Parent table name, this is
+					required only for the database
+					name */
+
+	fts_table_type_t
+			type;		/*!< The auxiliary table type */
+
+	table_id_t	table_id;	/*!< The table id */
+
+	index_id_t	index_id;	/*!< The index id */
+
+	const char*	suffix;		/*!< The suffix of the fts auxiliary
+					table name, can be NULL, not used
+					everywhere (yet) */
+	const dict_table_t*
+			table;		/*!< Parent table */
+	CHARSET_INFO*	charset;	/*!< charset info if it is for FTS
+					index auxiliary table */
+};
+
+enum	fts_status {
+	BG_THREAD_STOP = 1,	 	/*!< TRUE if the FTS background thread
+					has finished reading the ADDED table,
+					meaning more items can be added to
+					the table. */
+
+	BG_THREAD_READY = 2,		/*!< TRUE if the FTS background thread
+					is ready */
+
+	ADD_THREAD_STARTED = 4,		/*!< TRUE if the FTS add thread
+					has started */
+
+	ADDED_TABLE_SYNCED = 8,		/*!< TRUE if the ADDED table record is
+					sync-ed after crash recovery */
+
+	TABLE_DICT_LOCKED = 16		/*!< Set if the table has
+					dict_sys->mutex */
+};
+
+typedef	enum fts_status	fts_status_t;
+
+/** The state of the FTS sub system. */
+struct fts_struct {
+					/*!< mutex protecting bg_threads* and
+					fts_add_wq. */
+	mutex_t		bg_threads_mutex;
+
+	ulint		bg_threads;	/*!< number of background threads
+					accessing this table */
+
+					/*!< TRUE if background threads running
+					should stop themselves */
+	ulint		fts_status;	/*!< Status bit regarding fts
+					running state */
+
+	ib_wqueue_t*	add_wq;		/*!< Work queue for scheduling jobs
+					for the FTS 'Add' thread, or NULL
+					if the thread has not yet been
+					created. Each work item is a
+					fts_trx_doc_ids_t*. */
+
+	fts_cache_t*	cache;		/*!< FTS memory buffer for this table,
+					or NULL if the table has no FTS
+					index. */
+
+	ulint		doc_col;	/*!< FTS doc id hidden column number
+					in the CLUSTERED index. */
+
+	ib_vector_t*	indexes;	/*!< Vector of FTS indexes, this is
+					mainly for caching purposes. */
+	mem_heap_t*	fts_heap;	/*!< heap for fts_struct allocation */
+};
+
+typedef struct fts_stopword_struct	fts_stopword_t;
+
+/** status bits for fts_stopword_t status field. */
+#define STOPWORD_NOT_INIT               0x1
+#define STOPWORD_OFF                    0x2
+#define STOPWORD_FROM_DEFAULT           0x4
+#define STOPWORD_USER_TABLE             0x8
+
+extern const char*	fts_default_stopword[];
+
+/** Variable specifying the maximum FTS cache size for each table */
+extern ulong		fts_max_cache_size;
+
+/** Variable specifying the maximum FTS max token size */
+extern ulong		fts_max_token_size;
+
+/** Variable specifying the minimum FTS max token size */
+extern ulong		fts_min_token_size;
+
+/** Maximum possible Fulltext word length */
+#define FTS_MAX_WORD_LEN	3 * HA_FT_MAXCHARLEN
+
+/** Variable specifying the table that has Fulltext index to display its
+content through information schema table */
+extern char*		fts_internal_tbl_name;
+
+#define	fts_que_graph_free(graph)			\
+do {							\
+	mutex_enter(&dict_sys->mutex);			\
+	que_graph_free(graph);				\
+	mutex_exit(&dict_sys->mutex);			\
+} while (0)
+
+/******************************************************************//**
+Create a FTS cache. */
+UNIV_INTERN
+fts_cache_t*
+fts_cache_create(
+/*=============*/
+	dict_table_t*	table);			/*!< table owns the FTS cache */
+
+/******************************************************************//**
+Create a FTS index cache.
+@return Index Cache */
+UNIV_INTERN
+fts_index_cache_t*
+fts_cache_index_cache_create(
+/*=========================*/
+	dict_table_t*	table,			/*!< in: table with FTS index */
+	dict_index_t*	index);			/*!< in: FTS index */
+
+/******************************************************************//**
+Get the next available document id. This function creates a new
+transaction to generate the document id. */
+UNIV_INTERN
+ulint
+fts_get_next_doc_id(
+/*================*/
+						/*!< out: DB_SUCCESS if OK */
+	const dict_table_t*	table,		/*!< in: table */
+	doc_id_t*		doc_id);	/*!< out: new document id */
+
+/*********************************************************************//**
+Update the next and last Doc ID in the CONFIG table to be the input
+"doc_id" value (+ 1). We would do so after each FTS index build or
+table truncate */
+UNIV_INTERN
+void
+fts_update_next_doc_id(
+/*===================*/
+	const dict_table_t*	table,		/*!< in: table */
+	const char*		table_name,	/*!< in: table name */
+	doc_id_t		doc_id);	/*!< in: DOC ID to set */
+
+/******************************************************************//**
+Update the last document id. This function could create a new
+transaction to update the last document id. */
+UNIV_INTERN
+ulint
+fts_update_sync_doc_id(
+/*===================*/
+						/*!< out: DB_SUCCESS if OK */
+	const dict_table_t*	table,		/*!< in: table */
+	const char*		table_name,	/*!< in: table name */
+	doc_id_t		doc_id,		/*!< in: last document id */
+	trx_t*			trx);		/*!< in: update trx */
+
+/******************************************************************//**
+Create a new document id .
+@return DB_SUCCESS if all went well else error */
+UNIV_INTERN
+ulint
+fts_create_doc_id(
+/*==============*/
+	dict_table_t*	table,			/*!< in: row is of this
+						table. */
+	dtuple_t*	row,			/*!< in/out: add doc id
+						value to this row. This is the
+						current row that is being
+						inserted. */
+	mem_heap_t*	heap);			/*!< in: heap */
+
+/******************************************************************//**
+Create a new fts_doc_ids_t.
+@return new fts_doc_ids_t. */
+UNIV_INTERN
+fts_doc_ids_t*
+fts_doc_ids_create(void);
+/*=====================*/
+
+/******************************************************************//**
+Free a fts_doc_ids_t. */
+UNIV_INTERN
+void
+fts_doc_ids_free(
+/*=============*/
+	fts_doc_ids_t*	doc_ids);		/*!< in: doc_ids to free */
+
+/******************************************************************//**
+Notify the FTS system about an operation on an FTS-indexed table. */
+UNIV_INTERN
+void
+fts_trx_add_op(
+/*===========*/
+	trx_t*		trx,			/*!< in: InnoDB transaction */
+	dict_table_t*	table,			/*!< in: table */
+	doc_id_t	doc_id,			/*!< in: doc id */
+	fts_row_state	state,			/*!< in: state of the row */
+	ib_vector_t*	fts_indexes)		/*!< in: FTS indexes affected
+						(NULL=all) */
+	__attribute__((nonnull(1,2)));
+
+/******************************************************************//**
+Free an FTS trx. */
+UNIV_INTERN
+void
+fts_trx_free(
+/*=========*/
+	fts_trx_t*	fts_trx);		/*!< in, own: FTS trx */
+
+/******************************************************************//**
+Creates the common ancillary tables needed for supporting an FTS index
+on the given table. row_mysql_lock_data_dictionary must have been
+called before this.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+ulint
+fts_create_common_tables(
+/*=====================*/
+	trx_t*		trx,			/*!< in: transaction handle */
+	const dict_table_t*
+			table,			/*!< in: table with one FTS
+						index */
+	const char*	name,			/*!< in: table name */
+	ibool		skip_doc_id_index);
+						/*!< in: Skip index on doc id */
+/******************************************************************//**
+Wrapper function of fts_create_index_tables_low(), create auxiliary
+tables for an FTS index
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+ulint
+fts_create_index_tables(
+/*====================*/
+	trx_t*			trx,		/*!< in: transaction handle */
+	const dict_index_t*	index);		/*!< in: the FTS index
+						instance */
+
+/******************************************************************//**
+Creates the column specific ancillary tables needed for supporting an
+FTS index on the given table. row_mysql_lock_data_dictionary must have
+been called before this.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+ulint
+fts_create_index_tables_low(
+/*========================*/
+	trx_t*		trx,			/*!< in: transaction handle */
+	const dict_index_t*
+			index,			/*!< in: the FTS index
+						instance */
+	const char*	table_name,		/*!< in: the table name */
+	table_id_t	table_id);		/*!< in: the table id */
+
+/******************************************************************//**
+Add the FTS document id hidden column. */
+UNIV_INTERN
+void
+fts_add_doc_id_column(
+/*==================*/
+	dict_table_t*	table);			/*!< in/out: Table with
+						FTS index */
+
+/*********************************************************************//**
+Drops the ancillary tables needed for supporting an FTS index on the
+given table. row_mysql_lock_data_dictionary must have been called before
+this.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+ulint
+fts_drop_tables(
+/*============*/
+	trx_t*		trx,			/*!< in: transaction */
+	dict_table_t*	table);			/*!< in: table has the FTS
+						index */
+
+/******************************************************************//**
+The given transaction is about to be committed; do whatever is necessary
+from the FTS system's POV.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+ulint
+fts_commit(
+/*=======*/
+	trx_t*		trx);			/*!< in: transaction */
+
+/*******************************************************************//**
+FTS Query entry point.
+@return DB_SUCCESS if successful otherwise error code */
+UNIV_INTERN
+ulint
+fts_query(
+/*======*/
+	trx_t*		trx,			/*!< in: transaction */
+	dict_index_t*	index,			/*!< in: FTS index to search */
+	uint		flags,			/*!< in: FTS search mode */
+	const byte*	query,			/*!< in: FTS query */
+	ulint		query_len,		/*!< in: FTS query string len
+						in bytes */
+	fts_result_t**	result);		/*!< out: query result, to be
+						freed by the caller.*/
+
+/******************************************************************//**
+Retrieve the FTS Relevance Ranking result for doc with doc_id
+@return the relevance ranking value. */
+UNIV_INTERN
+float
+fts_retrieve_ranking(
+/*=================*/
+	fts_result_t*	result,			/*!< in: FTS result structure */
+	doc_id_t	doc_id);		/*!< in: the interested document
+						doc_id */
+
+/******************************************************************//**
+FTS Query sort result, returned by fts_query() on fts_ranking_t::rank. */
+UNIV_INTERN
+void
+fts_query_sort_result_on_rank(
+/*==========================*/
+	fts_result_t*	result);		/*!< out: result instance
+						to sort.*/
+
+/******************************************************************//**
+FTS Query free result, returned by fts_query(). */
+UNIV_INTERN
+void
+fts_query_free_result(
+/*==================*/
+	fts_result_t*	result);		/*!< in: result instance
+						to free.*/
+
+/******************************************************************//**
+Extract the doc id from the FTS hidden column. */
+UNIV_INTERN
+doc_id_t
+fts_get_doc_id_from_row(
+/*====================*/
+	dict_table_t*	table,			/*!< in: table */
+	dtuple_t*	row);			/*!< in: row whose FTS doc id we
+						want to extract.*/
+
+/******************************************************************//**
+Extract the doc id from the FTS hidden column. */
+UNIV_INTERN
+doc_id_t
+fts_get_doc_id_from_rec(
+/*====================*/
+	dict_table_t*	table,			/*!< in: table */
+	const rec_t*	rec,			/*!< in: rec */
+	mem_heap_t*	heap);			/*!< in: heap */
+
+/******************************************************************//**
+Update the query graph with a new document id.
+@return Doc ID used */
+UNIV_INTERN
+doc_id_t
+fts_update_doc_id(
+/*==============*/
+	dict_table_t*	table,			/*!< in: table */
+	upd_field_t*	ufield,			/*!< out: update node */
+	doc_id_t*	next_doc_id);		/*!< out: buffer for writing */
+
+/******************************************************************//**
+FTS initialize. */
+UNIV_INTERN
+void
+fts_startup(void);
+/*==============*/
+
+/******************************************************************//**
+Signal FTS threads to initiate shutdown. */
+UNIV_INTERN
+void
+fts_start_shutdown(
+/*===============*/
+	dict_table_t*	table,			/*!< in: table with FTS
+						indexes */
+	fts_t*		fts);			/*!< in: fts instance to
+						shutdown */
+
+/******************************************************************//**
+Wait for FTS threads to shutdown. */
+UNIV_INTERN
+void
+fts_shutdown(
+/*=========*/
+	dict_table_t*	table,			/*!< in: table with FTS
+						indexes */
+	fts_t*		fts);			/*!< in: fts instance to
+						shutdown */
+
+/******************************************************************//**
+Create an instance of fts_t.
+@return instance of fts_t */
+UNIV_INTERN
+fts_t*
+fts_create(
+/*=======*/
+	dict_table_t*	table);			/*!< out: table with FTS
+						indexes */
+
+/**********************************************************************//**
+Free the FTS resources. */
+UNIV_INTERN
+void
+fts_free(
+/*=====*/
+	dict_table_t*   table);			/*!< in/out: table with
+						FTS indexes */
+
+/*********************************************************************//**
+Run OPTIMIZE on the given table.
+@return DB_SUCCESS if all OK */
+UNIV_INTERN
+ulint
+fts_optimize_table(
+/*===============*/
+	dict_table_t*	table);			/*!< in: table to optimiza */
+
+/**********************************************************************//**
+Startup the optimize thread and create the work queue. */
+UNIV_INTERN
+void
+fts_optimize_init(void);
+/*====================*/
+
+/**********************************************************************//**
+Check whether the work queue is initialized.
+@return TRUE if optimze queue is initialized. */
+UNIV_INTERN
+ibool
+fts_optimize_is_init(void);
+/*======================*/
+
+/****************************************************************//**
+Drops index ancillary tables for a FTS index
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+ulint
+fts_drop_index_tables(
+/*==================*/
+	trx_t*		trx,			/*!< in: transaction */
+	dict_index_t*	index);			/*!< in: Index to drop */
+
+/******************************************************************//**
+Remove the table from the OPTIMIZER's list. We do wait for
+acknowledgement from the consumer of the message. */
+UNIV_INTERN
+void
+fts_optimize_remove_table(
+/*======================*/
+	dict_table_t*	table);			/*!< in: table to remove */
+
+/**********************************************************************//**
+Signal the optimize thread to prepare for shutdown. */
+UNIV_INTERN
+void
+fts_optimize_start_shutdown(void);
+/*==============================*/
+
+/**********************************************************************//**
+Inform optimize to clean up. */
+UNIV_INTERN
+void
+fts_optimize_end(void);
+/*===================*/
+
+/**********************************************************************//**
+Take a FTS savepoint.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+void
+fts_savepoint_take(
+/*===============*/
+	trx_t*		trx,			/*!< in: transaction */
+	const char*	name);			/*!< in: savepoint name */
+
+/**********************************************************************//**
+Refresh last statement savepoint.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+void
+fts_savepoint_laststmt_refresh(
+/*===========================*/
+	trx_t*		trx);			/*!< in: transaction */
+
+/**********************************************************************//**
+Release the savepoint data identified by  name. */
+UNIV_INTERN
+void
+fts_savepoint_release(
+/*==================*/
+	trx_t*		trx,			/*!< in: transaction */
+	const char*	name);			/*!< in: savepoint name */
+
+/**********************************************************************//**
+Free the FTS cache. */
+UNIV_INTERN
+void
+fts_cache_destroy(
+/*==============*/
+	fts_cache_t*	cache);			/*!< in: cache*/
+
+/*********************************************************************//**
+Clear cache. If the shutdown flag is TRUE then the cache can contain
+data that needs to be freed. For regular clear as part of normal
+working we assume the caller has freed all resources. */
+UNIV_INTERN
+void
+fts_cache_clear(
+/*============*/
+	fts_cache_t*	cache,			/*!< in: cache */
+	ibool		free_words);		/*!< in: TRUE if free
+						in memory word cache. */
+
+/*********************************************************************//**
+Initialize things in cache. */
+UNIV_INTERN
+void
+fts_cache_init(
+/*===========*/
+	fts_cache_t*	cache);			/*!< in: cache */
+
+/*********************************************************************//**
+Rollback to and including savepoint indentified by name. */
+UNIV_INTERN
+void
+fts_savepoint_rollback(
+/*===================*/
+	trx_t*		trx,			/*!< in: transaction */
+	const char*	name);			/*!< in: savepoint name */
+
+/*********************************************************************//**
+Rollback to and including savepoint indentified by name. */
+UNIV_INTERN
+void
+fts_savepoint_rollback_last_stmt(
+/*=============================*/
+	trx_t*		trx);			/*!< in: transaction */
+
+/***********************************************************************//**
+Drop all orphaned FTS auxiliary tables, those that don't have a parent
+table or FTS index defined on them. */
+UNIV_INTERN
+void
+fts_drop_orphaned_tables(void);
+/*==========================*/
+
+/******************************************************************//**
+Since we do a horizontal split on the index table, we need to drop the
+all the split tables. */
+UNIV_INTERN
+ulint
+fts_drop_index_split_tables(
+/*========================*/
+						/*!< out: DB_SUCCESS
+						or error code */
+	trx_t*		trx,			/*!< in: transaction */
+	dict_index_t*	index);			/*!< in: fts instance */
+
+/****************************************************************//**
+Run SYNC on the table, i.e., write out data from the cache to the
+FTS auxiliary INDEX table and clear the cache at the end.
+@return DB_SUCCESS if all OK */
+UNIV_INTERN
+ulint
+fts_sync_table(
+/*===========*/
+	dict_table_t*	table);			/*!< in: table */
+
+/****************************************************************//**
+Free the query graph but check whether dict_sys->mutex is already
+held */
+UNIV_INTERN
+void
+fts_que_graph_free_check_lock(
+/*==========================*/
+	fts_table_t*		fts_table,	/*!< in: FTS table */
+	const fts_index_cache_t*index_cache,	/*!< in: FTS index cache */
+	que_t*			graph);		/*!< in: query graph */
+
+/****************************************************************//**
+Create an FTS index cache. */
+UNIV_INTERN
+CHARSET_INFO*
+fts_index_get_charset(
+/*==================*/
+	dict_index_t*		index);		/*!< in: FTS index */
+
+/*********************************************************************//**
+Get the initial Doc ID by consulting the ADDED and the CONFIG table
+@return initial Doc ID */
+UNIV_INTERN
+doc_id_t
+fts_init_doc_id(
+/*============*/
+	const dict_table_t*		table);	/*!< in: table */
+
+/******************************************************************//**
+compare two character string according to their charset. */
+extern
+int
+innobase_fts_text_cmp(
+/*==================*/
+	const void*	cs,			/*!< in: Character set */
+	const void*	p1,			/*!< in: key */
+	const void*	p2);			/*!< in: node */
+
+/******************************************************************//**
+Makes all characters in a string lower case. */
+extern
+size_t
+innobase_fts_casedn_str(
+/*====================*/
+        CHARSET_INFO*	cs,			/*!< in: Character set */
+	char*		src,			/*!< in: string to put in
+						lower case */
+	size_t		src_len,		/*!< in: input string length */
+	char*		dst,			/*!< in: buffer for result
+						string */
+	size_t		dst_len);		/*!< in: buffer size */
+
+
+/******************************************************************//**
+compare two character string according to their charset. */
+extern
+int
+innobase_fts_text_cmp_prefix(
+/*=========================*/
+	const void*	cs,			/*!< in: Character set */
+	const void*	p1,			/*!< in: key */
+	const void*	p2);			/*!< in: node */
+
+/*************************************************************//**
+Get the next token from the given string and store it in *token. */
+extern
+ulint
+innobase_mysql_fts_get_token(
+/*=========================*/
+	CHARSET_INFO*	charset,		/*!< in: Character set */
+	byte*		start,			/*!< in: start of text */
+	byte*		end,			/*!< in: one character past
+						end of text */
+	fts_string_t*	token,			/*!< out: token's text */
+	ulint*		offset);		/*!< out: offset to token,
+						measured as characters from
+						'start' */
+
+/*********************************************************************//**
+Fetch COUNT(*) from specified table.
+@return the number of rows in the table */
+UNIV_INTERN
+ulint
+fts_get_rows_count(
+/*===============*/
+	fts_table_t*	fts_table);		/*!< in: fts table to read */
+
+/*************************************************************//**
+Get maximum Doc ID in a table if index "FTS_DOC_ID_INDEX" exists
+@return max Doc ID or 0 if index "FTS_DOC_ID_INDEX" does not exist */
+UNIV_INTERN
+doc_id_t
+fts_get_max_doc_id(
+/*===============*/
+	dict_table_t*	table);			/*!< in: user table */
+
+/******************************************************************//**
+Check whether user supplied stopword table exists and is of
+the right format.
+@return TRUE if the table qualifies */
+UNIV_INTERN
+ibool
+fts_valid_stopword_table(
+/*=====================*/
+	const char*	stopword_table_name);	/*!< in: Stopword table
+						name */
+/****************************************************************//**
+This function loads specified stopword into FTS cache
+@return TRUE if success */
+UNIV_INTERN
+ibool
+fts_load_stopword(
+/*==============*/
+	const dict_table_t*
+			table,			/*!< in: Table with FTS */
+	trx_t*		trx,			/*!< in: Transaction */
+	const char*	global_stopword_table,	/*!< in: Global stopword table
+						name */
+	const char*	session_stopword_table,	/*!< in: Session stopword table
+						name */
+	ibool		stopword_is_on,		/*!< in: Whether stopword
+						option is turned on/off */
+	ibool		reload);		/*!< in: Whether it is during
+						reload of FTS table */
+
+/****************************************************************//**
+Create the vector of fts_get_doc_t instances.
+@return vector of fts_get_doc_t instances */
+UNIV_INTERN
+ib_vector_t*
+fts_get_docs_create(
+/*================*/
+	fts_cache_t*	cache);			/*!< in: fts cache */
+
+/****************************************************************//**
+Read the rows from the FTS index
+@return vector of rows fetched */
+UNIV_INTERN
+ulint
+fts_table_fetch_doc_ids(
+/*====================*/
+	trx_t*		trx,			/*!< in: transaction */
+	fts_table_t*	fts_table,		/*!< in: aux table */
+	fts_doc_ids_t*	doc_ids);		/*!< in: For collecting
+						doc ids */
+/****************************************************************//**
+This function loads the documents in "ADDED" table into FTS cache,
+it also loads the stopword info into the FTS cache.
+@return DB_SUCCESS if all OK */
+UNIV_INTERN
+ibool
+fts_init_index(
+/*===========*/
+	dict_table_t*	table,			/*!< in: Table with FTS */
+	ibool		has_cache_lock);	/*!< in: Whether we already
+						have cache lock */
+/*******************************************************************//**
+Add a newly create index in FTS cache */
+UNIV_INTERN
+void
+fts_add_index(
+/*==========*/
+	dict_index_t*	index,			/*!< FTS index to be added */
+	dict_table_t*	table);			/*!< table */
+
+/*******************************************************************//**
+Drop auxiliary tables related to an FTS index
+@return DB_SUCCESS or error number */
+UNIV_INTERN
+ulint
+fts_drop_index(
+/*===========*/
+	dict_table_t*	table,	/*!< in: Table where indexes are dropped */
+	dict_index_t*	index,	/*!< in: Index to be dropped */
+	trx_t*		trx);	/*!< in: Transaction for the drop */
+
+/*******************************************************************//**
+Check indexes in the fts->indexes is also present in index cache and
+table->indexes list
+@return TRUE if all indexes match */
+UNIV_INTERN
+ibool
+fts_check_cached_index(
+/*===================*/
+	dict_table_t*	table);  /*!< in: Table where indexes are dropped */
+#endif /*!< fts0fts.h */
+
diff --git a/storage/innobase/include/fts0opt.h b/storage/innobase/include/fts0opt.h
new file mode 100644
index 00000000000..92eaf8270d2
--- /dev/null
+++ b/storage/innobase/include/fts0opt.h
@@ -0,0 +1,37 @@
+/*****************************************************************************
+
+Copyright (c) 2001, 2011, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/fts0opt.h
+Full Text Search optimize thread
+
+Created 2011-02-15 Jimmy Yang
+***********************************************************************/
+#ifndef INNODB_FTS0OPT_H
+#define INNODB_FTS0OPT_H
+
+/********************************************************************
+Callback function to fetch the rows in an FTS INDEX record. */
+UNIV_INTERN
+ibool
+fts_optimize_index_fetch_node(
+/*==========================*/
+                                        /* out: always returns non-NULL */
+        void*           row,		/* in: sel_node_t* */
+        void*           user_arg);	/* in: pointer to ib_vector_t */
+#endif
diff --git a/storage/innobase/include/fts0pars.h b/storage/innobase/include/fts0pars.h
new file mode 100644
index 00000000000..ae5a55b2455
--- /dev/null
+++ b/storage/innobase/include/fts0pars.h
@@ -0,0 +1,74 @@
+
+/* A Bison parser, made by GNU Bison 2.4.1.  */
+
+/* Skeleton interface for Bison's Yacc-like parsers in C
+
+      Copyright (C) 1984, 1989, 1990, 2000, 2001, 2002, 2003, 2004, 2005, 2006
+   Free Software Foundation, Inc.
+
+   This program is free software: you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation, either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
+
+/* As a special exception, you may create a larger work that contains
+   part or all of the Bison parser skeleton and distribute that work
+   under terms of your choice, so long as that work isn't itself a
+   parser generator using the skeleton or a modified version thereof
+   as a parser skeleton.  Alternatively, if you modify or redistribute
+   the parser skeleton itself, you may (at your option) remove this
+   special exception, which will cause the skeleton and the resulting
+   Bison output files to be licensed under the GNU General Public
+   License without this special exception.
+
+   This special exception was added by the Free Software Foundation in
+   version 2.2 of Bison.  */
+
+
+/* Tokens.  */
+#ifndef YYTOKENTYPE
+# define YYTOKENTYPE
+   /* Put the tokens into the symbol table, so that GDB and other debuggers
+      know about them.  */
+   enum yytokentype {
+     FTS_OPER = 258,
+     FTS_TEXT = 259,
+     FTS_TERM = 260,
+     FTS_NUMB = 261
+   };
+#endif
+
+
+
+#if ! defined YYSTYPE && ! defined YYSTYPE_IS_DECLARED
+typedef union YYSTYPE
+{
+
+/* Line 1676 of yacc.c  */
+#line 36 "fts0pars.y"
+
+	int		oper;
+	char*		token;
+	fts_ast_node_t*	node;
+
+
+
+/* Line 1676 of yacc.c  */
+#line 66 "fts0pars.h"
+} YYSTYPE;
+# define YYSTYPE_IS_TRIVIAL 1
+# define yystype YYSTYPE /* obsolescent; will be withdrawn */
+# define YYSTYPE_IS_DECLARED 1
+#endif
+
+
+
+
diff --git a/storage/innobase/include/fts0priv.h b/storage/innobase/include/fts0priv.h
new file mode 100644
index 00000000000..8524f988e47
--- /dev/null
+++ b/storage/innobase/include/fts0priv.h
@@ -0,0 +1,613 @@
+/*****************************************************************************
+
+Copyright (c) 2011, 2011, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/fts0priv.h
+Full text search internal header file
+
+Created 2011/09/02 Sunny Bains
+***********************************************************************/
+
+#ifndef INNOBASE_FTS0PRIV_H
+#define INNOBASE_FTS0PRIV_H
+
+#include "dict0dict.h"
+#include "pars0pars.h"
+#include "que0que.h"
+#include "que0types.h"
+#include "fts0types.h"
+
+/* The various states of the FTS sub system pertaining to a table with
+FTS indexes defined on it. */
+enum fts_table_state_enum {
+					/* !<This must be 0 since we insert
+					a hard coded '0' at create time
+					to the config table */
+
+	FTS_TABLE_STATE_RUNNING = 0,	/*!< Auxiliary tables created OK */
+
+	FTS_TABLE_STATE_OPTIMIZING,	/*!< This is a substate of RUNNING */
+
+	FTS_TABLE_STATE_DELETED		/*!< All aux tables to be dropped when
+					it's safe to do so */
+};
+
+typedef enum fts_table_state_enum fts_table_state_t;
+
+/** The default time to wait for the background thread (in microsecnds). */
+#define FTS_MAX_BACKGROUND_THREAD_WAIT		10000
+
+/** Maximum number of iterations to wait before we complain */
+#define FTS_BACKGROUND_THREAD_WAIT_COUNT	1000
+
+/** The maximum length of the config table's value column in bytes */
+#define FTS_MAX_CONFIG_NAME_LEN			64
+
+/** The maximum length of the config table's value column in bytes */
+#define FTS_MAX_CONFIG_VALUE_LEN		1024
+
+/** Approx. upper limit of ilist length in bytes. */
+#define FTS_ILIST_MAX_SIZE			(64 * 1024)
+
+/** FTS config table name parameters */
+
+/** The number of seconds after which an OPTIMIZE run will stop */
+#define FTS_OPTIMIZE_LIMIT_IN_SECS	"optimize_checkpoint_limit"
+
+/** The next doc id */
+#define FTS_SYNCED_DOC_ID		"synced_doc_id"
+
+/** The last word that was OPTIMIZED */
+#define FTS_LAST_OPTIMIZED_WORD		"last_optimized_word"
+
+/** Total number of documents that have been deleted. The next_doc_id
+minus this count gives us the total number of documents. */
+#define FTS_TOTAL_DELETED_COUNT		"deleted_doc_count"
+
+/** Total number of words parsed from all documents */
+#define FTS_TOTAL_WORD_COUNT		"total_word_count"
+
+/** Start of optimize of an FTS index */
+#define FTS_OPTIMIZE_START_TIME		"optimize_start_time"
+
+/** End of optimize for an FTS index */
+#define FTS_OPTIMIZE_END_TIME		"optimize_end_time"
+
+/** User specified stopword table name */
+#define	FTS_STOPWORD_TABLE_NAME		"stopword_table_name"
+
+/** Whether to use (turn on/off) stopword */
+#define	FTS_USE_STOPWORD		"use_stopword"
+
+/** State of the FTS system for this table. It can be one of
+ RUNNING, OPTIMIZING, DELETED. */
+#define FTS_TABLE_STATE			"table_state"
+
+/** The minimum length of an FTS auxiliary table names's id component
+e.g., For an auxiliary table name
+
+	FTS_<TABLE_ID>_SUFFIX
+
+This constant is for the minimum length required to store the <TABLE_ID>
+component.
+*/
+#define FTS_AUX_MIN_TABLE_ID_LENGTH	48
+
+/** Maximum length of an integer stored in the config table value column. */
+#define FTS_MAX_INT_LEN			32
+
+/******************************************************************//**
+Parse an SQL string. %s is replaced with the table's id.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+que_t*
+fts_parse_sql(
+/*==========*/
+	fts_table_t*	fts_table,	/*!< in: FTS aux table */
+	pars_info_t*	info,		/*!< in: info struct, or NULL */
+	const char*	sql);		/*!< in: SQL string to evaluate */
+/******************************************************************//**
+Evaluate a parsed SQL statement
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+ulint
+fts_eval_sql(
+/*=========*/
+	trx_t*		trx,		/*!< in: transaction */
+	que_t*		graph);		/*!< in: Parsed statement */
+/******************************************************************//**
+Construct the name of an ancillary FTS table for the given table.
+@return own: table name, must be freed with mem_free() */
+UNIV_INTERN
+char*
+fts_get_table_name(
+/*===============*/
+	const fts_table_t*
+			fts_table);	/*!< in: FTS aux table info */
+/******************************************************************//**
+Construct the column specification part of the SQL string for selecting the
+indexed FTS columns for the given table. Adds the necessary bound
+ids to the given 'info' and returns the SQL string. Examples:
+
+One indexed column named "text":
+
+ "$sel0",
+ info/ids: sel0 -> "text"
+
+Two indexed columns named "subject" and "content":
+
+ "$sel0, $sel1",
+ info/ids: sel0 -> "subject", sel1 -> "content",
+@return heap-allocated WHERE string */
+UNIV_INTERN
+const char*
+fts_get_select_columns_str(
+/*=======================*/
+	dict_index_t*	index,		/*!< in: FTS index */
+	pars_info_t*	info,		/*!< in/out: parser info */
+	mem_heap_t*	heap);		/*!< in: memory heap */
+
+/** define for fts_doc_fetch_by_doc_id() "option" value, defines whether
+we want to get Doc whose ID is equal to or greater or smaller than supplied
+ID */
+#define	FTS_FETCH_DOC_BY_ID_EQUAL	1
+#define	FTS_FETCH_DOC_BY_ID_LARGE	2
+#define	FTS_FETCH_DOC_BY_ID_SMALL	3
+
+/*************************************************************//**
+Fetch document (= a single row's indexed text) with the given
+document id.
+@return: DB_SUCCESS if fetch is successful, else error */
+UNIV_INTERN
+ulint
+fts_doc_fetch_by_doc_id(
+/*====================*/
+	fts_get_doc_t*	get_doc,	/*!< in: state */
+	doc_id_t	doc_id,		/*!< in: id of document to fetch */
+	dict_index_t*	index_to_use,	/*!< in: caller supplied FTS index */
+	ulint		option,         /*!< in: search option, if it is
+                                        greater than doc_id or equal */
+	fts_sql_callback
+			callback,	/*!< in: callback to read
+					records */
+	void*		arg);		/*!< in: callback arg */
+
+/*******************************************************************//**
+Callback function for fetch that stores the text of an FTS document,
+converting each column to UTF-16.
+@return: always returns NULL */
+UNIV_INTERN
+ibool
+fts_query_expansion_fetch_doc(
+/*==========================*/
+	void*		row,		/*!< in: sel_node_t* */
+	void*		user_arg);	/*!< in: fts_doc_t* */
+/********************************************************************
+Write out a single word's data as new entry/entries in the INDEX table.
+@return DB_SUCCESS if all OK. */
+UNIV_INTERN
+ulint
+fts_write_node(
+/*===========*/
+	trx_t*		trx,		/*!< in: transaction */
+	que_t**		graph,		/*!< in: query graph */
+	fts_table_t*	fts_table,	/*!< in: the FTS aux index */
+	fts_string_t*	word,		/*!< in: word in UTF-8 */
+	fts_node_t*	node);		/*!< in: node columns */
+/*******************************************************************//**
+Tokenize a document. */
+UNIV_INTERN
+void
+fts_tokenize_document(
+/*==================*/
+	fts_doc_t*	doc,		/*!< in/out: document to
+					tokenize */
+	fts_doc_t*	result);	/*!< out: if provided, save
+					result tokens here */
+/*******************************************************************//**
+Continue to tokenize a document. */
+UNIV_INTERN
+void
+fts_tokenize_document_next(
+/*=======================*/
+	fts_doc_t*	doc,		/*!< in/out: document to
+					tokenize */
+	ulint		add_pos,	/*!< in: add this position to all
+					tokens from this tokenization */
+	fts_doc_t*	result);	/*!< out: if provided, save
+					result tokens here */
+/******************************************************************//**
+Create a new empty document.
+@return own: new document */
+UNIV_INTERN
+fts_doc_t*
+fts_doc_init(
+/*=========*/
+	fts_doc_t*	doc);		/*!< in: doc to initialize */
+/******************************************************************//**
+Do a binary search for a doc id in the array
+@return +ve index if found -ve index where it should be
+        inserted if not found */
+UNIV_INTERN
+int
+fts_bsearch(
+/*========*/
+	fts_update_t*	array,		/*!< in: array to sort */
+	int		lower,		/*!< in: lower bound of array*/
+	int		upper,		/*!< in: upper bound of array*/
+	doc_id_t	doc_id);	/*!< in: doc id to lookup */
+/******************************************************************//**
+Free document. */
+UNIV_INTERN
+void
+fts_doc_free(
+/*=========*/
+	fts_doc_t*	doc);		/*!< in: document */
+/******************************************************************//**
+Free fts_optimizer_word_t instanace.*/
+
+void
+fts_word_free(
+/*==========*/
+	fts_word_t*	word);		/*!< in: instance to free.*/
+/******************************************************************//**
+Read the rows from the FTS inde
+@return vector of rows fetched */
+UNIV_INTERN
+ulint
+fts_index_fetch_nodes(
+/*==================*/
+	trx_t*		trx,		/*!< in: transaction */
+	que_t**		graph,		/*!< in: prepared statement */
+	fts_table_t*	fts_table,	/*!< in: FTS aux table */
+	const fts_string_t*
+			word,		/*!< in: the word to fetch */
+	fts_fetch_t*	fetch);		/*!< in: fetch callback.*/
+/******************************************************************//**
+Create a fts_optimizer_word_t instance.
+@return new instance */
+UNIV_INTERN
+fts_word_t*
+fts_word_init(
+/*==========*/
+	fts_word_t*	word,		/*!< in: word to initialize */
+	byte*		utf8,		/*!< in: UTF-8 string */
+	ulint		len);		/*!< in: length of string in bytes */
+/******************************************************************//**
+Compare two fts_trx_table_t instances, we actually compare the
+table id's here.
+@return < 0 if n1 < n2, 0 if n1 == n2, > 0 if n1 > n2 */
+UNIV_INLINE
+int
+fts_trx_table_cmp(
+/*==============*/
+	const void*	v1,		/*!< in: id1 */
+	const void*	v2);		/*!< in: id2 */
+/******************************************************************//**
+Compare a table id with a trx_table_t table id.
+@return < 0 if n1 < n2, 0 if n1 == n2, > 0 if n1 > n2 */
+UNIV_INLINE
+int
+fts_trx_table_id_cmp(
+/*=================*/
+	const void*	p1,		/*!< in: id1 */
+	const void*	p2);		/*!< in: id2 */
+/******************************************************************//**
+Commit a transaction.
+@return DB_SUCCESS if all OK */
+UNIV_INTERN
+ulint
+fts_sql_commit(
+/*===========*/
+	trx_t*		trx);		/*!< in: transaction */
+/******************************************************************//**
+Rollback a transaction.
+@return DB_SUCCESS if all OK */
+UNIV_INTERN
+ulint
+fts_sql_rollback(
+/*=============*/
+	trx_t*		trx);		/*!< in: transaction */
+/******************************************************************//**
+Parse an SQL string. %s is replaced with the table's id. Don't acquire
+the dict mutex
+@return query graph */
+UNIV_INTERN
+que_t*
+fts_parse_sql_no_dict_lock(
+/*=======================*/
+	fts_table_t*	fts_table,	/*!< in: table with FTS index */
+	pars_info_t*	info,		/*!< in: parser info */
+	const char*	sql);		/*!< in: SQL string to evaluate */
+/******************************************************************//**
+Get value from config table. The caller must ensure that enough
+space is allocated for value to hold the column contents
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+ulint
+fts_config_get_value(
+/*=================*/
+	trx_t*		trx,		/* transaction */
+	fts_table_t*	fts_table,	/*!< in: the indexed FTS table */
+	const char*	name,		/*!< in: get config value for
+					this parameter name */
+	fts_string_t*	value);		/*!< out: value read from
+					config table */
+/******************************************************************//**
+Get value specific to an FTS index from the config table. The caller
+must ensure that enough space is allocated for value to hold the
+column contents.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+ulint
+fts_config_get_index_value(
+/*=======================*/
+	trx_t*		trx,		/*!< transaction */
+	dict_index_t*	index,		/*!< in: index */
+	const char*	param,		/*!< in: get config value for
+					this parameter name */
+	fts_string_t*	value);		/*!< out: value read from
+					config table */
+/******************************************************************//**
+Set the value in the config table for name.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+ulint
+fts_config_set_value(
+/*=================*/
+	trx_t*		trx,		/*!< transaction */
+	fts_table_t*	fts_table,	/*!< in: the indexed FTS table */
+	const char*	name,		/*!< in: get config value for
+					this parameter name */
+	const fts_string_t*
+			value);		/*!< in: value to update */
+/****************************************************************//**
+Set an ulint value in the config table.
+@return DB_SUCCESS if all OK else error code */
+UNIV_INTERN
+ulint
+fts_config_set_ulint(
+/*=================*/
+	trx_t*		trx,		/*!< in: transaction */
+	fts_table_t*	fts_table,	/*!< in: the indexed FTS table */
+	const char*	name,		/*!< in: param name */
+	ulint		int_value);	/*!< in: value */
+
+/******************************************************************//**
+Set the value specific to an FTS index in the config table.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+ulint
+fts_config_set_index_value(
+/*=======================*/
+	trx_t*		trx,		/*!< transaction */
+	dict_index_t*	index,		/*!< in: index */
+	const char*	param,		/*!< in: get config value for
+					this parameter name */
+	fts_string_t*	value);		/*!< out: value read from
+					config table */
+/******************************************************************//**
+Increment the value in the config table for column name.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+ulint
+fts_config_increment_value(
+/*=======================*/
+	trx_t*		trx,		/*!< transaction */
+	fts_table_t*	fts_table,	/*!< in: the indexed FTS table */
+	const char*	name,		/*!< in: increment config value
+					for this parameter name */
+	ulint		delta);		/*!< in: increment by this much */
+/******************************************************************//**
+Increment the per index value in the config table for column name.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+ulint
+fts_config_increment_index_value(
+/*=============================*/
+	trx_t*		trx,		/*!< transaction */
+	dict_index_t*	index,		/*!< in: FTS index */
+	const char*	name,		/*!< in: increment config value
+					for this parameter name */
+	ulint		delta);		/*!< in: increment by this much */
+/******************************************************************//**
+Get an ulint value from the config table.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+ulint
+fts_config_get_index_ulint(
+/*=======================*/
+	trx_t*		trx,		/*!< in: transaction */
+	dict_index_t*	index,		/*!< in: FTS index */
+	const char*	name,		/*!< in: param name */
+	ulint*		int_value);	/*!< out: value */
+/******************************************************************//**
+Set an ulint value int the config table.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+ulint
+fts_config_set_index_ulint(
+/*=======================*/
+	trx_t*		trx,		/*!< in: transaction */
+	dict_index_t*	index,		/*!< in: FTS index */
+	const char*	name,		/*!< in: param name */
+	ulint		int_value);	/*!< in: value */
+/******************************************************************//**
+Get an ulint value from the config table.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+ulint
+fts_config_get_ulint(
+/*=================*/
+	trx_t*		trx,		/*!< in: transaction */
+	fts_table_t*	fts_table,	/*!< in: the indexed FTS table */
+	const char*	name,		/*!< in: param name */
+	ulint*		int_value);	/*!< out: value */
+/******************************************************************//**
+Search cache for word.
+@return the word node vector if found else NULL */
+UNIV_INTERN
+const ib_vector_t*
+fts_cache_find_word(
+/*================*/
+	const fts_index_cache_t*
+			index_cache,	/*!< in: cache to search */
+	const fts_string_t*
+			text);		/*!< in: word to search for */
+/******************************************************************//**
+Check cache for deleted doc id.
+@return TRUE if deleted */
+UNIV_INTERN
+ibool
+fts_cache_is_deleted_doc_id(
+/*========================*/
+	const fts_cache_t*
+			cache,		/*!< in: cache ito search */
+	doc_id_t	doc_id);	/*!< in: doc id to search for */
+/******************************************************************//**
+Append deleted doc ids to vector and sort the vector. */
+UNIV_INTERN
+void
+fts_cache_append_deleted_doc_ids(
+/*=============================*/
+	const fts_cache_t*
+			cache,		/*!< in: cache to use */
+	ib_vector_t*	vector);	/*!< in: append to this vector */
+/******************************************************************//**
+Wait for the background thread to start. We poll to detect change
+of state, which is acceptable, since the wait should happen only
+once during startup.
+@return true if the thread started else FALSE (i.e timed out) */
+UNIV_INTERN
+ibool
+fts_wait_for_background_thread_to_start(
+/*====================================*/
+	dict_table_t*	table,		/*!< in: table to which the thread
+					is attached */
+	ulint		max_wait);	/*!< in: time in microseconds, if set
+					to 0 then it disables timeout
+					checking */
+/*********************************************************************//**
+Get the total number of documents in the FTS.
+@return estimated number of rows in the table */
+UNIV_INTERN
+ulint
+fts_get_total_document_count(
+/*=========================*/
+	dict_table_t*	table);		/*!< in: table instance */
+/******************************************************************//**
+Get the total number of words in the FTS for a particular FTS index.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+ulint
+fts_get_total_word_count(
+/*=====================*/
+	trx_t*		trx,		/*!< in: transaction */
+	dict_index_t*	index,		/*!< in: for this index */
+	ulint*		total);		/*!< out: total words */
+/******************************************************************//**
+Search the index specific cache for a particular FTS index.
+@return the index specific cache else NULL */
+UNIV_INTERN
+const fts_index_cache_t*
+fts_find_index_cache(
+/*================*/
+	const fts_cache_t*
+			cache,		/*!< in: cache to search */
+	const dict_index_t*
+			index);		/*!< in: index to search for */
+/******************************************************************//**
+Write the table id to the given buffer (including final NUL). Buffer must be
+at least FTS_AUX_MIN_TABLE_ID_LENGTH bytes long.
+@return	number of bytes written */
+UNIV_INLINE
+int
+fts_write_object_id(
+/*================*/
+	ib_id_t	id,		/*!< in: a table/index id */
+	char*		str);		/*!< in: buffer to write the id to */
+/******************************************************************//**
+Read the table id from the string generated by fts_write_object_id().
+@return TRUE if parse successful */
+UNIV_INLINE
+ibool
+fts_read_object_id(
+/*===============*/
+	ib_id_t*	id,		/*!< out: a table id */
+	const char*	str);		/*!< in: buffer to read from */
+/******************************************************************//**
+Get the table id.
+@return number of bytes written */
+UNIV_INTERN
+int
+fts_get_table_id(
+/*=============*/
+	const fts_table_t*
+			fts_table,	/*!< in: FTS Auxiliary table */
+	char*		table_id);	/*!< out: table id, must be at least
+					FTS_AUX_MIN_TABLE_ID_LENGTH bytes
+					long */
+/******************************************************************//**
+Add the table to add to the OPTIMIZER's list. */
+UNIV_INTERN
+void
+fts_optimize_add_table(
+/*===================*/
+	dict_table_t*	table);		/*!< in: table to add */
+/******************************************************************//**
+Optimize a table. */
+UNIV_INTERN
+void
+fts_optimize_do_table(
+/*==================*/
+	dict_table_t*	table);		/*!< in: table to optimize */
+/******************************************************************//**
+Construct the prefix name of an FTS table.
+@return own: table name, must be freed with mem_free() */
+UNIV_INTERN
+char*
+fts_get_table_name_prefix(
+/*======================*/
+	const fts_table_t*
+			fts_table);	/*!< in: Auxiliary table type */
+/******************************************************************//**
+Add node positions. */
+UNIV_INTERN
+void
+fts_cache_node_add_positions(
+/*=========================*/
+	fts_cache_t*	cache,		/*!< in: cache */
+	fts_node_t*	node,		/*!< in: word node */
+	doc_id_t	doc_id,		/*!< in: doc id */
+	ib_vector_t*	positions);	/*!< in: fts_token_t::positions */
+
+/******************************************************************//**
+Create the config table name for retrieving index specific value.
+@return index config parameter name */
+UNIV_INTERN
+char*
+fts_config_create_index_param_name(
+/*===============================*/
+	const char*		param,		/*!< in: base name of param */
+	const dict_index_t*	index);		/*!< in: index for config */
+
+#ifndef UNIV_NONINL
+#include "fts0priv.ic"
+#endif
+
+#endif /* INNOBASE_FTS0PRIV_H */
diff --git a/storage/innobase/include/fts0priv.ic b/storage/innobase/include/fts0priv.ic
new file mode 100644
index 00000000000..716ea4713b5
--- /dev/null
+++ b/storage/innobase/include/fts0priv.ic
@@ -0,0 +1,59 @@
+/*****************************************************************************
+
+Copyright (c) 2011, 2011, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/fts0priv.ic
+Full text search internal header file
+
+Created 2011/11/12 Sunny Bains
+***********************************************************************/
+
+/******************************************************************//**
+Write the table id to the given buffer (including final NUL). Buffer must be
+at least FTS_AUX_MIN_TABLE_ID_LENGTH bytes long.
+@return	number of bytes written */
+UNIV_INLINE
+int
+fts_write_object_id(
+/*================*/
+	ib_id_t         id,		/* in: a table/index id */
+	char*		str)		/* in: buffer to write the id to */
+{
+#ifdef __WIN__
+# define UINT64PFx	"%016I64u"
+#else
+# define UINT64PFx	"%016"PRIx64
+# endif /* __WIN__ */
+
+        // FIXME: Use ut_snprintf()
+	return(sprintf(str, UINT64PFx, id));
+}
+
+/******************************************************************//**
+Read the table id from the string generated by fts_write_object_id().
+@return	TRUE if parse successful */
+UNIV_INLINE
+ibool
+fts_read_object_id(
+/*===============*/
+	ib_id_t*	id,		/* out: an id */
+	const char*	str)		/* in: buffer to read from */
+{
+	return(sscanf(str, IB_ID_FMT, id) == 2);
+}
+
diff --git a/storage/innobase/include/fts0tlex.h b/storage/innobase/include/fts0tlex.h
new file mode 100644
index 00000000000..c0fed0efa71
--- /dev/null
+++ b/storage/innobase/include/fts0tlex.h
@@ -0,0 +1,349 @@
+#ifndef fts0tHEADER_H
+#define fts0tHEADER_H 1
+#define fts0tIN_HEADER 1
+
+#line 6 "../include/fts0tlex.h"
+
+#line 8 "../include/fts0tlex.h"
+
+#define  YY_INT_ALIGNED short int
+
+/* A lexical scanner generated by flex */
+
+#define FLEX_SCANNER
+#define YY_FLEX_MAJOR_VERSION 2
+#define YY_FLEX_MINOR_VERSION 5
+#define YY_FLEX_SUBMINOR_VERSION 35
+#if YY_FLEX_SUBMINOR_VERSION > 0
+#define FLEX_BETA
+#endif
+
+/* First, we deal with  platform-specific or compiler-specific issues. */
+
+/* begin standard C headers. */
+#include <stdio.h>
+#include <string.h>
+#include <errno.h>
+#include <stdlib.h>
+
+/* end standard C headers. */
+
+/* flex integer type definitions */
+
+#ifndef FLEXINT_H
+#define FLEXINT_H
+
+/* C99 systems have <inttypes.h>. Non-C99 systems may or may not. */
+
+#if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
+
+/* C99 says to define __STDC_LIMIT_MACROS before including stdint.h,
+ * if you want the limit (max/min) macros for int types.
+ */
+#ifndef __STDC_LIMIT_MACROS
+#define __STDC_LIMIT_MACROS 1
+#endif
+
+#include <inttypes.h>
+typedef int8_t flex_int8_t;
+typedef uint8_t flex_uint8_t;
+typedef int16_t flex_int16_t;
+typedef uint16_t flex_uint16_t;
+typedef int32_t flex_int32_t;
+typedef uint32_t flex_uint32_t;
+#else
+typedef signed char flex_int8_t;
+typedef short int flex_int16_t;
+typedef int flex_int32_t;
+typedef unsigned char flex_uint8_t;
+typedef unsigned short int flex_uint16_t;
+typedef unsigned int flex_uint32_t;
+
+/* Limits of integral types. */
+#ifndef INT8_MIN
+#define INT8_MIN               (-128)
+#endif
+#ifndef INT16_MIN
+#define INT16_MIN              (-32767-1)
+#endif
+#ifndef INT32_MIN
+#define INT32_MIN              (-2147483647-1)
+#endif
+#ifndef INT8_MAX
+#define INT8_MAX               (127)
+#endif
+#ifndef INT16_MAX
+#define INT16_MAX              (32767)
+#endif
+#ifndef INT32_MAX
+#define INT32_MAX              (2147483647)
+#endif
+#ifndef UINT8_MAX
+#define UINT8_MAX              (255U)
+#endif
+#ifndef UINT16_MAX
+#define UINT16_MAX             (65535U)
+#endif
+#ifndef UINT32_MAX
+#define UINT32_MAX             (4294967295U)
+#endif
+
+#endif /* ! C99 */
+
+#endif /* ! FLEXINT_H */
+
+#ifdef __cplusplus
+
+/* The "const" storage-class-modifier is valid. */
+#define YY_USE_CONST
+
+#else	/* ! __cplusplus */
+
+/* C99 requires __STDC__ to be defined as 1. */
+#if defined (__STDC__)
+
+#define YY_USE_CONST
+
+#endif	/* defined (__STDC__) */
+#endif	/* ! __cplusplus */
+
+#ifdef YY_USE_CONST
+#define yyconst const
+#else
+#define yyconst
+#endif
+
+/* An opaque pointer. */
+#ifndef YY_TYPEDEF_YY_SCANNER_T
+#define YY_TYPEDEF_YY_SCANNER_T
+typedef void* yyscan_t;
+#endif
+
+/* For convenience, these vars (plus the bison vars far below)
+   are macros in the reentrant scanner. */
+#define yyin yyg->yyin_r
+#define yyout yyg->yyout_r
+#define yyextra yyg->yyextra_r
+#define yyleng yyg->yyleng_r
+#define yytext yyg->yytext_r
+#define yylineno (YY_CURRENT_BUFFER_LVALUE->yy_bs_lineno)
+#define yycolumn (YY_CURRENT_BUFFER_LVALUE->yy_bs_column)
+#define yy_flex_debug yyg->yy_flex_debug_r
+
+/* Size of default input buffer. */
+#ifndef YY_BUF_SIZE
+#ifdef __ia64__
+/* On IA-64, the buffer size is 16k, not 8k.
+ * Moreover, YY_BUF_SIZE is 2*YY_READ_BUF_SIZE in the general case.
+ * Ditto for the __ia64__ case accordingly.
+ */
+#define YY_BUF_SIZE 32768
+#else
+#define YY_BUF_SIZE 16384
+#endif /* __ia64__ */
+#endif
+
+#ifndef YY_TYPEDEF_YY_BUFFER_STATE
+#define YY_TYPEDEF_YY_BUFFER_STATE
+typedef struct yy_buffer_state *YY_BUFFER_STATE;
+#endif
+
+#ifndef YY_TYPEDEF_YY_SIZE_T
+#define YY_TYPEDEF_YY_SIZE_T
+typedef size_t yy_size_t;
+#endif
+
+#ifndef YY_STRUCT_YY_BUFFER_STATE
+#define YY_STRUCT_YY_BUFFER_STATE
+struct yy_buffer_state
+	{
+	FILE *yy_input_file;
+
+	char *yy_ch_buf;		/* input buffer */
+	char *yy_buf_pos;		/* current position in input buffer */
+
+	/* Size of input buffer in bytes, not including room for EOB
+	 * characters.
+	 */
+	yy_size_t yy_buf_size;
+
+	/* Number of characters read into yy_ch_buf, not including EOB
+	 * characters.
+	 */
+	int yy_n_chars;
+
+	/* Whether we "own" the buffer - i.e., we know we created it,
+	 * and can realloc() it to grow it, and should free() it to
+	 * delete it.
+	 */
+	int yy_is_our_buffer;
+
+	/* Whether this is an "interactive" input source; if so, and
+	 * if we're using stdio for input, then we want to use getc()
+	 * instead of fread(), to make sure we stop fetching input after
+	 * each newline.
+	 */
+	int yy_is_interactive;
+
+	/* Whether we're considered to be at the beginning of a line.
+	 * If so, '^' rules will be active on the next match, otherwise
+	 * not.
+	 */
+	int yy_at_bol;
+
+    int yy_bs_lineno; /**< The line count. */
+    int yy_bs_column; /**< The column count. */
+
+	/* Whether to try to fill the input buffer when we reach the
+	 * end of it.
+	 */
+	int yy_fill_buffer;
+
+	int yy_buffer_status;
+
+	};
+#endif /* !YY_STRUCT_YY_BUFFER_STATE */
+
+void fts0trestart (FILE *input_file ,yyscan_t yyscanner );
+void fts0t_switch_to_buffer (YY_BUFFER_STATE new_buffer ,yyscan_t yyscanner );
+YY_BUFFER_STATE fts0t_create_buffer (FILE *file,int size ,yyscan_t yyscanner );
+void fts0t_delete_buffer (YY_BUFFER_STATE b ,yyscan_t yyscanner );
+void fts0t_flush_buffer (YY_BUFFER_STATE b ,yyscan_t yyscanner );
+void fts0tpush_buffer_state (YY_BUFFER_STATE new_buffer ,yyscan_t yyscanner );
+void fts0tpop_buffer_state (yyscan_t yyscanner );
+
+YY_BUFFER_STATE fts0t_scan_buffer (char *base,yy_size_t size ,yyscan_t yyscanner );
+YY_BUFFER_STATE fts0t_scan_string (yyconst char *yy_str ,yyscan_t yyscanner );
+YY_BUFFER_STATE fts0t_scan_bytes (yyconst char *bytes,int len ,yyscan_t yyscanner );
+
+void *fts0talloc (yy_size_t ,yyscan_t yyscanner );
+void *fts0trealloc (void *,yy_size_t ,yyscan_t yyscanner );
+void fts0tfree (void * ,yyscan_t yyscanner );
+
+/* Begin user sect3 */
+
+#define fts0twrap(n) 1
+#define YY_SKIP_YYWRAP
+
+#define yytext_ptr yytext_r
+
+#ifdef YY_HEADER_EXPORT_START_CONDITIONS
+#define INITIAL 0
+
+#endif
+
+#ifndef YY_NO_UNISTD_H
+/* Special case for "unistd.h", since it is non-ANSI. We include it way
+ * down here because we want the user's section 1 to have been scanned first.
+ * The user has a chance to override it with an option.
+ */
+#include <unistd.h>
+#endif
+
+#ifndef YY_EXTRA_TYPE
+#define YY_EXTRA_TYPE void *
+#endif
+
+int fts0tlex_init (yyscan_t* scanner);
+
+int fts0tlex_init_extra (YY_EXTRA_TYPE user_defined,yyscan_t* scanner);
+
+/* Accessor methods to globals.
+   These are made visible to non-reentrant scanners for convenience. */
+
+int fts0tlex_destroy (yyscan_t yyscanner );
+
+int fts0tget_debug (yyscan_t yyscanner );
+
+void fts0tset_debug (int debug_flag ,yyscan_t yyscanner );
+
+YY_EXTRA_TYPE fts0tget_extra (yyscan_t yyscanner );
+
+void fts0tset_extra (YY_EXTRA_TYPE user_defined ,yyscan_t yyscanner );
+
+FILE *fts0tget_in (yyscan_t yyscanner );
+
+void fts0tset_in  (FILE * in_str ,yyscan_t yyscanner );
+
+FILE *fts0tget_out (yyscan_t yyscanner );
+
+void fts0tset_out  (FILE * out_str ,yyscan_t yyscanner );
+
+int fts0tget_leng (yyscan_t yyscanner );
+
+char *fts0tget_text (yyscan_t yyscanner );
+
+int fts0tget_lineno (yyscan_t yyscanner );
+
+void fts0tset_lineno (int line_number ,yyscan_t yyscanner );
+
+/* Macros after this point can all be overridden by user definitions in
+ * section 1.
+ */
+
+#ifndef YY_SKIP_YYWRAP
+#ifdef __cplusplus
+extern "C" int fts0twrap (yyscan_t yyscanner );
+#else
+extern int fts0twrap (yyscan_t yyscanner );
+#endif
+#endif
+
+#ifndef yytext_ptr
+static void yy_flex_strncpy (char *,yyconst char *,int ,yyscan_t yyscanner);
+#endif
+
+#ifdef YY_NEED_STRLEN
+static int yy_flex_strlen (yyconst char * ,yyscan_t yyscanner);
+#endif
+
+#ifndef YY_NO_INPUT
+
+#endif
+
+/* Amount of stuff to slurp up with each read. */
+#ifndef YY_READ_BUF_SIZE
+#ifdef __ia64__
+/* On IA-64, the buffer size is 16k, not 8k */
+#define YY_READ_BUF_SIZE 16384
+#else
+#define YY_READ_BUF_SIZE 8192
+#endif /* __ia64__ */
+#endif
+
+/* Number of entries by which start-condition stack grows. */
+#ifndef YY_START_STACK_INCR
+#define YY_START_STACK_INCR 25
+#endif
+
+/* Default declaration of generated scanner - a define so the user can
+ * easily add parameters.
+ */
+#ifndef YY_DECL
+#define YY_DECL_IS_OURS 1
+
+extern int fts0tlex (yyscan_t yyscanner);
+
+#define YY_DECL int fts0tlex (yyscan_t yyscanner)
+#endif /* !YY_DECL */
+
+/* yy_get_previous_state - get the state just before the EOB char was reached */
+
+#undef YY_NEW_FILE
+#undef YY_FLUSH_BUFFER
+#undef yy_set_bol
+#undef yy_new_buffer
+#undef yy_set_interactive
+#undef YY_DO_BEFORE_ACTION
+
+#ifdef YY_DECL_IS_OURS
+#undef YY_DECL_IS_OURS
+#undef YY_DECL
+#endif
+
+#line 68 "fts0tlex.l"
+
+
+#line 348 "../include/fts0tlex.h"
+#undef fts0tIN_HEADER
+#endif /* fts0tHEADER_H */
diff --git a/storage/innobase/include/fts0types.h b/storage/innobase/include/fts0types.h
new file mode 100644
index 00000000000..5b28f2c9473
--- /dev/null
+++ b/storage/innobase/include/fts0types.h
@@ -0,0 +1,481 @@
+/*****************************************************************************
+
+Copyright (c) 2007, 2011, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/fts0types.h
+Full text search types file
+
+Created 2007-03-27 Sunny Bains
+*******************************************************/
+
+#ifndef INNOBASE_FTS0TYPES_H
+#define INNOBASE_FTS0TYPES_H
+
+#include "que0types.h"
+#include "ut0byte.h"
+#include "fut0fut.h"
+#include "ut0rbt.h"
+#include "fts0fts.h"
+
+/** Types (aliases) used within FTS. */
+typedef struct fts_que_struct fts_que_t;
+typedef struct fts_node_struct fts_node_t;
+typedef struct fts_word_struct fts_word_t;
+typedef struct fts_fetch_struct fts_fetch_t;
+typedef struct fts_update_struct fts_update_t;
+typedef struct fts_get_doc_struct fts_get_doc_t;
+typedef struct fts_utf8_str_struct fts_utf8_str_t;
+typedef struct fts_doc_stats_struct fts_doc_stats_t;
+typedef struct fts_tokenizer_word_struct fts_tokenizer_word_t;
+typedef struct fts_index_selector_struct fts_index_selector_t;
+
+/** Callbacks used within FTS. */
+typedef pars_user_func_cb_t fts_sql_callback;
+typedef void (*fts_filter)(void*, fts_node_t*, void*, ulint len);
+
+/** Statistics relevant to a particular document, used during retrieval. */
+struct fts_doc_stats_struct {
+	doc_id_t	doc_id;		/*!< Document id */
+	ulint		word_count;	/*!< Total words in the document */
+};
+
+/** It's main purpose is to store the SQL prepared statements that
+are required to retrieve a document from the database. */
+struct fts_get_doc_struct {
+	fts_index_cache_t*
+			index_cache;	/*!< The index cache instance */
+
+					/*!< Parsed sql statement */
+	que_t*		get_document_graph;
+};
+
+/** Since we can have multiple FTS indexes on a table, we keep a
+per index cache of words etc. */
+struct fts_index_cache_struct {
+	dict_index_t*	index;		/*!< The FTS index instance */
+
+	ib_rbt_t*	words;		/*!< Nodes; indexed by fts_string_t*,
+					cells are fts_tokenizer_word_t*.*/
+
+	ib_vector_t*	doc_stats;	/*!< Array of the fts_doc_stats_t
+					contained in the memory buffer.
+					Must be in sorted order (ascending).
+					The  ideal choice is an rb tree but
+					the rb tree imposes a space overhead
+					that we can do without */
+
+	que_t**		ins_graph;	/*!< Insert query graphs */
+
+	que_t**		sel_graph;	/*!< Select query graphs */
+	CHARSET_INFO*	charset;	/*!< charset */
+};
+
+/** For supporting the tracking of updates on multiple FTS indexes we need
+to track which FTS indexes need to be updated. For INSERT and DELETE we
+update all fts indexes. */
+struct fts_update_struct {
+	doc_id_t	doc_id;		/*!< The doc id affected */
+
+	ib_vector_t*	fts_indexes;	/*!< The FTS indexes that need to be
+					updated. A NULL value means all
+					indexes need to be updated.  This
+					vector is not allocated on the heap
+					and so must be freed explicitly,
+					when we are done with it */
+};
+
+/** Stop word control infotmation. */
+struct fts_stopword_struct {
+	ulint		status;		/*!< Status of the stopword tree */
+	ib_alloc_t*	heap;		/*!< The memory allocator to use */
+	ib_rbt_t*	cached_stopword;/*!< This stores all active stopwords */
+	CHARSET_INFO*	charset;	/*!< charset for stopword */
+};
+
+/** The SYNC state of the cache. There is one instance of this struct
+associated with each ADD thread. */
+struct fts_sync_struct {
+	trx_t*		trx;		/*!< The transaction used for SYNCing
+					the cache to disk */
+	dict_table_t*	table;		/*!< Table with FTS index(es) */
+	ulint		max_cache_size;	/*!< Max size in bytes of the cache */
+	ibool		cache_full;	/*!< flag, when true it indicates that
+					we need to sync the cache to disk */
+	ulint		lower_index;	/*!< the start index of the doc id
+					vector from where to start adding
+					documents to the FTS cache */
+	ulint		upper_index;	/*!< max index of the doc id vector to
+					add to the FTS cache */
+	ibool		interrupted;	/*!< TRUE if SYNC was interrupted */
+	doc_id_t	min_doc_id;	/*!< The smallest doc id added to the
+					cache. It should equal to
+					doc_ids[lower_index] */
+	doc_id_t	max_doc_id;	/*!< The doc id at which the cache was
+					noted as being full, we use this to
+					set the upper_limit field */
+        ib_time_t	start_time;	/*!< SYNC start time */
+};
+
+typedef struct fts_sync_struct	fts_sync_t;
+
+/** The cache for the FTS system. It is a memory-based inverted index
+that new entries are added to, until it grows over the configured maximum
+size, at which time its contents are written to the INDEX table. */
+struct fts_cache_struct {
+	rw_lock_t	lock;		/*!< lock protecting all access to the
+					memory buffer. FIXME: this needs to
+					be our new upgrade-capable rw-lock */
+
+	rw_lock_t	init_lock;	/*!< lock used for the cache
+					intialization, it has different
+					SYNC level as above cache lock */
+
+	mutex_t		optimize_lock;	/*!< Lock for OPTIMIZE */
+
+	mutex_t		deleted_lock;	/*!< Lock covering deleted_doc_ids */
+
+	mutex_t		doc_id_lock;	/*!< Lock covering Doc ID */
+
+	ib_vector_t*	deleted_doc_ids;/*!< Array of deleted doc ids, each
+					element is of type fts_update_t */
+
+	ib_vector_t*	indexes;	/*!< We store the stats and inverted
+					index for the individual FTS indexes
+					in this vector. Each element is
+					an instance of fts_index_cache_t */
+
+	ib_vector_t*	get_docs;	/*!< information required to read
+					the document from the table. Each
+					element is of type fts_doc_t */
+
+	ulint		total_size;	/*!< total size consumed by the ilist
+					field of all nodes. SYNC is run
+					whenever this gets too big */
+	fts_sync_t*	sync;		/*!< sync structure to sync data to
+					disk */
+	ib_alloc_t*	sync_heap;	/*!< The heap allocator, for indexes
+					and deleted_doc_ids, ie. transient
+					objects, they are recreated after
+					a SYNC is completed */
+
+
+	ib_alloc_t*	self_heap;	/*!< This heap is the heap out of
+					which an instance of the cache itself
+					was created. Objects created using
+					this heap will last for the lifetime
+					of the cache */
+
+	doc_id_t	next_doc_id;	/*!< Next doc id */
+
+	doc_id_t	synced_doc_id;	/*!< Doc ID sync-ed to CONFIG table */
+
+	doc_id_t	first_doc_id;	/*!< first doc id since this table
+					was opened */
+
+	ulint		deleted;	/*!< Number of doc ids deleted since
+					last optimized. This variable is
+					covered by deleted_lock */
+
+	ulint		added;		/*!< Number of doc ids added since last
+					optimized. This variable is covered by
+					the deleted lock */
+
+	fts_stopword_t	stopword_info;	/*!< Cached stopwords for the FTS */
+	mem_heap_t*	cache_heap;	/*!< Cache Heap */
+};
+
+/** Columns of the FTS auxiliary INDEX table */
+struct fts_node_struct {
+	doc_id_t	first_doc_id;	/*!< First document id in ilist. */
+
+	doc_id_t	last_doc_id;	/*!< Last document id in ilist. */
+
+	byte*		ilist;		/*!< Binary list of documents & word
+					positions the token appears in.
+					TODO: For now, these are simply
+					ut_malloc'd, but if testing shows
+					that they waste memory unacceptably, a
+					special memory allocator will have
+					to be written */
+
+	ulint		doc_count;	/*!< Number of doc ids in ilist */
+
+	ulint		ilist_size;	/*!< Used size of ilist in bytes. */
+
+	ulint		ilist_size_alloc;
+					/*!< Allocated size of ilist in
+					bytes */
+};
+
+/** A tokenizer word. Contains information about one word. */
+struct fts_tokenizer_word_struct {
+	fts_string_t	text;		/*!< Token text. */
+
+	ib_vector_t*	nodes;		/*!< Word node ilists, each element is
+					of type fts_node_t */
+};
+
+/** Word text plus it's array of nodes as on disk in FTS index */
+struct fts_word_struct {
+	fts_string_t	text;		/*!< Word value in UTF-8 */
+	ib_vector_t*	nodes;		/*!< Nodes read from disk */
+
+	ib_alloc_t*	heap_alloc;	/*!< For handling all allocations */
+};
+
+/** Callback for reading and filtering nodes that are read from FTS index */
+struct fts_fetch_struct {
+	void*		read_arg;	/*!< Arg for the sql_callback */
+
+	fts_sql_callback
+			read_record;	/*!< Callback for reading index
+					record */
+};
+
+/** For horizontally splitting an FTS auxiliary index */
+struct fts_index_selector_struct {
+	ulint		value;		/*!< Character value at which
+					to split */
+
+	const char*	suffix;		/*!< FTS aux index suffix */
+};
+
+/** This type represents a single document. */
+struct fts_doc_struct {
+	fts_string_t	text;		/*!< document text */
+
+	ibool		found;		/*!< TRUE if the document was found
+					successfully in the database */
+
+	ib_rbt_t*	tokens;		/*!< This is filled when the document
+					is tokenized. Tokens; indexed by
+					fts_string_t*, cells are of type
+					fts_token_t* */
+
+	ib_alloc_t*	self_heap;	/*!< An instance of this type is
+					allocated from this heap along
+					with any objects that have the
+					same lifespan, most notably
+					the vector of token positions */
+	CHARSET_INFO*	charset;	/*!< Document's charset info */
+};
+
+/** A token and its positions within a document. */
+struct fts_token_struct {
+	fts_string_t	text;		/*!< token text */
+
+	ib_vector_t*	positions;	/*!< an array of the positions the
+					token is found in; each item is
+					actually an ulint. */
+};
+
+/** It's defined in fts/fts0fts.c */
+extern const fts_index_selector_t fts_index_selector[];
+
+/******************************************************************//**
+Compare two UTF-8 strings. */
+UNIV_INLINE
+int
+fts_utf8_string_cmp(
+/*================*/
+						/*!< out:
+						< 0 if n1 < n2,
+						0 if n1 == n2,
+						> 0 if n1 > n2 */
+	const void*	p1,			/*!< in: key */
+	const void*	p2);			/*!< in: node */
+
+/******************************************************************//**
+Compare two UTF-8 strings, and return match (0) if
+passed in "key" value equals or is the prefix of the "node" value. */
+UNIV_INLINE
+int
+fts_utf8_string_cmp_prefix(
+/*=======================*/
+						/*!< out:
+						< 0 if n1 < n2,
+						0 if n1 == n2,
+						> 0 if n1 > n2 */
+	const void*	p1,			/*!< in: key */
+	const void*	p2);			/*!< in: node */
+
+/******************************************************************//**
+Compare two fts_trx_row_t instances doc_ids. */
+UNIV_INLINE
+int
+fts_trx_row_doc_id_cmp(
+/*===================*/
+						/*!< out:
+						< 0 if n1 < n2,
+						0 if n1 == n2,
+						> 0 if n1 > n2 */
+	const void*	p1,			/*!< in: id1 */
+	const void*	p2);			/*!< in: id2 */
+
+/******************************************************************//**
+Compare two fts_ranking_t instances doc_ids. */
+UNIV_INLINE
+int
+fts_ranking_doc_id_cmp(
+/*===================*/
+						/*!< out:
+						< 0 if n1 < n2,
+						0 if n1 == n2,
+						> 0 if n1 > n2 */
+	const void*	p1,			/*!< in: id1 */
+	const void*	p2);			/*!< in: id2 */
+
+/******************************************************************//**
+Compare two fts_update_t instances doc_ids. */
+UNIV_INLINE
+int
+fts_update_doc_id_cmp(
+/*==================*/
+						/*!< out:
+						< 0 if n1 < n2,
+						0 if n1 == n2,
+						> 0 if n1 > n2 */
+	const void*	p1,			/*!< in: id1 */
+	const void*	p2);			/*!< in: id2 */
+
+/******************************************************************//**
+Decode and return the integer that was encoded using our VLC scheme.*/
+UNIV_INLINE
+ulint
+fts_decode_vlc(
+/*===========*/
+			/*!< out: value decoded */
+	byte**	ptr);	/*!< in: ptr to decode from, this ptr is
+			incremented by the number of bytes decoded */
+
+/******************************************************************//**
+Duplicate an UTF-8 string. */
+UNIV_INLINE
+void
+fts_utf8_string_dup(
+/*================*/
+						/*!< out:
+						< 0 if n1 < n2,
+						0 if n1 == n2,
+						> 0 if n1 > n2 */
+	fts_string_t*		dst,		/*!< in: dup to here */
+	const fts_string_t*	src,		/*!< in: src string */
+	mem_heap_t*		heap);		/*!< in: heap to use */
+
+/******************************************************************//**
+Return length of val if it were encoded using our VLC scheme. */
+UNIV_INLINE
+ulint
+fts_get_encoded_len(
+/*================*/
+						/*!< out: length of value
+						 encoded, in bytes */
+	ulint		val);			/*!< in: value to encode */
+
+/******************************************************************//**
+Encode an integer using our VLC scheme and return the length in bytes. */
+UNIV_INLINE
+ulint
+fts_encode_int(
+/*===========*/
+						/*!< out: length of value
+						encoded, in bytes */
+	ulint		val,			/*!< in: value to encode */
+	byte*		buf);			/*!< in: buffer, must have
+						enough space */
+
+/******************************************************************//**
+Decode a UTF-8 character.
+
+http://www.unicode.org/versions/Unicode4.0.0/ch03.pdf:
+
+ Scalar Value              1st Byte 2nd Byte 3rd Byte 4th Byte
+00000000 0xxxxxxx          0xxxxxxx
+00000yyy yyxxxxxx          110yyyyy 10xxxxxx
+zzzzyyyy yyxxxxxx          1110zzzz 10yyyyyy 10xxxxxx
+000uuuzz zzzzyyyy yyxxxxxx 11110uuu 10zzzzzz 10yyyyyy 10xxxxxx
+
+This function decodes UTF-8 sequences up to 6 bytes (31 bits).
+
+On error *ptr will point to the first byte that was not correctly
+decoded. This will hopefully help in resyncing the input. */
+UNIV_INLINE
+ulint
+fts_utf8_decode(
+/*============*/
+						/*!< out: UTF8_ERROR if *ptr
+						did not point to a valid
+						UTF-8 sequence, or the
+						Unicode code point. */
+	const byte**	ptr);			/*!< in/out: pointer to
+						UTF-8 string. The
+						pointer is advanced to
+						the start of the next
+						character. */
+
+/******************************************************************//**
+Lowercase an UTF-8 string. */
+UNIV_INLINE
+void
+fts_utf8_tolower(
+/*=============*/
+	fts_string_t*	str);			/*!< in: string */
+
+/******************************************************************//**
+Get the selected FTS aux INDEX suffix. */
+UNIV_INLINE
+const char*
+fts_get_suffix(
+/*===========*/
+	ulint		selected);		/*!< in: selected index */
+
+/********************************************************************
+Get the number of index selectors. */
+UNIV_INLINE
+ulint
+fts_get_n_selectors(void);
+/*=====================*/
+
+/******************************************************************//**
+Select the FTS auxiliary index for the given string.
+@return the index to use for the string */
+UNIV_INLINE
+ulint
+fts_select_index(
+/*=============*/
+	const CHARSET_INFO*	cs,		/*!< Charset */
+	const byte*		str,		/*!< in: word string */
+	ulint			len);		/*!< in: string length */
+
+/********************************************************************
+Select the next FTS auxiliary index for the given character.
+@return the next index to use for character */
+UNIV_INLINE
+ulint
+fts_select_next_index(
+/*==================*/
+	const CHARSET_INFO*	cs,		/*!< Charset */
+	const byte*		str,		/*!< in: string */
+	ulint			len);		/*!< in: string length */
+
+#ifndef UNIV_NONINL
+#include "fts0types.ic"
+#include "fts0vlc.ic"
+#endif
+
+#endif /* INNOBASE_FTS0TYPES_H */
diff --git a/storage/innobase/include/fts0types.ic b/storage/innobase/include/fts0types.ic
new file mode 100644
index 00000000000..2734a331a86
--- /dev/null
+++ b/storage/innobase/include/fts0types.ic
@@ -0,0 +1,427 @@
+/*****************************************************************************
+
+Copyright (c) 2007, 2011, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/fts0types.ic
+Full text search types.
+
+Created 2007-03-27 Sunny Bains
+*******************************************************/
+
+#ifndef INNOBASE_FTS0TYPES_IC
+#define INNOBASE_FTS0TYPES_IC
+
+#include <ctype.h>
+
+#include "rem0cmp.h"
+#include "ha_prototypes.h"
+
+extern const ulint UTF8_ERROR;
+
+/* Determine if a UTF-8 continuation byte is valid. */
+#define fts_utf8_is_valid(b) (((b) & 0xC0) == 0x80)
+
+/******************************************************************//**
+Compare two fts_trx_table_t instances.
+@return < 0 if n1 < n2, 0 if n1 == n2, > 0 if n1 > n2  */
+UNIV_INLINE
+int
+fts_trx_table_cmp(
+/*==============*/
+	const void*	p1,			/*!< in: id1 */
+	const void*	p2)			/*!< in: id2 */
+{
+	const dict_table_t* table1 = (*(const fts_trx_table_t**) p1)->table;
+	const dict_table_t* table2 = (*(const fts_trx_table_t**) p2)->table;
+
+	return((table1->id > table2->id)
+	       ? 1
+	       : (table1->id == table2->id)
+		  ? 0
+		  : -1);
+}
+
+/******************************************************************//**
+Compare a table id with a fts_trx_table_t table id.
+@return < 0 if n1 < n2, 0 if n1 == n2,> 0 if n1 > n2 */
+UNIV_INLINE
+int
+fts_trx_table_id_cmp(
+/*=================*/
+	const void*	p1,			/*!< in: id1 */
+	const void*	p2)			/*!< in: id2 */
+{
+	const ullint* table_id = (const ullint*) p1;
+	const dict_table_t* table2 = (*(const fts_trx_table_t**) p2)->table;
+
+	return((*table_id > table2->id)
+	       ? 1
+	       : (*table_id == table2->id)
+		  ? 0
+		  : -1);
+}
+
+/******************************************************************//**
+Duplicate an UTF-8 string.
+@return < 0 if n1 < n2, 0 if n1 == n2, > 0 if n1 > n2 */
+UNIV_INLINE
+void
+fts_utf8_string_dup(
+/*================*/
+	fts_string_t*		dst,		/*!< in: dup to here */
+	const fts_string_t*	src,		/*!< in: src string */
+	mem_heap_t*		heap)		/*!< in: heap to use */
+{
+	dst->f_str = (byte*) mem_heap_dup(heap, src->f_str, src->f_len + 1);
+
+	dst->f_len = src->f_len;
+	dst->f_str[src->f_len] = 0;
+	dst->f_n_char = src->f_n_char;
+}
+
+/******************************************************************//**
+Compare two fts_trx_row_t doc_ids.
+@return < 0 if n1 < n2, 0 if n1 == n2, > 0 if n1 > n2 */
+UNIV_INLINE
+int
+fts_trx_row_doc_id_cmp(
+/*===================*/
+	const void*	p1,			/*!< in: id1 */
+	const void*	p2)			/*!< in: id2 */
+{
+	const fts_trx_row_t*	tr1 = (const fts_trx_row_t*) p1;
+	const fts_trx_row_t*	tr2 = (const fts_trx_row_t*) p2;
+
+	return((int)(tr1->doc_id - tr2->doc_id));
+}
+
+/******************************************************************//**
+Compare two fts_ranking_t doc_ids.
+@return < 0 if n1 < n2, 0 if n1 == n2, > 0 if n1 > n2 */
+UNIV_INLINE
+int
+fts_ranking_doc_id_cmp(
+/*===================*/
+	const void*	p1,			/*!< in: id1 */
+	const void*	p2)			/*!< in: id2 */
+{
+	const fts_ranking_t*	rk1 = (const fts_ranking_t*) p1;
+	const fts_ranking_t*	rk2 = (const fts_ranking_t*) p2;
+
+	return((int)(rk1->doc_id - rk2->doc_id));
+}
+
+/******************************************************************//**
+Compare two fts_update_t doc_ids.
+@return < 0 if n1 < n2, 0 if n1 == n2, > 0 if n1 > n2 */
+UNIV_INLINE
+int
+fts_update_doc_id_cmp(
+/*==================*/
+	const void*	p1,			/*!< in: id1 */
+	const void*	p2)			/*!< in: id2 */
+{
+	const fts_update_t*	up1 = (const fts_update_t*) p1;
+	const fts_update_t*	up2 = (const fts_update_t*) p2;
+
+	return((int)(up1->doc_id - up2->doc_id));
+}
+
+
+/******************************************************************//**
+Lowercase an UTF-8 string. */
+UNIV_INLINE
+void
+fts_utf8_tolower(
+/*=============*/
+	fts_string_t*	str)			/*!< in: string */
+{
+	innobase_casedn_str((char*) str->f_str);
+}
+
+/******************************************************************//**
+Compare two UTF-8 strings.
+@return < 0 if n1 < n2, 0 if n1 == n2, > 0 if n1 > n2 */
+UNIV_INLINE
+int
+fts_utf8_string_cmp(
+/*================*/
+	const void*	p1,			/*!< in: key */
+	const void*	p2)			/*!< in: node */
+{
+	const fts_string_t* s1 = (const fts_string_t*) p1;
+	const fts_string_t* s2 = (const fts_string_t*) p2;
+
+	return(cmp_data_data_slow_varchar(
+		s1->f_str, s1->f_len, s2->f_str, s2->f_len));
+}
+
+/******************************************************************//**
+Compare two UTF-8 strings, and return match (0) if
+passed in "key" value equals or is the prefix of the "node" value.
+@return < 0 if n1 < n2, 0 if n1 == n2, > 0 if n1 > n2 */
+UNIV_INLINE
+int
+fts_utf8_string_cmp_prefix(
+/*=======================*/
+	const void*	p1,			/*!< in: key */
+	const void*	p2)			/*!< in: node */
+{
+	int	result;
+	ulint	len;
+
+	const fts_string_t* s1 = (const fts_string_t*) p1;
+	const fts_string_t* s2 = (const fts_string_t*) p2;
+
+	len = ut_min(s1->f_len, s2->f_len);
+
+	result = cmp_data_data_slow_varchar(s1->f_str, len, s2->f_str, len);
+
+	if (result) {
+		return(result);
+	}
+
+	if (s1->f_len > s2->f_len) {
+		return(1);
+	}
+
+	return(0);
+}
+
+/******************************************************************//**
+Decode a UTF-8 character.
+
+http://www.unicode.org/versions/Unicode4.0.0/ch03.pdf:
+
+ Scalar Value              1st Byte 2nd Byte 3rd Byte 4th Byte
+00000000 0xxxxxxx          0xxxxxxx
+00000yyy yyxxxxxx          110yyyyy 10xxxxxx
+zzzzyyyy yyxxxxxx          1110zzzz 10yyyyyy 10xxxxxx
+000uuuzz zzzzyyyy yyxxxxxx 11110uuu 10zzzzzz 10yyyyyy 10xxxxxx
+
+This function decodes UTF-8 sequences up to 6 bytes (31 bits).
+
+On error *ptr will point to the first byte that was not correctly
+decoded. This will hopefully help in resyncing the input.
+@return UTF8_ERROR if *ptr did not point to a valid
+UTF-8 sequence, or the Unicode code point. */
+UNIV_INLINE
+ulint
+fts_utf8_decode(
+/*============*/
+	const byte**	ptr)			/*!< in/out: pointer to
+						UTF-8 string. The
+						pointer is advanced to
+						the start of the next
+						character. */
+{
+	const byte*	p = *ptr;
+	ulint		ch = *p++;
+#ifdef UNIV_DEBUG
+	ulint		min_ch;
+#endif /* UNIV_DEBUG */
+
+	if (UNIV_LIKELY(ch < 0x80)) {
+		/* 0xxxxxxx */
+	} else if (UNIV_UNLIKELY(ch < 0xC0)) {
+		/* A continuation byte cannot start a code. */
+		goto err_exit;
+	} else if (ch < 0xE0) {
+		/* 110yyyyy 10xxxxxx */
+		ch &= 0x1F;
+		ut_d(min_ch = 0x80);
+		goto get1;
+	} else if (ch < 0xF0) {
+		/* 1110zzzz 10yyyyyy 10xxxxxx */
+		ch &= 0x0F;
+		ut_d(min_ch = 0x800);
+		goto get2;
+	} else if (ch < 0xF8) {
+		/* 11110uuu 10zzzzzz 10yyyyyy 10xxxxxx */
+		ch &= 0x07;
+		ut_d(min_ch = 0x10000);
+		goto get3;
+	} else if (ch < 0xFC) {
+		/* 111110tt 10uuuuuu 10zzzzzz 10yyyyyy 10xxxxxx */
+		ch &= 0x03;
+		ut_d(min_ch = 0x200000);
+		goto get4;
+	} else if (ch < 0xFE) {
+		/* 1111110s 10tttttt 10uuuuuu 10zzzzzz 10yyyyyy 10xxxxxx */
+		ut_d(min_ch = 0x4000000);
+		if (!fts_utf8_is_valid(*p)) {
+			goto err_exit;
+		}
+		ch <<= 6;
+		ch |= (*p++) & 0x3F;
+get4:
+		if (!fts_utf8_is_valid(*p)) {
+			goto err_exit;
+		}
+		ch <<= 6;
+		ch |= (*p++) & 0x3F;
+get3:
+		if (!fts_utf8_is_valid(*p)) {
+			goto err_exit;
+		}
+		ch <<= 6;
+		ch |= (*p++) & 0x3F;
+get2:
+		if (!fts_utf8_is_valid(*p)) {
+			goto err_exit;
+		}
+		ch <<= 6;
+		ch |= (*p++) & 0x3F;
+get1:
+		if (!fts_utf8_is_valid(*p)) {
+			goto err_exit;
+		}
+		ch <<= 6;
+		ch |= (*p++) & 0x3F;
+
+		/* The following is needed in the 6-byte case
+		when ulint is wider than 32 bits. */
+		ch &= 0xFFFFFFFF;
+
+		/* The code positions U+D800 to U+DFFF (UTF-16 surrogate pairs)
+		and U+FFFE and U+FFFF cannot occur in valid UTF-8. */
+
+		if ( (ch >= 0xD800 && ch <= 0xDFFF)
+#ifdef UNIV_DEBUG
+		     || ch < min_ch
+#endif /* UNIV_DEBUG */
+		     || ch == 0xFFFE || ch == 0xFFFF) {
+
+			ch = UTF8_ERROR;
+		}
+	} else {
+err_exit:
+		ch = UTF8_ERROR;
+	}
+
+	*ptr = p;
+
+	return(ch);
+}
+
+/******************************************************************//**
+Get the first character's code position for FTS index partition */
+extern
+ulint
+innobase_strnxfrm(
+/*==============*/
+        const CHARSET_INFO*	cs,	/*!< in: Character set */
+        const uchar*		p2,	/*!< in: string */
+        const ulint		len2);	/*!< in: string length */
+
+/******************************************************************//**
+Select the FTS auxiliary index for the given character.
+@return the index to use for the string */
+UNIV_INLINE
+ulint
+fts_select_index(
+/*=============*/
+	const CHARSET_INFO*	cs,	/*!< in: Charset */
+	const byte*		str,	/*!< in: string */
+	ulint			len)	/*!< in: string length */
+{
+	ulint			selected = 0;
+	ulint			value = innobase_strnxfrm(cs, str, len);
+
+	while (fts_index_selector[selected].value != 0) {
+
+		if (fts_index_selector[selected].value == value) {
+
+			return(selected);
+
+		} else if (fts_index_selector[selected].value > value) {
+
+			return(selected > 0 ? selected - 1 : 0);
+		}
+
+		++selected;
+	}
+
+	ut_ad(selected > 1);
+
+	return(selected - 1);
+}
+
+/******************************************************************//**
+Select the next FTS auxiliary index for the given character.
+@return the next index to use for character */
+UNIV_INLINE
+ulint
+fts_select_next_index(
+/*==================*/
+	const CHARSET_INFO*	cs,	/*!< in: Charset */
+	const byte*		str,	/*!< in: string */
+	ulint			len)	/*!< in: string length */
+{
+	ulint		selected = 0;
+	ulint		value = innobase_strnxfrm(cs, str, len);
+
+	while (fts_index_selector[selected].value != 0) {
+
+		if (fts_index_selector[selected].value == value) {
+
+			return(selected + 1);
+
+		} else if (fts_index_selector[selected].value > value) {
+
+			return(selected);
+		}
+
+		++selected;
+	}
+
+	ut_ad(selected > 0);
+
+	return((ulint) selected);
+}
+
+/******************************************************************//**
+Return the selected FTS aux index suffix. */
+UNIV_INLINE
+const char*
+fts_get_suffix(
+/*===========*/
+	ulint		selected)	/*!< in: selected index */
+{
+	return(fts_index_selector[selected].suffix);
+}
+
+/******************************************************************//**
+Get the number of index selectors.
+@return The number of selectors */
+UNIV_INLINE
+ulint
+fts_get_n_selectors(void)
+/*=====================*/
+{
+	ulint	i = 0;
+
+	// FIXME: This is a hack
+	while (fts_index_selector[i].value != 0) {
+		++i;
+	}
+
+	return(i);
+}
+
+#endif /* INNOBASE_FTS0TYPES_IC */
diff --git a/storage/innobase/include/fts0vlc.ic b/storage/innobase/include/fts0vlc.ic
new file mode 100644
index 00000000000..e79bcf59347
--- /dev/null
+++ b/storage/innobase/include/fts0vlc.ic
@@ -0,0 +1,142 @@
+/*****************************************************************************
+
+Copyright (c) 2007, 2011, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/fts0vlc.ic
+Full text variable length integer encoding/decoding.
+
+Created 2007-03-27 Sunny Bains
+*******************************************************/
+
+#ifndef INNOBASE_FTS0VLC_IC
+#define INNOBASE_FTS0VLC_IC
+
+#include "fts0types.h"
+
+/******************************************************************//**
+Return length of val if it were encoded using our VLC scheme.
+FIXME: We will need to be able encode 8 bytes value
+@return length of value encoded, in bytes */
+UNIV_INLINE
+ulint
+fts_get_encoded_len(
+/*================*/
+	ulint	val)	/* in: value to encode */
+{
+	if (val <= 127) {
+		return(1);
+	} else if (val <= 16383) {
+		return(2);
+	} else if (val <= 2097151) {
+		return(3);
+	} else if (val <= 268435455) {
+		return(4);
+	} else {
+		/* Possibly we should care that on 64-bit machines ulint can
+		contain values that we can't encode in 5 bytes, but
+		fts_encode_int doesn't handle them either so it doesn't much
+		matter. */
+
+		return(5);
+	}
+}
+
+/******************************************************************//**
+Encode an integer using our VLC scheme and return the length in bytes.
+@return length of value encoded, in bytes */
+UNIV_INLINE
+ulint
+fts_encode_int(
+/*===========*/
+	ulint	val,	/* in: value to encode */
+	byte*	buf)	/* in: buffer, must have enough space */
+{
+	ulint	len;
+
+	if (val <= 127) {
+		*buf = (byte) val;
+
+		len = 1;
+	} else if (val <= 16383) {
+		*buf++ = (byte)(val >> 7);
+		*buf = (byte)(val & 0x7F);
+
+		len = 2;
+	} else if (val <= 2097151) {
+		*buf++ = (byte)(val >> 14);
+		*buf++ = (byte)((val >> 7) & 0x7F);
+		*buf = (byte)(val & 0x7F);
+
+		len = 3;
+	} else if (val <= 268435455) {
+		*buf++ = (byte)(val >> 21);
+		*buf++ = (byte)((val >> 14) & 0x7F);
+		*buf++ = (byte)((val >> 7) & 0x7F);
+		*buf = (byte)(val & 0x7F);
+
+		len = 4;
+	} else {
+		/* Best to keep the limitations of the 32/64 bit versions
+		identical, at least for the time being. */
+		ut_ad(val <= 4294967295u);
+
+		*buf++ = (byte)(val >> 28);
+		*buf++ = (byte)((val >> 21) & 0x7F);
+		*buf++ = (byte)((val >> 14) & 0x7F);
+		*buf++ = (byte)((val >> 7) & 0x7F);
+		*buf = (byte)(val & 0x7F);
+
+		len = 5;
+	}
+
+	/* High-bit on means "last byte in the encoded integer". */
+	*buf |= 0x80;
+
+	return(len);
+}
+
+/******************************************************************//**
+Decode and return the integer that was encoded using our VLC scheme.
+@return value decoded */
+UNIV_INLINE
+ulint
+fts_decode_vlc(
+/*===========*/
+	byte**	ptr)	/* in: ptr to decode from, this ptr is
+			incremented by the number of bytes decoded */
+{
+	ulint	val = 0;
+
+	for (;;) {
+		byte	b = **ptr;
+
+		++*ptr;
+		val |= (b & 0x7F);
+
+		/* High-bit on means "last byte in the encoded integer". */
+		if (b & 0x80) {
+			break;
+		} else {
+			val <<= 7;
+		}
+	}
+
+	return(val);
+}
+
+#endif
diff --git a/storage/innobase/include/fut0fut.h b/storage/innobase/include/fut0fut.h
index dce20b3bad6..851cdb44cdf 100644
--- a/storage/innobase/include/fut0fut.h
+++ b/storage/innobase/include/fut0fut.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1995, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
diff --git a/storage/innobase/include/fut0fut.ic b/storage/innobase/include/fut0fut.ic
index 0b52719a055..b065b10b9ca 100644
--- a/storage/innobase/include/fut0fut.ic
+++ b/storage/innobase/include/fut0fut.ic
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1995, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
diff --git a/storage/innobase/include/fut0lst.h b/storage/innobase/include/fut0lst.h
index fe024c2498f..90f9a65d4fa 100644
--- a/storage/innobase/include/fut0lst.h
+++ b/storage/innobase/include/fut0lst.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1995, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
diff --git a/storage/innobase/include/fut0lst.ic b/storage/innobase/include/fut0lst.ic
index dcd13c61871..d18cf21378f 100644
--- a/storage/innobase/include/fut0lst.ic
+++ b/storage/innobase/include/fut0lst.ic
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1995, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
diff --git a/storage/innobase/include/ha0ha.h b/storage/innobase/include/ha0ha.h
index 83a7394123f..1a2b8dac014 100644
--- a/storage/innobase/include/ha0ha.h
+++ b/storage/innobase/include/ha0ha.h
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -45,9 +45,10 @@ ha_search_and_get_data(
 	ulint		fold);	/*!< in: folded value of the searched data */
 /*********************************************************//**
 Looks for an element when we know the pointer to the data and updates
-the pointer to data if found. */
+the pointer to data if found.
+@return TRUE if found */
 UNIV_INTERN
-void
+ibool
 ha_search_and_update_if_found_func(
 /*===============================*/
 	hash_table_t*	table,	/*!< in/out: hash table */
@@ -92,8 +93,12 @@ ha_create_func(
 	ulint	mutex_level,	/*!< in: level of the mutexes in the latching
 				order: this is used in the debug version */
 #endif /* UNIV_SYNC_DEBUG */
-	ulint	n_mutexes);	/*!< in: number of mutexes to protect the
+	ulint	n_mutexes,	/*!< in: number of mutexes to protect the
 				hash table: must be a power of 2, or 0 */
+	ulint	type);		/*!< in: type of datastructure for which
+				the memory heap is going to be used e.g.:
+				MEM_HEAP_FOR_BTR_SEARCH or
+				MEM_HEAP_FOR_PAGE_HASH */
 #ifdef UNIV_SYNC_DEBUG
 /** Creates a hash table.
 @return		own: created table
@@ -102,7 +107,7 @@ chosen to be a slightly bigger prime number.
 @param level	in: level of the mutexes in the latching order
 @param n_m	in: number of mutexes to protect the hash table;
 		must be a power of 2, or 0 */
-# define ha_create(n_c,n_m,level) ha_create_func(n_c,level,n_m)
+# define ha_create(n_c,n_m,type,level) ha_create_func(n_c,level,n_m,type)
 #else /* UNIV_SYNC_DEBUG */
 /** Creates a hash table.
 @return		own: created table
@@ -111,10 +116,18 @@ chosen to be a slightly bigger prime number.
 @param level	in: level of the mutexes in the latching order
 @param n_m	in: number of mutexes to protect the hash table;
 		must be a power of 2, or 0 */
-# define ha_create(n_c,n_m,level) ha_create_func(n_c,n_m)
+# define ha_create(n_c,n_m,type,level) ha_create_func(n_c,n_m,type)
 #endif /* UNIV_SYNC_DEBUG */
 
 /*************************************************************//**
+Empties a hash table and frees the memory heaps. */
+UNIV_INTERN
+void
+ha_clear(
+/*=====*/
+	hash_table_t*	table);	/*!< in, own: hash table */
+
+/*************************************************************//**
 Inserts an entry into a hash table. If an entry with the same fold number
 is found, its node is updated to point to the new data, and no new node
 is inserted.
@@ -131,7 +144,7 @@ ha_insert_for_fold_func(
 #if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
 	buf_block_t*	block,	/*!< in: buffer block containing the data */
 #endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
-	rec_t*	data);	/*!< in: data, must not be NULL */
+	const rec_t*	data);	/*!< in: data, must not be NULL */
 
 #if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
 /**
@@ -143,7 +156,10 @@ is inserted.
 @param f	in: folded value of data
 @param b	in: buffer block containing the data
 @param d	in: data, must not be NULL */
-# define ha_insert_for_fold(t,f,b,d) ha_insert_for_fold_func(t,f,b,d)
+# define ha_insert_for_fold(t,f,b,d) 	do {		\
+	ha_insert_for_fold_func(t,f,b,d);		\
+	MONITOR_INC(MONITOR_ADAPTIVE_HASH_ROW_ADDED);	\
+} while(0)
 #else /* UNIV_AHI_DEBUG || UNIV_DEBUG */
 /**
 Inserts an entry into a hash table. If an entry with the same fold number
@@ -154,7 +170,10 @@ is inserted.
 @param f	in: folded value of data
 @param b	ignored: buffer block containing the data
 @param d	in: data, must not be NULL */
-# define ha_insert_for_fold(t,f,b,d) ha_insert_for_fold_func(t,f,d)
+# define ha_insert_for_fold(t,f,b,d)	do {		\
+	ha_insert_for_fold_func(t,f,d);			\
+	MONITOR_INC(MONITOR_ADAPTIVE_HASH_ROW_ADDED);	\
+} while (0)
 #endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
 
 /*********************************************************//**
@@ -214,20 +233,33 @@ struct ha_node_struct {
 	ulint		fold;	/*!< fold value for the data */
 };
 
-#ifndef UNIV_HOTBACKUP
-/** Assert that the current thread is holding the mutex protecting a
-hash bucket corresponding to a fold value.
-@param table	in: hash table
-@param fold	in: fold value */
-# define ASSERT_HASH_MUTEX_OWN(table, fold)				\
-	ut_ad(!(table)->mutexes || mutex_own(hash_get_mutex(table, fold)))
-#else /* !UNIV_HOTBACKUP */
-/** Assert that the current thread is holding the mutex protecting a
-hash bucket corresponding to a fold value.
-@param table	in: hash table
-@param fold	in: fold value */
-# define ASSERT_HASH_MUTEX_OWN(table, fold) ((void) 0)
-#endif /* !UNIV_HOTBACKUP */
+#ifdef UNIV_DEBUG
+/********************************************************************//**
+Assert that the synchronization object in a hash operation involving
+possible change in the hash table is held.
+Note that in case of mutexes we assert that mutex is owned while in case
+of rw-locks we assert that it is held in exclusive mode. */
+UNIV_INLINE
+void
+hash_assert_can_modify(
+/*===================*/
+	hash_table_t*	table,	/*!< in: hash table */
+	ulint		fold);	/*!< in: fold value */
+/********************************************************************//**
+Assert that the synchronization object in a hash search operation is held.
+Note that in case of mutexes we assert that mutex is owned while in case
+of rw-locks we assert that it is held either in x-mode or s-mode. */
+UNIV_INLINE
+void
+hash_assert_can_search(
+/*===================*/
+	hash_table_t*	table,	/*!< in: hash table */
+	ulint		fold);	/*!< in: fold value */
+#else /* UNIV_DEBUG */
+#define hash_assert_can_modify(t, f)
+#define hash_assert_can_search(t, f)
+#endif /* UNIV_DEBUG */
+
 
 #ifndef UNIV_NONINL
 #include "ha0ha.ic"
diff --git a/storage/innobase/include/ha0ha.ic b/storage/innobase/include/ha0ha.ic
index aec28398b5d..91794e8f1fc 100644
--- a/storage/innobase/include/ha0ha.ic
+++ b/storage/innobase/include/ha0ha.ic
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -106,6 +106,56 @@ ha_chain_get_first(
 	       hash_get_nth_cell(table, hash_calc_hash(fold, table))->node);
 }
 
+#ifdef UNIV_DEBUG
+/********************************************************************//**
+Assert that the synchronization object in a hash operation involving
+possible change in the hash table is held.
+Note that in case of mutexes we assert that mutex is owned while in case
+of rw-locks we assert that it is held in exclusive mode. */
+UNIV_INLINE
+void
+hash_assert_can_modify(
+/*===================*/
+	hash_table_t*	table,	/*!< in: hash table */
+	ulint		fold)	/*!< in: fold value */
+{
+	if (table->type == HASH_TABLE_SYNC_MUTEX) {
+		ut_ad(mutex_own(hash_get_mutex(table, fold)));
+	} else if (table->type == HASH_TABLE_SYNC_RW_LOCK) {
+# ifdef UNIV_SYNC_DEBUG
+		rw_lock_t* lock = hash_get_lock(table, fold);
+		ut_ad(rw_lock_own(lock, RW_LOCK_EX));
+# endif
+	} else {
+		ut_ad(table->type == HASH_TABLE_SYNC_NONE);
+	}
+}
+
+/********************************************************************//**
+Assert that the synchronization object in a hash search operation is held.
+Note that in case of mutexes we assert that mutex is owned while in case
+of rw-locks we assert that it is held either in x-mode or s-mode. */
+UNIV_INLINE
+void
+hash_assert_can_search(
+/*===================*/
+	hash_table_t*	table,	/*!< in: hash table */
+	ulint		fold)	/*!< in: fold value */
+{
+	if (table->type == HASH_TABLE_SYNC_MUTEX) {
+		ut_ad(mutex_own(hash_get_mutex(table, fold)));
+	} else if (table->type == HASH_TABLE_SYNC_RW_LOCK) {
+# ifdef UNIV_SYNC_DEBUG
+		rw_lock_t* lock = hash_get_lock(table, fold);
+		ut_ad(rw_lock_own(lock, RW_LOCK_EX)
+		      || rw_lock_own(lock, RW_LOCK_SHARED));
+# endif
+	} else {
+		ut_ad(table->type == HASH_TABLE_SYNC_NONE);
+	}
+}
+#endif /* UNIV_DEBUG */
+
 /*************************************************************//**
 Looks for an element in a hash table.
 @return pointer to the data of the first hash table node in chain
@@ -119,10 +169,7 @@ ha_search_and_get_data(
 {
 	ha_node_t*	node;
 
-	ASSERT_HASH_MUTEX_OWN(table, fold);
-#ifdef UNIV_SYNC_DEBUG
-	ut_ad(rw_lock_own(&btr_search_latch, RW_LOCK_SHARED));
-#endif /* UNIV_SYNC_DEBUG */
+	hash_assert_can_search(table, fold);
 	ut_ad(btr_search_enabled);
 
 	node = ha_chain_get_first(table, fold);
@@ -152,7 +199,7 @@ ha_search_with_data(
 {
 	ha_node_t*	node;
 
-	ASSERT_HASH_MUTEX_OWN(table, fold);
+	hash_assert_can_search(table, fold);
 
 	ut_ad(btr_search_enabled);
 
@@ -184,10 +231,7 @@ ha_search_and_delete_if_found(
 {
 	ha_node_t*	node;
 
-	ASSERT_HASH_MUTEX_OWN(table, fold);
-#ifdef UNIV_SYNC_DEBUG
-	ut_ad(rw_lock_own(&btr_search_latch, RW_LOCK_EX));
-#endif /* UNIV_SYNC_DEBUG */
+	hash_assert_can_modify(table, fold);
 	ut_ad(btr_search_enabled);
 
 	node = ha_search_with_data(table, fold, data);
diff --git a/storage/innobase/include/ha0storage.h b/storage/innobase/include/ha0storage.h
index c30bd840579..caf42abfcfe 100644
--- a/storage/innobase/include/ha0storage.h
+++ b/storage/innobase/include/ha0storage.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 2007, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 2007, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
diff --git a/storage/innobase/include/ha0storage.ic b/storage/innobase/include/ha0storage.ic
index 5acbf82f005..ce6e7406b43 100644
--- a/storage/innobase/include/ha0storage.ic
+++ b/storage/innobase/include/ha0storage.ic
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 2007, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 2007, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
diff --git a/storage/innobase/include/ha_prototypes.h b/storage/innobase/include/ha_prototypes.h
index edf7a1a28c1..f2317054c7f 100644
--- a/storage/innobase/include/ha_prototypes.h
+++ b/storage/innobase/include/ha_prototypes.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 2006, 2010, Innobase Oy. All Rights Reserved.
+Copyright (c) 2006, 2010, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -27,9 +27,17 @@ Created 5/11/2006 Osku Salerma
 #ifndef HA_INNODB_PROTOTYPES_H
 #define HA_INNODB_PROTOTYPES_H
 
+#include "my_dbug.h"
+#include "my_compare.h"
+#include "my_sys.h"
+#include "m_string.h"
+
 #include "trx0types.h"
 #include "m_ctype.h" /* CHARSET_INFO */
 
+// Forward declaration
+typedef struct fts_string_struct fts_string_t;
+
 /*********************************************************************//**
 Wrapper around MySQL's copy_and_convert function.
 @return	number of bytes copied to 'to' */
@@ -43,7 +51,8 @@ innobase_convert_string(
 	CHARSET_INFO*	to_cs,		/*!< in: character set to convert to */
 	const void*	from,		/*!< in: string to convert */
 	ulint		from_length,	/*!< in: number of bytes to convert */
-	CHARSET_INFO*	from_cs,	/*!< in: character set to convert from */
+	CHARSET_INFO*	from_cs,	/*!< in: character set to convert
+					from */
 	uint*		errors);	/*!< out: number of errors encountered
 					during the conversion */
 
@@ -136,6 +145,23 @@ innobase_mysql_print_thd(
 	uint	max_query_len);	/*!< in: max query length to print, or 0 to
 				   use the default max length */
 
+/*************************************************************//**
+InnoDB uses this function to compare two data fields for which the data type
+is such that we must use MySQL code to compare them.
+@return	1, 0, -1, if a is greater, equal, less than b, respectively */
+UNIV_INTERN
+int
+innobase_mysql_cmp(
+/*===============*/
+	int		mysql_type,	/*!< in: MySQL type */
+	uint		charset_number,	/*!< in: number of the charset */
+	const unsigned char* a,		/*!< in: data field */
+	unsigned int	a_length,	/*!< in: data field length,
+					not UNIV_SQL_NULL */
+	const unsigned char* b,		/*!< in: data field */
+	unsigned int	b_length)	/*!< in: data field length,
+					not UNIV_SQL_NULL */
+	__attribute__((nonnull, warn_unused_result));
 /**************************************************************//**
 Converts a MySQL type to an InnoDB type. Note that this function returns
 the 'mtype' of InnoDB. InnoDB differentiates between MySQL's old <= 4.1
@@ -174,6 +200,17 @@ innobase_strcasecmp(
 	const char*	b);	/*!< in: second string to compare */
 
 /******************************************************************//**
+Compares NUL-terminated UTF-8 strings case insensitively. The
+second string contains wildcards.
+@return 0 if a match is found, 1 if not */
+UNIV_INTERN
+int
+innobase_wildcasecmp(
+/*=================*/
+	const char*	a,	/*!< in: string to compare */
+	const char*	b);	/*!< in: wildcard string to compare */
+
+/******************************************************************//**
 Strip dir name from a full path name and return only its file name.
 @return file name or "null" if no file name */
 UNIV_INTERN
@@ -211,8 +248,8 @@ innobase_convert_from_id(
 	struct charset_info_st*	cs,	/*!< in: the 'from' character set */
 	char*			to,	/*!< out: converted identifier */
 	const char*		from,	/*!< in: identifier to convert */
-	ulint			len);	/*!< in: length of 'to', in bytes; should
-					be at least 3 * strlen(to) + 1 */
+	ulint			len);	/*!< in: length of 'to', in bytes;
+					should be at least 3 * strlen(to) + 1 */
 /******************************************************************//**
 Makes all characters in a NUL-terminated UTF-8 string lower case. */
 UNIV_INTERN
@@ -256,11 +293,20 @@ innobase_get_at_most_n_mbchars(
 	ulint data_len,		/*!< in: length of the string in bytes */
 	const char* str);	/*!< in: character string */
 
+/*************************************************************//**
+InnoDB index push-down condition check
+@return ICP_NO_MATCH, ICP_MATCH, or ICP_OUT_OF_RANGE */
+UNIV_INTERN
+enum icp_result
+innobase_index_cond(
+/*================*/
+	void*	file)	/*!< in/out: pointer to ha_innobase */
+	__attribute__((nonnull, warn_unused_result));
 /******************************************************************//**
 Returns true if the thread supports XA,
 global value of innodb_supports_xa if thd is NULL.
 @return	true if thd supports XA */
-
+UNIV_INTERN
 ibool
 thd_supports_xa(
 /*============*/
@@ -282,8 +328,18 @@ UNIV_INTERN
 void
 thd_set_lock_wait_time(
 /*===================*/
-        void*   thd,	/*!< in: thread handle (THD*) */
-        ulint   value);	/*!< in: time waited for the lock */
+	void*	thd,	/*!< in: thread handle (THD*) */
+	ulint	value);	/*!< in: time waited for the lock */
+
+/**********************************************************************//**
+Get the current setting of the table_cache_size global parameter. We do
+a dirty read because for one there is no synchronization object and
+secondly there is little harm in doing so even if we get a torn read.
+@return	SQL statement string */
+UNIV_INTERN
+ulint
+innobase_get_table_cache_size(void);
+/*===============================*/
 
 /**********************************************************************//**
 Get the current setting of the lower_case_table_names global parameter from
@@ -296,4 +352,66 @@ ulint
 innobase_get_lower_case_table_names(void);
 /*=====================================*/
 
-#endif
+/*************************************************************//**
+Get the next token from the given string and store it in *token. */
+UNIV_INTERN
+ulint
+innobase_mysql_fts_get_token(
+/*=========================*/
+	CHARSET_INFO*	charset,	/*!< in: Character set */
+	byte*		start,		/*!< in: start of text */
+	byte*		end,		/*!< in: one character past end of
+					text */
+	fts_string_t*	token,		/*!< out: token's text */
+	ulint*		offset);	/*!< out: offset to token,
+					measured as characters from
+					'start' */
+
+/******************************************************************//**
+compare two character string case insensitively according to their charset. */
+UNIV_INTERN
+int
+innobase_fts_text_case_cmp(
+/*=======================*/
+	const void*	cs,		/*!< in: Character set */
+	const void*	p1,		/*!< in: key */
+	const void*	p2);		/*!< in: node */
+
+/******************************************************************//**
+compare two character string according to their charset. */
+UNIV_INTERN
+int
+innobase_fts_string_cmp(
+/*====================*/
+	const void*	cs,		/*!< in: Character set */
+	const void*	p1,		/*!< in: key */
+	const void*	p2);		/*!< in: node */
+
+/****************************************************************//**
+Get FTS field charset info from the field's prtype
+@return charset info */
+UNIV_INTERN
+CHARSET_INFO*
+innobase_get_fts_charset(
+/*=====================*/
+	int		mysql_type,	/*!< in: MySQL type */
+	uint		charset_number);/*!< in: number of the charset */
+/******************************************************************//**
+Returns true if transaction should be flagged as read-only.
+@return	true if the thd is marked as read-only */
+UNIV_INTERN
+ibool
+thd_trx_is_read_only(
+/*=================*/
+	void*	thd);	/*!< in: thread handle (THD*) */
+
+/******************************************************************//**
+Check if the transaction is an auto-commit transaction. TRUE also
+implies that it is a SELECT (read-only) transaction.
+@return	true if the transaction is an auto commit read-only transaction. */
+UNIV_INTERN
+ibool
+thd_trx_is_auto_commit(
+/*===================*/
+	void*	thd);	/*!< in: thread handle (THD*) can be NULL */
+#endif /* HA_INNODB_PROTOTYPES_H */
diff --git a/storage/innobase/include/handler0alter.h b/storage/innobase/include/handler0alter.h
index 017fe88d533..c5d439ef21b 100644
--- a/storage/innobase/include/handler0alter.h
+++ b/storage/innobase/include/handler0alter.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 2005, 2010, Innobase Oy. All Rights Reserved.
+Copyright (c) 2005, 2010, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
diff --git a/storage/innobase/include/hash0hash.h b/storage/innobase/include/hash0hash.h
index b17c21a45ef..1c19ea53a23 100644
--- a/storage/innobase/include/hash0hash.h
+++ b/storage/innobase/include/hash0hash.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1997, 2011, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -30,6 +30,7 @@ Created 5/20/1997 Heikki Tuuri
 #include "mem0mem.h"
 #ifndef UNIV_HOTBACKUP
 # include "sync0sync.h"
+# include "sync0rw.h"
 #endif /* !UNIV_HOTBACKUP */
 
 typedef struct hash_table_struct hash_table_t;
@@ -40,6 +41,18 @@ typedef void*	hash_node_t;
 /* Fix Bug #13859: symbol collision between imap/mysql */
 #define hash_create hash0_create
 
+/* Differnt types of hash_table based on the synchronization
+method used for it. */
+enum hash_table_sync_t {
+	HASH_TABLE_SYNC_NONE = 0,	/*!< Don't use any internal
+					synchronization objects for
+					this hash_table. */
+	HASH_TABLE_SYNC_MUTEX,		/*!< Use mutexes to control
+					access to this hash_table. */
+	HASH_TABLE_SYNC_RW_LOCK		/*!< Use rw_locks to control
+					access to this hash_table. */
+};
+
 /*************************************************************//**
 Creates a hash table with >= n array cells. The actual number
 of cells is chosen to be a prime number slightly bigger than n.
@@ -51,21 +64,29 @@ hash_create(
 	ulint	n);	/*!< in: number of array cells */
 #ifndef UNIV_HOTBACKUP
 /*************************************************************//**
-Creates a mutex array to protect a hash table. */
+Creates a sync object array array to protect a hash table.
+::sync_obj can be mutexes or rw_locks depening on the type of
+hash table. */
 UNIV_INTERN
 void
-hash_create_mutexes_func(
-/*=====================*/
-	hash_table_t*	table,		/*!< in: hash table */
+hash_create_sync_obj_func(
+/*======================*/
+	hash_table_t*		table,	/*!< in: hash table */
+	enum hash_table_sync_t	type,	/*!< in: HASH_TABLE_SYNC_MUTEX
+					or HASH_TABLE_SYNC_RW_LOCK */
 #ifdef UNIV_SYNC_DEBUG
-	ulint		sync_level,	/*!< in: latching order level of the
-					mutexes: used in the debug version */
+	ulint			sync_level,/*!< in: latching order level
+					of the mutexes: used in the
+					debug version */
 #endif /* UNIV_SYNC_DEBUG */
-	ulint		n_mutexes);	/*!< in: number of mutexes */
+	ulint			n_sync_obj);/*!< in: number of sync objects,
+					must be a power of 2 */
 #ifdef UNIV_SYNC_DEBUG
-# define hash_create_mutexes(t,n,level) hash_create_mutexes_func(t,level,n)
+# define hash_create_sync_obj(t, s, n, level)			\
+			hash_create_sync_obj_func(t, s, level, n)
 #else /* UNIV_SYNC_DEBUG */
-# define hash_create_mutexes(t,n,level) hash_create_mutexes_func(t,n)
+# define hash_create_sync_obj(t, s, n, level)			\
+			hash_create_sync_obj_func(t, s, n)
 #endif /* UNIV_SYNC_DEBUG */
 #endif /* !UNIV_HOTBACKUP */
 
@@ -87,11 +108,12 @@ hash_calc_hash(
 	hash_table_t*	table);	/*!< in: hash table */
 #ifndef UNIV_HOTBACKUP
 /********************************************************************//**
-Assert that the mutex for the table in a hash operation is owned. */
-# define HASH_ASSERT_OWNED(TABLE, FOLD)					\
-ut_ad(!(TABLE)->mutexes || mutex_own(hash_get_mutex(TABLE, FOLD)));
+Assert that the mutex for the table is held */
+# define HASH_ASSERT_OWN(TABLE, FOLD)				\
+	ut_ad((TABLE)->type != HASH_TABLE_SYNC_MUTEX		\
+	      || (mutex_own(hash_get_mutex((TABLE), FOLD))));
 #else /* !UNIV_HOTBACKUP */
-# define HASH_ASSERT_OWNED(TABLE, FOLD)
+# define HASH_ASSERT_OWN(TABLE, FOLD)
 #endif /* !UNIV_HOTBACKUP */
 
 /*******************************************************************//**
@@ -102,7 +124,7 @@ do {\
 	hash_cell_t*	cell3333;\
 	TYPE*		struct3333;\
 \
-	HASH_ASSERT_OWNED(TABLE, FOLD)\
+	HASH_ASSERT_OWN(TABLE, FOLD)\
 \
 	(DATA)->NAME = NULL;\
 \
@@ -124,7 +146,7 @@ do {\
 
 #ifdef UNIV_HASH_DEBUG
 # define HASH_ASSERT_VALID(DATA) ut_a((void*) (DATA) != (void*) -1)
-# define HASH_INVALIDATE(DATA, NAME) DATA->NAME = (void*) -1
+# define HASH_INVALIDATE(DATA, NAME) *(void**) (&DATA->NAME) = (void*) -1
 #else
 # define HASH_ASSERT_VALID(DATA) do {} while (0)
 # define HASH_INVALIDATE(DATA, NAME) do {} while (0)
@@ -138,7 +160,7 @@ do {\
 	hash_cell_t*	cell3333;\
 	TYPE*		struct3333;\
 \
-	HASH_ASSERT_OWNED(TABLE, FOLD)\
+	HASH_ASSERT_OWN(TABLE, FOLD)\
 \
 	cell3333 = hash_get_nth_cell(TABLE, hash_calc_hash(FOLD, TABLE));\
 \
@@ -175,7 +197,7 @@ Looks for a struct in a hash table. */
 #define HASH_SEARCH(NAME, TABLE, FOLD, TYPE, DATA, ASSERTION, TEST)\
 {\
 \
-	HASH_ASSERT_OWNED(TABLE, FOLD)\
+	HASH_ASSERT_OWN(TABLE, FOLD)\
 \
 	(DATA) = (TYPE) HASH_GET_FIRST(TABLE, hash_calc_hash(FOLD, TABLE));\
 	HASH_ASSERT_VALID(DATA);\
@@ -259,7 +281,7 @@ do {\
 \
 	HASH_DELETE(TYPE, NAME, TABLE, fold111, NODE);\
 \
-	top_node111 = (TYPE*)mem_heap_get_top(\
+	top_node111 = (TYPE*) mem_heap_get_top(\
 				hash_get_heap(TABLE, fold111),\
 							sizeof(TYPE));\
 \
@@ -284,11 +306,12 @@ do {\
 		} else {\
 			/* We have to look for the predecessor of the top\
 			node */\
-			node111 = cell111->node;\
+			node111 = static_cast<TYPE*>(cell111->node);\
 \
 			while (top_node111 != HASH_GET_NEXT(NAME, node111)) {\
 \
-				node111 = HASH_GET_NEXT(NAME, node111);\
+				node111 = static_cast<TYPE*>(\
+					HASH_GET_NEXT(NAME, node111));\
 			}\
 \
 			/* Now we have the predecessor node */\
@@ -329,12 +352,12 @@ do {\
 } while (0)
 
 /************************************************************//**
-Gets the mutex index for a fold value in a hash table.
-@return	mutex number */
+Gets the sync object index for a fold value in a hash table.
+@return	index */
 UNIV_INLINE
 ulint
-hash_get_mutex_no(
-/*==============*/
+hash_get_sync_obj_index(
+/*====================*/
 	hash_table_t*	table,	/*!< in: hash table */
 	ulint		fold);	/*!< in: fold */
 /************************************************************//**
@@ -365,6 +388,15 @@ hash_get_nth_mutex(
 	hash_table_t*	table,	/*!< in: hash table */
 	ulint		i);	/*!< in: index of the mutex */
 /************************************************************//**
+Gets the nth rw_lock in a hash table.
+@return	rw_lock */
+UNIV_INLINE
+rw_lock_t*
+hash_get_nth_lock(
+/*==============*/
+	hash_table_t*	table,	/*!< in: hash table */
+	ulint		i);	/*!< in: index of the rw_lock */
+/************************************************************//**
 Gets the mutex for a fold value in a hash table.
 @return	mutex */
 UNIV_INLINE
@@ -374,6 +406,15 @@ hash_get_mutex(
 	hash_table_t*	table,	/*!< in: hash table */
 	ulint		fold);	/*!< in: fold */
 /************************************************************//**
+Gets the rw_lock for a fold value in a hash table.
+@return	rw_lock */
+UNIV_INLINE
+rw_lock_t*
+hash_get_lock(
+/*==========*/
+	hash_table_t*	table,	/*!< in: hash table */
+	ulint		fold);	/*!< in: fold */
+/************************************************************//**
 Reserves the mutex for a fold value in a hash table. */
 UNIV_INTERN
 void
@@ -403,10 +444,84 @@ void
 hash_mutex_exit_all(
 /*================*/
 	hash_table_t*	table);	/*!< in: hash table */
+/************************************************************//**
+Releases all but the passed in mutex of a hash table. */
+UNIV_INTERN
+void
+hash_mutex_exit_all_but(
+/*====================*/
+	hash_table_t*	table,		/*!< in: hash table */
+	mutex_t*	keep_mutex);	/*!< in: mutex to keep */
+/************************************************************//**
+s-lock a lock for a fold value in a hash table. */
+UNIV_INTERN
+void
+hash_lock_s(
+/*========*/
+	hash_table_t*	table,	/*!< in: hash table */
+	ulint		fold);	/*!< in: fold */
+/************************************************************//**
+x-lock a lock for a fold value in a hash table. */
+UNIV_INTERN
+void
+hash_lock_x(
+/*========*/
+	hash_table_t*	table,	/*!< in: hash table */
+	ulint		fold);	/*!< in: fold */
+/************************************************************//**
+unlock an s-lock for a fold value in a hash table. */
+UNIV_INTERN
+void
+hash_unlock_s(
+/*==========*/
+
+	hash_table_t*	table,	/*!< in: hash table */
+	ulint		fold);	/*!< in: fold */
+/************************************************************//**
+unlock x-lock for a fold value in a hash table. */
+UNIV_INTERN
+void
+hash_unlock_x(
+/*==========*/
+	hash_table_t*	table,	/*!< in: hash table */
+	ulint		fold);	/*!< in: fold */
+/************************************************************//**
+Reserves all the locks of a hash table, in an ascending order. */
+UNIV_INTERN
+void
+hash_lock_x_all(
+/*============*/
+	hash_table_t*	table);	/*!< in: hash table */
+/************************************************************//**
+Releases all the locks of a hash table, in an ascending order. */
+UNIV_INTERN
+void
+hash_unlock_x_all(
+/*==============*/
+	hash_table_t*	table);	/*!< in: hash table */
+/************************************************************//**
+Releases all but passed in lock of a hash table, */
+UNIV_INTERN
+void
+hash_unlock_x_all_but(
+/*==================*/
+	hash_table_t*	table,		/*!< in: hash table */
+	rw_lock_t*	keep_lock);	/*!< in: lock to keep */
+
 #else /* !UNIV_HOTBACKUP */
 # define hash_get_heap(table, fold)	((table)->heap)
 # define hash_mutex_enter(table, fold)	((void) 0)
 # define hash_mutex_exit(table, fold)	((void) 0)
+# define hash_mutex_enter_all(table)	((void) 0)
+# define hash_mutex_exit_all(table)	((void) 0)
+# define hash_mutex_exit_all_but(t, m)	((void) 0)
+# define hash_lock_s(t, f)		((void) 0)
+# define hash_lock_x(t, f)		((void) 0)
+# define hash_unlock_s(t, f)		((void) 0)
+# define hash_unlock_x(t, f)		((void) 0)
+# define hash_lock_x_all(t)		((void) 0)
+# define hash_unlock_x_all(t)		((void) 0)
+# define hash_unlock_x_all_but(t, l)	((void) 0)
 #endif /* !UNIV_HOTBACKUP */
 
 struct hash_cell_struct{
@@ -415,27 +530,40 @@ struct hash_cell_struct{
 
 /* The hash table structure */
 struct hash_table_struct {
+	enum hash_table_sync_t	type;	/*<! type of hash_table. */
 #if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
 # ifndef UNIV_HOTBACKUP
-	ibool		adaptive;/* TRUE if this is the hash table of the
-				adaptive hash index */
+	ibool			adaptive;/* TRUE if this is the hash
+					table of the adaptive hash
+					index */
 # endif /* !UNIV_HOTBACKUP */
 #endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
-	ulint		n_cells;/* number of cells in the hash table */
-	hash_cell_t*	array;	/*!< pointer to cell array */
+	ulint			n_cells;/* number of cells in the hash table */
+	hash_cell_t*		array;	/*!< pointer to cell array */
 #ifndef UNIV_HOTBACKUP
-	ulint		n_mutexes;/* if mutexes != NULL, then the number of
-				mutexes, must be a power of 2 */
-	mutex_t*	mutexes;/* NULL, or an array of mutexes used to
-				protect segments of the hash table */
-	mem_heap_t**	heaps;	/*!< if this is non-NULL, hash chain nodes for
-				external chaining can be allocated from these
-				memory heaps; there are then n_mutexes many of
-				these heaps */
+	ulint			n_sync_obj;/* if sync_objs != NULL, then
+					the number of either the number
+					of mutexes or the number of
+					rw_locks depending on the type.
+					Must be a power of 2 */
+	union {
+		mutex_t*	mutexes;/* NULL, or an array of mutexes
+					used to protect segments of the
+					hash table */
+		rw_lock_t*	rw_locks;/* NULL, or an array of rw_lcoks
+					used to protect segments of the
+					hash table */
+	} sync_obj;
+
+	mem_heap_t**		heaps;	/*!< if this is non-NULL, hash
+					chain nodes for external chaining
+					can be allocated from these memory
+					heaps; there are then n_mutexes
+					many of these heaps */
 #endif /* !UNIV_HOTBACKUP */
-	mem_heap_t*	heap;
+	mem_heap_t*		heap;
 #ifdef UNIV_DEBUG
-	ulint		magic_n;
+	ulint			magic_n;
 # define HASH_TABLE_MAGIC_N	76561114
 #endif /* UNIV_DEBUG */
 };
diff --git a/storage/innobase/include/hash0hash.ic b/storage/innobase/include/hash0hash.ic
index 0b437894e2e..1e5474601d5 100644
--- a/storage/innobase/include/hash0hash.ic
+++ b/storage/innobase/include/hash0hash.ic
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1997, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -87,20 +87,21 @@ hash_calc_hash(
 
 #ifndef UNIV_HOTBACKUP
 /************************************************************//**
-Gets the mutex index for a fold value in a hash table.
-@return	mutex number */
+Gets the sync object index for a fold value in a hash table.
+@return	index */
 UNIV_INLINE
 ulint
-hash_get_mutex_no(
-/*==============*/
+hash_get_sync_obj_index(
+/*====================*/
 	hash_table_t*	table,	/*!< in: hash table */
 	ulint		fold)	/*!< in: fold */
 {
 	ut_ad(table);
 	ut_ad(table->magic_n == HASH_TABLE_MAGIC_N);
-	ut_ad(ut_is_2pow(table->n_mutexes));
+	ut_ad(table->type != HASH_TABLE_SYNC_NONE);
+	ut_ad(ut_is_2pow(table->n_sync_obj));
 	return(ut_2pow_remainder(hash_calc_hash(fold, table),
-				 table->n_mutexes));
+				 table->n_sync_obj));
 }
 
 /************************************************************//**
@@ -115,7 +116,8 @@ hash_get_nth_heap(
 {
 	ut_ad(table);
 	ut_ad(table->magic_n == HASH_TABLE_MAGIC_N);
-	ut_ad(i < table->n_mutexes);
+	ut_ad(table->type != HASH_TABLE_SYNC_NONE);
+	ut_ad(i < table->n_sync_obj);
 
 	return(table->heaps[i]);
 }
@@ -139,7 +141,7 @@ hash_get_heap(
 		return(table->heap);
 	}
 
-	i = hash_get_mutex_no(table, fold);
+	i = hash_get_sync_obj_index(table, fold);
 
 	return(hash_get_nth_heap(table, i));
 }
@@ -156,9 +158,10 @@ hash_get_nth_mutex(
 {
 	ut_ad(table);
 	ut_ad(table->magic_n == HASH_TABLE_MAGIC_N);
-	ut_ad(i < table->n_mutexes);
+	ut_ad(table->type == HASH_TABLE_SYNC_MUTEX);
+	ut_ad(i < table->n_sync_obj);
 
-	return(table->mutexes + i);
+	return(table->sync_obj.mutexes + i);
 }
 
 /************************************************************//**
@@ -176,8 +179,47 @@ hash_get_mutex(
 	ut_ad(table);
 	ut_ad(table->magic_n == HASH_TABLE_MAGIC_N);
 
-	i = hash_get_mutex_no(table, fold);
+	i = hash_get_sync_obj_index(table, fold);
 
 	return(hash_get_nth_mutex(table, i));
 }
+
+/************************************************************//**
+Gets the nth rw_lock in a hash table.
+@return	rw_lock */
+UNIV_INLINE
+rw_lock_t*
+hash_get_nth_lock(
+/*==============*/
+	hash_table_t*	table,	/*!< in: hash table */
+	ulint		i)	/*!< in: index of the rw_lock */
+{
+	ut_ad(table);
+	ut_ad(table->magic_n == HASH_TABLE_MAGIC_N);
+	ut_ad(table->type == HASH_TABLE_SYNC_RW_LOCK);
+	ut_ad(i < table->n_sync_obj);
+
+	return(table->sync_obj.rw_locks + i);
+}
+
+/************************************************************//**
+Gets the rw_lock for a fold value in a hash table.
+@return	rw_lock */
+UNIV_INLINE
+rw_lock_t*
+hash_get_lock(
+/*==========*/
+	hash_table_t*	table,	/*!< in: hash table */
+	ulint		fold)	/*!< in: fold */
+{
+	ulint	i;
+
+	ut_ad(table);
+	ut_ad(table->type == HASH_TABLE_SYNC_RW_LOCK);
+	ut_ad(table->magic_n == HASH_TABLE_MAGIC_N);
+
+	i = hash_get_sync_obj_index(table, fold);
+
+	return(hash_get_nth_lock(table, i));
+}
 #endif /* !UNIV_HOTBACKUP */
diff --git a/storage/innobase/include/ibuf0ibuf.h b/storage/innobase/include/ibuf0ibuf.h
index 28c97fd609f..f405ebf8d11 100644
--- a/storage/innobase/include/ibuf0ibuf.h
+++ b/storage/innobase/include/ibuf0ibuf.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1997, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -35,6 +35,10 @@ Created 7/19/1997 Heikki Tuuri
 #ifndef UNIV_HOTBACKUP
 # include "ibuf0types.h"
 
+/** Default value for maximum on-disk size of change buffer in terms
+of percentage of the buffer pool. */
+#define CHANGE_BUFFER_DEFAULT_SIZE	(25)
+
 /* Possible operations buffered in the insert/whatever buffer. See
 ibuf_insert(). DO NOT CHANGE THE VALUES OF THESE, THEY ARE STORED ON DISK. */
 typedef enum {
@@ -98,6 +102,14 @@ void
 ibuf_init_at_db_start(void);
 /*=======================*/
 /*********************************************************************//**
+Updates the max_size value for ibuf. */
+UNIV_INTERN
+void
+ibuf_max_size_update(
+/*=================*/
+	ulint	new_val);	/*!< in: new value in terms of
+				percentage of the buffer pool size */
+/*********************************************************************//**
 Reads the biggest tablespace id from the high end of the insert buffer
 tree and updates the counter in fil_system. */
 UNIV_INTERN
@@ -376,14 +388,12 @@ will be merged from ibuf trees to the pages read, 0 if ibuf is
 empty */
 UNIV_INTERN
 ulint
-ibuf_contract_for_n_pages(
-/*======================*/
-	ibool	sync,	/*!< in: TRUE if the caller wants to wait for the
-			issued read with the highest tablespace address
-			to complete */
-	ulint	n_pages);/*!< in: try to read at least this many pages to
-			the buffer pool and merge the ibuf contents to
-			them */
+ibuf_contract_in_background(
+/*========================*/
+	ibool	full);	/*!< in: TRUE if the caller wants to do a full
+			contract based on PCT_IO(100). If FALSE then
+			the size of contract batch is determined based
+			on the current size of the ibuf tree. */
 #endif /* !UNIV_HOTBACKUP */
 /*********************************************************************//**
 Parses a redo log record of an ibuf bitmap page init.
@@ -452,6 +462,36 @@ for the file segment from which the pages for the ibuf tree are allocated */
 /* The insert buffer tree itself is always located in space 0. */
 #define IBUF_SPACE_ID		0
 
+/** Insert buffer struct */
+struct ibuf_struct{
+	ulint		size;		/*!< current size of the ibuf index
+					tree, in pages */
+	ulint		max_size;	/*!< recommended maximum size of the
+					ibuf index tree, in pages */
+	ulint		seg_size;	/*!< allocated pages of the file
+					segment containing ibuf header and
+					tree */
+	ibool		empty;		/*!< Protected by the page
+					latch of the root page of the
+					insert buffer tree
+					(FSP_IBUF_TREE_ROOT_PAGE_NO). TRUE
+					if and only if the insert
+					buffer tree is empty. */
+	ulint		free_list_len;	/*!< length of the free list */
+	ulint		height;		/*!< tree height */
+	dict_index_t*	index;		/*!< insert buffer index */
+
+	ulint		n_merges;	/*!< number of pages merged */
+	ulint		n_merged_ops[IBUF_OP_COUNT];
+					/*!< number of operations of each type
+					merged to index pages */
+	ulint		n_discarded_ops[IBUF_OP_COUNT];
+					/*!< number of operations of each type
+					discarded without merging due to the
+					tablespace being deleted or the
+					index being dropped */
+};
+
 #ifndef UNIV_NONINL
 #include "ibuf0ibuf.ic"
 #endif
diff --git a/storage/innobase/include/ibuf0ibuf.ic b/storage/innobase/include/ibuf0ibuf.ic
index 0a22667a260..8a4ec633b01 100644
--- a/storage/innobase/include/ibuf0ibuf.ic
+++ b/storage/innobase/include/ibuf0ibuf.ic
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1997, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -28,9 +28,6 @@ Created 7/19/1997 Heikki Tuuri
 #ifndef UNIV_HOTBACKUP
 #include "buf0lru.h"
 
-/** Counter for ibuf_should_try() */
-extern ulint	ibuf_flush_count;
-
 /** An index page must contain at least UNIV_PAGE_SIZE /
 IBUF_PAGE_SIZE_PER_FREE_SPACE bytes of free space for ibuf to try to
 buffer inserts to this page.  If there is this much of free space, the
@@ -61,36 +58,6 @@ ibuf_mtr_commit(
 	mtr_commit(mtr);
 }
 
-/** Insert buffer struct */
-struct ibuf_struct{
-	ulint		size;		/*!< current size of the ibuf index
-					tree, in pages */
-	ulint		max_size;	/*!< recommended maximum size of the
-					ibuf index tree, in pages */
-	ulint		seg_size;	/*!< allocated pages of the file
-					segment containing ibuf header and
-					tree */
-	ibool		empty;		/*!< Protected by the page
-					latch of the root page of the
-					insert buffer tree
-					(FSP_IBUF_TREE_ROOT_PAGE_NO). TRUE
-					if and only if the insert
-					buffer tree is empty. */
-	ulint		free_list_len;	/*!< length of the free list */
-	ulint		height;		/*!< tree height */
-	dict_index_t*	index;		/*!< insert buffer index */
-
-	ulint		n_merges;	/*!< number of pages merged */
-	ulint		n_merged_ops[IBUF_OP_COUNT];
-					/*!< number of operations of each type
-					merged to index pages */
-	ulint		n_discarded_ops[IBUF_OP_COUNT];
-					/*!< number of operations of each type
-					discarded without merging due to the
-					tablespace being deleted or the
-					index being dropped */
-};
-
 /************************************************************************//**
 Sets the free bit of the page in the ibuf bitmap. This is done in a separate
 mini-transaction, hence this operation does not restrict further work to only
@@ -127,21 +94,10 @@ ibuf_should_try(
 						a secondary index when we
 						decide */
 {
-	if (ibuf_use != IBUF_USE_NONE
-	    && !dict_index_is_clust(index)
-	    && (ignore_sec_unique || !dict_index_is_unique(index))) {
-
-		ibuf_flush_count++;
-
-		if (ibuf_flush_count % 4 == 0) {
-
-			buf_LRU_try_free_flushed_blocks(NULL);
-		}
-
-		return(TRUE);
-	}
-
-	return(FALSE);
+	return(ibuf_use != IBUF_USE_NONE
+	       && ibuf->max_size != 0
+	       && !dict_index_is_clust(index)
+	       && (ignore_sec_unique || !dict_index_is_unique(index)));
 }
 
 /******************************************************************//**
@@ -174,12 +130,11 @@ ibuf_bitmap_page(
 	ut_ad(ut_is_2pow(zip_size));
 
 	if (!zip_size) {
-		return(UNIV_UNLIKELY((page_no & (UNIV_PAGE_SIZE - 1))
-				     == FSP_IBUF_BITMAP_OFFSET));
+		return((page_no & (UNIV_PAGE_SIZE - 1))
+			== FSP_IBUF_BITMAP_OFFSET);
 	}
 
-	return(UNIV_UNLIKELY((page_no & (zip_size - 1))
-			     == FSP_IBUF_BITMAP_OFFSET));
+	return((page_no & (zip_size - 1)) == FSP_IBUF_BITMAP_OFFSET);
 }
 
 /*********************************************************************//**
@@ -197,7 +152,7 @@ ibuf_index_page_calc_free_bits(
 	ulint	n;
 	ut_ad(ut_is_2pow(zip_size));
 	ut_ad(!zip_size || zip_size > IBUF_PAGE_SIZE_PER_FREE_SPACE);
-	ut_ad(zip_size <= UNIV_PAGE_SIZE);
+	ut_ad(zip_size <= UNIV_ZIP_SIZE_MAX);
 
 	if (zip_size) {
 		n = max_ins_size
@@ -232,7 +187,7 @@ ibuf_index_page_calc_free_from_bits(
 	ut_ad(bits < 4);
 	ut_ad(ut_is_2pow(zip_size));
 	ut_ad(!zip_size || zip_size > IBUF_PAGE_SIZE_PER_FREE_SPACE);
-	ut_ad(zip_size <= UNIV_PAGE_SIZE);
+	ut_ad(zip_size <= UNIV_ZIP_SIZE_MAX);
 
 	if (zip_size) {
 		if (bits == 3) {
@@ -274,9 +229,9 @@ ibuf_index_page_calc_free_zip(
 	zip_max_ins = page_zip_max_ins_size(page_zip,
 					    FALSE/* not clustered */);
 
-	if (UNIV_UNLIKELY(zip_max_ins < 0)) {
+	if (zip_max_ins < 0) {
 		return(0);
-	} else if (UNIV_LIKELY(max_ins_size > (ulint) zip_max_ins)) {
+	} else if (max_ins_size > (ulint) zip_max_ins) {
 		max_ins_size = (ulint) zip_max_ins;
 	}
 
diff --git a/storage/innobase/include/ibuf0types.h b/storage/innobase/include/ibuf0types.h
index 55944f879b2..e404b62a011 100644
--- a/storage/innobase/include/ibuf0types.h
+++ b/storage/innobase/include/ibuf0types.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1997, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
diff --git a/storage/innobase/include/lock0iter.h b/storage/innobase/include/lock0iter.h
index 25a57c9740c..42b4f7281e4 100644
--- a/storage/innobase/include/lock0iter.h
+++ b/storage/innobase/include/lock0iter.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 2007, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 2007, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
diff --git a/storage/innobase/include/lock0lock.h b/storage/innobase/include/lock0lock.h
index 29fdc3bbe97..fa1e6d50224 100644
--- a/storage/innobase/include/lock0lock.h
+++ b/storage/innobase/include/lock0lock.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1996, 2010, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2011, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -36,13 +36,12 @@ Created 5/7/1996 Heikki Tuuri
 #include "lock0types.h"
 #include "read0types.h"
 #include "hash0hash.h"
+#include "srv0srv.h"
 #include "ut0vec.h"
 
 #ifdef UNIV_DEBUG
 extern ibool	lock_print_waits;
 #endif /* UNIV_DEBUG */
-/* Buffer for storing information about the most recent deadlock error */
-extern FILE*	lock_latest_err_file;
 
 /*********************************************************************//**
 Gets the size of a lock struct.
@@ -65,18 +64,6 @@ void
 lock_sys_close(void);
 /*================*/
 /*********************************************************************//**
-Checks if some transaction has an implicit x-lock on a record in a clustered
-index.
-@return	transaction which has the x-lock, or NULL */
-UNIV_INLINE
-trx_t*
-lock_clust_rec_some_has_impl(
-/*=========================*/
-	const rec_t*		rec,	/*!< in: user record */
-	const dict_index_t*	index,	/*!< in: clustered index */
-	const ulint*		offsets)/*!< in: rec_get_offsets(rec, index) */
-	__attribute__((nonnull, warn_unused_result));
-/*********************************************************************//**
 Gets the heap_no of the smallest user record on a page.
 @return	heap_no of smallest user record, or PAGE_HEAP_NO_SUPREMUM */
 UNIV_INLINE
@@ -271,14 +258,15 @@ lock_rec_restore_from_page_infimum(
 					state; lock bits are reset on
 					the infimum */
 /*********************************************************************//**
-Returns TRUE if there are explicit record locks on a page.
-@return	TRUE if there are explicit record locks on the page */
+Determines if there are explicit record locks on a page.
+@return	an explicit record lock on the page, or NULL if there are none */
 UNIV_INTERN
-ibool
+lock_t*
 lock_rec_expl_exist_on_page(
 /*========================*/
 	ulint	space,	/*!< in: space id */
-	ulint	page_no);/*!< in: page number */
+	ulint	page_no)/*!< in: page number */
+	__attribute__((warn_unused_result));
 /*********************************************************************//**
 Checks if locks of other transactions prevent an immediate insert of
 a record. If they do, first tests if the query thread should anyway
@@ -467,7 +455,8 @@ lock_table(
 /*=======*/
 	ulint		flags,	/*!< in: if BTR_NO_LOCKING_FLAG bit is set,
 				does nothing */
-	dict_table_t*	table,	/*!< in: database table in dictionary cache */
+	dict_table_t*	table,	/*!< in/out: database table
+				in dictionary cache */
 	enum lock_mode	mode,	/*!< in: lock mode */
 	que_thr_t*	thr);	/*!< in: query thread */
 /*************************************************************//**
@@ -478,28 +467,20 @@ UNIV_INTERN
 void
 lock_rec_unlock(
 /*============*/
-	trx_t*			trx,	/*!< in: transaction that has
+	trx_t*			trx,	/*!< in/out: transaction that has
 					set a record lock */
 	const buf_block_t*	block,	/*!< in: buffer block containing rec */
 	const rec_t*		rec,	/*!< in: record */
 	enum lock_mode		lock_mode);/*!< in: LOCK_S or LOCK_X */
 /*********************************************************************//**
-Releases transaction locks, and releases possible other transactions waiting
-because of these locks. */
+Releases a transaction's locks, and releases possible other transactions
+waiting because of these locks. Change the state of the transaction to
+TRX_STATE_COMMITTED_IN_MEMORY. */
 UNIV_INTERN
 void
-lock_release_off_kernel(
-/*====================*/
-	trx_t*	trx);	/*!< in: transaction */
-/*********************************************************************//**
-Cancels a waiting lock request and releases possible other transactions
-waiting behind it. */
-UNIV_INTERN
-void
-lock_cancel_waiting_and_release(
-/*============================*/
-	lock_t*	lock);	/*!< in: waiting lock request */
-
+lock_trx_release_locks(
+/*===================*/
+	trx_t*	trx);	/*!< in/out: transaction */
 /*********************************************************************//**
 Removes locks on a table to be dropped or truncated.
 If remove_also_table_sx_locks is TRUE then table-level S and X locks are
@@ -572,8 +553,9 @@ UNIV_INTERN
 ibool
 lock_is_table_exclusive(
 /*====================*/
-	dict_table_t*	table,	/*!< in: table */
-	trx_t*		trx);	/*!< in: transaction */
+	const dict_table_t*	table,	/*!< in: table */
+	const trx_t*		trx)	/*!< in: transaction */
+	__attribute__((nonnull));
 /*********************************************************************//**
 Checks if a lock request lock1 has to wait for request lock2.
 @return	TRUE if lock1 has to wait for lock2 to be removed */
@@ -587,18 +569,17 @@ lock_has_to_wait(
 				on the same record as in lock1 if the
 				locks are record locks */
 /*********************************************************************//**
-Checks that a transaction id is sensible, i.e., not in the future.
-@return	TRUE if ok */
+Reports that a transaction id is insensible, i.e., in the future. */
 UNIV_INTERN
-ibool
-lock_check_trx_id_sanity(
-/*=====================*/
+void
+lock_report_trx_id_insanity(
+/*========================*/
 	trx_id_t	trx_id,		/*!< in: trx id */
 	const rec_t*	rec,		/*!< in: user record */
-	dict_index_t*	index,		/*!< in: clustered index */
+	dict_index_t*	index,		/*!< in: index */
 	const ulint*	offsets,	/*!< in: rec_get_offsets(rec, index) */
-	ibool		has_kernel_mutex);/*!< in: TRUE if the caller owns the
-					kernel mutex */
+	trx_id_t	max_trx_id)	/*!< in: trx_sys_get_max_trx_id() */
+	__attribute__((nonnull));
 /*********************************************************************//**
 Prints info of a table lock. */
 UNIV_INTERN
@@ -617,16 +598,19 @@ lock_rec_print(
 	const lock_t*	lock);	/*!< in: record type lock */
 /*********************************************************************//**
 Prints info of locks for all transactions.
-@return FALSE if not able to obtain kernel mutex
-and exits without printing info */
+@return FALSE if not able to obtain lock mutex and exits without
+printing info */
 UNIV_INTERN
 ibool
 lock_print_info_summary(
 /*====================*/
 	FILE*	file,	/*!< in: file where to print */
-	ibool   nowait);/*!< in: whether to wait for the kernel mutex */
-/*************************************************************************
-Prints info of locks for each transaction. */
+	ibool   nowait)	/*!< in: whether to wait for the lock mutex */
+	__attribute__((nonnull, warn_unused_result));
+/*********************************************************************//**
+Prints info of locks for each transaction. This function assumes that the
+caller holds the lock mutex and more importantly it will release the lock
+mutex on behalf of the caller. (This should be fixed in the future). */
 UNIV_INTERN
 void
 lock_print_info_all_transactions(
@@ -635,27 +619,14 @@ lock_print_info_all_transactions(
 /*********************************************************************//**
 Return approximate number or record locks (bits set in the bitmap) for
 this transaction. Since delete-marked records may be removed, the
-record count will not be precise. */
+record count will not be precise.
+The caller must be holding lock_sys->mutex. */
 UNIV_INTERN
 ulint
 lock_number_of_rows_locked(
 /*=======================*/
-	const trx_t*	trx);	/*!< in: transaction */
-/*******************************************************************//**
-Check if a transaction holds any autoinc locks.
-@return TRUE if the transaction holds any AUTOINC locks. */
-UNIV_INTERN
-ibool
-lock_trx_holds_autoinc_locks(
-/*=========================*/
-	const trx_t*	trx);		/*!< in: transaction */
-/*******************************************************************//**
-Release all the transaction's autoinc locks. */
-UNIV_INTERN
-void
-lock_release_autoinc_locks(
-/*=======================*/
-	trx_t*		trx);		/*!< in/out: transaction */
+	const trx_lock_t*	trx_lock)	/*!< in: transaction locks */
+	__attribute__((nonnull, warn_unused_result));
 
 /*******************************************************************//**
 Gets the type of a lock. Non-inline version for using outside of the
@@ -751,7 +722,78 @@ ulint
 lock_rec_get_page_no(
 /*=================*/
 	const lock_t*	lock);	/*!< in: lock */
+/*******************************************************************//**
+Check if there are any locks (table or rec) against table.
+@return	TRUE if locks exist */
+UNIV_INTERN
+ibool
+lock_table_has_locks(
+/*=================*/
+	const dict_table_t*	table);	/*!< in: check if there are any locks
+					held on records in this table or on the
+					table itself */
 
+/*********************************************************************//**
+A thread which wakes up threads whose lock wait may have lasted too long.
+@return	a dummy parameter */
+extern "C" UNIV_INTERN
+os_thread_ret_t
+DECLARE_THREAD(lock_wait_timeout_thread)(
+/*=====================================*/
+	void*	arg);	/*!< in: a dummy parameter required by
+			os_thread_create */
+
+/********************************************************************//**
+Releases a user OS thread waiting for a lock to be released, if the
+thread is already suspended. */
+UNIV_INTERN
+void
+lock_wait_release_thread_if_suspended(
+/*==================================*/
+	que_thr_t*	thr);	/*!< in: query thread associated with the
+				user OS thread	 */
+
+/***************************************************************//**
+Puts a user OS thread to wait for a lock to be released. If an error
+occurs during the wait trx->error_state associated with thr is
+!= DB_SUCCESS when we return. DB_LOCK_WAIT_TIMEOUT and DB_DEADLOCK
+are possible errors. DB_DEADLOCK is returned if selective deadlock
+resolution chose this transaction as a victim. */
+UNIV_INTERN
+void
+lock_wait_suspend_thread(
+/*=====================*/
+	que_thr_t*	thr);	/*!< in: query thread associated with the
+				user OS thread */
+/*********************************************************************//**
+Unlocks AUTO_INC type locks that were possibly reserved by a trx. This
+function should be called at the the end of an SQL statement, by the
+connection thread that owns the transaction (trx->mysql_thd). */
+UNIV_INTERN
+void
+lock_unlock_table_autoinc(
+/*======================*/
+	trx_t*	trx);			/*!< in/out: transaction */
+/*********************************************************************//**
+Check whether the transaction has already been rolled back because it
+was selected as a deadlock victim, or if it has to wait then cancel
+the wait lock.
+@return DB_DEADLOCK, DB_LOCK_WAIT or DB_SUCCESS */
+UNIV_INTERN
+enum db_err
+lock_trx_handle_wait(
+/*=================*/
+	trx_t*	trx)	/*!< in/out: trx lock state */
+	__attribute__((nonnull));
+/*********************************************************************//**
+Get the number of locks on a table.
+@return number of locks */
+UNIV_INTERN
+ulint
+lock_table_get_n_locks(
+/*===================*/
+	const dict_table_t*	table)	/*!< in: table */
+	__attribute__((nonnull));
 /** Lock modes and types */
 /* @{ */
 #define LOCK_MODE_MASK	0xFUL	/*!< mask used to extract mode from the
@@ -815,12 +857,66 @@ struct lock_op_struct{
 
 /** The lock system struct */
 struct lock_sys_struct{
-	hash_table_t*	rec_hash;	/*!< hash table of the record locks */
+	mutex_t		mutex;			/*!< Mutex protecting the
+						locks */
+	hash_table_t*	rec_hash;		/*!< hash table of the record
+						locks */
+	mutex_t		wait_mutex;		/*!< Mutex protecting the
+						next two fields */
+	srv_slot_t*	waiting_threads;	/*!< Array  of user threads
+						suspended while waiting for
+						locks within InnoDB, protected
+						by the lock_sys->wait_mutex */
+	srv_slot_t*	last_slot;		/*!< highest slot ever used
+						in the waiting_threads array,
+						protected by
+						lock_sys->wait_mutex */
+	ibool		rollback_complete;
+						/*!< TRUE if rollback of all
+						recovered transactions is
+						complete. Protected by
+						lock_sys->mutex */
 };
 
 /** The lock system */
 extern lock_sys_t*	lock_sys;
 
+/** Test if lock_sys->mutex can be acquired without waiting. */
+#define lock_mutex_enter_nowait() mutex_enter_nowait(&lock_sys->mutex)
+
+/** Test if lock_sys->mutex is owned. */
+#define lock_mutex_own() mutex_own(&lock_sys->mutex)
+
+/** Acquire the lock_sys->mutex. */
+#define lock_mutex_enter() do {			\
+	mutex_enter(&lock_sys->mutex);		\
+} while (0)
+
+/** Release the lock_sys->mutex. */
+#define lock_mutex_exit() do {			\
+	mutex_exit(&lock_sys->mutex);		\
+} while (0)
+
+/** Test if lock_sys->wait_mutex is owned. */
+#define lock_wait_mutex_own() mutex_own(&lock_sys->wait_mutex)
+
+/** Acquire the lock_sys->wait_mutex. */
+#define lock_wait_mutex_enter() do {		\
+	mutex_enter(&lock_sys->wait_mutex);	\
+} while (0)
+
+/** Release the lock_sys->wait_mutex. */
+#define lock_wait_mutex_exit() do {		\
+	mutex_exit(&lock_sys->wait_mutex);	\
+} while (0)
+
+// FIXME: Move these to lock_sys_t
+extern	ibool		srv_lock_timeout_active;
+extern	ulint		srv_n_lock_wait_count;
+extern	ulint		srv_n_lock_wait_current_count;
+extern	ib_int64_t	srv_n_lock_wait_time;
+extern	ulint		srv_n_lock_max_wait_time;
+extern	os_event_t	srv_lock_timeout_thread_event;
 
 #ifndef UNIV_NONINL
 #include "lock0lock.ic"
diff --git a/storage/innobase/include/lock0lock.ic b/storage/innobase/include/lock0lock.ic
index 1d740a5fa43..736936954cb 100644
--- a/storage/innobase/include/lock0lock.ic
+++ b/storage/innobase/include/lock0lock.ic
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -68,35 +68,6 @@ lock_rec_hash(
 }
 
 /*********************************************************************//**
-Checks if some transaction has an implicit x-lock on a record in a clustered
-index.
-@return	transaction which has the x-lock, or NULL */
-UNIV_INLINE
-trx_t*
-lock_clust_rec_some_has_impl(
-/*=========================*/
-	const rec_t*		rec,	/*!< in: user record */
-	const dict_index_t*	index,	/*!< in: clustered index */
-	const ulint*		offsets)/*!< in: rec_get_offsets(rec, index) */
-{
-	trx_id_t	trx_id;
-
-	ut_ad(mutex_own(&kernel_mutex));
-	ut_ad(dict_index_is_clust(index));
-	ut_ad(page_rec_is_user_rec(rec));
-
-	trx_id = row_get_rec_trx_id(rec, index, offsets);
-
-	if (trx_is_active(trx_id)) {
-		/* The modifying or inserting transaction is active */
-
-		return(trx_get_on_id(trx_id));
-	}
-
-	return(NULL);
-}
-
-/*********************************************************************//**
 Gets the heap_no of the smallest user record on a page.
 @return	heap_no of smallest user record, or PAGE_HEAP_NO_SUPREMUM */
 UNIV_INLINE
diff --git a/storage/innobase/include/lock0priv.h b/storage/innobase/include/lock0priv.h
index 287c151b19f..d516289e1f2 100644
--- a/storage/innobase/include/lock0priv.h
+++ b/storage/innobase/include/lock0priv.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 2007, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 2007, 2011, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -62,7 +62,7 @@ struct lock_rec_struct {
 					lock struct */
 };
 
-/** Lock struct */
+/** Lock struct; protected by lock_sys->mutex */
 struct lock_struct {
 	trx_t*		trx;		/*!< transaction owning the
 					lock */
@@ -101,6 +101,28 @@ lock_rec_get_prev(
 	const lock_t*	in_lock,/*!< in: record lock */
 	ulint		heap_no);/*!< in: heap number of the record */
 
+/*********************************************************************//**
+Cancels a waiting lock request and releases possible other transactions
+waiting behind it. */
+UNIV_INTERN
+void
+lock_cancel_waiting_and_release(
+/*============================*/
+	lock_t*	lock);	/*!< in/out: waiting lock request */
+
+/*********************************************************************//**
+Checks if some transaction has an implicit x-lock on a record in a clustered
+index.
+@return	transaction id of the transaction which has the x-lock, or 0 */
+UNIV_INLINE
+trx_id_t
+lock_clust_rec_some_has_impl(
+/*=========================*/
+	const rec_t*		rec,	/*!< in: user record */
+	const dict_index_t*	index,	/*!< in: clustered index */
+	const ulint*		offsets)/*!< in: rec_get_offsets(rec, index) */
+	__attribute__((nonnull, warn_unused_result));
+
 #ifndef UNIV_NONINL
 #include "lock0priv.ic"
 #endif
diff --git a/storage/innobase/include/lock0priv.ic b/storage/innobase/include/lock0priv.ic
index 30447c99848..6b70dc33d3c 100644
--- a/storage/innobase/include/lock0priv.ic
+++ b/storage/innobase/include/lock0priv.ic
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 2007, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 2007, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -24,8 +24,8 @@ Created July 16, 2007 Vasil Dimov
 *******************************************************/
 
 /* This file contains only methods which are used in
-lock/lock0* files, other than lock/lock0lock.c.
-I.e. lock/lock0lock.c contains more internal inline
+lock/lock0* files, other than lock/lock0lock.cc.
+I.e. lock/lock0lock.cc contains more internal inline
 methods but they are used only in that file. */
 
 #ifndef LOCK_MODULE_IMPLEMENTATION
@@ -46,4 +46,22 @@ lock_get_type_low(
 	return(lock->type_mode & LOCK_TYPE_MASK);
 }
 
+/*********************************************************************//**
+Checks if some transaction has an implicit x-lock on a record in a clustered
+index.
+@return	transaction id of the transaction which has the x-lock, or 0 */
+UNIV_INLINE
+trx_id_t
+lock_clust_rec_some_has_impl(
+/*=========================*/
+	const rec_t*		rec,	/*!< in: user record */
+	const dict_index_t*	index,	/*!< in: clustered index */
+	const ulint*		offsets)/*!< in: rec_get_offsets(rec, index) */
+{
+	ut_ad(dict_index_is_clust(index));
+	ut_ad(page_rec_is_user_rec(rec));
+
+	return(row_get_rec_trx_id(rec, index, offsets));
+}
+
 /* vim: set filetype=c: */
diff --git a/storage/innobase/include/lock0types.h b/storage/innobase/include/lock0types.h
index 45f29e90fe9..16e6b2e0113 100644
--- a/storage/innobase/include/lock0types.h
+++ b/storage/innobase/include/lock0types.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -39,7 +39,9 @@ enum lock_mode {
 	LOCK_AUTO_INC,	/* locks the auto-inc counter of a table
 			in an exclusive mode */
 	LOCK_NONE,	/* this is used elsewhere to note consistent read */
-	LOCK_NUM = LOCK_NONE/* number of lock modes */
+	LOCK_NUM = LOCK_NONE, /* number of lock modes */
+	LOCK_NONE_UNSET = 255
 };
 
+
 #endif
diff --git a/storage/innobase/include/log0log.h b/storage/innobase/include/log0log.h
index f2ab6a9898d..6d27d9d4f10 100644
--- a/storage/innobase/include/log0log.h
+++ b/storage/innobase/include/log0log.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1995, 2010, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 1995, 2011, Oracle and/or its affiliates. All rights reserved.
 Copyright (c) 2009, Google Inc.
 
 Portions of this file contain modifications contributed and copyrighted by
@@ -41,6 +41,12 @@ Created 12/9/1995 Heikki Tuuri
 #include "sync0rw.h"
 #endif /* !UNIV_HOTBACKUP */
 
+/* Type used for all log sequence number storage and arithmetics */
+typedef	ib_uint64_t		lsn_t;
+#define LSN_MAX			IB_UINT64_MAX
+
+#define LSN_PF			UINT64PF
+
 /** Redo log buffer */
 typedef struct log_struct	log_t;
 /** Redo log group */
@@ -64,17 +70,6 @@ extern	ibool	log_debug_writes;
 /** Maximum number of log groups in log_group_struct::checkpoint_buf */
 #define LOG_MAX_N_GROUPS	32
 
-#ifndef UNIV_HOTBACKUP
-/****************************************************************//**
-Sets the global variable log_fsp_current_free_limit. Also makes a checkpoint,
-so that we know that the limit has been written to a log checkpoint field
-on disk. */
-UNIV_INTERN
-void
-log_fsp_current_free_limit_set_and_checkpoint(
-/*==========================================*/
-	ulint	limit);	/*!< in: limit to set */
-#endif /* !UNIV_HOTBACKUP */
 /*******************************************************************//**
 Calculates where in log files we find a specified lsn.
 @return	log file number */
@@ -98,12 +93,12 @@ Writes to the log the string given. The log must be released with
 log_release.
 @return	end lsn of the log record, zero if did not succeed */
 UNIV_INLINE
-ib_uint64_t
+lsn_t
 log_reserve_and_write_fast(
 /*=======================*/
 	const void*	str,	/*!< in: string */
 	ulint		len,	/*!< in: string length */
-	ib_uint64_t*	start_lsn);/*!< out: start lsn of the log record */
+	lsn_t*		start_lsn);/*!< out: start lsn of the log record */
 /***********************************************************************//**
 Releases the log mutex. */
 UNIV_INLINE
@@ -124,7 +119,7 @@ Opens the log for log_write_low. The log must be closed with log_close and
 released with log_release.
 @return	start lsn of the log record */
 UNIV_INTERN
-ib_uint64_t
+lsn_t
 log_reserve_and_open(
 /*=================*/
 	ulint	len);	/*!< in: length of data to be catenated */
@@ -141,14 +136,14 @@ log_write_low(
 Closes the log.
 @return	lsn */
 UNIV_INTERN
-ib_uint64_t
+lsn_t
 log_close(void);
 /*===========*/
 /************************************************************//**
 Gets the current lsn.
 @return	current lsn */
 UNIV_INLINE
-ib_uint64_t
+lsn_t
 log_get_lsn(void);
 /*=============*/
 /****************************************************************
@@ -156,7 +151,7 @@ Gets the log group capacity. It is OK to read the value without
 holding log_sys->mutex because it is constant.
 @return	log group capacity */
 UNIV_INLINE
-ulint
+lsn_t
 log_get_capacity(void);
 /*==================*/
 /******************************************************//**
@@ -173,7 +168,7 @@ log_group_init(
 /*===========*/
 	ulint	id,			/*!< in: group id */
 	ulint	n_files,		/*!< in: number of log files */
-	ulint	file_size,		/*!< in: log file size in bytes */
+	lsn_t	file_size,		/*!< in: log file size in bytes */
 	ulint	space_id,		/*!< in: space id of the file space
 					which contains the log files of this
 					group */
@@ -198,14 +193,13 @@ UNIV_INTERN
 void
 log_write_up_to(
 /*============*/
-	ib_uint64_t	lsn,	/*!< in: log sequence number up to which
-				the log should be written,
-				IB_ULONGLONG_MAX if not specified */
-	ulint		wait,	/*!< in: LOG_NO_WAIT, LOG_WAIT_ONE_GROUP,
-				or LOG_WAIT_ALL_GROUPS */
-	ibool		flush_to_disk);
-				/*!< in: TRUE if we want the written log
-				also to be flushed to disk */
+	lsn_t	lsn,	/*!< in: log sequence number up to which
+			the log should be written, LSN_MAX if not specified */
+	ulint	wait,	/*!< in: LOG_NO_WAIT, LOG_WAIT_ONE_GROUP,
+			or LOG_WAIT_ALL_GROUPS */
+	ibool	flush_to_disk);
+			/*!< in: TRUE if we want the written log
+			also to be flushed to disk */
 /****************************************************************//**
 Does a syncronous flush of the log buffer to disk. */
 UNIV_INTERN
@@ -223,20 +217,14 @@ log_buffer_sync_in_background(
 /*==========================*/
 	ibool	flush);	/*<! in: flush the logs to disk */
 /****************************************************************//**
-Advances the smallest lsn for which there are unflushed dirty blocks in the
-buffer pool and also may make a new checkpoint. NOTE: this function may only
-be called if the calling thread owns no synchronization objects!
-@return FALSE if there was a flush batch of the same type running,
-which means that we could not start this flush batch */
-UNIV_INTERN
-ibool
-log_preflush_pool_modified_pages(
-/*=============================*/
-	ib_uint64_t	new_oldest,	/*!< in: try to advance
-					oldest_modified_lsn at least
-					to this lsn */
-	ibool		sync);		/*!< in: TRUE if synchronous
-					operation is desired */
+Checks if an asynchronous flushing of dirty pages is required in the
+background. This function is only called from the page cleaner thread.
+@return lsn to which the flushing should happen or LSN_MAX
+if flushing is not required */
+UNIV_INTERN
+lsn_t
+log_async_flush_lsn(void);
+/*=====================*/
 /******************************************************//**
 Makes a checkpoint. Note that this function does not flush dirty
 blocks from the buffer pool: it only checks what is lsn of the oldest
@@ -261,16 +249,16 @@ UNIV_INTERN
 void
 log_make_checkpoint_at(
 /*===================*/
-	ib_uint64_t	lsn,		/*!< in: make a checkpoint at this or a
-					later lsn, if IB_ULONGLONG_MAX, makes
-					a checkpoint at the latest lsn */
-	ibool		write_always);	/*!< in: the function normally checks if
-					the new checkpoint would have a
-					greater lsn than the previous one: if
-					not, then no physical write is done;
-					by setting this parameter TRUE, a
-					physical write will always be made to
-					log files */
+	lsn_t	lsn,		/*!< in: make a checkpoint at this or a
+				later lsn, if LSN_MAX, makes
+				a checkpoint at the latest lsn */
+	ibool	write_always);	/*!< in: the function normally checks if
+				the new checkpoint would have a
+				greater lsn than the previous one: if
+				not, then no physical write is done;
+				by setting this parameter TRUE, a
+				physical write will always be made to
+				log files */
 /****************************************************************//**
 Makes a checkpoint at the latest lsn and writes it to first page of each
 data file in the database, so that we know that the file spaces contain
@@ -388,8 +376,8 @@ log_group_read_log_seg(
 	ulint		type,		/*!< in: LOG_ARCHIVE or LOG_RECOVER */
 	byte*		buf,		/*!< in: buffer where to read */
 	log_group_t*	group,		/*!< in: log group */
-	ib_uint64_t	start_lsn,	/*!< in: read area start */
-	ib_uint64_t	end_lsn);	/*!< in: read area end */
+	lsn_t		start_lsn,	/*!< in: read area start */
+	lsn_t		end_lsn);	/*!< in: read area end */
 /******************************************************//**
 Writes a buffer to a log file group. */
 UNIV_INTERN
@@ -400,7 +388,7 @@ log_group_write_buf(
 	byte*		buf,		/*!< in: buffer */
 	ulint		len,		/*!< in: buffer len; must be divisible
 					by OS_FILE_LOG_BLOCK_SIZE */
-	ib_uint64_t	start_lsn,	/*!< in: start lsn of the buffer; must
+	lsn_t		start_lsn,	/*!< in: start lsn of the buffer; must
 					be divisible by
 					OS_FILE_LOG_BLOCK_SIZE */
 	ulint		new_data_offset);/*!< in: start offset of new data in
@@ -416,14 +404,14 @@ void
 log_group_set_fields(
 /*=================*/
 	log_group_t*	group,	/*!< in/out: group */
-	ib_uint64_t	lsn);	/*!< in: lsn for which the values should be
+	lsn_t		lsn);	/*!< in: lsn for which the values should be
 				set */
 /******************************************************//**
 Calculates the data capacity of a log group, when the log file headers are not
 included.
 @return	capacity in bytes */
 UNIV_INTERN
-ulint
+lsn_t
 log_group_get_capacity(
 /*===================*/
 	const log_group_t*	group);	/*!< in: log group */
@@ -515,8 +503,8 @@ UNIV_INLINE
 void
 log_block_init(
 /*===========*/
-	byte*		log_block,	/*!< in: pointer to the log buffer */
-	ib_uint64_t	lsn);		/*!< in: lsn within the log block */
+	byte*	log_block,	/*!< in: pointer to the log buffer */
+	lsn_t	lsn);		/*!< in: lsn within the log block */
 /************************************************************//**
 Initializes a log block in the log buffer in the old, < 3.23.52 format, where
 there was no checksum yet. */
@@ -524,8 +512,8 @@ UNIV_INLINE
 void
 log_block_init_in_old_format(
 /*=========================*/
-	byte*		log_block,	/*!< in: pointer to the log buffer */
-	ib_uint64_t	lsn);		/*!< in: lsn within the log block */
+	byte*	log_block,	/*!< in: pointer to the log buffer */
+	lsn_t	lsn);		/*!< in: lsn within the log block */
 /************************************************************//**
 Converts a lsn to a log block number.
 @return	log block number, it is > 0 and <= 1G */
@@ -533,7 +521,7 @@ UNIV_INLINE
 ulint
 log_block_convert_lsn_to_no(
 /*========================*/
-	ib_uint64_t	lsn);	/*!< in: lsn of a byte within the block */
+	lsn_t	lsn);	/*!< in: lsn of a byte within the block */
 /******************************************************//**
 Prints info of the log. */
 UNIV_INTERN
@@ -548,7 +536,7 @@ UNIV_INTERN
 ibool
 log_peek_lsn(
 /*=========*/
-	ib_uint64_t*	lsn);	/*!< out: if returns TRUE, current lsn is here */
+	lsn_t*	lsn);	/*!< out: if returns TRUE, current lsn is here */
 /**********************************************************************//**
 Refreshes the statistics used to print per-second averages. */
 UNIV_INTERN
@@ -579,7 +567,7 @@ extern log_t*	log_sys;
 #define LOG_RECOVER	98887331
 
 /* The counting of lsn's starts from this value: this must be non-zero */
-#define LOG_START_LSN		((ib_uint64_t) (16 * OS_FILE_LOG_BLOCK_SIZE))
+#define LOG_START_LSN		((lsn_t) (16 * OS_FILE_LOG_BLOCK_SIZE))
 
 #define LOG_BUFFER_SIZE		(srv_log_buffer_size * UNIV_PAGE_SIZE)
 #define LOG_ARCHIVE_BUF_SIZE	(srv_log_buffer_size * UNIV_PAGE_SIZE / 4)
@@ -626,7 +614,7 @@ extern log_t*	log_sys;
 /* Offsets for a checkpoint field */
 #define LOG_CHECKPOINT_NO		0
 #define LOG_CHECKPOINT_LSN		8
-#define LOG_CHECKPOINT_OFFSET		16
+#define LOG_CHECKPOINT_OFFSET_LOW32	16
 #define LOG_CHECKPOINT_LOG_BUF_SIZE	20
 #define	LOG_CHECKPOINT_ARCHIVED_LSN	24
 #define	LOG_CHECKPOINT_GROUP_ARRAY	32
@@ -640,22 +628,38 @@ extern log_t*	log_sys;
 							+ LOG_MAX_N_GROUPS * 8)
 #define LOG_CHECKPOINT_CHECKSUM_1	LOG_CHECKPOINT_ARRAY_END
 #define LOG_CHECKPOINT_CHECKSUM_2	(4 + LOG_CHECKPOINT_ARRAY_END)
+#if 0
 #define LOG_CHECKPOINT_FSP_FREE_LIMIT	(8 + LOG_CHECKPOINT_ARRAY_END)
-					/* current fsp free limit in
+					/*!< Not used (0);
+					This used to contain the
+					current fsp free limit in
 					tablespace 0, in units of one
-					megabyte; this information is only used
-					by ibbackup to decide if it can
-					truncate unused ends of
-					non-auto-extending data files in space
-					0 */
+					megabyte.
+
+					This information might have been used
+					since ibbackup version 0.35 but
+					before 1.41 to decide if unused ends of
+					non-auto-extending data files
+					in space 0 can be truncated.
+
+					This information was made obsolete
+					by ibbackup --compress. */
 #define LOG_CHECKPOINT_FSP_MAGIC_N	(12 + LOG_CHECKPOINT_ARRAY_END)
-					/* this magic number tells if the
+					/*!< Not used (0);
+					This magic number tells if the
 					checkpoint contains the above field:
 					the field was added to
-					InnoDB-3.23.50 */
-#define LOG_CHECKPOINT_SIZE		(16 + LOG_CHECKPOINT_ARRAY_END)
-
+					InnoDB-3.23.50 and
+					removed from MySQL 5.6 */
 #define LOG_CHECKPOINT_FSP_MAGIC_N_VAL	1441231243
+					/*!< if LOG_CHECKPOINT_FSP_MAGIC_N
+					contains this value, then
+					LOG_CHECKPOINT_FSP_FREE_LIMIT
+					is valid */
+#endif
+#define LOG_CHECKPOINT_OFFSET_HIGH32	(16 + LOG_CHECKPOINT_ARRAY_END)
+#define LOG_CHECKPOINT_SIZE		(20 + LOG_CHECKPOINT_ARRAY_END)
+
 
 /* Offsets of a log file header */
 #define LOG_GROUP_ID		0	/* log group number */
@@ -705,15 +709,15 @@ struct log_group_struct{
 	/* The following fields are protected by log_sys->mutex */
 	ulint		id;		/*!< log group id */
 	ulint		n_files;	/*!< number of files in the group */
-	ulint		file_size;	/*!< individual log file size in bytes,
+	lsn_t		file_size;	/*!< individual log file size in bytes,
 					including the log file header */
 	ulint		space_id;	/*!< file space which implements the log
 					group */
 	ulint		state;		/*!< LOG_GROUP_OK or
 					LOG_GROUP_CORRUPTED */
-	ib_uint64_t	lsn;		/*!< lsn used to fix coordinates within
+	lsn_t		lsn;		/*!< lsn used to fix coordinates within
 					the log group */
-	ulint		lsn_offset;	/*!< the offset of the above lsn */
+	lsn_t		lsn_offset;	/*!< the offset of the above lsn */
 	ulint		n_pending_writes;/*!< number of currently pending flush
 					writes for this log group */
 	byte**		file_header_bufs_ptr;/*!< unaligned buffers */
@@ -742,7 +746,7 @@ struct log_group_struct{
 	ulint		next_archived_offset; /*!< like the preceding field */
 #endif /* UNIV_LOG_ARCHIVE */
 	/*-----------------------------*/
-	ib_uint64_t	scanned_lsn;	/*!< used only in recovery: recovery scan
+	lsn_t		scanned_lsn;	/*!< used only in recovery: recovery scan
 					succeeded up to this lsn in this log
 					group */
 	byte*		checkpoint_buf_ptr;/*!< unaligned checkpoint header */
@@ -757,12 +761,11 @@ struct log_struct{
 	byte		pad[64];	/*!< padding to prevent other memory
 					update hotspots from residing on the
 					same memory cache line */
-	ib_uint64_t	lsn;		/*!< log sequence number */
+	lsn_t		lsn;		/*!< log sequence number */
 	ulint		buf_free;	/*!< first free offset within the log
 					buffer */
 #ifndef UNIV_HOTBACKUP
 	mutex_t		mutex;		/*!< mutex protecting the log */
-#endif /* !UNIV_HOTBACKUP */
 
 	mutex_t		log_flush_order_mutex;/*!< mutex to serialize access to
 					the flush list when we are putting
@@ -772,6 +775,7 @@ struct log_struct{
 					mtr_commit and still ensure that
 					insertions in the flush_list happen
 					in the LSN order. */
+#endif /* !UNIV_HOTBACKUP */
 	byte*		buf_ptr;	/* unaligned log buffer */
 	byte*		buf;		/*!< log buffer */
 	ulint		buf_size;	/*!< log buffer size in bytes */
@@ -806,13 +810,13 @@ struct log_struct{
 					later; this is advanced when a flush
 					operation is completed to all the log
 					groups */
-	ib_uint64_t	written_to_some_lsn;
+	lsn_t		written_to_some_lsn;
 					/*!< first log sequence number not yet
 					written to any log group; for this to
 					be advanced, it is enough that the
 					write i/o has been completed for any
 					one log group */
-	ib_uint64_t	written_to_all_lsn;
+	lsn_t		written_to_all_lsn;
 					/*!< first log sequence number not yet
 					written to some log group; for this to
 					be advanced, it is enough that the
@@ -828,16 +832,16 @@ struct log_struct{
 					flushed_to_disk_lsn or
 					write_lsn which are always
 					up-to-date and accurate. */
-	ib_uint64_t	write_lsn;	/*!< end lsn for the current running
+	lsn_t		write_lsn;	/*!< end lsn for the current running
 					write */
 	ulint		write_end_offset;/*!< the data in buffer has
 					been written up to this offset
 					when the current write ends:
 					this field will then be copied
 					to buf_next_to_write */
-	ib_uint64_t	current_flush_lsn;/*!< end lsn for the current running
+	lsn_t		current_flush_lsn;/*!< end lsn for the current running
 					write + flush operation */
-	ib_uint64_t	flushed_to_disk_lsn;
+	lsn_t		flushed_to_disk_lsn;
 					/*!< how far we have written the log
 					AND flushed to disk */
 	ulint		n_pending_writes;/*!< number of currently
@@ -874,42 +878,37 @@ struct log_struct{
 	/* @} */
 
 	/** Fields involved in checkpoints @{ */
-	ulint		log_group_capacity; /*!< capacity of the log group; if
+	lsn_t		log_group_capacity; /*!< capacity of the log group; if
 					the checkpoint age exceeds this, it is
 					a serious error because it is possible
 					we will then overwrite log and spoil
 					crash recovery */
-	ulint		max_modified_age_async;
+	lsn_t		max_modified_age_async;
 					/*!< when this recommended
 					value for lsn -
 					buf_pool_get_oldest_modification()
 					is exceeded, we start an
 					asynchronous preflush of pool pages */
-	ulint		max_modified_age_sync;
+	lsn_t		max_modified_age_sync;
 					/*!< when this recommended
 					value for lsn -
 					buf_pool_get_oldest_modification()
 					is exceeded, we start a
 					synchronous preflush of pool pages */
-	ulint		adm_checkpoint_interval;
-					/*!< administrator-specified checkpoint
-					interval in terms of log growth in
-					bytes; the interval actually used by
-					the database can be smaller */
-	ulint		max_checkpoint_age_async;
+	lsn_t		max_checkpoint_age_async;
 					/*!< when this checkpoint age
 					is exceeded we start an
 					asynchronous writing of a new
 					checkpoint */
-	ulint		max_checkpoint_age;
+	lsn_t		max_checkpoint_age;
 					/*!< this is the maximum allowed value
 					for lsn - last_checkpoint_lsn when a
 					new query step is started */
 	ib_uint64_t	next_checkpoint_no;
 					/*!< next checkpoint number */
-	ib_uint64_t	last_checkpoint_lsn;
+	lsn_t		last_checkpoint_lsn;
 					/*!< latest checkpoint lsn */
-	ib_uint64_t	next_checkpoint_lsn;
+	lsn_t		next_checkpoint_lsn;
 					/*!< next checkpoint lsn */
 	ulint		n_pending_checkpoint_writes;
 					/*!< number of currently pending
@@ -927,16 +926,16 @@ struct log_struct{
 	/** Fields involved in archiving @{ */
 	ulint		archiving_state;/*!< LOG_ARCH_ON, LOG_ARCH_STOPPING
 					LOG_ARCH_STOPPED, LOG_ARCH_OFF */
-	ib_uint64_t	archived_lsn;	/*!< archiving has advanced to this
+	lsn_t		archived_lsn;	/*!< archiving has advanced to this
 					lsn */
-	ulint		max_archived_lsn_age_async;
+	lsn_t		max_archived_lsn_age_async;
 					/*!< recommended maximum age of
 					archived_lsn, before we start
 					asynchronous copying to the archive */
-	ulint		max_archived_lsn_age;
+	lsn_t		max_archived_lsn_age;
 					/*!< maximum allowed age for
 					archived_lsn */
-	ib_uint64_t	next_archived_lsn;/*!< during an archive write,
+	lsn_t		next_archived_lsn;/*!< during an archive write,
 					until the write is completed, we
 					store the next value for
 					archived_lsn here: the write
diff --git a/storage/innobase/include/log0log.ic b/storage/innobase/include/log0log.ic
index 67db6695cab..5ecd7b85a26 100644
--- a/storage/innobase/include/log0log.ic
+++ b/storage/innobase/include/log0log.ic
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1995, 2010, Innobase Oy. All Rights Reserved.
+Copyright (c) 1995, 2010, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -26,6 +26,7 @@ Created 12/9/1995 Heikki Tuuri
 #include "os0file.h"
 #include "mach0data.h"
 #include "mtr0mtr.h"
+#include "srv0mon.h"
 
 #ifdef UNIV_LOG_DEBUG
 /******************************************************//**
@@ -192,7 +193,7 @@ UNIV_INLINE
 ulint
 log_block_convert_lsn_to_no(
 /*========================*/
-	ib_uint64_t	lsn)	/*!< in: lsn of a byte within the block */
+	lsn_t	lsn)	/*!< in: lsn of a byte within the block */
 {
 	return(((ulint) (lsn / OS_FILE_LOG_BLOCK_SIZE) & 0x3FFFFFFFUL) + 1);
 }
@@ -260,8 +261,8 @@ UNIV_INLINE
 void
 log_block_init(
 /*===========*/
-	byte*		log_block,	/*!< in: pointer to the log buffer */
-	ib_uint64_t	lsn)		/*!< in: lsn within the log block */
+	byte*	log_block,	/*!< in: pointer to the log buffer */
+	lsn_t	lsn)		/*!< in: lsn within the log block */
 {
 	ulint	no;
 
@@ -282,8 +283,8 @@ UNIV_INLINE
 void
 log_block_init_in_old_format(
 /*=========================*/
-	byte*		log_block,	/*!< in: pointer to the log buffer */
-	ib_uint64_t	lsn)		/*!< in: lsn within the log block */
+	byte*	log_block,	/*!< in: pointer to the log buffer */
+	lsn_t	lsn)		/*!< in: lsn within the log block */
 {
 	ulint	no;
 
@@ -304,12 +305,12 @@ Writes to the log the string given. The log must be released with
 log_release.
 @return	end lsn of the log record, zero if did not succeed */
 UNIV_INLINE
-ib_uint64_t
+lsn_t
 log_reserve_and_write_fast(
 /*=======================*/
 	const void*	str,	/*!< in: string */
 	ulint		len,	/*!< in: string length */
-	ib_uint64_t*	start_lsn)/*!< out: start lsn of the log record */
+	lsn_t*		start_lsn)/*!< out: start lsn of the log record */
 {
 	ulint		data_len;
 #ifdef UNIV_LOG_LSN_DEBUG
@@ -374,6 +375,9 @@ log_reserve_and_write_fast(
 
 	log_sys->lsn += len;
 
+	MONITOR_SET(MONITOR_LSN_CHECKPOINT_AGE,
+		    log_sys->lsn - log_sys->last_checkpoint_lsn);
+
 #ifdef UNIV_LOG_DEBUG
 	log_check_log_recs(log_sys->buf + log_sys->old_buf_free,
 			   log_sys->buf_free - log_sys->old_buf_free,
@@ -396,11 +400,11 @@ log_release(void)
 Gets the current lsn.
 @return	current lsn */
 UNIV_INLINE
-ib_uint64_t
+lsn_t
 log_get_lsn(void)
 /*=============*/
 {
-	ib_uint64_t	lsn;
+	lsn_t	lsn;
 
 	mutex_enter(&(log_sys->mutex));
 
@@ -416,7 +420,7 @@ Gets the log group capacity. It is OK to read the value without
 holding log_sys->mutex because it is constant.
 @return	log group capacity */
 UNIV_INLINE
-ulint
+lsn_t
 log_get_capacity(void)
 /*==================*/
 {
diff --git a/storage/innobase/include/log0recv.h b/storage/innobase/include/log0recv.h
index 9f334a34b44..218298a1698 100644
--- a/storage/innobase/include/log0recv.h
+++ b/storage/innobase/include/log0recv.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1997, 2010, Innobase Oy. All Rights Reserved.
+Copyright (c) 1997, 2010, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -40,19 +40,17 @@ Reads the checkpoint info needed in hot backup.
 @return	TRUE if success */
 UNIV_INTERN
 ibool
-recv_read_cp_info_for_backup(
-/*=========================*/
+recv_read_checkpoint_info_for_backup(
+/*=================================*/
 	const byte*	hdr,	/*!< in: buffer containing the log group
 				header */
-	ib_uint64_t*	lsn,	/*!< out: checkpoint lsn */
-	ulint*		offset,	/*!< out: checkpoint offset in the log group */
-	ulint*		fsp_limit,/*!< out: fsp limit of space 0,
-				1000000000 if the database is running
-				with < version 3.23.50 of InnoDB */
-	ib_uint64_t*	cp_no,	/*!< out: checkpoint number */
-	ib_uint64_t*	first_header_lsn);
+	lsn_t*		lsn,	/*!< out: checkpoint lsn */
+	lsn_t*		offset,	/*!< out: checkpoint offset in the log group */
+	lsn_t*		cp_no,	/*!< out: checkpoint number */
+	lsn_t*		first_header_lsn)
 				/*!< out: lsn of of the start of the
 				first log file */
+	__attribute__((nonnull));
 /*******************************************************************//**
 Scans the log segment and n_bytes_scanned is set to the length of valid
 log scanned. */
@@ -62,7 +60,7 @@ recv_scan_log_seg_for_backup(
 /*=========================*/
 	byte*		buf,		/*!< in: buffer containing log data */
 	ulint		buf_len,	/*!< in: data length in that buffer */
-	ib_uint64_t*	scanned_lsn,	/*!< in/out: lsn of buffer start,
+	lsn_t*		scanned_lsn,	/*!< in/out: lsn of buffer start,
 					we return scanned lsn */
 	ulint*		scanned_checkpoint_no,
 					/*!< in/out: 4 lowest bytes of the
@@ -136,12 +134,12 @@ recv_recovery_from_checkpoint_start_func(
 #ifdef UNIV_LOG_ARCHIVE
 	ulint		type,		/*!< in: LOG_CHECKPOINT or
 					LOG_ARCHIVE */
-	ib_uint64_t	limit_lsn,	/*!< in: recover up to this lsn
+	lsn_t		limit_lsn,	/*!< in: recover up to this lsn
 					if possible */
 #endif /* UNIV_LOG_ARCHIVE */
-	ib_uint64_t	min_flushed_lsn,/*!< in: min flushed lsn from
+	lsn_t		min_flushed_lsn,/*!< in: min flushed lsn from
 					data files */
-	ib_uint64_t	max_flushed_lsn);/*!< in: max flushed lsn from
+	lsn_t		max_flushed_lsn);/*!< in: max flushed lsn from
 					 data files */
 #ifdef UNIV_LOG_ARCHIVE
 /** Wrapper for recv_recovery_from_checkpoint_start_func().
@@ -202,11 +200,11 @@ recv_scan_log_recs(
 	const byte*	buf,		/*!< in: buffer containing a log
 					segment or garbage */
 	ulint		len,		/*!< in: buffer length */
-	ib_uint64_t	start_lsn,	/*!< in: buffer start lsn */
-	ib_uint64_t*	contiguous_lsn,	/*!< in/out: it is known that all log
+	lsn_t		start_lsn,	/*!< in: buffer start lsn */
+	lsn_t*		contiguous_lsn,	/*!< in/out: it is known that all log
 					groups contain contiguous log data up
 					to this lsn */
-	ib_uint64_t*	group_scanned_lsn);/*!< out: scanning succeeded up to
+	lsn_t*		group_scanned_lsn);/*!< out: scanning succeeded up to
 					this lsn */
 /******************************************************//**
 Resets the logs. The contents of log files will be lost! */
@@ -214,7 +212,7 @@ UNIV_INTERN
 void
 recv_reset_logs(
 /*============*/
-	ib_uint64_t	lsn,		/*!< in: reset to this lsn
+	lsn_t		lsn,		/*!< in: reset to this lsn
 					rounded up to be divisible by
 					OS_FILE_LOG_BLOCK_SIZE, after
 					which we add
@@ -235,8 +233,8 @@ recv_reset_log_files_for_backup(
 /*============================*/
 	const char*	log_dir,	/*!< in: log file directory path */
 	ulint		n_log_files,	/*!< in: number of log files */
-	ulint		log_file_size,	/*!< in: log file size */
-	ib_uint64_t	lsn);		/*!< in: new start lsn, must be
+	lsn_t		log_file_size,	/*!< in: log file size */
+	lsn_t		lsn);		/*!< in: new start lsn, must be
 					divisible by OS_FILE_LOG_BLOCK_SIZE */
 #endif /* UNIV_HOTBACKUP */
 /********************************************************//**
@@ -302,9 +300,9 @@ UNIV_INTERN
 ulint
 recv_recovery_from_archive_start(
 /*=============================*/
-	ib_uint64_t	min_flushed_lsn,/*!< in: min flushed lsn field from the
+	lsn_t		min_flushed_lsn,/*!< in: min flushed lsn field from the
 					data files */
-	ib_uint64_t	limit_lsn,	/*!< in: recover up to this lsn if
+	lsn_t		limit_lsn,	/*!< in: recover up to this lsn if
 					possible */
 	ulint		first_log_no);	/*!< in: number of the first archived
 					log file to use in the recovery; the
@@ -337,11 +335,11 @@ struct recv_struct{
 	ulint		len;	/*!< log record body length in bytes */
 	recv_data_t*	data;	/*!< chain of blocks containing the log record
 				body */
-	ib_uint64_t	start_lsn;/*!< start lsn of the log segment written by
+	lsn_t		start_lsn;/*!< start lsn of the log segment written by
 				the mtr which generated this log record: NOTE
 				that this is not necessarily the start lsn of
 				this log record */
-	ib_uint64_t	end_lsn;/*!< end lsn of the log segment written by
+	lsn_t		end_lsn;/*!< end lsn of the log segment written by
 				the mtr which generated this log record: NOTE
 				that this is not necessarily the end lsn of
 				this log record */
@@ -392,7 +390,7 @@ struct recv_sys_struct{
 	ibool		apply_batch_on;
 				/*!< this is TRUE when a log rec application
 				batch is running */
-	ib_uint64_t	lsn;	/*!< log sequence number */
+	lsn_t		lsn;	/*!< log sequence number */
 	ulint		last_log_buf_size;
 				/*!< size of the log buffer when the database
 				last time wrote to the log */
@@ -404,12 +402,12 @@ struct recv_sys_struct{
 				preceding buffer */
 	byte*		buf;	/*!< buffer for parsing log records */
 	ulint		len;	/*!< amount of data in buf */
-	ib_uint64_t	parse_start_lsn;
+	lsn_t		parse_start_lsn;
 				/*!< this is the lsn from which we were able to
 				start parsing log records and adding them to
 				the hash table; zero if a suitable
 				start point not found yet */
-	ib_uint64_t	scanned_lsn;
+	lsn_t		scanned_lsn;
 				/*!< the log data has been scanned up to this
 				lsn */
 	ulint		scanned_checkpoint_no;
@@ -418,10 +416,10 @@ struct recv_sys_struct{
 	ulint		recovered_offset;
 				/*!< start offset of non-parsed log records in
 				buf */
-	ib_uint64_t	recovered_lsn;
+	lsn_t		recovered_lsn;
 				/*!< the log records have been parsed up to
 				this lsn */
-	ib_uint64_t	limit_lsn;/*!< recovery should be made at most
+	lsn_t		limit_lsn;/*!< recovery should be made at most
 				up to this lsn */
 	ibool		found_corrupt_log;
 				/*!< this is set to TRUE if we during log
diff --git a/storage/innobase/include/log0recv.ic b/storage/innobase/include/log0recv.ic
index 0a8e55b96fa..32c28dd03e6 100644
--- a/storage/innobase/include/log0recv.ic
+++ b/storage/innobase/include/log0recv.ic
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1997, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -33,7 +33,7 @@ ibool
 recv_recovery_is_on(void)
 /*=====================*/
 {
-	return(UNIV_UNLIKELY(recv_recovery_on));
+	return(recv_recovery_on);
 }
 
 #ifdef UNIV_LOG_ARCHIVE
diff --git a/storage/innobase/include/mach0data.h b/storage/innobase/include/mach0data.h
index 8434bc73586..3066070ef39 100644
--- a/storage/innobase/include/mach0data.h
+++ b/storage/innobase/include/mach0data.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1995, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -27,6 +27,8 @@ Created 11/28/1995 Heikki Tuuri
 #ifndef mach0data_h
 #define mach0data_h
 
+#ifndef UNIV_INNOCHECKSUM
+
 #include "univ.i"
 #include "ut0byte.h"
 
@@ -204,7 +206,7 @@ UNIV_INLINE
 void
 mach_write_to_8(
 /*============*/
-	byte*		b,	/*!< in: pointer to 8 bytes where to store */
+	void*		b,	/*!< in: pointer to 8 bytes where to store */
 	ib_uint64_t	n);	/*!< in: 64-bit integer to be stored */
 /********************************************************//**
 The following function is used to fetch data from 8 consecutive
@@ -361,19 +363,19 @@ mach_write_to_2_little_endian(
 /*==========================*/
 	byte*	dest,		/*!< in: where to write */
 	ulint	n);		/*!< in: unsigned long int to write */
-
 /*********************************************************//**
 Convert integral type from storage byte order (big endian) to
 host byte order.
 @return	integer value */
 UNIV_INLINE
-ullint
+ib_uint64_t
 mach_read_int_type(
 /*===============*/
 	const byte*	src,		/*!< in: where to read from */
 	ulint		len,		/*!< in: length of src */
 	ibool		unsigned_type);	/*!< in: signed or unsigned flag */
 #endif /* !UNIV_HOTBACKUP */
+#endif /* !UNIV_INNOCHECKSUM */
 
 #ifndef UNIV_NONINL
 #include "mach0data.ic"
diff --git a/storage/innobase/include/mach0data.ic b/storage/innobase/include/mach0data.ic
index b1e5991d39e..ec1a28bca47 100644
--- a/storage/innobase/include/mach0data.ic
+++ b/storage/innobase/include/mach0data.ic
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1995, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -24,6 +24,8 @@ to the machine format.
 Created 11/28/1995 Heikki Tuuri
 ***********************************************************************/
 
+#ifndef UNIV_INNOCHECKSUM
+
 #include "ut0mem.h"
 
 /*******************************************************//**
@@ -38,7 +40,7 @@ mach_write_to_1(
 	ut_ad(b);
 	ut_ad((n | 0xFFUL) <= 0xFFUL);
 
-	b[0] = (byte)n;
+	b[0] = (byte) n;
 }
 
 /********************************************************//**
@@ -165,9 +167,11 @@ mach_write_to_4(
 	b[0] = (byte)(n >> 24);
 	b[1] = (byte)(n >> 16);
 	b[2] = (byte)(n >> 8);
-	b[3] = (byte)n;
+	b[3] = (byte) n;
 }
 
+#endif /* !UNIV_INNOCHECKSUM */
+
 /********************************************************//**
 The following function is used to fetch data from 4 consecutive
 bytes. The most significant byte is at the lowest address.
@@ -186,6 +190,8 @@ mach_read_from_4(
 		);
 }
 
+#ifndef UNIV_INNOCHECKSUM
+
 /*********************************************************//**
 Writes a ulint in a compressed form where the first byte codes the
 length of the stored ulint. We look at the most significant bits of
@@ -280,13 +286,13 @@ UNIV_INLINE
 void
 mach_write_to_8(
 /*============*/
-	byte*		b,	/*!< in: pointer to 8 bytes where to store */
+	void*		b,	/*!< in: pointer to 8 bytes where to store */
 	ib_uint64_t	n)	/*!< in: 64-bit integer to be stored */
 {
 	ut_ad(b);
 
-	mach_write_to_4(b, (ulint) (n >> 32));
-	mach_write_to_4(b + 4, (ulint) n);
+	mach_write_to_4(static_cast<byte*>(b), (ulint) (n >> 32));
+	mach_write_to_4(static_cast<byte*>(b) + 4, (ulint) n);
 }
 
 /********************************************************//**
@@ -550,7 +556,7 @@ mach_double_read(
 	ulint	i;
 	byte*	ptr;
 
-	ptr = (byte*)&d;
+	ptr = (byte*) &d;
 
 	for (i = 0; i < sizeof(double); i++) {
 #ifdef WORDS_BIGENDIAN
@@ -575,7 +581,7 @@ mach_double_write(
 	ulint	i;
 	byte*	ptr;
 
-	ptr = (byte*)&d;
+	ptr = (byte*) &d;
 
 	for (i = 0; i < sizeof(double); i++) {
 #ifdef WORDS_BIGENDIAN
@@ -599,7 +605,7 @@ mach_float_read(
 	ulint	i;
 	byte*	ptr;
 
-	ptr = (byte*)&d;
+	ptr = (byte*) &d;
 
 	for (i = 0; i < sizeof(float); i++) {
 #ifdef WORDS_BIGENDIAN
@@ -624,7 +630,7 @@ mach_float_write(
 	ulint	i;
 	byte*	ptr;
 
-	ptr = (byte*)&d;
+	ptr = (byte*) &d;
 
 	for (i = 0; i < sizeof(float); i++) {
 #ifdef WORDS_BIGENDIAN
@@ -648,7 +654,6 @@ mach_read_from_n_little_endian(
 	ulint	n	= 0;
 	const byte*	ptr;
 
-	ut_ad(buf_size <= sizeof(ulint));
 	ut_ad(buf_size > 0);
 
 	ptr = buf + buf_size;
@@ -736,7 +741,7 @@ Convert integral type from storage byte order (big endian) to
 host byte order.
 @return	integer value */
 UNIV_INLINE
-ullint
+ib_uint64_t
 mach_read_int_type(
 /*===============*/
 	const byte*	src,		/*!< in: where to read from */
@@ -772,3 +777,4 @@ mach_read_int_type(
 	return(ret);
 }
 #endif /* !UNIV_HOTBACKUP */
+#endif /* !UNIV_INNOCHECKSUM */
diff --git a/storage/innobase/include/mem0dbg.h b/storage/innobase/include/mem0dbg.h
index d81e1418b2b..9f95e84c81e 100644
--- a/storage/innobase/include/mem0dbg.h
+++ b/storage/innobase/include/mem0dbg.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1994, 2010, Innobase Oy. All Rights Reserved.
+Copyright (c) 1994, 2010, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -31,7 +31,7 @@ check fields whose sizes are given below */
 # ifndef UNIV_HOTBACKUP
 /* The mutex which protects in the debug version the hash table
 containing the list of live memory heaps, and also the global
-variables in mem0dbg.c. */
+variables in mem0dbg.cc. */
 extern mutex_t	mem_hash_mutex;
 # endif /* !UNIV_HOTBACKUP */
 
diff --git a/storage/innobase/include/mem0dbg.ic b/storage/innobase/include/mem0dbg.ic
index b0c8178a623..ec60ed35337 100644
--- a/storage/innobase/include/mem0dbg.ic
+++ b/storage/innobase/include/mem0dbg.ic
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1994, 2010, Innobase Oy. All Rights Reserved.
+Copyright (c) 1994, 2010, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
diff --git a/storage/innobase/include/mem0mem.h b/storage/innobase/include/mem0mem.h
index 5181bb4c9f7..6851a5bc01b 100644
--- a/storage/innobase/include/mem0mem.h
+++ b/storage/innobase/include/mem0mem.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1994, 2010, Innobase Oy. All Rights Reserved.
+Copyright (c) 1994, 2010, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -62,6 +62,12 @@ buffer pool; the latter method is used for very big heaps */
 					allocation functions can return
 					NULL. */
 
+/* Different type of heaps in terms of which datastructure is using them */
+#define MEM_HEAP_FOR_BTR_SEARCH		(MEM_HEAP_BTR_SEARCH | MEM_HEAP_BUFFER)
+#define MEM_HEAP_FOR_PAGE_HASH		(MEM_HEAP_DYNAMIC)
+#define MEM_HEAP_FOR_RECV_SYS		(MEM_HEAP_BUFFER)
+#define MEM_HEAP_FOR_LOCK_HEAP		(MEM_HEAP_BUFFER)
+
 /* The following start size is used for the first block in the memory heap if
 the size is not specified, i.e., 0 is given as the parameter in the call of
 create. The standard size is the maximum (payload) size of the blocks used for
@@ -99,16 +105,8 @@ heap creation. */
 Use this macro instead of the corresponding function! Macro for memory
 heap creation. */
 
-#define mem_heap_create_in_buffer(N)	mem_heap_create_func(\
-		(N), MEM_HEAP_BUFFER, __FILE__, __LINE__)
-/**************************************************************//**
-Use this macro instead of the corresponding function! Macro for memory
-heap creation. */
-
-#define mem_heap_create_in_btr_search(N)	mem_heap_create_func(\
-		(N), MEM_HEAP_BTR_SEARCH | MEM_HEAP_BUFFER,\
-		__FILE__, __LINE__)
-
+#define mem_heap_create_typed(N, T)	mem_heap_create_func(\
+		(N), (T), __FILE__, __LINE__)
 /**************************************************************//**
 Use this macro instead of the corresponding function! Macro for memory
 heap freeing. */
@@ -221,7 +219,7 @@ mem_heap_get_size(
 Use this macro instead of the corresponding function!
 Macro for memory buffer allocation */
 
-#define mem_zalloc(N)	memset(mem_alloc(N), 0, (N));
+#define mem_zalloc(N)	memset(mem_alloc(N), 0, (N))
 
 #define mem_alloc(N)	mem_alloc_func((N), NULL, __FILE__, __LINE__)
 #define mem_alloc2(N,S)	mem_alloc_func((N), (S), __FILE__, __LINE__)
@@ -320,7 +318,7 @@ mem_heap_dup(
 	ulint		len);	/*!< in: length of data, in bytes */
 
 /****************************************************************//**
-A simple (s)printf replacement that dynamically allocates the space for the
+A simple sprintf replacement that dynamically allocates the space for the
 formatted string from the given heap. This supports a very limited set of
 the printf syntax: types 's' and 'u' and length modifier 'l' (which is
 required for the 'u' type).
diff --git a/storage/innobase/include/mem0mem.ic b/storage/innobase/include/mem0mem.ic
index c70615e1ca9..eee3806dd52 100644
--- a/storage/innobase/include/mem0mem.ic
+++ b/storage/innobase/include/mem0mem.ic
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1994, 2010, Innobase Oy. All Rights Reserved.
+Copyright (c) 1994, 2010, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -193,7 +193,7 @@ mem_heap_alloc(
 
 	free = mem_block_get_free(block);
 
-	buf = (byte*)block + free;
+	buf = (byte*) block + free;
 
 	mem_block_set_free(block, free + MEM_SPACE_NEEDED(n));
 
@@ -202,11 +202,11 @@ mem_heap_alloc(
 		       n + MEM_FIELD_HEADER_SIZE + MEM_FIELD_TRAILER_SIZE);
 
 	/* In the debug version write debugging info to the field */
-	mem_field_init((byte*)buf, n);
+	mem_field_init((byte*) buf, n);
 
 	/* Advance buf to point at the storage which will be given to the
 	caller */
-	buf = (byte*)buf + MEM_FIELD_HEADER_SIZE;
+	buf = (byte*) buf + MEM_FIELD_HEADER_SIZE;
 
 #endif
 	UNIV_MEM_ALLOC(buf, n);
@@ -229,7 +229,7 @@ mem_heap_get_heap_top(
 
 	block = UT_LIST_GET_LAST(heap->base);
 
-	buf = (byte*)block + mem_block_get_free(block);
+	buf = (byte*) block + mem_block_get_free(block);
 
 	return(buf);
 }
@@ -272,8 +272,8 @@ mem_heap_free_heap_top(
 	block = UT_LIST_GET_LAST(heap->base);
 
 	while (block != NULL) {
-		if (((byte*)block + mem_block_get_free(block) >= old_top)
-		    && ((byte*)block <= old_top)) {
+		if (((byte*) block + mem_block_get_free(block) >= old_top)
+		    && ((byte*) block <= old_top)) {
 			/* Found the right block */
 
 			break;
@@ -292,22 +292,22 @@ mem_heap_free_heap_top(
 	ut_ad(block);
 
 	/* Set the free field of block */
-	mem_block_set_free(block, old_top - (byte*)block);
+	mem_block_set_free(block, old_top - (byte*) block);
 
 #ifdef UNIV_MEM_DEBUG
 	ut_ad(mem_block_get_start(block) <= mem_block_get_free(block));
 
 	/* In the debug version erase block from top up */
-	mem_erase_buf(old_top, (byte*)block + block->len - old_top);
+	mem_erase_buf(old_top, (byte*) block + block->len - old_top);
 
 	/* Update allocated memory count */
 	mutex_enter(&mem_hash_mutex);
 	mem_current_allocated_memory -= (total_size - size);
 	mutex_exit(&mem_hash_mutex);
 #else /* UNIV_MEM_DEBUG */
-	UNIV_MEM_ASSERT_W(old_top, (byte*)block + block->len - old_top);
+	UNIV_MEM_ASSERT_W(old_top, (byte*) block + block->len - old_top);
 #endif /* UNIV_MEM_DEBUG */
-	UNIV_MEM_ALLOC(old_top, (byte*)block + block->len - old_top);
+	UNIV_MEM_ALLOC(old_top, (byte*) block + block->len - old_top);
 
 	/* If free == start, we may free the block if it is not the first
 	one */
@@ -326,7 +326,7 @@ mem_heap_empty(
 /*===========*/
 	mem_heap_t*	heap)	/*!< in: heap to empty */
 {
-	mem_heap_free_heap_top(heap, (byte*)heap + mem_block_get_start(heap));
+	mem_heap_free_heap_top(heap, (byte*) heap + mem_block_get_start(heap));
 #ifndef UNIV_HOTBACKUP
 	if (heap->free_block) {
 		mem_heap_free_block_free(heap);
@@ -394,7 +394,7 @@ mem_heap_free_top(
 	ut_ad(mem_block_get_start(block) <= mem_block_get_free(block));
 
 	/* In the debug version check the consistency, and erase field */
-	mem_field_erase((byte*)block + mem_block_get_free(block), n);
+	mem_field_erase((byte*) block + mem_block_get_free(block), n);
 #endif
 
 	/* If free == start, we may free the block if it is not the first
@@ -529,7 +529,7 @@ mem_alloc_func(
 	first block and thus we can calculate the pointer to the heap from
 	the pointer to the buffer when we free the memory buffer. */
 
-	if (UNIV_LIKELY_NULL(size)) {
+	if (size) {
 		/* Adjust the allocation to the actual size of the
 		memory block. */
 		ulint	m = mem_block_get_len(heap)
@@ -538,12 +538,13 @@ mem_alloc_func(
 		m -= MEM_FIELD_HEADER_SIZE + MEM_FIELD_TRAILER_SIZE;
 #endif /* UNIV_MEM_DEBUG */
 		ut_ad(m >= n);
-		*size = n = m;
+		n = m;
+		*size = m;
 	}
 
 	buf = mem_heap_alloc(heap, n);
 
-	ut_a((byte*)heap == (byte*)buf - MEM_BLOCK_HEADER_SIZE
+	ut_a((byte*) heap == (byte*) buf - MEM_BLOCK_HEADER_SIZE
 	     - MEM_FIELD_HEADER_SIZE);
 	return(buf);
 }
@@ -562,7 +563,7 @@ mem_free_func(
 {
 	mem_heap_t*   heap;
 
-	heap = (mem_heap_t*)((byte*)ptr - MEM_BLOCK_HEADER_SIZE
+	heap = (mem_heap_t*)((byte*) ptr - MEM_BLOCK_HEADER_SIZE
 			     - MEM_FIELD_HEADER_SIZE);
 	mem_heap_free_func(heap, file_name, line);
 }
diff --git a/storage/innobase/include/mem0pool.h b/storage/innobase/include/mem0pool.h
index fa8be296ec9..451055e857f 100644
--- a/storage/innobase/include/mem0pool.h
+++ b/storage/innobase/include/mem0pool.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1994, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
diff --git a/storage/innobase/include/mem0pool.ic b/storage/innobase/include/mem0pool.ic
index b891dd6dea0..f4bafb8ba63 100644
--- a/storage/innobase/include/mem0pool.ic
+++ b/storage/innobase/include/mem0pool.ic
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1994, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
diff --git a/storage/innobase/include/mtr0log.h b/storage/innobase/include/mtr0log.h
index d271002a5fe..1427a981bef 100644
--- a/storage/innobase/include/mtr0log.h
+++ b/storage/innobase/include/mtr0log.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1995, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -168,7 +168,7 @@ mlog_write_initial_log_record_fast(
 	mtr_t*		mtr);	/*!< in: mtr */
 #else /* !UNIV_HOTBACKUP */
 # define mlog_write_initial_log_record(ptr,type,mtr) ((void) 0)
-# define mlog_write_initial_log_record_fast(ptr,type,log_ptr,mtr) ((byte *) 0)
+# define mlog_write_initial_log_record_fast(ptr,type,log_ptr,mtr) ((byte*) 0)
 #endif /* !UNIV_HOTBACKUP */
 /********************************************************//**
 Parses an initial log record written by mlog_write_initial_log_record.
diff --git a/storage/innobase/include/mtr0log.ic b/storage/innobase/include/mtr0log.ic
index 6f871170099..3ed4876eeab 100644
--- a/storage/innobase/include/mtr0log.ic
+++ b/storage/innobase/include/mtr0log.ic
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -26,6 +26,7 @@ Created 12/7/1995 Heikki Tuuri
 #include "mach0data.h"
 #include "ut0lst.h"
 #include "buf0buf.h"
+#include "buf0dblwr.h"
 #include "fsp0types.h"
 #include "trx0sys.h"
 
@@ -203,7 +204,7 @@ mlog_write_initial_log_record_fast(
 	system tablespace */
 	if (space == TRX_SYS_SPACE
 	    && offset >= FSP_EXTENT_SIZE && offset < 3 * FSP_EXTENT_SIZE) {
-		if (trx_doublewrite_buf_is_being_created) {
+		if (buf_dblwr_being_created) {
 			/* Do nothing: we only come to this branch in an
 			InnoDB database creation. We do not redo log
 			anything for the doublewrite buffer pages. */
diff --git a/storage/innobase/include/mtr0mtr.h b/storage/innobase/include/mtr0mtr.h
index 46f1ff9310c..fd84f1119cc 100644
--- a/storage/innobase/include/mtr0mtr.h
+++ b/storage/innobase/include/mtr0mtr.h
@@ -191,6 +191,9 @@ functions).  The page number parameter was originally written as 0. @{ */
 					MLOG_FILE_CREATE, MLOG_FILE_CREATE2 */
 /* @} */
 
+/* included here because it needs MLOG_LSN defined */
+#include "log0log.h"
+
 /***************************************************************//**
 Starts a mini-transaction. */
 UNIV_INLINE
@@ -355,7 +358,6 @@ mtr_memo_push(
 	void*	object,	/*!< in: object */
 	ulint	type);	/*!< in: object type: MTR_MEMO_S_LOCK, ... */
 
-
 /* Type definition of a mini-transaction memo stack slot. */
 typedef	struct mtr_memo_slot_struct	mtr_memo_slot_t;
 struct mtr_memo_slot_struct{
@@ -370,11 +372,14 @@ struct mtr_struct{
 #endif
 	dyn_array_t	memo;	/*!< memo stack for locks etc. */
 	dyn_array_t	log;	/*!< mini-transaction log */
-	ibool		inside_ibuf;
+	unsigned	inside_ibuf:1;
 				/*!< TRUE if inside ibuf changes */
-	ibool		modifications;
-				/* TRUE if the mtr made modifications to
-				buffer pool pages */
+	unsigned	modifications:1;
+				/*!< TRUE if the mini-transaction
+				modified buffer pool pages */
+	unsigned	made_dirty:1;
+				/*!< TRUE if mtr has made at least
+				one buffer pool page dirty */
 	ulint		n_log_recs;
 				/* count of how many page initial log records
 				have been written to the mtr log */
@@ -383,9 +388,9 @@ struct mtr_struct{
 				this mini-transaction */
 	ulint		log_mode; /* specifies which operations should be
 				logged; default value MTR_LOG_ALL */
-	ib_uint64_t	start_lsn;/* start lsn of the possible log entry for
+	lsn_t		start_lsn;/* start lsn of the possible log entry for
 				this mtr */
-	ib_uint64_t	end_lsn;/* end lsn of the possible log entry for
+	lsn_t		end_lsn;/* end lsn of the possible log entry for
 				this mtr */
 #ifdef UNIV_DEBUG
 	ulint		magic_n;
diff --git a/storage/innobase/include/mtr0mtr.ic b/storage/innobase/include/mtr0mtr.ic
index a03a0271535..dcd9826b380 100644
--- a/storage/innobase/include/mtr0mtr.ic
+++ b/storage/innobase/include/mtr0mtr.ic
@@ -29,6 +29,16 @@ Created 11/26/1995 Heikki Tuuri
 #endif /* !UNIV_HOTBACKUP */
 #include "mach0data.h"
 
+/***************************************************//**
+Checks if a mini-transaction is dirtying a clean page.
+@return TRUE if the mtr is dirtying a clean page. */
+UNIV_INTERN
+ibool
+mtr_block_dirtied(
+/*==============*/
+	const buf_block_t*	block)	/*!< in: block being x-fixed */
+	__attribute__((nonnull,warn_unused_result));
+
 /***************************************************************//**
 Starts a mini-transaction. */
 UNIV_INLINE
@@ -43,8 +53,9 @@ mtr_start(
 	dyn_array_create(&(mtr->log));
 
 	mtr->log_mode = MTR_LOG_ALL;
-	mtr->modifications = FALSE;
 	mtr->inside_ibuf = FALSE;
+	mtr->modifications = FALSE;
+	mtr->made_dirty = FALSE;
 	mtr->n_log_recs = 0;
 	mtr->n_freed_pages = 0;
 
@@ -72,6 +83,15 @@ mtr_memo_push(
 	ut_ad(mtr->magic_n == MTR_MAGIC_N);
 	ut_ad(mtr->state == MTR_ACTIVE);
 
+	/* If this mtr has x-fixed a clean page then we set
+	the made_dirty flag. This tells us if we need to
+	grab log_flush_order_mutex at mtr_commit so that we
+	can insert the dirtied page to the flush list. */
+	if (type == MTR_MEMO_PAGE_X_FIX && !mtr->made_dirty) {
+		mtr->made_dirty =
+			mtr_block_dirtied((const buf_block_t*) object);
+	}
+
 	memo = &(mtr->memo);
 
 	slot = (mtr_memo_slot_t*) dyn_array_push(memo, sizeof *slot);
@@ -249,7 +269,7 @@ mtr_s_lock_func(
 	ut_ad(mtr);
 	ut_ad(lock);
 
-	rw_lock_s_lock_inline(lock, 0, file, line);
+	rw_lock_s_lock_func(lock, 0, file, line);
 
 	mtr_memo_push(mtr, lock, MTR_MEMO_S_LOCK);
 }
@@ -268,7 +288,7 @@ mtr_x_lock_func(
 	ut_ad(mtr);
 	ut_ad(lock);
 
-	rw_lock_x_lock_inline(lock, 0, file, line);
+	rw_lock_x_lock_func(lock, 0, file, line);
 
 	mtr_memo_push(mtr, lock, MTR_MEMO_X_LOCK);
 }
diff --git a/storage/innobase/include/mtr0types.h b/storage/innobase/include/mtr0types.h
index 83a7aaf3839..7a2bcefadb9 100644
--- a/storage/innobase/include/mtr0types.h
+++ b/storage/innobase/include/mtr0types.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1995, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
diff --git a/storage/innobase/include/os0file.h b/storage/innobase/include/os0file.h
index fb13120a481..8f84193cb0f 100644
--- a/storage/innobase/include/os0file.h
+++ b/storage/innobase/include/os0file.h
@@ -1,6 +1,6 @@
 /***********************************************************************
 
-Copyright (c) 1995, 2010, Innobase Oy. All Rights Reserved.
+Copyright (c) 1995, 2011, Oracle and/or its affiliates. All Rights Reserved.
 Copyright (c) 2009, Percona Inc.
 
 Portions of this file contain modifications contributed and copyrighted
@@ -19,9 +19,9 @@ WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
 Public License for more details.
 
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 ***********************************************************************/
 
@@ -46,9 +46,6 @@ Created 10/21/1995 Heikki Tuuri
 /** File node of a tablespace or the log data space */
 typedef	struct fil_node_struct	fil_node_t;
 
-#ifdef UNIV_DO_FLUSH
-extern ibool	os_do_not_call_flush_at_each_write;
-#endif /* UNIV_DO_FLUSH */
 extern ibool	os_has_said_disk_full;
 /** Flag: enable debug printout for asynchronous i/o */
 extern ibool	os_aio_print_debug;
@@ -74,6 +71,8 @@ extern ulint	os_n_pending_writes;
 
 #endif
 
+/** File offset in bytes */
+typedef ib_uint64_t os_offset_t;
 #ifdef __WIN__
 /** File handle */
 # define os_file_t	HANDLE
@@ -102,14 +101,28 @@ log. */
 
 #define OS_FILE_LOG_BLOCK_SIZE		512
 
-/** Options for file_create @{ */
-#define	OS_FILE_OPEN			51
-#define	OS_FILE_CREATE			52
-#define OS_FILE_OVERWRITE		53
-#define OS_FILE_OPEN_RAW		54
-#define	OS_FILE_CREATE_PATH		55
-#define	OS_FILE_OPEN_RETRY		56	/* for os_file_create() on
-						the first ibdata file */
+/** Options for os_file_create_func @{ */
+typedef enum os_file_create_enum {
+	OS_FILE_OPEN = 51,		/*!< to open an existing file (if
+					doesn't exist, error) */
+	OS_FILE_CREATE,			/*!< to create new file (if
+					exists, error) */
+	OS_FILE_OVERWRITE,		/*!< to create a new file, if exists
+					the overwrite old file */
+	OS_FILE_OPEN_RAW,		/*!< to open a raw device or disk
+					partition */
+	OS_FILE_CREATE_PATH,		/*!< to create the directories */
+	OS_FILE_OPEN_RETRY,		/*!< open with retry */
+
+	/** Flags that can be combined with the above values. Please ensure
+	that the above values stay below 128. */
+
+	OS_FILE_ON_ERROR_NO_EXIT = 128,	/*!< do not exit on unknown errors */
+	OS_FILE_ON_ERROR_SILENT = 256	/*!< don't print diagnostic messages to
+					the log unless it is a fatal error,
+					this flag is only used if
+					ON_ERROR_NO_EXIT is set */
+} os_file_create_t;
 
 #define OS_FILE_READ_ONLY		333
 #define	OS_FILE_READ_WRITE		444
@@ -204,20 +217,18 @@ used to register actual file read, write and flush */
 # define register_pfs_file_open_begin(state, locker, key, op, name,	\
 				      src_file, src_line)		\
 do {									\
-	if (PSI_server) {						\
-		locker = PSI_server->get_thread_file_name_locker(	\
-			state, key, op, name, &locker);			\
-		if (locker) {						\
-			PSI_server->start_file_open_wait(		\
-				locker, src_file, src_line);		\
-		}							\
+	locker = PSI_CALL(get_thread_file_name_locker)(			\
+		state, key, op, name, &locker);				\
+	if (UNIV_LIKELY(locker != NULL)) {				\
+		PSI_CALL(start_file_open_wait)(				\
+			locker, src_file, src_line);			\
 	}								\
 } while (0)
 
 # define register_pfs_file_open_end(locker, file)			\
 do {									\
-	if (locker) {							\
-		PSI_server->end_file_open_wait_and_bind_to_descriptor(	\
+	if (UNIV_LIKELY(locker != NULL)) {				\
+		PSI_CALL(end_file_open_wait_and_bind_to_descriptor)(	\
 			locker, file);					\
 	}								\
 } while (0)
@@ -225,20 +236,18 @@ do {									\
 # define register_pfs_file_io_begin(state, locker, file, count, op,	\
 				    src_file, src_line)			\
 do {									\
-	if (PSI_server) {						\
-		locker = PSI_server->get_thread_file_descriptor_locker(	\
-			state, file, op);				\
-		if (locker) {						\
-			PSI_server->start_file_wait(			\
-				locker, count, src_file, src_line);	\
-		}							\
+	locker = PSI_CALL(get_thread_file_descriptor_locker)(		\
+		state, file, op);					\
+	if (UNIV_LIKELY(locker != NULL)) {				\
+		PSI_CALL(start_file_wait)(				\
+			locker, count, src_file, src_line);		\
 	}								\
 } while (0)
 
 # define register_pfs_file_io_end(locker, count)			\
 do {									\
-	if (locker) {							\
-		PSI_server->end_file_wait(locker, count);		\
+	if (UNIV_LIKELY(locker != NULL)) {				\
+		PSI_CALL(end_file_wait)(locker, count);			\
 	}								\
 } while (0)
 #endif /* UNIV_PFS_IO  */
@@ -276,24 +285,20 @@ The wrapper functions have the prefix of "innodb_". */
 # define os_file_close(file)						\
 	pfs_os_file_close_func(file, __FILE__, __LINE__)
 
-# define os_aio(type, mode, name, file, buf, offset, offset_high,	\
+# define os_aio(type, mode, name, file, buf, offset,			\
 		n, message1, message2)					\
 	pfs_os_aio_func(type, mode, name, file, buf, offset,		\
-			offset_high, n, message1, message2,		\
-			__FILE__, __LINE__)
+			n, message1, message2, __FILE__, __LINE__)
 
-# define os_file_read(file, buf, offset, offset_high, n)		\
-	pfs_os_file_read_func(file, buf, offset, offset_high, n,	\
-			      __FILE__, __LINE__)
+# define os_file_read(file, buf, offset, n)				\
+	pfs_os_file_read_func(file, buf, offset, n, __FILE__, __LINE__)
 
-# define os_file_read_no_error_handling(file, buf, offset,		\
-					offset_high, n)			\
-	pfs_os_file_read_no_error_handling_func(file, buf, offset,	\
-						offset_high, n,		\
+# define os_file_read_no_error_handling(file, buf, offset, n)		\
+	pfs_os_file_read_no_error_handling_func(file, buf, offset, n,	\
 						__FILE__, __LINE__)
 
-# define os_file_write(name, file, buf, offset, offset_high, n)		\
-	pfs_os_file_write_func(name, file, buf, offset, offset_high,	\
+# define os_file_write(name, file, buf, offset, n)	\
+	pfs_os_file_write_func(name, file, buf, offset,	\
 			       n, __FILE__, __LINE__)
 
 # define os_file_flush(file)						\
@@ -308,7 +313,7 @@ to original un-instrumented file I/O APIs */
 # define os_file_create(key, name, create, purpose, type, success)	\
 	os_file_create_func(name, create, purpose, type, success)
 
-# define os_file_create_simple(key, name, create, access, success)	\
+# define os_file_create_simple(key, name, create_mode, access, success)	\
 	os_file_create_simple_func(name, create_mode, access, success)
 
 # define os_file_create_simple_no_error_handling(			\
@@ -318,20 +323,18 @@ to original un-instrumented file I/O APIs */
 
 # define os_file_close(file)	os_file_close_func(file)
 
-# define os_aio(type, mode, name, file, buf, offset, offset_high,	\
-	       n, message1, message2)					\
-	os_aio_func(type, mode, name, file, buf, offset, offset_high, n,\
+# define os_aio(type, mode, name, file, buf, offset, n, message1, message2) \
+	os_aio_func(type, mode, name, file, buf, offset, n,		\
 		    message1, message2)
 
-# define os_file_read(file, buf, offset, offset_high, n)		\
-	os_file_read_func(file, buf, offset, offset_high, n)
+# define os_file_read(file, buf, offset, n)	\
+	os_file_read_func(file, buf, offset, n)
 
-# define os_file_read_no_error_handling(file, buf, offset,		\
-				       offset_high, n)			\
-	os_file_read_no_error_handling_func(file, buf, offset, offset_high, n)
+# define os_file_read_no_error_handling(file, buf, offset, n)		\
+	os_file_read_no_error_handling_func(file, buf, offset, n)
 
-# define os_file_write(name, file, buf, offset, offset_high, n)		\
-	os_file_write_func(name, file, buf, offset, offset_high, n)
+# define os_file_write(name, file, buf, offset, n)			\
+	os_file_write_func(name, file, buf, offset, n)
 
 # define os_file_flush(file)	os_file_flush_func(file)
 
@@ -461,13 +464,7 @@ os_file_create_simple_func(
 /*=======================*/
 	const char*	name,	/*!< in: name of the file or path as a
 				null-terminated string */
-	ulint		create_mode,/*!< in: OS_FILE_OPEN if an existing file is
-				opened (if does not exist, error), or
-				OS_FILE_CREATE if a new file is created
-				(if exists, error), or
-				OS_FILE_CREATE_PATH if new file
-				(if exists, error) and subdirectories along
-				its path are created (if needed)*/
+	ulint		create_mode,/*!< in: create mode */
 	ulint		access_type,/*!< in: OS_FILE_READ_ONLY or
 				OS_FILE_READ_WRITE */
 	ibool*		success);/*!< out: TRUE if succeed, FALSE if error */
@@ -483,15 +480,13 @@ os_file_create_simple_no_error_handling_func(
 /*=========================================*/
 	const char*	name,	/*!< in: name of the file or path as a
 				null-terminated string */
-	ulint		create_mode,/*!< in: OS_FILE_OPEN if an existing file
-				is opened (if does not exist, error), or
-				OS_FILE_CREATE if a new file is created
-				(if exists, error) */
+	ulint		create_mode,/*!< in: create mode */
 	ulint		access_type,/*!< in: OS_FILE_READ_ONLY,
 				OS_FILE_READ_WRITE, or
 				OS_FILE_READ_ALLOW_DELETE; the last option is
 				used by a backup program reading the file */
-	ibool*		success);/*!< out: TRUE if succeed, FALSE if error */
+	ibool*		success)/*!< out: TRUE if succeed, FALSE if error */
+	__attribute__((nonnull, warn_unused_result));
 /****************************************************************//**
 Tries to disable OS caching on an opened file descriptor. */
 UNIV_INTERN
@@ -515,14 +510,7 @@ os_file_create_func(
 /*================*/
 	const char*	name,	/*!< in: name of the file or path as a
 				null-terminated string */
-	ulint		create_mode,/*!< in: OS_FILE_OPEN if an existing file
-				is opened (if does not exist, error), or
-				OS_FILE_CREATE if a new file is created
-				(if exists, error),
-				OS_FILE_OVERWRITE if a new file is created
-				or an old overwritten;
-				OS_FILE_OPEN_RAW, if a raw device or disk
-				partition should be opened */
+	ulint		create_mode,/*!< in: create mode */
 	ulint		purpose,/*!< in: OS_FILE_AIO, if asynchronous,
 				non-buffered i/o is desired,
 				OS_FILE_NORMAL, if any normal file;
@@ -531,7 +519,8 @@ os_file_create_func(
 				async i/o or unbuffered i/o: look in the
 				function source code for the exact rules */
 	ulint		type,	/*!< in: OS_DATA_FILE or OS_LOG_FILE */
-	ibool*		success);/*!< out: TRUE if succeed, FALSE if error */
+	ibool*		success)/*!< out: TRUE if succeed, FALSE if error */
+	__attribute__((nonnull, warn_unused_result));
 /***********************************************************************//**
 Deletes a file. The file has to be closed before calling this.
 @return	TRUE if success */
@@ -539,7 +528,8 @@ UNIV_INTERN
 ibool
 os_file_delete(
 /*===========*/
-	const char*	name);	/*!< in: file path as a null-terminated string */
+	const char*	name);	/*!< in: file path as a null-terminated
+				string */
 
 /***********************************************************************//**
 Deletes a file if it exists. The file has to be closed before calling this.
@@ -548,7 +538,8 @@ UNIV_INTERN
 ibool
 os_file_delete_if_exists(
 /*=====================*/
-	const char*	name);	/*!< in: file path as a null-terminated string */
+	const char*	name);	/*!< in: file path as a null-terminated
+				string */
 /***********************************************************************//**
 NOTE! Use the corresponding macro os_file_rename(), not directly
 this function!
@@ -589,18 +580,13 @@ pfs_os_file_create_simple_func(
 	mysql_pfs_key_t key,	/*!< in: Performance Schema Key */
 	const char*	name,	/*!< in: name of the file or path as a
 				null-terminated string */
-	ulint		create_mode,/*!< in: OS_FILE_OPEN if an existing file is
-				opened (if does not exist, error), or
-				OS_FILE_CREATE if a new file is created
-				(if exists, error), or
-				OS_FILE_CREATE_PATH if new file
-				(if exists, error) and subdirectories along
-				its path are created (if needed)*/
+	ulint		create_mode,/*!< in: create mode */
 	ulint		access_type,/*!< in: OS_FILE_READ_ONLY or
 				OS_FILE_READ_WRITE */
 	ibool*		success,/*!< out: TRUE if succeed, FALSE if error */
 	const char*	src_file,/*!< in: file name where func invoked */
-	ulint		src_line);/*!< in: line where the func invoked */
+	ulint		src_line)/*!< in: line where the func invoked */
+	__attribute__((nonnull, warn_unused_result));
 
 /****************************************************************//**
 NOTE! Please use the corresponding macro
@@ -617,17 +603,15 @@ pfs_os_file_create_simple_no_error_handling_func(
 	mysql_pfs_key_t key,	/*!< in: Performance Schema Key */
 	const char*	name,	/*!< in: name of the file or path as a
 				null-terminated string */
-	ulint		create_mode,/*!< in: OS_FILE_OPEN if an existing file
-				is opened (if does not exist, error), or
-				OS_FILE_CREATE if a new file is created
-				(if exists, error) */
+	ulint		create_mode, /*!< in: file create mode */
 	ulint		access_type,/*!< in: OS_FILE_READ_ONLY,
 				OS_FILE_READ_WRITE, or
 				OS_FILE_READ_ALLOW_DELETE; the last option is
 				used by a backup program reading the file */
 	ibool*		success,/*!< out: TRUE if succeed, FALSE if error */
 	const char*	src_file,/*!< in: file name where func invoked */
-	ulint		src_line);/*!< in: line where the func invoked */
+	ulint		src_line)/*!< in: line where the func invoked */
+	__attribute__((nonnull, warn_unused_result));
 
 /****************************************************************//**
 NOTE! Please use the corresponding macro os_file_create(), not directly
@@ -643,14 +627,7 @@ pfs_os_file_create_func(
 	mysql_pfs_key_t key,	/*!< in: Performance Schema Key */
 	const char*	name,	/*!< in: name of the file or path as a
 				null-terminated string */
-	ulint		create_mode,/*!< in: OS_FILE_OPEN if an existing file
-				is opened (if does not exist, error), or
-				OS_FILE_CREATE if a new file is created
-				(if exists, error),
-				OS_FILE_OVERWRITE if a new file is created
-				or an old overwritten;
-				OS_FILE_OPEN_RAW, if a raw device or disk
-				partition should be opened */
+	ulint		create_mode,/*!< in: file create mode */
 	ulint		purpose,/*!< in: OS_FILE_AIO, if asynchronous,
 				non-buffered i/o is desired,
 				OS_FILE_NORMAL, if any normal file;
@@ -661,7 +638,8 @@ pfs_os_file_create_func(
 	ulint		type,	/*!< in: OS_DATA_FILE or OS_LOG_FILE */
 	ibool*		success,/*!< out: TRUE if succeed, FALSE if error */
 	const char*	src_file,/*!< in: file name where func invoked */
-	ulint		src_line);/*!< in: line where the func invoked */
+	ulint		src_line)/*!< in: line where the func invoked */
+	__attribute__((nonnull, warn_unused_result));
 
 /***********************************************************************//**
 NOTE! Please use the corresponding macro os_file_close(), not directly
@@ -687,10 +665,7 @@ pfs_os_file_read_func(
 /*==================*/
 	os_file_t	file,	/*!< in: handle to a file */
 	void*		buf,	/*!< in: buffer where to read */
-	ulint		offset,	/*!< in: least significant 32 bits of file
-				offset where to read */
-	ulint		offset_high,/*!< in: most significant 32 bits of
-				offset */
+	os_offset_t	offset,	/*!< in: file offset where to read */
 	ulint		n,	/*!< in: number of bytes to read */
 	const char*	src_file,/*!< in: file name where func invoked */
 	ulint		src_line);/*!< in: line where the func invoked */
@@ -708,10 +683,7 @@ pfs_os_file_read_no_error_handling_func(
 /*====================================*/
 	os_file_t	file,	/*!< in: handle to a file */
 	void*		buf,	/*!< in: buffer where to read */
-	ulint		offset,	/*!< in: least significant 32 bits of file
-				offset where to read */
-	ulint		offset_high,/*!< in: most significant 32 bits of
-				offset */
+	os_offset_t	offset,	/*!< in: file offset where to read */
 	ulint		n,	/*!< in: number of bytes to read */
 	const char*	src_file,/*!< in: file name where func invoked */
 	ulint		src_line);/*!< in: line where the func invoked */
@@ -733,10 +705,7 @@ pfs_os_aio_func(
 	os_file_t	file,	/*!< in: handle to a file */
 	void*		buf,	/*!< in: buffer where to read or from which
 				to write */
-	ulint		offset,	/*!< in: least significant 32 bits of file
-				offset where to read or write */
-	ulint		offset_high,/*!< in: most significant 32 bits of
-				offset */
+	os_offset_t	offset,	/*!< in: file offset where to read or write */
 	ulint		n,	/*!< in: number of bytes to read or write */
 	fil_node_t*	message1,/*!< in: message for the aio handler
 				(can be used to identify a completed
@@ -762,10 +731,7 @@ pfs_os_file_write_func(
 				null-terminated string */
 	os_file_t	file,	/*!< in: handle to a file */
 	const void*	buf,	/*!< in: buffer from which to write */
-	ulint		offset,	/*!< in: least significant 32 bits of file
-				offset where to write */
-	ulint		offset_high,/*!< in: most significant 32 bits of
-				offset */
+	os_offset_t	offset,	/*!< in: file offset where to write */
 	ulint		n,	/*!< in: number of bytes to write */
 	const char*	src_file,/*!< in: file name where func invoked */
 	ulint		src_line);/*!< in: line where the func invoked */
@@ -814,23 +780,13 @@ os_file_close_no_error_handling(
 #endif /* UNIV_HOTBACKUP */
 /***********************************************************************//**
 Gets a file size.
-@return	TRUE if success */
+@return	file size, or (os_offset_t) -1 on failure */
 UNIV_INTERN
-ibool
+os_offset_t
 os_file_get_size(
 /*=============*/
-	os_file_t	file,	/*!< in: handle to a file */
-	ulint*		size,	/*!< out: least significant 32 bits of file
-				size */
-	ulint*		size_high);/*!< out: most significant 32 bits of size */
-/***********************************************************************//**
-Gets file size as a 64-bit integer ib_int64_t.
-@return	size in bytes, -1 if error */
-UNIV_INTERN
-ib_int64_t
-os_file_get_size_as_iblonglong(
-/*===========================*/
-	os_file_t	file);	/*!< in: handle to a file */
+	os_file_t	file)	/*!< in: handle to a file */
+	__attribute__((warn_unused_result));
 /***********************************************************************//**
 Write the specified number of zeros to a newly created file.
 @return	TRUE if success */
@@ -841,9 +797,8 @@ os_file_set_size(
 	const char*	name,	/*!< in: name of the file or path as a
 				null-terminated string */
 	os_file_t	file,	/*!< in: handle to a file */
-	ulint		size,	/*!< in: least significant 32 bits of file
-				size */
-	ulint		size_high);/*!< in: most significant 32 bits of size */
+	os_offset_t	size)	/*!< in: file size */
+	__attribute__((nonnull, warn_unused_result));
 /***********************************************************************//**
 Truncates a file at its current position.
 @return	TRUE if success */
@@ -883,10 +838,7 @@ os_file_read_func(
 /*==============*/
 	os_file_t	file,	/*!< in: handle to a file */
 	void*		buf,	/*!< in: buffer where to read */
-	ulint		offset,	/*!< in: least significant 32 bits of file
-				offset where to read */
-	ulint		offset_high,/*!< in: most significant 32 bits of
-				offset */
+	os_offset_t	offset,	/*!< in: file offset where to read */
 	ulint		n);	/*!< in: number of bytes to read */
 /*******************************************************************//**
 Rewind file to its start, read at most size - 1 bytes from it to str, and
@@ -911,10 +863,7 @@ os_file_read_no_error_handling_func(
 /*================================*/
 	os_file_t	file,	/*!< in: handle to a file */
 	void*		buf,	/*!< in: buffer where to read */
-	ulint		offset,	/*!< in: least significant 32 bits of file
-				offset where to read */
-	ulint		offset_high,/*!< in: most significant 32 bits of
-				offset */
+	os_offset_t	offset,	/*!< in: file offset where to read */
 	ulint		n);	/*!< in: number of bytes to read */
 
 /*******************************************************************//**
@@ -930,10 +879,7 @@ os_file_write_func(
 				null-terminated string */
 	os_file_t	file,	/*!< in: handle to a file */
 	const void*	buf,	/*!< in: buffer from which to write */
-	ulint		offset,	/*!< in: least significant 32 bits of file
-				offset where to write */
-	ulint		offset_high,/*!< in: most significant 32 bits of
-				offset */
+	os_offset_t	offset,	/*!< in: file offset where to write */
 	ulint		n);	/*!< in: number of bytes to write */
 /*******************************************************************//**
 Check the existence and type of the given file.
@@ -1037,10 +983,7 @@ os_aio_func(
 	os_file_t	file,	/*!< in: handle to a file */
 	void*		buf,	/*!< in: buffer where to read or from which
 				to write */
-	ulint		offset,	/*!< in: least significant 32 bits of file
-				offset where to read or write */
-	ulint		offset_high, /*!< in: most significant 32 bits of
-				offset */
+	os_offset_t	offset,	/*!< in: file offset where to read or write */
 	ulint		n,	/*!< in: number of bytes to read or write */
 	fil_node_t*	message1,/*!< in: message for the aio handler
 				(can be used to identify a completed
diff --git a/storage/innobase/include/os0file.ic b/storage/innobase/include/os0file.ic
index 648070c6909..bdd7eb5f8f4 100644
--- a/storage/innobase/include/os0file.ic
+++ b/storage/innobase/include/os0file.ic
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 2010, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2010, 2011, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -40,13 +40,7 @@ pfs_os_file_create_simple_func(
 	mysql_pfs_key_t key,	/*!< in: Performance Schema Key */
 	const char*	name,	/*!< in: name of the file or path as a
 				null-terminated string */
-	ulint		create_mode,/*!< in: OS_FILE_OPEN if an existing file is
-				opened (if does not exist, error), or
-				OS_FILE_CREATE if a new file is created
-				(if exists, error), or
-				OS_FILE_CREATE_PATH if new file
-				(if exists, error) and subdirectories along
-				its path are created (if needed)*/
+	ulint		create_mode,/*!< in: create mode */
 	ulint		access_type,/*!< in: OS_FILE_READ_ONLY or
 				OS_FILE_READ_WRITE */
 	ibool*		success,/*!< out: TRUE if succeed, FALSE if error */
@@ -88,10 +82,7 @@ pfs_os_file_create_simple_no_error_handling_func(
 	mysql_pfs_key_t key,	/*!< in: Performance Schema Key */
 	const char*	name,	/*!< in: name of the file or path as a
 				null-terminated string */
-	ulint		create_mode,/*!< in: OS_FILE_OPEN if an existing file
-				is opened (if does not exist, error), or
-				OS_FILE_CREATE if a new file is created
-				(if exists, error) */
+	ulint		create_mode, /*!< in: file create mode */
 	ulint		access_type,/*!< in: OS_FILE_READ_ONLY,
 				OS_FILE_READ_WRITE, or
 				OS_FILE_READ_ALLOW_DELETE; the last option is
@@ -133,14 +124,7 @@ pfs_os_file_create_func(
 	mysql_pfs_key_t key,	/*!< in: Performance Schema Key */
 	const char*	name,	/*!< in: name of the file or path as a
 				null-terminated string */
-	ulint		create_mode,/*!< in: OS_FILE_OPEN if an existing file
-				is opened (if does not exist, error), or
-				OS_FILE_CREATE if a new file is created
-				(if exists, error),
-				OS_FILE_OVERWRITE if a new file is created
-				or an old overwritten;
-				OS_FILE_OPEN_RAW, if a raw device or disk
-				partition should be opened */
+	ulint		create_mode,/*!< in: file create mode */
 	ulint		purpose,/*!< in: OS_FILE_AIO, if asynchronous,
 				non-buffered i/o is desired,
 				OS_FILE_NORMAL, if any normal file;
@@ -216,10 +200,7 @@ pfs_os_aio_func(
 	os_file_t	file,	/*!< in: handle to a file */
 	void*		buf,	/*!< in: buffer where to read or from which
 				to write */
-	ulint		offset,	/*!< in: least significant 32 bits of file
-				offset where to read or write */
-	ulint		offset_high,/*!< in: most significant 32 bits of
-				offset */
+	os_offset_t	offset,	/*!< in: file offset where to read or write */
 	ulint		n,	/*!< in: number of bytes to read or write */
 	fil_node_t*	message1,/*!< in: message for the aio handler
 				(can be used to identify a completed
@@ -243,7 +224,7 @@ pfs_os_aio_func(
 					: PSI_FILE_READ,
 				   src_file, src_line);
 
-	result = os_aio_func(type, mode, name, file, buf, offset, offset_high,
+	result = os_aio_func(type, mode, name, file, buf, offset,
 			     n, message1, message2);
 
 	register_pfs_file_io_end(locker, n);
@@ -263,10 +244,7 @@ pfs_os_file_read_func(
 /*==================*/
 	os_file_t	file,	/*!< in: handle to a file */
 	void*		buf,	/*!< in: buffer where to read */
-	ulint		offset,	/*!< in: least significant 32 bits of file
-				offset where to read */
-	ulint		offset_high,/*!< in: most significant 32 bits of
-				offset */
+	os_offset_t	offset,	/*!< in: file offset where to read */
 	ulint		n,	/*!< in: number of bytes to read */
 	const char*	src_file,/*!< in: file name where func invoked */
 	ulint		src_line)/*!< in: line where the func invoked */
@@ -278,7 +256,7 @@ pfs_os_file_read_func(
 	register_pfs_file_io_begin(&state, locker, file, n, PSI_FILE_READ,
 				   src_file, src_line);
 
-	result = os_file_read_func(file, buf, offset, offset_high, n);
+	result = os_file_read_func(file, buf, offset, n);
 
 	register_pfs_file_io_end(locker, n);
 
@@ -299,10 +277,7 @@ pfs_os_file_read_no_error_handling_func(
 /*====================================*/
 	os_file_t	file,	/*!< in: handle to a file */
 	void*		buf,	/*!< in: buffer where to read */
-	ulint		offset,	/*!< in: least significant 32 bits of file
-				offset where to read */
-	ulint		offset_high,/*!< in: most significant 32 bits of
-				offset */
+	os_offset_t	offset,	/*!< in: file offset where to read */
 	ulint		n,	/*!< in: number of bytes to read */
 	const char*	src_file,/*!< in: file name where func invoked */
 	ulint		src_line)/*!< in: line where the func invoked */
@@ -314,8 +289,7 @@ pfs_os_file_read_no_error_handling_func(
 	register_pfs_file_io_begin(&state, locker, file, n, PSI_FILE_READ,
 				   src_file, src_line);
 
-	result = os_file_read_no_error_handling_func(file, buf, offset,
-						     offset_high, n);
+	result = os_file_read_no_error_handling_func(file, buf, offset, n);
 
 	register_pfs_file_io_end(locker, n);
 
@@ -336,10 +310,7 @@ pfs_os_file_write_func(
 				null-terminated string */
 	os_file_t	file,	/*!< in: handle to a file */
 	const void*	buf,	/*!< in: buffer from which to write */
-	ulint		offset,	/*!< in: least significant 32 bits of file
-				offset where to write */
-	ulint		offset_high,/*!< in: most significant 32 bits of
-				offset */
+	os_offset_t	offset,	/*!< in: file offset where to write */
 	ulint		n,	/*!< in: number of bytes to write */
 	const char*	src_file,/*!< in: file name where func invoked */
 	ulint		src_line)/*!< in: line where the func invoked */
@@ -351,7 +322,7 @@ pfs_os_file_write_func(
 	register_pfs_file_io_begin(&state, locker, file, n, PSI_FILE_WRITE,
 				   src_file, src_line);
 
-	result = os_file_write_func(name, file, buf, offset, offset_high, n);
+	result = os_file_write_func(name, file, buf, offset, n);
 
 	register_pfs_file_io_end(locker, n);
 
diff --git a/storage/innobase/include/os0proc.h b/storage/innobase/include/os0proc.h
index fd46bd7db87..613e3bd6947 100644
--- a/storage/innobase/include/os0proc.h
+++ b/storage/innobase/include/os0proc.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1995, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
diff --git a/storage/innobase/include/os0proc.ic b/storage/innobase/include/os0proc.ic
index c9641644525..506f4f8ce0c 100644
--- a/storage/innobase/include/os0proc.ic
+++ b/storage/innobase/include/os0proc.ic
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1995, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
diff --git a/storage/innobase/include/os0sync.h b/storage/innobase/include/os0sync.h
index 1b98f94f641..d68823b72ca 100644
--- a/storage/innobase/include/os0sync.h
+++ b/storage/innobase/include/os0sync.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1995, 2011, Oracle and/or its affiliates. All Rights Reserved.
 Copyright (c) 2008, Google Inc.
 
 Portions of this file contain modifications contributed and copyrighted by
@@ -18,8 +18,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -36,21 +36,33 @@ Created 9/6/1995 Heikki Tuuri
 
 #include "univ.i"
 #include "ut0lst.h"
+#include "sync0types.h"
 
 #ifdef __WIN__
 /** Native event (slow)*/
 typedef HANDLE			os_native_event_t;
 /** Native mutex */
-typedef CRITICAL_SECTION	os_fast_mutex_t;
+typedef CRITICAL_SECTION	fast_mutex_t;
 /** Native condition variable. */
 typedef CONDITION_VARIABLE	os_cond_t;
 #else
 /** Native mutex */
-typedef pthread_mutex_t		os_fast_mutex_t;
+typedef pthread_mutex_t		fast_mutex_t;
 /** Native condition variable */
 typedef pthread_cond_t		os_cond_t;
 #endif
 
+/** Structure that includes Performance Schema Probe pfs_psi
+in the os_fast_mutex structure if UNIV_PFS_MUTEX is defined */
+typedef struct os_fast_mutex_struct {
+	fast_mutex_t		mutex;	/*!< os_fast_mutex */
+#ifdef UNIV_PFS_MUTEX
+	struct PSI_mutex*	pfs_psi;/*!< The performance schema
+					instrumentation hook */
+#endif
+} os_fast_mutex_t;
+
+
 /** Operating system event */
 typedef struct os_event_struct	os_event_struct_t;
 /** Operating system event handle */
@@ -87,6 +99,9 @@ typedef struct os_mutex_struct	os_mutex_str_t;
 /** Operating system mutex handle */
 typedef os_mutex_str_t*		os_mutex_t;
 
+/** Return value of os_event_wait_time() when the time is exceeded */
+#define OS_SYNC_TIME_EXCEEDED	1
+
 /** Mutex protecting counts and the event and OS 'slow' mutex lists */
 extern os_mutex_t	os_sync_mutex;
 
@@ -181,7 +196,7 @@ os_event_wait_low(
 /**********************************************************//**
 Waits for an event object until it is in the signaled state or
 a timeout is exceeded. In Unix the timeout is always infinite.
-@return	0 if success, OS_SYNC_TIME_EXCEEDED if timeout was exceeded */
+@return 0 if success, OS_SYNC_TIME_EXCEEDED if timeout was exceeded */
 UNIV_INTERN
 ulint
 os_event_wait_time_low(
@@ -231,34 +246,119 @@ ulint
 os_fast_mutex_trylock(
 /*==================*/
 	os_fast_mutex_t*	fast_mutex);	/*!< in: mutex to acquire */
+
+/**********************************************************************
+Following os_fast_ mutex APIs would be performance schema instrumented:
+
+os_fast_mutex_init
+os_fast_mutex_lock
+os_fast_mutex_unlock
+os_fast_mutex_free
+
+These mutex APIs will point to corresponding wrapper functions that contain
+the performance schema instrumentation.
+
+NOTE! The following macro should be used in mutex operation, not the
+corresponding function. */
+
+#ifdef UNIV_PFS_MUTEX
+# define os_fast_mutex_init(K, M)			\
+	pfs_os_fast_mutex_init(K, M)
+
+# define os_fast_mutex_lock(M)				\
+	pfs_os_fast_mutex_lock(M, __FILE__, __LINE__)
+
+# define os_fast_mutex_unlock(M)	pfs_os_fast_mutex_unlock(M)
+
+# define os_fast_mutex_free(M)		pfs_os_fast_mutex_free(M)
+
+/*********************************************************//**
+NOTE! Please use the corresponding macro os_fast_mutex_init(), not directly
+this function!
+A wrapper function for os_fast_mutex_init_func(). Initializes an operating
+system fast mutex semaphore. */
+UNIV_INLINE
+void
+pfs_os_fast_mutex_init(
+/*===================*/
+	PSI_mutex_key		key,		/*!< in: Performance Schema
+						key */
+	os_fast_mutex_t*	fast_mutex);	/*!< out: fast mutex */
+/**********************************************************//**
+NOTE! Please use the corresponding macro os_fast_mutex_free(), not directly
+this function!
+Wrapper function for pfs_os_fast_mutex_free(). Also destroys the performance
+schema probes when freeing the mutex */
+UNIV_INLINE
+void
+pfs_os_fast_mutex_free(
+/*===================*/
+	os_fast_mutex_t*	fast_mutex);	/*!< in/out: mutex to free */
+/**********************************************************//**
+NOTE! Please use the corresponding macro os_fast_mutex_lock, not directly
+this function!
+Wrapper function of os_fast_mutex_lock. Acquires ownership of a fast mutex. */
+UNIV_INLINE
+void
+pfs_os_fast_mutex_lock(
+/*===================*/
+	os_fast_mutex_t*	fast_mutex,	/*!< in/out: mutex to acquire */
+	const char*		file_name,	/*!< in: file name where
+						 locked */
+	ulint			line);		/*!< in: line where locked */
+/**********************************************************//**
+NOTE! Please use the corresponding macro os_fast_mutex_unlock, not directly
+this function!
+Wrapper function of os_fast_mutex_unlock. Releases ownership of a fast mutex. */
+UNIV_INLINE
+void
+pfs_os_fast_mutex_unlock(
+/*=====================*/
+	os_fast_mutex_t*	fast_mutex);	/*!< in/out: mutex to release */
+
+#else /* UNIV_PFS_MUTEX */
+
+# define os_fast_mutex_init(K, M)			\
+	os_fast_mutex_init_func(&((os_fast_mutex_t*)(M))->mutex)
+
+# define os_fast_mutex_lock(M)				\
+	os_fast_mutex_lock_func(&((os_fast_mutex_t*)(M))->mutex)
+
+# define os_fast_mutex_unlock(M)			\
+	os_fast_mutex_unlock_func(&((os_fast_mutex_t*)(M))->mutex)
+
+# define os_fast_mutex_free(M)				\
+	os_fast_mutex_free_func(&((os_fast_mutex_t*)(M))->mutex)
+#endif /* UNIV_PFS_MUTEX */
+
 /**********************************************************//**
 Releases ownership of a fast mutex. */
 UNIV_INTERN
 void
-os_fast_mutex_unlock(
-/*=================*/
-	os_fast_mutex_t*	fast_mutex);	/*!< in: mutex to release */
+os_fast_mutex_unlock_func(
+/*======================*/
+	fast_mutex_t*		fast_mutex);	/*!< in: mutex to release */
 /*********************************************************//**
 Initializes an operating system fast mutex semaphore. */
 UNIV_INTERN
 void
-os_fast_mutex_init(
-/*===============*/
-	os_fast_mutex_t*	fast_mutex);	/*!< in: fast mutex */
+os_fast_mutex_init_func(
+/*====================*/
+	fast_mutex_t*		fast_mutex);	/*!< in: fast mutex */
 /**********************************************************//**
 Acquires ownership of a fast mutex. */
 UNIV_INTERN
 void
-os_fast_mutex_lock(
-/*===============*/
-	os_fast_mutex_t*	fast_mutex);	/*!< in: mutex to acquire */
+os_fast_mutex_lock_func(
+/*====================*/
+	fast_mutex_t*		fast_mutex);	/*!< in: mutex to acquire */
 /**********************************************************//**
 Frees an mutex object. */
 UNIV_INTERN
 void
-os_fast_mutex_free(
-/*===============*/
-	os_fast_mutex_t*	fast_mutex);	/*!< in: mutex to free */
+os_fast_mutex_free_func(
+/*====================*/
+	fast_mutex_t*		fast_mutex);	/*!< in: mutex to free */
 
 /**********************************************************//**
 Atomic compare-and-swap and increment for InnoDB. */
@@ -304,12 +404,30 @@ amount of increment. */
 # define os_atomic_increment_ulint(ptr, amount) \
 	os_atomic_increment(ptr, amount)
 
+# define os_atomic_increment_uint64(ptr, amount) \
+	os_atomic_increment(ptr, amount)
+
+/* Returns the resulting value, ptr is pointer to target, amount is the
+amount to decrement. */
+
+# define os_atomic_decrement(ptr, amount) \
+	__sync_sub_and_fetch(ptr, amount)
+
+# define os_atomic_decrement_lint(ptr, amount) \
+	os_atomic_decrement(ptr, amount)
+
+# define os_atomic_decrement_ulint(ptr, amount) \
+	os_atomic_decrement(ptr, amount)
+
 /**********************************************************//**
 Returns the old value of *ptr, atomically sets *ptr to new_val */
 
 # define os_atomic_test_and_set_byte(ptr, new_val) \
 	__sync_lock_test_and_set(ptr, (byte) new_val)
 
+# define os_atomic_test_and_set_ulint(ptr, new_val) \
+	__sync_lock_test_and_set(ptr, new_val)
+
 #elif defined(HAVE_IB_SOLARIS_ATOMICS)
 
 #define HAVE_ATOMIC_BUILTINS
@@ -327,15 +445,15 @@ compare to, new_val is the value to swap in. */
 	(atomic_cas_ulong(ptr, old_val, new_val) == old_val)
 
 # define os_compare_and_swap_lint(ptr, old_val, new_val) \
-	((lint)atomic_cas_ulong((ulong_t*) ptr, old_val, new_val) == old_val)
+	((lint) atomic_cas_ulong((ulong_t*) ptr, old_val, new_val) == old_val)
 
 # ifdef HAVE_IB_ATOMIC_PTHREAD_T_SOLARIS
 #  if SIZEOF_PTHREAD_T == 4
 #   define os_compare_and_swap_thread_id(ptr, old_val, new_val) \
-	((pthread_t)atomic_cas_32(ptr, old_val, new_val) == old_val)
+	((pthread_t) atomic_cas_32(ptr, old_val, new_val) == old_val)
 #  elif SIZEOF_PTHREAD_T == 8
 #   define os_compare_and_swap_thread_id(ptr, old_val, new_val) \
-	((pthread_t)atomic_cas_64(ptr, old_val, new_val) == old_val)
+	((pthread_t) atomic_cas_64(ptr, old_val, new_val) == old_val)
 #  else
 #   error "SIZEOF_PTHREAD_T != 4 or 8"
 #  endif /* SIZEOF_PTHREAD_T CHECK */
@@ -351,44 +469,97 @@ compare to, new_val is the value to swap in. */
 Returns the resulting value, ptr is pointer to target, amount is the
 amount of increment. */
 
-# define os_atomic_increment_lint(ptr, amount) \
-	atomic_add_long_nv((ulong_t*) ptr, amount)
-
 # define os_atomic_increment_ulint(ptr, amount) \
 	atomic_add_long_nv(ptr, amount)
 
+# define os_atomic_increment_lint(ptr, amount) \
+	os_atomic_increment_ulint((ulong_t*) ptr, amount)
+
+# define os_atomic_increment_uint64(ptr, amount) \
+	atomic_add_64_nv(ptr, amount)
+
+/* Returns the resulting value, ptr is pointer to target, amount is the
+amount to decrement. */
+
+# define os_atomic_decrement_lint(ptr, amount) \
+	os_atomic_increment_ulint((ulong_t*) ptr, -(amount))
+
+# define os_atomic_decrement_ulint(ptr, amount) \
+	os_atomic_increment_ulint(ptr, -(amount))
+
 /**********************************************************//**
 Returns the old value of *ptr, atomically sets *ptr to new_val */
 
 # define os_atomic_test_and_set_byte(ptr, new_val) \
 	atomic_swap_uchar(ptr, new_val)
 
+# define os_atomic_test_and_set_ulint(ptr, new_val) \
+	atomic_swap_ulong(ptr, new_val)
+
 #elif defined(HAVE_WINDOWS_ATOMICS)
 
 #define HAVE_ATOMIC_BUILTINS
 
-/* On Windows, use Windows atomics / interlocked */
-# ifdef _WIN64
-#  define win_cmp_and_xchg InterlockedCompareExchange64
-#  define win_xchg_and_add InterlockedExchangeAdd64
-# else /* _WIN64 */
-#  define win_cmp_and_xchg InterlockedCompareExchange
-#  define win_xchg_and_add InterlockedExchangeAdd
-# endif
+/**********************************************************//**
+Atomic compare and exchange of signed integers (both 32 and 64 bit).
+@return value found before the exchange.
+If it is not equal to old_value the exchange did not happen. */
+UNIV_INLINE
+lint
+win_cmp_and_xchg_lint(
+/*==================*/
+	volatile lint*	ptr,		/*!< in/out: source/destination */
+	lint		new_val,	/*!< in: exchange value */
+	lint		old_val);	/*!< in: value to compare to */
+
+/**********************************************************//**
+Atomic addition of signed integers.
+@return Initial value of the variable pointed to by ptr */
+UNIV_INLINE
+lint
+win_xchg_and_add(
+/*=============*/
+	volatile lint*	ptr,	/*!< in/out: address of destination */
+	lint		val);	/*!< in: number to be added */
+
+/**********************************************************//**
+Atomic compare and exchange of unsigned integers.
+@return value found before the exchange.
+If it is not equal to old_value the exchange did not happen. */
+UNIV_INLINE
+ulint
+win_cmp_and_xchg_ulint(
+/*===================*/
+	volatile ulint*	ptr,		/*!< in/out: source/destination */
+	ulint		new_val,	/*!< in: exchange value */
+	ulint		old_val);	/*!< in: value to compare to */
+
+/**********************************************************//**
+Atomic compare and exchange of 32 bit unsigned integers.
+@return value found before the exchange.
+If it is not equal to old_value the exchange did not happen. */
+UNIV_INLINE
+DWORD
+win_cmp_and_xchg_dword(
+/*===================*/
+	volatile DWORD*	ptr,		/*!< in/out: source/destination */
+	DWORD		new_val,	/*!< in: exchange value */
+	DWORD		old_val);	/*!< in: value to compare to */
 
 /**********************************************************//**
 Returns true if swapped, ptr is pointer to target, old_val is value to
 compare to, new_val is the value to swap in. */
 
 # define os_compare_and_swap_ulint(ptr, old_val, new_val) \
-	(win_cmp_and_xchg(ptr, new_val, old_val) == old_val)
+	(win_cmp_and_xchg_ulint(ptr, new_val, old_val) == old_val)
 
 # define os_compare_and_swap_lint(ptr, old_val, new_val) \
-	(win_cmp_and_xchg(ptr, new_val, old_val) == old_val)
+	(win_cmp_and_xchg_lint(ptr, new_val, old_val) == old_val)
 
 /* windows thread objects can always be passed to windows atomic functions */
 # define os_compare_and_swap_thread_id(ptr, old_val, new_val) \
-	(InterlockedCompareExchange(ptr, new_val, old_val) == old_val)
+	(win_cmp_and_xchg_dword(ptr, new_val, old_val) == old_val)
+
 # define INNODB_RW_LOCKS_USE_ATOMICS
 # define IB_ATOMICS_STARTUP_MSG \
 	"Mutexes and rw_locks use Windows interlocked functions"
@@ -401,7 +572,20 @@ amount of increment. */
 	(win_xchg_and_add(ptr, amount) + amount)
 
 # define os_atomic_increment_ulint(ptr, amount) \
-	((ulint) (win_xchg_and_add(ptr, amount) + amount))
+	((ulint) (win_xchg_and_add((lint*) ptr, (lint) amount) + amount))
+
+# define os_atomic_increment_uint64(ptr, amount) \
+	((ulint) (win_xchg_and_add(ptr, (lint) amount) + amount))
+
+/**********************************************************//**
+Returns the resulting value, ptr is pointer to target, amount is the
+amount to decrement. There is no atomic substract function on Windows */
+
+# define os_atomic_decrement_lint(ptr, amount) \
+	(win_xchg_and_add(ptr, -(lint) amount) - amount)
+
+# define os_atomic_decrement_ulint(ptr, amount) \
+	((ulint) (win_xchg_and_add((lint*) ptr, -(lint) amount) - amount))
 
 /**********************************************************//**
 Returns the old value of *ptr, atomically sets *ptr to new_val.
@@ -411,10 +595,55 @@ clobbered */
 # define os_atomic_test_and_set_byte(ptr, new_val) \
 	((byte) InterlockedExchange(ptr, new_val))
 
+# define os_atomic_test_and_set_ulong(ptr, new_val) \
+	InterlockedExchange(ptr, new_val)
+
 #else
 # define IB_ATOMICS_STARTUP_MSG \
 	"Mutexes and rw_locks use InnoDB's own implementation"
 #endif
+#ifdef HAVE_ATOMIC_BUILTINS
+#define os_atomic_inc_ulint(m,v,d)	os_atomic_increment_ulint(v, d)
+#define os_atomic_dec_ulint(m,v,d)	os_atomic_decrement_ulint(v, d)
+#else
+#define os_atomic_inc_ulint(m,v,d)	os_atomic_inc_ulint_func(m, v, d)
+#define os_atomic_dec_ulint(m,v,d)	os_atomic_dec_ulint_func(m, v, d)
+#endif /* HAVE_ATOMIC_BUILTINS */
+
+/**********************************************************//**
+Following macros are used to update specified counter atomically
+if HAVE_ATOMIC_BUILTINS defined. Otherwise, use mutex passed in
+for synchronization */
+#ifdef HAVE_ATOMIC_BUILTINS
+#define os_increment_counter_by_amount(mutex, counter, amount)	\
+	(void) os_atomic_increment_ulint(&counter, amount)
+
+#define os_decrement_counter_by_amount(mutex, counter, amount)	\
+	(void) os_atomic_increment_ulint(&counter, (-((lint) amount)))
+#else
+#define os_increment_counter_by_amount(mutex, counter, amount)	\
+	do {							\
+		mutex_enter(&(mutex));				\
+		(counter) += (amount);				\
+		mutex_exit(&(mutex));				\
+	} while (0)
+
+#define os_decrement_counter_by_amount(mutex, counter, amount)	\
+	do {							\
+		ut_a(counter >= amount);			\
+		mutex_enter(&(mutex));				\
+		(counter) -= (amount);				\
+		mutex_exit(&(mutex));				\
+	} while (0)
+#endif  /* HAVE_ATOMIC_BUILTINS */
+
+#define os_inc_counter(mutex, counter)				\
+	os_increment_counter_by_amount(mutex, counter, 1)
+
+#define os_dec_counter(mutex, counter)				\
+	do {							\
+		os_decrement_counter_by_amount(mutex, counter, 1);\
+	} while (0);
 
 #ifndef UNIV_NONINL
 #include "os0sync.ic"
diff --git a/storage/innobase/include/os0sync.ic b/storage/innobase/include/os0sync.ic
index c33f13aaad6..0d907b31366 100644
--- a/storage/innobase/include/os0sync.ic
+++ b/storage/innobase/include/os0sync.ic
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1995, 2011, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -36,14 +36,10 @@ os_fast_mutex_trylock(
 /*==================*/
 	os_fast_mutex_t*	fast_mutex)	/*!< in: mutex to acquire */
 {
-#ifdef __WIN__
-	if (TryEnterCriticalSection(fast_mutex)) {
-
-		return(0);
-	} else {
+	fast_mutex_t*	mutex = &fast_mutex->mutex;
 
-		return(1);
-	}
+#ifdef __WIN__
+	return(!TryEnterCriticalSection(mutex));
 #else
 	/* NOTE that the MySQL my_pthread.h redefines pthread_mutex_trylock
 	so that it returns 0 on success. In the operating system
@@ -51,6 +47,186 @@ os_fast_mutex_trylock(
 	returns 1 on success (but MySQL remaps that to 0), while Linux,
 	FreeBSD, Solaris, AIX, Tru64 Unix, HP-UX-11.0 return 0 on success. */
 
-	return((ulint) pthread_mutex_trylock(fast_mutex));
+	return((ulint) pthread_mutex_trylock(mutex));
+#endif
+}
+
+#ifdef UNIV_PFS_MUTEX
+/*********************************************************//**
+NOTE! Please use the corresponding macro os_fast_mutex_init(), not directly
+this function!
+A wrapper function for os_fast_mutex_init_func(). Initializes an operating
+system fast mutex semaphore. */
+UNIV_INLINE
+void
+pfs_os_fast_mutex_init(
+/*===================*/
+	PSI_mutex_key		key,		/*!< in: Performance Schema
+						key */
+	os_fast_mutex_t*	fast_mutex)	/*!< out: fast mutex */
+{
+#ifdef HAVE_PSI_MUTEX_INTERFACE
+	fast_mutex->pfs_psi = PSI_CALL(init_mutex)(key, &fast_mutex->mutex);
+#else
+	fast_mutex->pfs_psi = NULL;
+#endif
+
+	os_fast_mutex_init_func(&fast_mutex->mutex);
+}
+/******************************************************************//**
+NOTE! Please use the corresponding macro os_fast_mutex_free(), not directly
+this function!
+Wrapper function for pfs_os_fast_mutex_free(). Also destroys the performance
+schema probes when freeing the mutex */
+UNIV_INLINE
+void
+pfs_os_fast_mutex_free(
+/*===================*/
+	os_fast_mutex_t*	fast_mutex)  /*!< in/out: mutex */
+{
+#ifdef HAVE_PSI_MUTEX_INTERFACE
+	if (fast_mutex->pfs_psi != NULL)
+		PSI_CALL(destroy_mutex)(fast_mutex->pfs_psi);
 #endif
+	fast_mutex->pfs_psi = NULL;
+
+	os_fast_mutex_free_func(&fast_mutex->mutex);
 }
+/**********************************************************//**
+NOTE! Please use the corresponding macro os_fast_mutex_lock, not directly
+this function!
+Wrapper function of os_fast_mutex_lock_func. Acquires ownership of a fast
+mutex. */
+UNIV_INLINE
+void
+pfs_os_fast_mutex_lock(
+/*===================*/
+	os_fast_mutex_t*	fast_mutex,	/*!< in/out: mutex to acquire */
+	const char*		file_name,	/*!< in: file name where
+						 locked */
+	ulint			line)		/*!< in: line where locked */
+{
+#ifdef HAVE_PSI_MUTEX_INTERFACE
+	if (fast_mutex->pfs_psi != NULL)
+	{
+		PSI_mutex_locker* 	locker;
+		PSI_mutex_locker_state	state;
+
+		locker = PSI_CALL(start_mutex_wait)(&state, fast_mutex->pfs_psi,
+			PSI_MUTEX_LOCK, file_name, line);
+
+		os_fast_mutex_lock_func(&fast_mutex->mutex);
+
+		if (locker != NULL)
+			PSI_CALL(end_mutex_wait)(locker, 0);
+	}
+	else
+#endif
+	{
+		os_fast_mutex_lock_func(&fast_mutex->mutex);
+	}
+
+	return;
+}
+/**********************************************************//**
+NOTE! Please use the corresponding macro os_fast_mutex_unlock, not directly
+this function!
+Wrapper function of os_fast_mutex_unlock_func. Releases ownership of a
+fast mutex. */
+UNIV_INLINE
+void
+pfs_os_fast_mutex_unlock(
+/*=====================*/
+	os_fast_mutex_t*	fast_mutex)	/*!< in/out: mutex to release */
+{
+#ifdef HAVE_PSI_MUTEX_INTERFACE
+	if (fast_mutex->pfs_psi != NULL)
+		PSI_CALL(unlock_mutex)(fast_mutex->pfs_psi);
+#endif
+
+	os_fast_mutex_unlock_func(&fast_mutex->mutex);
+}
+#endif /* UNIV_PFS_MUTEX */
+
+#ifdef HAVE_WINDOWS_ATOMICS
+
+/* Use inline functions to make 64 and 32 bit versions of windows atomic
+functions so that typecasts are evaluated at compile time. Take advantage
+that lint is either __int64 or long int and windows atomic functions work
+on __int64 and LONG */
+
+/**********************************************************//**
+Atomic compare and exchange of unsigned integers.
+@return value found before the exchange.
+If it is not equal to old_value the exchange did not happen. */
+UNIV_INLINE
+lint
+win_cmp_and_xchg_lint(
+/*==================*/
+	volatile lint*	ptr,		/*!< in/out: source/destination */
+	lint		new_val,	/*!< in: exchange value */
+	lint		old_val)	/*!< in: value to compare to */
+{
+# ifdef _WIN64
+	return(InterlockedCompareExchange64(ptr, new_val, old_val));
+# else
+	return(InterlockedCompareExchange(ptr, new_val, old_val));
+# endif
+}
+
+/**********************************************************//**
+Atomic addition of signed integers.
+@return Initial value of the variable pointed to by ptr */
+UNIV_INLINE
+lint
+win_xchg_and_add(
+/*=============*/
+	volatile lint*	ptr,	/*!< in/out: address of destination */
+	lint		val)	/*!< in: number to be added */
+{
+#ifdef _WIN64
+	return(InterlockedExchangeAdd64(ptr, val));
+#else
+	return(InterlockedExchangeAdd(ptr, val));
+#endif
+}
+
+/**********************************************************//**
+Atomic compare and exchange of unsigned integers.
+@return value found before the exchange.
+If it is not equal to old_value the exchange did not happen. */
+UNIV_INLINE
+ulint
+win_cmp_and_xchg_ulint(
+/*===================*/
+	volatile ulint*	ptr,		/*!< in/out: source/destination */
+	ulint		new_val,	/*!< in: exchange value */
+	ulint		old_val)	/*!< in: value to compare to */
+{
+	return((ulint) win_cmp_and_xchg_lint(
+		(volatile lint*) ptr,
+		(lint) new_val,
+		(lint) old_val));
+}
+
+/**********************************************************//**
+Atomic compare and exchange of 32-bit unsigned integers.
+@return value found before the exchange.
+If it is not equal to old_value the exchange did not happen. */
+UNIV_INLINE
+DWORD
+win_cmp_and_xchg_dword(
+/*===================*/
+	volatile DWORD*	ptr,		/*!< in/out: source/destination */
+	DWORD		new_val,	/*!< in: exchange value */
+	DWORD		old_val)	/*!< in: value to compare to */
+{
+	ut_ad(sizeof(DWORD) == sizeof(LONG));	/* We assume this. */
+	return(InterlockedCompareExchange(
+		(volatile LONG*) ptr,
+		(LONG) new_val,
+		(LONG) old_val));
+}
+
+#endif /* HAVE_WINDOWS_ATOMICS */
+
diff --git a/storage/innobase/include/os0thread.h b/storage/innobase/include/os0thread.h
index df3cdb7728e..37c54afae80 100644
--- a/storage/innobase/include/os0thread.h
+++ b/storage/innobase/include/os0thread.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1995, 2011, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -35,7 +35,6 @@ can wait inside InnoDB */
 
 #define	OS_THREAD_MAX_N		srv_max_n_threads
 
-
 /* Possible fixed priorities for threads */
 #define OS_THREAD_PRIORITY_NONE		100
 #define OS_THREAD_PRIORITY_BACKGROUND	1
@@ -44,14 +43,35 @@ can wait inside InnoDB */
 
 #ifdef __WIN__
 typedef void*			os_thread_t;
-typedef unsigned long		os_thread_id_t;	/*!< In Windows the thread id
+typedef DWORD			os_thread_id_t;	/*!< In Windows the thread id
 						is an unsigned long int */
+extern "C"  {
+typedef LPTHREAD_START_ROUTINE	os_thread_func_t;
+}
+
+/** Macro for specifying a Windows thread start function. */
+#define DECLARE_THREAD(func)	WINAPI func
+
+/** Required to get around a build error on Windows. Even though our functions
+are defined/declared as WINAPI f(LPVOID a); the compiler complains that they
+are defined as: os_thread_ret_t (__cdecl*)(void*). Because our functions
+don't access the arguments and don't return any value, we should be safe. */
+#define os_thread_create(f,a,i)	\
+	os_thread_create_func(reinterpret_cast<os_thread_func_t>(f), a, i)
+
 #else
+
 typedef pthread_t		os_thread_t;
 typedef os_thread_t		os_thread_id_t;	/*!< In Unix we use the thread
 						handle itself as the id of
 						the thread */
-#endif
+extern "C"  { typedef void*	(*os_thread_func_t)(void*); }
+
+/** Macro for specifying a POSIX thread start function. */
+#define DECLARE_THREAD(func)	func
+#define os_thread_create(f,a,i)	os_thread_create_func(f, a, i)
+
+#endif /* __WIN__ */
 
 /* Define a function pointer type to use in a typecast */
 typedef void* (*os_posix_f_t) (void*);
@@ -88,14 +108,10 @@ thread should always use that to exit and not use return() to exit.
 @return	handle to the thread */
 UNIV_INTERN
 os_thread_t
-os_thread_create(
-/*=============*/
-#ifndef __WIN__
-	os_posix_f_t		start_f,
-#else
-	ulint (*start_f)(void*),		/*!< in: pointer to function
+os_thread_create_func(
+/*==================*/
+	os_thread_func_t	func,		/*!< in: pointer to function
 						from which to start */
-#endif
 	void*			arg,		/*!< in: argument to start
 						function */
 	os_thread_id_t*		thread_id);	/*!< out: id of the created
diff --git a/storage/innobase/include/os0thread.ic b/storage/innobase/include/os0thread.ic
index f89bc40b4fa..0622d22f2dc 100644
--- a/storage/innobase/include/os0thread.ic
+++ b/storage/innobase/include/os0thread.ic
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1995, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
diff --git a/storage/innobase/include/page0cur.h b/storage/innobase/include/page0cur.h
index 1544b0abe1c..52f5c5de58a 100644
--- a/storage/innobase/include/page0cur.h
+++ b/storage/innobase/include/page0cur.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1994, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
diff --git a/storage/innobase/include/page0cur.ic b/storage/innobase/include/page0cur.ic
index 3520677dfb3..a065f9ff30d 100644
--- a/storage/innobase/include/page0cur.ic
+++ b/storage/innobase/include/page0cur.ic
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1994, 2011, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -27,6 +27,8 @@ Created 10/4/1994 Heikki Tuuri
 #include "buf0types.h"
 
 #ifdef UNIV_DEBUG
+# include "rem0cmp.h"
+
 /*********************************************************//**
 Gets pointer to the page frame where the cursor is positioned.
 @return	page */
@@ -268,6 +270,7 @@ page_cur_tuple_insert(
 					      index, rec, offsets, mtr);
 	}
 
+	ut_ad(!rec || !cmp_dtuple_rec(tuple, rec, offsets));
 	mem_heap_free(heap);
 	return(rec);
 }
diff --git a/storage/innobase/include/page0page.h b/storage/innobase/include/page0page.h
index 74e9ceca959..e4571b69376 100644
--- a/storage/innobase/include/page0page.h
+++ b/storage/innobase/include/page0page.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1994, 2012, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 1994, 2011, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
diff --git a/storage/innobase/include/page0page.ic b/storage/innobase/include/page0page.ic
index 781ad029e87..e73e547e92b 100644
--- a/storage/innobase/include/page0page.ic
+++ b/storage/innobase/include/page0page.ic
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1994, 2012, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 1994, 2011, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -136,7 +136,7 @@ page_header_set_field(
 	ut_ad(field != PAGE_N_HEAP || (val & 0x7fff) < UNIV_PAGE_SIZE);
 
 	mach_write_to_2(page + PAGE_HEADER + field, val);
-	if (UNIV_LIKELY_NULL(page_zip)) {
+	if (page_zip) {
 		page_zip_write_header(page_zip,
 				      page + PAGE_HEADER + field, 2, NULL);
 	}
@@ -211,7 +211,7 @@ page_header_reset_last_insert(
 {
 	ut_ad(page && mtr);
 
-	if (UNIV_LIKELY_NULL(page_zip)) {
+	if (page_zip) {
 		mach_write_to_2(page + (PAGE_HEADER + PAGE_LAST_INSERT), 0);
 		page_zip_write_header(page_zip,
 				      page + (PAGE_HEADER + PAGE_LAST_INSERT),
@@ -233,8 +233,7 @@ page_is_comp(
 /*=========*/
 	const page_t*	page)	/*!< in: index page */
 {
-	return(UNIV_EXPECT(page_header_get_field(page, PAGE_N_HEAP) & 0x8000,
-			   0x8000));
+	return(page_header_get_field(page, PAGE_N_HEAP) & 0x8000);
 }
 
 /************************************************************//**
@@ -345,10 +344,10 @@ page_rec_is_user_rec_low(
 #endif
 	ut_ad(offset <= UNIV_PAGE_SIZE - PAGE_EMPTY_DIR_START);
 
-	return(UNIV_LIKELY(offset != PAGE_NEW_SUPREMUM)
-	       && UNIV_LIKELY(offset != PAGE_NEW_INFIMUM)
-	       && UNIV_LIKELY(offset != PAGE_OLD_INFIMUM)
-	       && UNIV_LIKELY(offset != PAGE_OLD_SUPREMUM));
+	return(offset != PAGE_NEW_SUPREMUM
+	       && offset != PAGE_NEW_INFIMUM
+	       && offset != PAGE_OLD_INFIMUM
+	       && offset != PAGE_OLD_SUPREMUM);
 }
 
 /************************************************************//**
@@ -363,8 +362,8 @@ page_rec_is_supremum_low(
 	ut_ad(offset >= PAGE_NEW_INFIMUM);
 	ut_ad(offset <= UNIV_PAGE_SIZE - PAGE_EMPTY_DIR_START);
 
-	return(UNIV_UNLIKELY(offset == PAGE_NEW_SUPREMUM)
-	       || UNIV_UNLIKELY(offset == PAGE_OLD_SUPREMUM));
+	return(offset == PAGE_NEW_SUPREMUM
+	       || offset == PAGE_OLD_SUPREMUM);
 }
 
 /************************************************************//**
@@ -379,8 +378,7 @@ page_rec_is_infimum_low(
 	ut_ad(offset >= PAGE_NEW_INFIMUM);
 	ut_ad(offset <= UNIV_PAGE_SIZE - PAGE_EMPTY_DIR_START);
 
-	return(UNIV_UNLIKELY(offset == PAGE_NEW_INFIMUM)
-	       || UNIV_UNLIKELY(offset == PAGE_OLD_INFIMUM));
+	return(offset == PAGE_NEW_INFIMUM || offset == PAGE_OLD_INFIMUM);
 }
 
 /************************************************************//**
@@ -484,12 +482,14 @@ page_cmp_dtuple_rec_with_match(
 
 	rec_offset = page_offset(rec);
 
-	if (UNIV_UNLIKELY(rec_offset == PAGE_NEW_INFIMUM)
-	    || UNIV_UNLIKELY(rec_offset == PAGE_OLD_INFIMUM)) {
+	if (rec_offset == PAGE_NEW_INFIMUM
+	    || rec_offset == PAGE_OLD_INFIMUM) {
+
 		return(1);
-	}
-	if (UNIV_UNLIKELY(rec_offset == PAGE_NEW_SUPREMUM)
-	    || UNIV_UNLIKELY(rec_offset == PAGE_OLD_SUPREMUM)) {
+
+	} else if (rec_offset == PAGE_NEW_SUPREMUM
+		   || rec_offset == PAGE_OLD_SUPREMUM) {
+
 		return(-1);
 	}
 
@@ -731,21 +731,19 @@ page_rec_get_next_low(
 
 	offs = rec_get_next_offs(rec, comp);
 
-	if (UNIV_UNLIKELY(offs >= UNIV_PAGE_SIZE)) {
+	if (offs >= UNIV_PAGE_SIZE) {
 		fprintf(stderr,
 			"InnoDB: Next record offset is nonsensical %lu"
 			" in record at offset %lu\n"
 			"InnoDB: rec address %p, space id %lu, page %lu\n",
-			(ulong)offs, (ulong) page_offset(rec),
+			(ulong) offs, (ulong) page_offset(rec),
 			(void*) rec,
 			(ulong) page_get_space_id(page),
 			(ulong) page_get_page_no(page));
 		buf_page_print(page, 0, 0);
 
 		ut_error;
-	}
-
-	if (UNIV_UNLIKELY(offs == 0)) {
+	} else if (offs == 0) {
 
 		return(NULL);
 	}
@@ -797,11 +795,7 @@ page_rec_set_next(
 	ut_ad(!next || !page_rec_is_infimum(next));
 	ut_ad(!next || page_align(rec) == page_align(next));
 
-	if (UNIV_LIKELY(next != NULL)) {
-		offs = page_offset(next);
-	} else {
-		offs = 0;
-	}
+	offs = next != NULL ? page_offset(next) : 0;
 
 	if (page_rec_is_comp(rec)) {
 		rec_set_next_offs_new(rec, offs);
@@ -976,7 +970,7 @@ page_get_free_space_of_empty(
 /*=========================*/
 	ulint	comp)		/*!< in: nonzero=compact page layout */
 {
-	if (UNIV_LIKELY(comp)) {
+	if (comp) {
 		return((ulint)(UNIV_PAGE_SIZE
 			       - PAGE_NEW_SUPREMUM_END
 			       - PAGE_DIR
@@ -1111,7 +1105,7 @@ page_mem_free(
 	page_header_set_field(page, page_zip, PAGE_GARBAGE,
 			      garbage + rec_offs_size(offsets));
 
-	if (UNIV_LIKELY_NULL(page_zip)) {
+	if (page_zip) {
 		page_zip_dir_delete(page_zip, rec, index, offsets, free);
 	} else {
 		page_header_set_field(page, page_zip, PAGE_N_RECS,
diff --git a/storage/innobase/include/page0types.h b/storage/innobase/include/page0types.h
index d9a277bf208..da2ac1c7de2 100644
--- a/storage/innobase/include/page0types.h
+++ b/storage/innobase/include/page0types.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1994, 2011, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -49,15 +49,14 @@ page0*.h includes rem0rec.h and may include rem0rec.ic. */
 /** Number of bits needed for representing different compressed page sizes */
 #define PAGE_ZIP_SSIZE_BITS 3
 
-/** log2 of smallest compressed page size */
-#define PAGE_ZIP_MIN_SIZE_SHIFT	10
-/** Smallest compressed page size */
-#define PAGE_ZIP_MIN_SIZE	(1 << PAGE_ZIP_MIN_SIZE_SHIFT)
+/** Maximum compressed page shift size */
+#define PAGE_ZIP_SSIZE_MAX	\
+	(UNIV_ZIP_SIZE_SHIFT_MAX - UNIV_ZIP_SIZE_SHIFT_MIN + 1)
 
-/** Number of supported compressed page sizes */
-#define PAGE_ZIP_NUM_SSIZE (UNIV_PAGE_SIZE_SHIFT - PAGE_ZIP_MIN_SIZE_SHIFT + 2)
-#if PAGE_ZIP_NUM_SSIZE > (1 << PAGE_ZIP_SSIZE_BITS)
-# error "PAGE_ZIP_NUM_SSIZE > (1 << PAGE_ZIP_SSIZE_BITS)"
+/* Make sure there are enough bits available to store the maximum zip
+ssize, which is the number of shifts from 512. */
+#if PAGE_ZIP_SSIZE_MAX >= (1 << PAGE_ZIP_SSIZE_BITS)
+# error "PAGE_ZIP_SSIZE_MAX >= (1 << PAGE_ZIP_SSIZE_BITS)"
 #endif
 
 /** Compressed page descriptor */
@@ -75,9 +74,9 @@ struct page_zip_des_struct
 					columns on the page; the maximum
 					is 744 on a 16 KiB page */
 	unsigned	ssize:PAGE_ZIP_SSIZE_BITS;
-					/*!< 0 or compressed page size;
+					/*!< 0 or compressed page shift size;
 					the size in bytes is
-					PAGE_ZIP_MIN_SIZE << (ssize - 1). */
+					(UNIV_ZIP_SIZE_MIN >> 1) << ssize. */
 };
 
 /** Compression statistics for a given page size */
@@ -98,7 +97,7 @@ struct page_zip_stat_struct {
 typedef struct page_zip_stat_struct page_zip_stat_t;
 
 /** Statistics on compression, indexed by page_zip_des_struct::ssize - 1 */
-extern page_zip_stat_t page_zip_stat[PAGE_ZIP_NUM_SSIZE - 1];
+extern page_zip_stat_t page_zip_stat[PAGE_ZIP_SSIZE_MAX];
 
 /**********************************************************************//**
 Write the "deleted" flag of a record on a compressed page.  The flag must
diff --git a/storage/innobase/include/page0zip.h b/storage/innobase/include/page0zip.h
index 00c1d0516e6..f127fad2260 100644
--- a/storage/innobase/include/page0zip.h
+++ b/storage/innobase/include/page0zip.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 2005, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 2005, 2011, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -35,6 +35,7 @@ Created June 2005 by Marko Makela
 #include "page0types.h"
 #include "buf0types.h"
 #include "dict0types.h"
+#include "srv0srv.h"
 #include "trx0types.h"
 #include "mem0mem.h"
 
@@ -444,9 +445,21 @@ ulint
 page_zip_calc_checksum(
 /*===================*/
         const void*     data,   /*!< in: compressed page */
-        ulint           size)   /*!< in: size of compressed page */
+        ulint           size,   /*!< in: size of compressed page */
+	srv_checksum_algorithm_t algo) /*!< in: algorithm to use */
 	__attribute__((nonnull));
 
+/**********************************************************************//**
+Verify a compressed page's checksum.
+@return	TRUE if the stored checksum is valid according to the value of
+innodb_checksum_algorithm */
+UNIV_INTERN
+ibool
+page_zip_verify_checksum(
+/*=====================*/
+	const void*	data,	/*!< in: compressed page */
+	ulint		size);	/*!< in: size of compressed page */
+
 #ifndef UNIV_HOTBACKUP
 /** Check if a pointer to an uncompressed page matches a compressed page.
 @param ptr	pointer to an uncompressed page frame
diff --git a/storage/innobase/include/page0zip.ic b/storage/innobase/include/page0zip.ic
index 75cc7a9fcc4..c9300aa4e9f 100644
--- a/storage/innobase/include/page0zip.ic
+++ b/storage/innobase/include/page0zip.ic
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 2005, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 2005, 2011, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -120,13 +120,13 @@ page_zip_get_size(
 {
 	ulint	size;
 
-	if (UNIV_UNLIKELY(!page_zip->ssize)) {
+	if (!page_zip->ssize) {
 		return(0);
 	}
 
-	size = (PAGE_ZIP_MIN_SIZE >> 1) << page_zip->ssize;
+	size = (UNIV_ZIP_SIZE_MIN >> 1) << page_zip->ssize;
 
-	ut_ad(size >= PAGE_ZIP_MIN_SIZE);
+	ut_ad(size >= UNIV_ZIP_SIZE_MIN);
 	ut_ad(size <= UNIV_PAGE_SIZE);
 
 	return(size);
@@ -175,12 +175,12 @@ page_zip_rec_needs_ext(
 	ut_ad(comp || !zip_size);
 
 #if UNIV_PAGE_SIZE > REC_MAX_DATA_SIZE
-	if (UNIV_UNLIKELY(rec_size >= REC_MAX_DATA_SIZE)) {
+	if (rec_size >= REC_MAX_DATA_SIZE) {
 		return(TRUE);
 	}
 #endif
 
-	if (UNIV_UNLIKELY(zip_size)) {
+	if (zip_size) {
 		ut_ad(comp);
 		/* On a compressed page, there is a two-byte entry in
 		the dense page directory for every record.  But there
@@ -209,7 +209,7 @@ page_zip_simple_validate(
 {
 	ut_ad(page_zip);
 	ut_ad(page_zip->data);
-	ut_ad(page_zip->ssize < PAGE_ZIP_NUM_SSIZE);
+	ut_ad(page_zip->ssize <= PAGE_ZIP_SSIZE_MAX);
 	ut_ad(page_zip_get_size(page_zip)
 	      > PAGE_DATA + PAGE_ZIP_DIR_SLOT_SIZE);
 	ut_ad(page_zip->m_start <= page_zip->m_end);
@@ -238,11 +238,11 @@ page_zip_get_trailer_len(
 	ut_ad(page_zip_simple_validate(page_zip));
 	UNIV_MEM_ASSERT_RW(page_zip->data, page_zip_get_size(page_zip));
 
-	if (UNIV_UNLIKELY(!page_is_leaf(page_zip->data))) {
+	if (!page_is_leaf(page_zip->data)) {
 		uncompressed_size = PAGE_ZIP_DIR_SLOT_SIZE
 			+ REC_NODE_PTR_SIZE;
 		ut_ad(!page_zip->n_blobs);
-	} else if (UNIV_UNLIKELY(is_clust)) {
+	} else if (is_clust) {
 		uncompressed_size = PAGE_ZIP_DIR_SLOT_SIZE
 			+ DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN;
 	} else {
@@ -315,7 +315,7 @@ page_zip_available(
 	space needed for identifying the record (encoded heap_no). */
 	length -= REC_N_NEW_EXTRA_BYTES - 2;
 
-	if (UNIV_UNLIKELY(create)) {
+	if (create > 0) {
 		/* When a record is created, a pointer may be added to
 		the dense directory.
 		Likewise, space for the columns that will not be
@@ -326,10 +326,8 @@ page_zip_available(
 		trailer_len += uncompressed_size;
 	}
 
-	return(UNIV_LIKELY(length
-			   + trailer_len
-			   + page_zip->m_end
-			   < page_zip_get_size(page_zip)));
+	return(length + trailer_len + page_zip->m_end
+	       < page_zip_get_size(page_zip));
 }
 
 /**********************************************************************//**
@@ -384,7 +382,7 @@ page_zip_write_header(
 	/* The following would fail in page_cur_insert_rec_zip(). */
 	/* ut_ad(page_zip_validate(page_zip, str - pos)); */
 
-	if (UNIV_LIKELY_NULL(mtr)) {
+	if (mtr) {
 #ifndef UNIV_HOTBACKUP
 		page_zip_write_header_log(str, length, mtr);
 #endif /* !UNIV_HOTBACKUP */
diff --git a/storage/innobase/include/pars0grm.h b/storage/innobase/include/pars0grm.h
index 3de233eed3a..8e725fe9545 100644
--- a/storage/innobase/include/pars0grm.h
+++ b/storage/innobase/include/pars0grm.h
@@ -1,29 +1,37 @@
-/*****************************************************************************
+/* A Bison parser, made by GNU Bison 2.3.  */
 
-Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
-Copyright (c) 1984, 1989, 1990, 2000, 2001, 2002, 2003, 2004 Free Software
-Foundation, Inc.
+/* Skeleton interface for Bison's Yacc-like parsers in C
 
-As a special exception, when this file is copied by Bison into a
-Bison output file, you may use that output file without restriction.
-This special exception was added by the Free Software Foundation
-in version 1.24 of Bison.
+   Copyright (C) 1984, 1989, 1990, 2000, 2001, 2002, 2003, 2004, 2005, 2006
+   Free Software Foundation, Inc.
 
-This program is free software; you can redistribute it and/or modify it under
-the terms of the GNU General Public License as published by the Free Software
-Foundation; version 2 of the License.
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2, or (at your option)
+   any later version.
 
-This program is distributed in the hope that it will be useful, but WITHOUT
-ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
-FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
 
-You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 51 Franklin Street, Fifth Floor,
+   Boston, MA 02110-1301, USA.  */
 
-*****************************************************************************/
+/* As a special exception, you may create a larger work that contains
+   part or all of the Bison parser skeleton and distribute that work
+   under terms of your choice, so long as that work isn't itself a
+   parser generator using the skeleton or a modified version thereof
+   as a parser skeleton.  Alternatively, if you modify or redistribute
+   the parser skeleton itself, you may (at your option) remove this
+   special exception, which will cause the skeleton and the resulting
+   Bison output files to be licensed under the GNU General Public
+   License without this special exception.
 
-/* A Bison parser, made by GNU Bison 1.875d.  */
+   This special exception was added by the Free Software Foundation in
+   version 2.2 of Bison.  */
 
 /* Tokens.  */
 #ifndef YYTOKENTYPE
@@ -123,9 +131,19 @@ Place, Suite 330, Boston, MA 02111-1307 USA
      PARS_LOCK_TOKEN = 347,
      PARS_SHARE_TOKEN = 348,
      PARS_MODE_TOKEN = 349,
-     NEG = 350
+     PARS_LIKE_TOKEN = 350,
+     PARS_LIKE_TOKEN_EXACT = 351,
+     PARS_LIKE_TOKEN_PREFIX = 352,
+     PARS_LIKE_TOKEN_SUFFIX = 353,
+     PARS_LIKE_TOKEN_SUBSTR = 354,
+     PARS_TABLE_NAME_TOKEN = 355,
+     PARS_COMPACT_TOKEN = 356,
+     PARS_BLOCK_SIZE_TOKEN = 357,
+     PARS_BIGINT_TOKEN = 358,
+     NEG = 359
    };
 #endif
+/* Tokens.  */
 #define PARS_INT_LIT 258
 #define PARS_FLOAT_LIT 259
 #define PARS_STR_LIT 260
@@ -218,12 +236,21 @@ Place, Suite 330, Boston, MA 02111-1307 USA
 #define PARS_LOCK_TOKEN 347
 #define PARS_SHARE_TOKEN 348
 #define PARS_MODE_TOKEN 349
-#define NEG 350
+#define PARS_LIKE_TOKEN 350
+#define PARS_LIKE_TOKEN_EXACT 351
+#define PARS_LIKE_TOKEN_PREFIX 352
+#define PARS_LIKE_TOKEN_SUFFIX 353
+#define PARS_LIKE_TOKEN_SUBSTR 354
+#define PARS_TABLE_NAME_TOKEN 355
+#define PARS_COMPACT_TOKEN 356
+#define PARS_BLOCK_SIZE_TOKEN 357
+#define PARS_BIGINT_TOKEN 358
+#define NEG 359
 
 
 
 
-#if ! defined (YYSTYPE) && ! defined (YYSTYPE_IS_DECLARED)
+#if ! defined YYSTYPE && ! defined YYSTYPE_IS_DECLARED
 typedef int YYSTYPE;
 # define yystype YYSTYPE /* obsolescent; will be withdrawn */
 # define YYSTYPE_IS_DECLARED 1
@@ -232,5 +259,3 @@ typedef int YYSTYPE;
 
 extern YYSTYPE yylval;
 
-
-
diff --git a/storage/innobase/include/pars0opt.h b/storage/innobase/include/pars0opt.h
index 42d956068f8..1084d644c90 100644
--- a/storage/innobase/include/pars0opt.h
+++ b/storage/innobase/include/pars0opt.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1997, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
diff --git a/storage/innobase/include/pars0opt.ic b/storage/innobase/include/pars0opt.ic
index e0bb6bf1af2..786d911ca3d 100644
--- a/storage/innobase/include/pars0opt.ic
+++ b/storage/innobase/include/pars0opt.ic
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1997, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
diff --git a/storage/innobase/include/pars0pars.h b/storage/innobase/include/pars0pars.h
index 141b2706d7d..9eb8aeb747f 100644
--- a/storage/innobase/include/pars0pars.h
+++ b/storage/innobase/include/pars0pars.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1996, 2010, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2011, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -38,7 +38,7 @@ Created 11/19/1996 Heikki Tuuri
 and varies in type, while 'user_arg' is a user-supplied argument. The
 meaning of the return type also varies. See the individual use cases, e.g.
 the FETCH statement, for details on them. */
-typedef void* (*pars_user_func_cb_t)(void* arg, void* user_arg);
+typedef ibool	(*pars_user_func_cb_t)(void* arg, void* user_arg);
 
 /** If the following is set TRUE, the parser will emit debugging
 information */
@@ -74,6 +74,7 @@ extern pars_res_word_t	pars_distinct_token;
 extern pars_res_word_t	pars_binary_token;
 extern pars_res_word_t	pars_blob_token;
 extern pars_res_word_t	pars_int_token;
+extern pars_res_word_t	pars_bigint_token;
 extern pars_res_word_t	pars_char_token;
 extern pars_res_word_t	pars_float_token;
 extern pars_res_word_t	pars_update_token;
@@ -105,13 +106,13 @@ pars_sql(
 	pars_info_t*	info,	/*!< in: extra information, or NULL */
 	const char*	str);	/*!< in: SQL string */
 /*************************************************************//**
-Retrieves characters to the lexical analyzer. */
+Retrieves characters to the lexical analyzer.
+@return number of characters copied or 0 on EOF */
 UNIV_INTERN
-void
+int
 pars_get_lex_chars(
 /*===============*/
 	char*	buf,		/*!< in/out: buffer where to copy */
-	int*	result,		/*!< out: number of characters copied or EOF */
 	int	max_size);	/*!< in: maximum number of characters which fit
 				in the buffer */
 /*************************************************************//**
@@ -140,6 +141,17 @@ pars_func(
 /*======*/
 	que_node_t*	res_word,/*!< in: function name reserved word */
 	que_node_t*	arg);	/*!< in: first argument in the argument list */
+/*************************************************************************
+Rebind a LIKE search string. NOTE: We ignore any '%' characters embedded
+within the search string.
+@return	own: function node in a query tree */
+UNIV_INTERN
+int
+pars_like_rebind(
+/*=============*/
+        sym_node_t*     node,   /* in: The search string node.*/
+        const byte*     ptr,    /* in: literal to (re) bind */
+        ulint           len);   /* in: length of literal to (re) bind*/
 /*********************************************************************//**
 Parses an operator expression.
 @return	own: function node in a query tree */
@@ -397,7 +409,10 @@ pars_create_table(
 	sym_node_t*	table_sym,	/*!< in: table name node in the symbol
 					table */
 	sym_node_t*	column_defs,	/*!< in: list of column names */
-	void*		not_fit_in_memory);/*!< in: a non-NULL pointer means that
+	sym_node_t*	compact,	/* in: non-NULL if COMPACT table. */
+	sym_node_t*	block_size,	/* in: block size (can be NULL) */
+	void*		not_fit_in_memory);
+					/*!< in: a non-NULL pointer means that
 					this is a table which in simulations
 					should be simulated as not fitting
 					in memory; thread is put to sleep
@@ -498,7 +513,76 @@ pars_info_add_str_literal(
 	pars_info_t*	info,		/*!< in: info struct */
 	const char*	name,		/*!< in: name */
 	const char*	str);		/*!< in: string */
+/********************************************************************
+If the literal value already exists then it rebinds otherwise it
+creates a new entry.*/
+UNIV_INTERN
+void
+pars_info_bind_literal(
+/*===================*/
+	pars_info_t*	info,		/* in: info struct */
+	const char*	name,		/* in: name */
+	const void*	address,	/* in: address */
+	ulint		length,		/* in: length of data */
+	ulint		type,		/* in: type, e.g. DATA_FIXBINARY */
+	ulint		prtype);	/* in: precise type, e.g. */
+/********************************************************************
+If the literal value already exists then it rebinds otherwise it
+creates a new entry.*/
+UNIV_INTERN
+void
+pars_info_bind_varchar_literal(
+/*===========================*/
+	pars_info_t*	info,		/*!< in: info struct */
+	const char*	name,		/*!< in: name */
+	const byte*	str,		/*!< in: string */
+	ulint		str_len);	/*!< in: string length */
+/****************************************************************//**
+Equivalent to:
 
+char buf[4];
+mach_write_to_4(buf, val);
+pars_info_add_literal(info, name, buf, 4, DATA_INT, 0);
+
+except that the buffer is dynamically allocated from the info struct's
+heap. */
+UNIV_INTERN
+void
+pars_info_bind_int4_literal(
+/*=======================*/
+	pars_info_t*		info,		/*!< in: info struct */
+	const char*		name,		/*!< in: name */
+	const ib_uint32_t*	val);		/*!< in: value */
+/********************************************************************
+If the literal value already exists then it rebinds otherwise it
+creates a new entry. */
+UNIV_INTERN
+void
+pars_info_bind_int8_literal(
+/*=======================*/
+	pars_info_t*		info,		/*!< in: info struct */
+	const char*		name,		/*!< in: name */
+	const ib_uint64_t*	val);		/*!< in: value */
+/****************************************************************//**
+Add user function. */
+UNIV_INTERN
+void
+pars_info_bind_function(
+/*===================*/
+	pars_info_t*		info,	/*!< in: info struct */
+	const char*		name,	/*!< in: function name */
+	pars_user_func_cb_t	func,	/*!< in: function address */
+	void*			arg);	/*!< in: user-supplied argument */
+/****************************************************************//**
+Add bound id. */
+UNIV_INTERN
+void
+pars_info_bind_id(
+/*=============*/
+	pars_info_t*		info,	/*!< in: info struct */
+	ibool			copy_name,/* in: make a copy of name if TRUE */
+	const char*		name,	/*!< in: name */
+	const char*		id);	/*!< in: id */
 /****************************************************************//**
 Equivalent to:
 
@@ -532,16 +616,6 @@ pars_info_add_ull_literal(
 	pars_info_t*	info,		/*!< in: info struct */
 	const char*	name,		/*!< in: name */
 	ib_uint64_t	val);		/*!< in: value */
-/****************************************************************//**
-Add user function. */
-UNIV_INTERN
-void
-pars_info_add_function(
-/*===================*/
-	pars_info_t*		info,	/*!< in: info struct */
-	const char*		name,	/*!< in: function name */
-	pars_user_func_cb_t	func,	/*!< in: function address */
-	void*			arg);	/*!< in: user-supplied argument */
 
 /****************************************************************//**
 Add bound id. */
@@ -619,6 +693,7 @@ struct pars_bound_lit_struct {
 	ulint		length;		/*!< length of data */
 	ulint		type;		/*!< type, e.g. DATA_FIXBINARY */
 	ulint		prtype;		/*!< precise type, e.g. DATA_UNSIGNED */
+	sym_node_t*	node;		/*!< symbol node */
 };
 
 /** Bound identifier. */
@@ -638,7 +713,7 @@ is also used for some non-functions like the assignment ':=' */
 struct func_node_struct{
 	que_common_t	common;	/*!< type: QUE_NODE_FUNC */
 	int		func;	/*!< token code of the function name */
-	ulint		class;	/*!< class of the function */
+	ulint		fclass;	/*!< class of the function */
 	que_node_t*	args;	/*!< argument(s) of the function */
 	UT_LIST_NODE_T(func_node_t) cond_list;
 				/*!< list of comparison conditions; defined
diff --git a/storage/innobase/include/pars0pars.ic b/storage/innobase/include/pars0pars.ic
index ae6c13cd671..4c88337a265 100644
--- a/storage/innobase/include/pars0pars.ic
+++ b/storage/innobase/include/pars0pars.ic
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
diff --git a/storage/innobase/include/pars0sym.h b/storage/innobase/include/pars0sym.h
index 6d1a4b82414..4b3b342a533 100644
--- a/storage/innobase/include/pars0sym.h
+++ b/storage/innobase/include/pars0sym.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1997, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -67,7 +67,7 @@ sym_node_t*
 sym_tab_add_str_lit(
 /*================*/
 	sym_tab_t*	sym_tab,	/*!< in: symbol table */
-	byte*		str,		/*!< in: string with no quotes around
+	const byte*	str,		/*!< in: string with no quotes around
 					it */
 	ulint		len);		/*!< in: string length */
 /******************************************************************//**
@@ -80,6 +80,16 @@ sym_tab_add_bound_lit(
 	sym_tab_t*	sym_tab,	/*!< in: symbol table */
 	const char*	name,		/*!< in: name of bound literal */
 	ulint*		lit_type);	/*!< out: type of literal (PARS_*_LIT) */
+/**********************************************************************
+Rebind literal to a node in the symbol table. */
+
+sym_node_t*
+sym_tab_rebind_lit(
+/*===============*/
+                                        /* out: symbol table node */
+        sym_node_t*     node,           /* in: node that is bound to literal*/
+        const void*     address,        /* in: pointer to data */
+        ulint           length);        /* in: length of data */
 /******************************************************************//**
 Adds an SQL null literal to a symbol table.
 @return	symbol table node */
@@ -116,11 +126,14 @@ sym_tab_add_bound_id(
 
 /** Types of a symbol table node */
 enum sym_tab_entry {
+	SYM_UNSET,		/*!< Unset entry. */
 	SYM_VAR = 91,		/*!< declared parameter or local
 				variable of a procedure */
 	SYM_IMPLICIT_VAR,	/*!< storage for a intermediate result
 				of a calculation */
 	SYM_LIT,		/*!< literal */
+	SYM_TABLE_REF_COUNTED,	/*!< database table name, ref counted. Must
+				be closed explicitly. */
 	SYM_TABLE,		/*!< database table name */
 	SYM_COLUMN,		/*!< database table name */
 	SYM_CURSOR,		/*!< named cursor */
@@ -210,6 +223,7 @@ struct sym_node_struct{
 							the symbol table */
 	UT_LIST_NODE_T(sym_node_t)	sym_list;	/*!< list of symbol
 							nodes */
+	sym_node_t*			like_node;	/* LIKE operator node*/
 };
 
 /** Symbol table */
diff --git a/storage/innobase/include/pars0sym.ic b/storage/innobase/include/pars0sym.ic
index 9eb09db3a47..266c1a6310d 100644
--- a/storage/innobase/include/pars0sym.ic
+++ b/storage/innobase/include/pars0sym.ic
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1997, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
diff --git a/storage/innobase/include/pars0types.h b/storage/innobase/include/pars0types.h
index e0a8a86bf07..13ae53f3fd6 100644
--- a/storage/innobase/include/pars0types.h
+++ b/storage/innobase/include/pars0types.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1998, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1998, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
diff --git a/storage/innobase/include/que0que.h b/storage/innobase/include/que0que.h
index 720da6dcb46..531794ce688 100644
--- a/storage/innobase/include/que0que.h
+++ b/storage/innobase/include/que0que.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1996, 2010, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2011, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -41,14 +41,9 @@ Created 5/27/1996 Heikki Tuuri
 of SQL execution in the UNIV_SQL_DEBUG version */
 extern ibool	que_trace_on;
 
-/***********************************************************************//**
-Adds a query graph to the session's list of graphs. */
-UNIV_INTERN
-void
-que_graph_publish(
-/*==============*/
-	que_t*	graph,	/*!< in: graph */
-	sess_t*	sess);	/*!< in: session */
+/** Mutex protecting the query threads. */
+extern mutex_t	que_thr_mutex;
+
 /***********************************************************************//**
 Creates a query graph fork node.
 @return	own: fork node */
@@ -114,8 +109,8 @@ que_graph_free(
 			afterwards! */
 /**********************************************************************//**
 Stops a query thread if graph or trx is in a state requiring it. The
-conditions are tested in the order (1) graph, (2) trx. The kernel mutex has
-to be reserved.
+conditions are tested in the order (1) graph, (2) trx. The lock_sys_t::mutex
+has to be reserved.
 @return	TRUE if stopped */
 UNIV_INTERN
 ibool
@@ -143,7 +138,7 @@ que_thr_stop_for_mysql_no_error(
 /**********************************************************************//**
 A patch for MySQL used to 'stop' a dummy query thread used in MySQL. The
 query thread is stopped and made inactive, except in the case where
-it was put to the lock wait state in lock0lock.c, but the lock has already
+it was put to the lock wait state in lock0lock.cc, but the lock has already
 been granted or the transaction chosen as a victim in deadlock resolution. */
 UNIV_INTERN
 void
@@ -158,44 +153,17 @@ que_run_threads(
 /*============*/
 	que_thr_t*	thr);	/*!< in: query thread */
 /**********************************************************************//**
-After signal handling is finished, returns control to a query graph error
-handling routine. (Currently, just returns the control to the root of the
-graph so that the graph can communicate an error message to the client.) */
-UNIV_INTERN
-void
-que_fork_error_handle(
-/*==================*/
-	trx_t*	trx,	/*!< in: trx */
-	que_t*	fork);	/*!< in: query graph which was run before signal
-			handling started, NULL not allowed */
-/**********************************************************************//**
-Moves a suspended query thread to the QUE_THR_RUNNING state and releases
-a single worker thread to execute it. This function should be used to end
+Moves a suspended query thread to the QUE_THR_RUNNING state and release
+a worker thread to execute it. This function should be used to end
 the wait state of a query thread waiting for a lock or a stored procedure
-completion. */
+completion.
+@return query thread instance of thread to wakeup or NULL  */
 UNIV_INTERN
-void
-que_thr_end_wait(
-/*=============*/
-	que_thr_t*	thr,		/*!< in: query thread in the
-					QUE_THR_LOCK_WAIT,
-					or QUE_THR_PROCEDURE_WAIT, or
-					QUE_THR_SIG_REPLY_WAIT state */
-	que_thr_t**	next_thr);	/*!< in/out: next query thread to run;
-					if the value which is passed in is
-					a pointer to a NULL pointer, then the
-					calling function can start running
-					a new query thread */
-/**********************************************************************//**
-Same as que_thr_end_wait, but no parameter next_thr available. */
-UNIV_INTERN
-void
-que_thr_end_wait_no_next_thr(
-/*=========================*/
-	que_thr_t*	thr);		/*!< in: query thread in the
-					QUE_THR_LOCK_WAIT,
-					or QUE_THR_PROCEDURE_WAIT, or
-					QUE_THR_SIG_REPLY_WAIT state */
+que_thr_t*
+que_thr_end_lock_wait(
+/*==================*/
+	trx_t*		trx);		/*!< in: transaction in the
+					QUE_THR_LOCK_WAIT state */
 /**********************************************************************//**
 Starts execution of a command in a query fork. Picks a query thread which
 is not in the QUE_THR_RUNNING state and moves it to that state. If none
@@ -296,6 +264,14 @@ que_node_list_add_last(
 /*===================*/
 	que_node_t*	node_list,	/*!< in: node list, or NULL */
 	que_node_t*	node);		/*!< in: node */
+/*************************************************************************
+Get the last node from the list.*/
+UNIV_INLINE
+que_node_t*
+que_node_list_get_last(
+/*===================*/
+					/* out: node last node from list.*/
+	que_node_t*	node_list);	/* in: node list, or NULL */
 /*********************************************************************//**
 Gets a query graph node list length.
 @return	length, for NULL list 0 */
@@ -308,7 +284,7 @@ que_node_list_get_len(
 Checks if graph, trx, or session is in a state where the query thread should
 be stopped.
 @return TRUE if should be stopped; NOTE that if the peek is made
-without reserving the kernel mutex, then another peek with the mutex
+without reserving the trx_t::mutex, then another peek with the mutex
 reserved is necessary before deciding the actual stopping */
 UNIV_INLINE
 ibool
@@ -334,7 +310,7 @@ que_node_print_info(
 Evaluate the given SQL
 @return	error code or DB_SUCCESS */
 UNIV_INTERN
-ulint
+enum db_err
 que_eval_sql(
 /*=========*/
 	pars_info_t*	info,	/*!< in: info struct, or NULL */
@@ -344,8 +320,34 @@ que_eval_sql(
 				dict_sys->mutex around call to pars_sql. */
 	trx_t*		trx);	/*!< in: trx */
 
-/* Query graph query thread node: the fields are protected by the kernel
-mutex with the exceptions named below */
+/**********************************************************************//**
+Round robin scheduler.
+@return a query thread of the graph moved to QUE_THR_RUNNING state, or
+NULL; the query thread should be executed by que_run_threads by the
+caller */
+UNIV_INTERN
+que_thr_t*
+que_fork_scheduler_round_robin(
+/*===========================*/
+	que_fork_t*	fork,		/*!< in: a query fork */
+	que_thr_t*	thr);		/*!< in: current pos */
+
+/*********************************************************************//**
+Initialise the query sub-system. */
+UNIV_INTERN
+void
+que_init(void);
+/*==========*/
+
+/*********************************************************************//**
+Close the query sub-system. */
+UNIV_INTERN
+void
+que_close(void);
+/*===========*/
+
+/* Query graph query thread node: the fields are protected by the
+trx_t::mutex with the exceptions named below */
 
 struct que_thr_struct{
 	que_common_t	common;		/*!< type: QUE_NODE_THR */
@@ -353,24 +355,15 @@ struct que_thr_struct{
 					corruption */
 	que_node_t*	child;		/*!< graph child node */
 	que_t*		graph;		/*!< graph where this node belongs */
+	ulint		state;		/*!< state of the query thread */
 	ibool		is_active;	/*!< TRUE if the thread has been set
 					to the run state in
 					que_thr_move_to_run_state, but not
 					deactivated in
 					que_thr_dec_reference_count */
-	ulint		state;		/*!< state of the query thread */
-	UT_LIST_NODE_T(que_thr_t)
-			thrs;		/*!< list of thread nodes of the fork
-					node */
-	UT_LIST_NODE_T(que_thr_t)
-			trx_thrs;	/*!< lists of threads in wait list of
-					the trx */
-	UT_LIST_NODE_T(que_thr_t)
-			queue;		/*!< list of runnable thread nodes in
-					the server task queue */
 	/*------------------------------*/
 	/* The following fields are private to the OS thread executing the
-	query thread, and are not protected by the kernel mutex: */
+	query thread, and are not protected by any mutex: */
 
 	que_node_t*	run_node;	/*!< pointer to the node where the
 					subgraph down from this node is
@@ -381,6 +374,21 @@ struct que_thr_struct{
 					thus far */
 	ulint		lock_state;	/*!< lock state of thread (table or
 					row) */
+	struct srv_slot_struct*
+			slot;		/* The thread slot in the wait
+					array in srv_sys_t */
+	/*------------------------------*/
+	/* The following fields are links for the various lists that
+	this type can be on. */
+	UT_LIST_NODE_T(que_thr_t)
+			thrs;		/*!< list of thread nodes of the fork
+					node */
+	UT_LIST_NODE_T(que_thr_t)
+			trx_thrs;	/*!< lists of threads in wait list of
+					the trx */
+	UT_LIST_NODE_T(que_thr_t)
+			queue;		/*!< list of runnable thread nodes in
+					the server task queue */
 	ulint		fk_cascade_depth; /*!< maximum cascading call depth
 					supported for foreign key constraint
 					related delete/updates */
@@ -389,7 +397,7 @@ struct que_thr_struct{
 #define QUE_THR_MAGIC_N		8476583
 #define QUE_THR_MAGIC_FREED	123461526
 
-/* Query graph fork node: its fields are protected by the kernel mutex */
+/* Query graph fork node: its fields are protected by the query thread mutex */
 struct que_fork_struct{
 	que_common_t	common;		/*!< type: QUE_NODE_FORK */
 	que_t*		graph;		/*!< query graph of this node */
@@ -502,7 +510,6 @@ struct que_fork_struct{
 					thread has done its task */
 #define QUE_THR_COMMAND_WAIT	4
 #define QUE_THR_LOCK_WAIT	5
-#define QUE_THR_SIG_REPLY_WAIT	6
 #define QUE_THR_SUSPENDED	7
 #define QUE_THR_ERROR		8
 
@@ -516,7 +523,6 @@ struct que_fork_struct{
 #define QUE_CUR_START		2
 #define	QUE_CUR_END		3
 
-
 #ifndef UNIV_NONINL
 #include "que0que.ic"
 #endif
diff --git a/storage/innobase/include/que0que.ic b/storage/innobase/include/que0que.ic
index bd936670e1e..eff5a86d958 100644
--- a/storage/innobase/include/que0que.ic
+++ b/storage/innobase/include/que0que.ic
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1996, 2010, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2010, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -88,7 +88,7 @@ que_node_get_type(
 {
 	ut_ad(node);
 
-	return(((que_common_t*)node)->type);
+	return(((que_common_t*) node)->type);
 }
 
 /***********************************************************************//**
@@ -101,7 +101,7 @@ que_node_get_val(
 {
 	ut_ad(node);
 
-	return(&(((que_common_t*)node)->val));
+	return(&(((que_common_t*) node)->val));
 }
 
 /***********************************************************************//**
@@ -115,7 +115,7 @@ que_node_get_val_buf_size(
 {
 	ut_ad(node);
 
-	return(((que_common_t*)node)->val_buf_size);
+	return(((que_common_t*) node)->val_buf_size);
 }
 
 /***********************************************************************//**
@@ -129,7 +129,7 @@ que_node_set_val_buf_size(
 {
 	ut_ad(node);
 
-	((que_common_t*)node)->val_buf_size = size;
+	((que_common_t*) node)->val_buf_size = size;
 }
 
 /***********************************************************************//**
@@ -143,7 +143,7 @@ que_node_set_parent(
 {
 	ut_ad(node);
 
-	((que_common_t*)node)->parent = parent;
+	((que_common_t*) node)->parent = parent;
 }
 
 /***********************************************************************//**
@@ -192,6 +192,28 @@ que_node_list_add_last(
 	return(node_list);
 }
 
+/*************************************************************************
+Removes a query graph node from the list.*/
+UNIV_INLINE
+que_node_t*
+que_node_list_get_last(
+/*===================*/
+					/* out: last node in list.*/
+	que_node_t*	node_list)	/* in: node list */
+{
+	que_common_t*	node;
+
+	ut_a(node_list != NULL);
+
+	node = (que_common_t*) node_list;
+
+	/* We need the last element */
+	while (node->brother != NULL) {
+		node = (que_common_t*) node->brother;
+	}
+
+	return(node);
+}
 /*********************************************************************//**
 Gets the next list node in a list of query graph nodes.
 @return	next node in a list of nodes */
@@ -201,7 +223,7 @@ que_node_get_next(
 /*==============*/
 	que_node_t*	node)	/*!< in: node in a list */
 {
-	return(((que_common_t*)node)->brother);
+	return(((que_common_t*) node)->brother);
 }
 
 /*********************************************************************//**
@@ -236,14 +258,14 @@ que_node_get_parent(
 /*================*/
 	que_node_t*	node)	/*!< in: node */
 {
-	return(((que_common_t*)node)->parent);
+	return(((que_common_t*) node)->parent);
 }
 
 /**********************************************************************//**
 Checks if graph, trx, or session is in a state where the query thread should
 be stopped.
 @return TRUE if should be stopped; NOTE that if the peek is made
-without reserving the kernel mutex, then another peek with the mutex
+without reserving the trx mutex, then another peek with the mutex
 reserved is necessary before deciding the actual stopping */
 UNIV_INLINE
 ibool
@@ -258,9 +280,9 @@ que_thr_peek_stop(
 	trx = graph->trx;
 
 	if (graph->state != QUE_FORK_ACTIVE
-	    || trx->que_state == TRX_QUE_LOCK_WAIT
-	    || (UT_LIST_GET_LEN(trx->signals) > 0
-		&& trx->que_state == TRX_QUE_RUNNING)) {
+	    || trx->lock.que_state == TRX_QUE_LOCK_WAIT
+	    || (trx->lock.que_state != TRX_QUE_ROLLING_BACK
+		&& trx->lock.que_state != TRX_QUE_RUNNING)) {
 
 		return(TRUE);
 	}
diff --git a/storage/innobase/include/que0types.h b/storage/innobase/include/que0types.h
index ea976074768..b165b817d87 100644
--- a/storage/innobase/include/que0types.h
+++ b/storage/innobase/include/que0types.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
diff --git a/storage/innobase/include/read0read.h b/storage/innobase/include/read0read.h
index 73ea66f4da2..6ea57fffcd2 100644
--- a/storage/innobase/include/read0read.h
+++ b/storage/innobase/include/read0read.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1997, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -52,18 +52,16 @@ must be closed with ..._close.
 @return	own: read view struct */
 UNIV_INTERN
 read_view_t*
-read_view_oldest_copy_or_open_new(
-/*==============================*/
-	trx_id_t	cr_trx_id,	/*!< in: trx_id of creating
-					transaction, or 0 used in purge */
+read_view_purge_open(
+/*=================*/
 	mem_heap_t*	heap);		/*!< in: memory heap from which
 					allocated */
 /*********************************************************************//**
-Closes a read view. */
+Remove read view from the trx_sys->view_list. */
 UNIV_INTERN
 void
-read_view_close(
-/*============*/
+read_view_remove(
+/*=============*/
 	read_view_t*	view);	/*!< in: read view */
 /*********************************************************************//**
 Closes a consistent read view for MySQL. This function is called at an SQL
@@ -145,9 +143,9 @@ struct read_view_struct{
 	ulint		n_trx_ids;
 				/*!< Number of cells in the trx_ids array */
 	trx_id_t*	trx_ids;/*!< Additional trx ids which the read should
-				not see: typically, these are the active
-				transactions at the time when the read is
-				serialized, except the reading transaction
+				not see: typically, these are the read-write
+				active transactions at the time when the read
+			       	is serialized, except the reading transaction
 				itself; the trx ids in this array are in a
 				descending order. These trx_ids should be
 				between the "low" and "high" water marks,
diff --git a/storage/innobase/include/read0read.ic b/storage/innobase/include/read0read.ic
index 5bb5249b591..436800e1585 100644
--- a/storage/innobase/include/read0read.ic
+++ b/storage/innobase/include/read0read.ic
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1997, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -24,36 +24,6 @@ Created 2/16/1997 Heikki Tuuri
 *******************************************************/
 
 /*********************************************************************//**
-Gets the nth trx id in a read view.
-@return	trx id */
-UNIV_INLINE
-trx_id_t
-read_view_get_nth_trx_id(
-/*=====================*/
-	const read_view_t*	view,	/*!< in: read view */
-	ulint			n)	/*!< in: position */
-{
-	ut_ad(n < view->n_trx_ids);
-
-	return(*(view->trx_ids + n));
-}
-
-/*********************************************************************//**
-Sets the nth trx id in a read view. */
-UNIV_INLINE
-void
-read_view_set_nth_trx_id(
-/*=====================*/
-	read_view_t*	view,	/*!< in: read view */
-	ulint		n,	/*!< in: position */
-	trx_id_t	trx_id)	/*!< in: trx id to set */
-{
-	ut_ad(n < view->n_trx_ids);
-
-	*(view->trx_ids + n) = trx_id;
-}
-
-/*********************************************************************//**
 Checks if a read view sees the specified transaction.
 @return	TRUE if sees */
 UNIV_INLINE
@@ -63,33 +33,34 @@ read_view_sees_trx_id(
 	const read_view_t*	view,	/*!< in: read view */
 	trx_id_t		trx_id)	/*!< in: trx id */
 {
-	ulint	n_ids;
-	ulint	i;
-
 	if (trx_id < view->up_limit_id) {
 
 		return(TRUE);
-	}
-
-	if (trx_id >= view->low_limit_id) {
+	} else if (trx_id >= view->low_limit_id) {
 
 		return(FALSE);
-	}
-
-	/* We go through the trx ids in the array smallest first: this order
-	may save CPU time, because if there was a very long running
-	transaction in the trx id array, its trx id is looked at first, and
-	the first two comparisons may well decide the visibility of trx_id. */
-
-	n_ids = view->n_trx_ids;
-
-	for (i = 0; i < n_ids; i++) {
-		trx_id_t	view_trx_id
-			= read_view_get_nth_trx_id(view, n_ids - i - 1);
-
-		if (trx_id <= view_trx_id) {
-			return(trx_id != view_trx_id);
-		}
+	} else {
+		ulint	lower = 0;
+		ulint	upper = view->n_trx_ids - 1;
+
+		ut_a(view->n_trx_ids > 0);
+
+		do {
+			ulint		mid	= (lower + upper) >> 1;
+			trx_id_t	mid_id	= view->trx_ids[mid];
+
+			if (mid_id == trx_id) {
+				return(FALSE);
+			} else if (mid_id < trx_id) {
+				if (mid > 0) {
+					upper = mid - 1;
+				} else {
+					break;
+				}
+			} else {
+				lower = mid + 1;
+			}
+		} while (lower <= upper);
 	}
 
 	return(TRUE);
diff --git a/storage/innobase/include/read0types.h b/storage/innobase/include/read0types.h
index caf69e3fb51..0b6aa132b88 100644
--- a/storage/innobase/include/read0types.h
+++ b/storage/innobase/include/read0types.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1997, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
diff --git a/storage/innobase/include/rem0cmp.h b/storage/innobase/include/rem0cmp.h
index a908521c9f7..ed6486aa603 100644
--- a/storage/innobase/include/rem0cmp.h
+++ b/storage/innobase/include/rem0cmp.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1994, 2011, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -75,6 +75,63 @@ cmp_data_data_slow(
 	const byte*	data2,	/*!< in: data field (== a pointer to a memory
 				buffer) */
 	ulint		len2);	/*!< in: data field length or UNIV_SQL_NULL */
+
+/*****************************************************************
+This function is used to compare two data fields for which we know the
+data type to be VARCHAR.
+@return	1, 0, -1, if lhs is greater, equal, less than rhs, respectively */
+UNIV_INTERN
+int
+cmp_data_data_slow_varchar(
+/*=======================*/
+	const byte*	lhs,	/* in: data field (== a pointer to a memory
+				buffer) */
+	ulint		lhs_len,/* in: data field length or UNIV_SQL_NULL */
+	const byte*	rhs,	/* in: data field (== a pointer to a memory
+				buffer) */
+	ulint		rhs_len);/* in: data field length or UNIV_SQL_NULL */
+/*****************************************************************
+This function is used to compare two varchar/char fields. The comparison
+is for the LIKE operator.
+@return	1, 0, -1, if lhs is greater, equal, less than rhs, respectively */
+UNIV_INTERN
+int
+cmp_data_data_slow_like_prefix(
+/*===========================*/
+	const byte*	data1,	/* in: data field (== a pointer to a memory
+				buffer) */
+	ulint		len1,	/* in: data field length or UNIV_SQL_NULL */
+	const byte*	data2,	/* in: data field (== a pointer to a memory
+				buffer) */
+	ulint		len2);	/* in: data field length or UNIV_SQL_NULL */
+/*****************************************************************
+This function is used to compare two varchar/char fields. The comparison
+is for the LIKE operator.
+@return	1, 0, -1, if data1 is greater, equal, less than data2, respectively */
+UNIV_INTERN
+int
+cmp_data_data_slow_like_suffix(
+/*===========================*/
+	const byte*	data1,	/* in: data field (== a pointer to a memory
+				buffer) */
+	ulint		len1,	/* in: data field length or UNIV_SQL_NULL */
+	const byte*	data2,	/* in: data field (== a pointer to a memory
+				buffer) */
+	ulint		len2);	/* in: data field length or UNIV_SQL_NULL */
+/*****************************************************************
+This function is used to compare two varchar/char fields. The comparison
+is for the LIKE operator.
+@return	1, 0, -1, if data1 is greater, equal, less than data2, respectively */
+UNIV_INTERN
+int
+cmp_data_data_slow_like_substr(
+/*===========================*/
+	const byte*	data1,	/* in: data field (== a pointer to a memory
+				buffer) */
+	ulint		len1,	/* in: data field length or UNIV_SQL_NULL */
+	const byte*	data2,	/* in: data field (== a pointer to a memory
+				buffer) */
+	ulint		len2);	/* in: data field length or UNIV_SQL_NULL */
 /*************************************************************//**
 This function is used to compare two dfields where at least the first
 has its data type field set.
@@ -192,6 +249,39 @@ cmp_rec_rec(
 	const ulint*	offsets2,/*!< in: rec_get_offsets(rec2, index) */
 	dict_index_t*	index);	/*!< in: data dictionary index */
 
+/*****************************************************************
+This function is used to compare two dfields where at least the first
+has its data type field set. */
+UNIV_INTERN
+int
+cmp_dfield_dfield_like_prefix(
+/*==========================*/
+				/* out: 1, 0, -1, if dfield1 is greater, equal,
+				less than dfield2, respectively */
+	dfield_t*	dfield1,/* in: data field; must have type field set */
+	dfield_t*	dfield2);/* in: data field */
+/*****************************************************************
+This function is used to compare two dfields where at least the first
+has its data type field set. */
+UNIV_INLINE
+int
+cmp_dfield_dfield_like_substr(
+/*==========================*/
+				/* out: 1, 0, -1, if dfield1 is greater, equal,
+				less than dfield2, respectively */
+	dfield_t*	dfield1,/* in: data field; must have type field set */
+	dfield_t*	dfield2);/* in: data field */
+/*****************************************************************
+This function is used to compare two dfields where at least the first
+has its data type field set. */
+UNIV_INLINE
+int
+cmp_dfield_dfield_like_suffix(
+/*==========================*/
+				/* out: 1, 0, -1, if dfield1 is greater, equal,
+				less than dfield2, respectively */
+	dfield_t*	dfield1,/* in: data field; must have type field set */
+	dfield_t*	dfield2);/* in: data field */
 
 #ifndef UNIV_NONINL
 #include "rem0cmp.ic"
diff --git a/storage/innobase/include/rem0cmp.ic b/storage/innobase/include/rem0cmp.ic
index 63415fe7837..67a2dcacba1 100644
--- a/storage/innobase/include/rem0cmp.ic
+++ b/storage/innobase/include/rem0cmp.ic
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1994, 2011, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -43,6 +43,60 @@ cmp_data_data(
 	return(cmp_data_data_slow(mtype, prtype, data1, len1, data2, len2));
 }
 
+/*****************************************************************
+This function is used to compare two (CHAR) data fields for the LIKE
+operator. */
+UNIV_INLINE
+int
+cmp_data_data_like_prefix(
+/*======================*/
+				/* out: 1, 0, -1, if data1 is greater, equal,
+				less than data2, respectively */
+	byte*           data1,  /* in: data field (== a pointer to a memory
+				buffer) */
+	ulint           len1,   /* in: data field length or UNIV_SQL_NULL */
+	byte*           data2,  /* in: data field (== a pointer to a memory
+				buffer) */
+	ulint           len2)   /* in: data field length or UNIV_SQL_NULL */
+{
+	return(cmp_data_data_slow_like_prefix(data1, len1, data2, len2));
+}
+/*****************************************************************
+This function is used to compare two (CHAR) data fields for the LIKE
+operator. */
+UNIV_INLINE
+int
+cmp_data_data_like_suffix(
+/*======================*/
+				/* out: 1, 0, -1, if data1 is greater, equal,
+				less than data2, respectively */
+	byte*           data1,  /* in: data field (== a pointer to a memory
+				buffer) */
+	ulint           len1,   /* in: data field length or UNIV_SQL_NULL */
+	byte*           data2,  /* in: data field (== a pointer to a memory
+				buffer) */
+	ulint           len2)   /* in: data field length or UNIV_SQL_NULL */
+{
+	return(cmp_data_data_slow_like_suffix(data1, len1, data2, len2));
+}
+/*****************************************************************
+This function is used to compare two (CHAR) data fields for the LIKE
+operator. */
+UNIV_INLINE
+int
+cmp_data_data_like_substr(
+/*======================*/
+				/* out: 1, 0, -1, if data1 is greater, equal,
+				less than data2, respectively */
+	byte*           data1,  /* in: data field (== a pointer to a memory
+				buffer) */
+	ulint           len1,   /* in: data field length or UNIV_SQL_NULL */
+	byte*           data2,  /* in: data field (== a pointer to a memory
+				buffer) */
+	ulint           len2)   /* in: data field length or UNIV_SQL_NULL */
+{
+	return(cmp_data_data_slow_like_substr(data1, len1, data2, len2));
+}
 /*************************************************************//**
 This function is used to compare two dfields where at least the first
 has its data type field set.
@@ -68,6 +122,47 @@ cmp_dfield_dfield(
 			     dfield_get_len(dfield2)));
 }
 
+/*****************************************************************
+This function is used to compare two dfields where at least the first
+has its data type field set. */
+UNIV_INLINE
+int
+cmp_dfield_dfield_like_suffix(
+/*==========================*/
+				/* out: 1, 0, -1, if dfield1 is greater, equal,
+				less than dfield2, respectively */
+	dfield_t*       dfield1,/* in: data field; must have type field set */
+	dfield_t*       dfield2)/* in: data field */
+{
+	ut_ad(dfield_check_typed(dfield1));
+
+	return(cmp_data_data_like_suffix(
+		(byte*) dfield_get_data(dfield1),
+		dfield_get_len(dfield1),
+		(byte*) dfield_get_data(dfield2),
+		dfield_get_len(dfield2)));
+}
+
+/*****************************************************************
+This function is used to compare two dfields where at least the first
+has its data type field set. */
+UNIV_INLINE
+int
+cmp_dfield_dfield_like_substr(
+/*==========================*/
+				/* out: 1, 0, -1, if dfield1 is greater, equal,
+				less than dfield2, respectively */
+	dfield_t*       dfield1,/* in: data field; must have type field set */
+	dfield_t*       dfield2)/* in: data field */
+{
+	ut_ad(dfield_check_typed(dfield1));
+
+	return(cmp_data_data_like_substr(
+		(byte*) dfield_get_data(dfield1),
+		dfield_get_len(dfield1),
+		(byte*) dfield_get_data(dfield2),
+		dfield_get_len(dfield2)));
+}
 /*************************************************************//**
 This function is used to compare two physical records. Only the common
 first fields are compared.
diff --git a/storage/innobase/include/rem0rec.h b/storage/innobase/include/rem0rec.h
index 10b74d18c13..c6c70bb5f09 100644
--- a/storage/innobase/include/rem0rec.h
+++ b/storage/innobase/include/rem0rec.h
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -54,7 +54,7 @@ in addition to the data and the offsets */
 #define REC_STATUS_INFIMUM	2
 #define REC_STATUS_SUPREMUM	3
 
-/* The following four constants are needed in page0zip.c in order to
+/* The following four constants are needed in page0zip.cc in order to
 efficiently compress and decompress pages. */
 
 /* The offset of heap_no in a compact record */
@@ -480,7 +480,7 @@ ulint
 rec_offs_any_extern(
 /*================*/
 	const ulint*	offsets);/*!< in: array returned by rec_get_offsets() */
-#ifdef UNIV_BLOB_NULL_DEBUG
+#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
 /******************************************************//**
 Determine if the offsets are for a record containing null BLOB pointers.
 @return	first field containing a null BLOB pointer, or NULL if none found */
@@ -491,7 +491,7 @@ rec_offs_any_null_extern(
 	const rec_t*	rec,		/*!< in: record */
 	const ulint*	offsets)	/*!< in: rec_get_offsets(rec) */
 	__attribute__((nonnull, warn_unused_result));
-#endif /* UNIV_BLOB_NULL_DEBUG */
+#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
 /******************************************************//**
 Returns nonzero if the extern bit is set in nth field of rec.
 @return	nonzero if externally stored */
@@ -542,7 +542,11 @@ rec_set_nth_field(
 	const ulint*	offsets,/*!< in: array returned by rec_get_offsets() */
 	ulint		n,	/*!< in: index number of the field */
 	const void*	data,	/*!< in: pointer to the data if not SQL null */
-	ulint		len);	/*!< in: length of the data or UNIV_SQL_NULL */
+	ulint		len);	/*!< in: length of the data or UNIV_SQL_NULL.
+				If not SQL null, must have the same
+				length as the previous value.
+				If SQL null, previous value must be
+				SQL null. */
 /**********************************************************//**
 The following function returns the data size of an old-style physical
 record, that is the sum of field lengths. SQL null fields
diff --git a/storage/innobase/include/rem0rec.ic b/storage/innobase/include/rem0rec.ic
index dc8ed515c30..6950263fe81 100644
--- a/storage/innobase/include/rem0rec.ic
+++ b/storage/innobase/include/rem0rec.ic
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -103,7 +103,7 @@ and the shift needed to obtain each bit-field of the record. */
 
 #define REC_OLD_HEAP_NO		5
 #define REC_HEAP_NO_MASK	0xFFF8UL
-#if 0 /* defined in rem0rec.h for use of page0zip.c */
+#if 0 /* defined in rem0rec.h for use of page0zip.cc */
 #define REC_NEW_HEAP_NO		4
 #define	REC_HEAP_NO_SHIFT	3
 #endif
@@ -264,12 +264,12 @@ rec_get_next_ptr_const(
 
 	field_value = mach_read_from_2(rec - REC_NEXT);
 
-	if (UNIV_UNLIKELY(field_value == 0)) {
+	if (field_value == 0) {
 
 		return(NULL);
 	}
 
-	if (UNIV_EXPECT(comp, REC_OFFS_COMPACT)) {
+	if (comp) {
 #if UNIV_PAGE_SIZE <= 32768
 		/* Note that for 64 KiB pages, field_value can 'wrap around'
 		and the debug assertion is not valid */
@@ -337,7 +337,7 @@ rec_get_next_offs(
 
 	field_value = mach_read_from_2(rec - REC_NEXT);
 
-	if (UNIV_EXPECT(comp, REC_OFFS_COMPACT)) {
+	if (comp) {
 #if UNIV_PAGE_SIZE <= 32768
 		/* Note that for 64 KiB pages, field_value can 'wrap around'
 		and the debug assertion is not valid */
@@ -354,7 +354,7 @@ rec_get_next_offs(
 		      + ut_align_offset(rec, UNIV_PAGE_SIZE)
 		      < UNIV_PAGE_SIZE);
 #endif
-		if (UNIV_UNLIKELY(field_value == 0)) {
+		if (field_value == 0) {
 
 			return(0);
 		}
@@ -410,7 +410,7 @@ rec_set_next_offs_new(
 	ut_ad(rec);
 	ut_ad(UNIV_PAGE_SIZE > next);
 
-	if (UNIV_UNLIKELY(!next)) {
+	if (!next) {
 		field_value = 0;
 	} else {
 		/* The following two statements calculate
@@ -418,7 +418,7 @@ rec_set_next_offs_new(
 		as a non-negative number */
 
 		field_value = (ulint)
-			((lint) next 
+			((lint) next
 			 - (lint) ut_align_offset(rec, UNIV_PAGE_SIZE));
 		field_value &= REC_NEXT_MASK;
 	}
@@ -572,9 +572,7 @@ rec_set_n_owned_new(
 {
 	rec_set_bit_field_1(rec, n_owned, REC_NEW_N_OWNED,
 			    REC_N_OWNED_MASK, REC_N_OWNED_SHIFT);
-	if (UNIV_LIKELY_NULL(page_zip)
-	    && UNIV_LIKELY(rec_get_status(rec)
-			   != REC_STATUS_SUPREMUM)) {
+	if (page_zip && rec_get_status(rec) != REC_STATUS_SUPREMUM) {
 		page_zip_rec_set_owned(page_zip, rec, n_owned);
 	}
 }
@@ -648,7 +646,7 @@ rec_get_info_and_status_bits(
 & (REC_INFO_BITS_MASK >> REC_INFO_BITS_SHIFT)
 # error "REC_NEW_STATUS_MASK and REC_INFO_BITS_MASK overlap"
 #endif
-	if (UNIV_EXPECT(comp, REC_OFFS_COMPACT)) {
+	if (comp) {
 		bits = rec_get_info_bits(rec, TRUE) | rec_get_status(rec);
 	} else {
 		bits = rec_get_info_bits(rec, FALSE);
@@ -684,16 +682,14 @@ rec_get_deleted_flag(
 	const rec_t*	rec,	/*!< in: physical record */
 	ulint		comp)	/*!< in: nonzero=compact page format */
 {
-	if (UNIV_EXPECT(comp, REC_OFFS_COMPACT)) {
-		return(UNIV_UNLIKELY(
-			       rec_get_bit_field_1(rec, REC_NEW_INFO_BITS,
-						   REC_INFO_DELETED_FLAG,
-						   REC_INFO_BITS_SHIFT)));
+	if (comp) {
+		return(rec_get_bit_field_1(rec, REC_NEW_INFO_BITS,
+					   REC_INFO_DELETED_FLAG,
+					   REC_INFO_BITS_SHIFT));
 	} else {
-		return(UNIV_UNLIKELY(
-			       rec_get_bit_field_1(rec, REC_OLD_INFO_BITS,
-						   REC_INFO_DELETED_FLAG,
-						   REC_INFO_BITS_SHIFT)));
+		return(rec_get_bit_field_1(rec, REC_OLD_INFO_BITS,
+					   REC_INFO_DELETED_FLAG,
+					   REC_INFO_BITS_SHIFT));
 	}
 }
 
@@ -741,7 +737,7 @@ rec_set_deleted_flag_new(
 
 	rec_set_info_bits_new(rec, val);
 
-	if (UNIV_LIKELY_NULL(page_zip)) {
+	if (page_zip) {
 		page_zip_rec_set_deleted(page_zip, rec, flag);
 	}
 }
@@ -1041,7 +1037,7 @@ rec_get_nth_field_offs(
 	ut_ad(n < rec_offs_n_fields(offsets));
 	ut_ad(len);
 
-	if (UNIV_UNLIKELY(n == 0)) {
+	if (n == 0) {
 		offs = 0;
 	} else {
 		offs = rec_offs_base(offsets)[n] & REC_OFFS_MASK;
@@ -1085,10 +1081,10 @@ rec_offs_any_extern(
 	const ulint*	offsets)/*!< in: array returned by rec_get_offsets() */
 {
 	ut_ad(rec_offs_validate(NULL, NULL, offsets));
-	return(UNIV_UNLIKELY(*rec_offs_base(offsets) & REC_OFFS_EXTERNAL));
+	return(*rec_offs_base(offsets) & REC_OFFS_EXTERNAL);
 }
 
-#ifdef UNIV_BLOB_NULL_DEBUG
+#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
 /******************************************************//**
 Determine if the offsets are for a record containing null BLOB pointers.
 @return	first field containing a null BLOB pointer, or NULL if none found */
@@ -1124,7 +1120,7 @@ rec_offs_any_null_extern(
 
 	return(NULL);
 }
-#endif /* UNIV_BLOB_NULL_DEBUG */
+#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
 
 /******************************************************//**
 Returns nonzero if the extern bit is set in nth field of rec.
@@ -1138,8 +1134,7 @@ rec_offs_nth_extern(
 {
 	ut_ad(rec_offs_validate(NULL, NULL, offsets));
 	ut_ad(n < rec_offs_n_fields(offsets));
-	return(UNIV_UNLIKELY(rec_offs_base(offsets)[1 + n]
-			     & REC_OFFS_EXTERNAL));
+	return(rec_offs_base(offsets)[1 + n] & REC_OFFS_EXTERNAL);
 }
 
 /******************************************************//**
@@ -1154,8 +1149,7 @@ rec_offs_nth_sql_null(
 {
 	ut_ad(rec_offs_validate(NULL, NULL, offsets));
 	ut_ad(n < rec_offs_n_fields(offsets));
-	return(UNIV_UNLIKELY(rec_offs_base(offsets)[1 + n]
-			     & REC_OFFS_SQL_NULL));
+	return(rec_offs_base(offsets)[1 + n] & REC_OFFS_SQL_NULL);
 }
 
 /******************************************************//**
@@ -1394,7 +1388,7 @@ rec_set_nth_field(
 	ut_ad(rec);
 	ut_ad(rec_offs_validate(rec, NULL, offsets));
 
-	if (UNIV_UNLIKELY(len == UNIV_SQL_NULL)) {
+	if (len == UNIV_SQL_NULL) {
 		if (!rec_offs_nth_sql_null(offsets, n)) {
 			ut_a(!rec_offs_comp(offsets));
 			rec_set_nth_field_sql_null(rec, n);
@@ -1554,7 +1548,7 @@ rec_copy(
 
 	ut_memcpy(buf, rec - extra_len, extra_len + data_len);
 
-	return((byte*)buf + extra_len);
+	return((byte*) buf + extra_len);
 }
 
 /**********************************************************//**
@@ -1596,7 +1590,7 @@ rec_get_converted_size(
 	ut_ad(dtuple);
 	ut_ad(dtuple_check_typed(dtuple));
 
-	ut_ad(index->type & DICT_UNIVERSAL
+	ut_ad(dict_index_is_univ(index)
 	      || dtuple_get_n_fields(dtuple)
 	      == (((dtuple_get_info_bits(dtuple) & REC_NEW_STATUS_MASK)
 		   == REC_STATUS_NODE_PTR)
@@ -1616,6 +1610,41 @@ rec_get_converted_size(
 	extra_size = rec_get_converted_extra_size(
 		data_size, dtuple_get_n_fields(dtuple), n_ext);
 
+#if 0
+	/* This code is inactive since it may be the wrong place to add
+	in the size of node pointers used in parent pages AND it is not
+	currently needed since ha_innobase::max_supported_key_length()
+	ensures that the key size limit for each page size is well below
+	the actual limit ((free space on page / 4) - record overhead).
+	But those limits will need to be raised when InnoDB can
+	support multiple page sizes.  At that time, we will need
+	to consider the node pointer on these universal btrees. */
+
+	if (dict_index_is_univ(index)) {
+		/* This is for the insert buffer B-tree.
+		All fields in the leaf tuple ascend to the
+		parent node plus the child page pointer. */
+
+		/* ibuf cannot contain externally stored fields */
+		ut_ad(n_ext == 0);
+
+		/* Add the data pointer and recompute extra_size
+		based on one more field. */
+		data_size += REC_NODE_PTR_SIZE;
+		extra_size = rec_get_converted_extra_size(
+			data_size,
+			dtuple_get_n_fields(dtuple) + 1,
+			0);
+
+		/* Be sure dtuple->n_fields has this node ptr
+		accounted for.  This function should correspond to
+		what rec_convert_dtuple_to_rec() needs in storage.
+		In optimistic insert or update-not-in-place, we will
+		have to ensure that if the record is converted to a
+		node pointer, it will not become too large.*/
+	}
+#endif
+
 	return(data_size + extra_size);
 }
 
diff --git a/storage/innobase/include/rem0types.h b/storage/innobase/include/rem0types.h
index 7afd595be90..2f1ead43c07 100644
--- a/storage/innobase/include/rem0types.h
+++ b/storage/innobase/include/rem0types.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1994, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -45,10 +45,21 @@ This constant MUST NOT BE CHANGED, or the compatibility of InnoDB data
 files would be at risk! */
 #define REC_ANTELOPE_MAX_INDEX_COL_LEN		768
 
-/** Maximum indexed field length for table format DICT_TF_FORMAT_ZIP and
+/** Maximum indexed field length for table format UNIV_FORMAT_B and
 beyond.
 This (3072) is the maximum index row length allowed, so we cannot create index
 prefix column longer than that. */
 #define REC_VERSION_56_MAX_INDEX_COL_LEN	3072
 
+/** Innodb row types are a subset of the MySQL global enum row_type.
+They are made into their own enum so that switch statements can account
+for each of them. */
+enum rec_format_enum {
+	REC_FORMAT_REDUNDANT	= 0,	/*!< REDUNDANT row format */
+	REC_FORMAT_COMPACT	= 1,	/*!< COMPACT row format */
+	REC_FORMAT_COMPRESSED	= 2,	/*!< COMPRESSED row format */
+	REC_FORMAT_DYNAMIC	= 3	/*!< DYNAMIC row format */
+};
+typedef enum rec_format_enum rec_format_t;
+
 #endif
diff --git a/storage/innobase/include/row0ext.h b/storage/innobase/include/row0ext.h
index 557da2c4a82..60aaf16c09a 100644
--- a/storage/innobase/include/row0ext.h
+++ b/storage/innobase/include/row0ext.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 2006, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 2006, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
diff --git a/storage/innobase/include/row0ext.ic b/storage/innobase/include/row0ext.ic
index 466046b2821..39e150d91d5 100644
--- a/storage/innobase/include/row0ext.ic
+++ b/storage/innobase/include/row0ext.ic
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 2006, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 2006, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -48,7 +48,7 @@ row_ext_lookup_ith(
 	ut_ad(*len <= ext->max_len);
 	ut_ad(ext->max_len > 0);
 
-	if (UNIV_UNLIKELY(*len == 0)) {
+	if (*len == 0) {
 		/* The BLOB could not be fetched to the cache. */
 		return(field_ref_zero);
 	} else {
diff --git a/storage/innobase/include/row0ftsort.h b/storage/innobase/include/row0ftsort.h
new file mode 100644
index 00000000000..cc5efea026f
--- /dev/null
+++ b/storage/innobase/include/row0ftsort.h
@@ -0,0 +1,287 @@
+/*****************************************************************************
+
+Copyright (c) 2010, 2011, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0ftsort.h
+Create Full Text Index with (parallel) merge sort
+
+Created 10/13/2010 Jimmy Yang
+*******************************************************/
+
+#ifndef row0ftsort_h
+#define row0ftsort_h
+
+#include "univ.i"
+#include "data0data.h"
+#include "dict0types.h"
+#include "row0mysql.h"
+#include "fts0fts.h"
+#include "fts0types.h"
+#include "fts0priv.h"
+#include "row0merge.h"
+
+/** This structure defineds information the scan thread will fetch
+and put to the linked list for parallel tokenization/sort threads
+to process */
+typedef struct fts_doc_item     fts_doc_item_t;
+
+/** Information about temporary files used in merge sort */
+struct fts_doc_item {
+	dfield_t*	field;		/*!< field contains document string */
+	doc_id_t	doc_id;		/*!< document ID */
+	UT_LIST_NODE_T(fts_doc_item_t)	doc_list;
+					/*!< list of doc items */
+};
+
+/** This defines the list type that scan thread would feed the parallel
+tokenization threads and sort threads. */
+typedef UT_LIST_BASE_NODE_T(fts_doc_item_t)     fts_doc_list_t;
+
+#define FTS_NUM_AUX_INDEX	6
+#define FTS_PLL_MERGE		1
+
+/** Sort information passed to each individual parallel sort thread */
+typedef struct fts_psort_struct		fts_psort_t;
+
+/** Common info passed to each parallel sort thread */
+struct fts_psort_common_struct {
+	struct TABLE*		table;		/*!< MySQL table */
+	dict_table_t*		new_table;	/*!< source table */
+	trx_t*			trx;		/*!< transaction */
+	dict_index_t*		sort_index;	/*!< FTS index */
+	fts_psort_t*		all_info;	/*!< all parallel sort info */
+	os_event_t		sort_event;	/*!< sort event */
+	ibool			opt_doc_id_size;/*!< whether to use 4 bytes
+						instead of 8 bytes integer to
+						store Doc ID during sort, if
+						Doc ID will not be big enough
+						to use 8 bytes value */
+};
+
+typedef struct fts_psort_common_struct	fts_psort_common_t;
+
+struct fts_psort_struct {
+	ulint			psort_id;	/*!< Parallel sort ID */
+	row_merge_buf_t*	merge_buf[FTS_NUM_AUX_INDEX];
+						/*!< sort buffer */
+	merge_file_t*		merge_file[FTS_NUM_AUX_INDEX];
+						/*!< sort file */
+	row_merge_block_t*	merge_block[FTS_NUM_AUX_INDEX];
+						/*!< buffer to write to file */
+	row_merge_block_t*	block_alloc[FTS_NUM_AUX_INDEX];
+						/*!< buffer to allocated */
+	ulint			child_status;	/*!< child thread status */
+	ulint			state;		/*!< child thread state */
+	fts_doc_list_t		fts_doc_list;	/*!< doc list to process */
+	fts_psort_common_t*	psort_common;	/*!< ptr to all psort info */
+};
+
+/** Structure stores information from string tokenization operation */
+struct fts_tokenize_ctx {
+	ulint			processed_len;  /*!< processed string length */
+	ulint			init_pos;       /*!< doc start position */
+	ulint			buf_used;       /*!< the sort buffer (ID) when
+						tokenization stops, which
+						could due to sort buffer full */
+	ulint			rows_added[FTS_NUM_AUX_INDEX];
+						/*!< number of rows added for
+						each FTS index partition */
+	ib_rbt_t*		cached_stopword;/*!< in: stopword list */
+	dfield_t		sort_field[FTS_NUM_FIELDS_SORT];
+						/*!< in: sort field */
+};
+
+typedef struct fts_tokenize_ctx fts_tokenize_ctx_t;
+
+/** Structure stores information needed for the insertion phase of FTS
+parallel sort. */
+struct fts_psort_insert {
+	trx_t*		trx;		/*!< Transaction used for insertion */
+	que_t**		ins_graph;	/*!< insert graph */
+	fts_table_t	fts_table;	/*!< auxiliary table */
+	CHARSET_INFO*	charset;	/*!< charset info */
+	mem_heap_t*	heap;		/*!< heap */
+	ibool		opt_doc_id_size;/*!< Whether to use smaller (4 bytes)
+					integer for Doc ID */
+};
+
+typedef struct fts_psort_insert	fts_psort_insert_t;
+
+
+/** status bit used for communication between parent and child thread */
+#define FTS_PARENT_COMPLETE	1
+#define FTS_CHILD_COMPLETE	1
+
+/** Print some debug information */
+#define	FTSORT_PRINT
+
+#ifdef	FTSORT_PRINT
+#define	DEBUG_FTS_SORT_PRINT(str)		\
+	do {					\
+		ut_print_timestamp(stderr);	\
+		fprintf(stderr, str);		\
+	} while (0)
+#else
+#define DEBUG_FTS_SORT_PRINT(str)
+#endif	/* FTSORT_PRINT */
+
+/*************************************************************//**
+Create a temporary "fts sort index" used to merge sort the
+tokenized doc string. The index has three "fields":
+
+1) Tokenized word,
+2) Doc ID
+3) Word's position in original 'doc'.
+
+@return dict_index_t structure for the fts sort index */
+UNIV_INTERN
+dict_index_t*
+row_merge_create_fts_sort_index(
+/*============================*/
+	dict_index_t*		index,	/*!< in: Original FTS index
+					based on which this sort index
+					is created */
+	const dict_table_t*	table,	/*!< in: table that FTS index
+					is being created on */
+	ibool*			opt_doc_id_size);
+					/*!< out: whether to use 4 bytes
+					instead of 8 bytes integer to
+					store Doc ID during sort */
+
+/********************************************************************//**
+Initialize FTS parallel sort structures.
+@return TRUE if all successful */
+UNIV_INTERN
+ibool
+row_fts_psort_info_init(
+/*====================*/
+	trx_t*			trx,	/*!< in: transaction */
+	struct TABLE*		table,	/*!< in: MySQL table object */
+	const dict_table_t*	new_table,/*!< in: table where indexes are
+					created */
+	dict_index_t*		index,	/*!< in: FTS index to be created */
+	ibool			opt_doc_id_size,
+					/*!< in: whether to use 4 bytes
+					instead of 8 bytes integer to
+					store Doc ID during sort */
+	fts_psort_t**		psort,	/*!< out: parallel sort info to be
+					instantiated */
+	fts_psort_t**		merge);	/*!< out: parallel merge info
+					to be instantiated */
+/********************************************************************//**
+Clean up and deallocate FTS parallel sort structures, and close
+temparary merge sort files */
+UNIV_INTERN
+void
+row_fts_psort_info_destroy(
+/*=======================*/
+	fts_psort_t*	psort_info,	/*!< parallel sort info */
+	fts_psort_t*	merge_info);	/*!< parallel merge info */
+/********************************************************************//**
+Free up merge buffers when merge sort is done */
+UNIV_INTERN
+void
+row_fts_free_pll_merge_buf(
+/*=======================*/
+	fts_psort_t*	psort_info);	/*!< in: parallel sort info */
+
+/*********************************************************************//**
+Function performs parallel tokenization of the incoming doc strings.
+@return OS_THREAD_DUMMY_RETURN */
+UNIV_INTERN
+os_thread_ret_t
+fts_parallel_tokenization(
+/*======================*/
+	void*		arg);		/*!< in: psort_info for the thread */
+/*********************************************************************//**
+Start the parallel tokenization and parallel merge sort */
+UNIV_INTERN
+void
+row_fts_start_psort(
+/*================*/
+	fts_psort_t*	psort_info);	/*!< in: parallel sort info */
+/*********************************************************************//**
+Function performs the merge and insertion of the sorted records.
+@return OS_THREAD_DUMMY_RETURN */
+UNIV_INTERN
+os_thread_ret_t
+fts_parallel_merge(
+/*===============*/
+	void*		arg);		/*!< in: parallel merge info */
+/*********************************************************************//**
+Kick off the parallel merge and insert thread */
+UNIV_INTERN
+void
+row_fts_start_parallel_merge(
+/*=========================*/
+	fts_psort_t*	merge_info);	/*!< in: parallel sort info */
+/********************************************************************//**
+Insert processed FTS data to the auxillary tables.
+@return DB_SUCCESS if insertion runs fine */
+UNIV_INTERN
+ulint
+row_merge_write_fts_word(
+/*=====================*/
+	trx_t*		trx,		/*!< in: transaction */
+	que_t**		ins_graph,	/*!< in: Insert query graphs */
+	fts_tokenizer_word_t*word,	/*!< in: sorted and tokenized
+					word */
+	fts_table_t*	fts_table,	/*!< in: fts aux table instance */
+	CHARSET_INFO*	charset);	/*!< in: charset */
+/********************************************************************//**
+Read sorted FTS data files and insert data tuples to auxillary tables.
+@return DB_SUCCESS or error number */
+UNIV_INTERN
+void
+row_fts_insert_tuple(
+/*=================*/
+	fts_psort_insert_t*
+			ins_ctx,        /*!< in: insert context */
+	fts_tokenizer_word_t* word,	/*!< in: last processed
+					tokenized word */
+	ib_vector_t*	positions,	/*!< in: word position */
+	doc_id_t*	in_doc_id,	/*!< in: last item doc id */
+	dtuple_t*	dtuple);	/*!< in: entry to insert */
+/********************************************************************//**
+Propagate a newly added record up one level in the selection tree
+@return parent where this value propagated to */
+UNIV_INTERN
+int
+row_merge_fts_sel_propagate(
+/*========================*/
+	int		propogated,	/*<! in: tree node propagated */
+	int*		sel_tree,	/*<! in: selection tree */
+	ulint		level,		/*<! in: selection tree level */
+	const mrec_t**	 mrec,		/*<! in: sort record */
+	ulint**		offsets,	/*<! in: record offsets */
+	dict_index_t*	index);		/*<! in: FTS index */
+/********************************************************************//**
+Read sorted file containing index data tuples and insert these data
+tuples to the index
+@return DB_SUCCESS or error number */
+UNIV_INTERN
+ulint
+row_fts_merge_insert(
+/*=================*/
+	dict_index_t*	index,		/*!< in: index */
+	dict_table_t*	table,		/*!< in: new table */
+	fts_psort_t*	psort_info,	/*!< parallel sort info */
+	ulint		id);		/* !< in: which auxiliary table's data
+					to insert to */
+
+#endif /* row0ftsort_h */
diff --git a/storage/innobase/include/row0ins.h b/storage/innobase/include/row0ins.h
index 810973e61a7..54ad7241a4f 100644
--- a/storage/innobase/include/row0ins.h
+++ b/storage/innobase/include/row0ins.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
diff --git a/storage/innobase/include/row0ins.ic b/storage/innobase/include/row0ins.ic
index 84f6da255bf..9c191d869a2 100644
--- a/storage/innobase/include/row0ins.ic
+++ b/storage/innobase/include/row0ins.ic
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
diff --git a/storage/innobase/include/row0merge.h b/storage/innobase/include/row0merge.h
index be7c77e7724..c4e2f5ddf41 100644
--- a/storage/innobase/include/row0merge.h
+++ b/storage/innobase/include/row0merge.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 2005, 2010, Innobase Oy. All Rights Reserved.
+Copyright (c) 2005, 2010, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -38,6 +38,58 @@ Created 13/06/2005 Jan Lindstrom
 #include "btr0types.h"
 #include "row0mysql.h"
 #include "lock0types.h"
+#include "srv0srv.h"
+
+/** @brief Block size for I/O operations in merge sort.
+
+The minimum is UNIV_PAGE_SIZE, or page_get_free_space_of_empty()
+rounded to a power of 2.
+
+When not creating a PRIMARY KEY that contains column prefixes, this
+can be set as small as UNIV_PAGE_SIZE / 2.  See the comment above
+ut_ad(data_size < sizeof(row_merge_block_t)). */
+typedef byte   row_merge_block_t;
+
+/** @brief Secondary buffer for I/O operations of merge records.
+
+This buffer is used for writing or reading a record that spans two
+row_merge_block_t.  Thus, it must be able to hold one merge record,
+whose maximum size is the same as the minimum size of
+row_merge_block_t. */
+typedef byte	mrec_buf_t[UNIV_PAGE_SIZE_MAX];
+
+/** @brief Merge record in row_merge_block_t.
+
+The format is the same as a record in ROW_FORMAT=COMPACT with the
+exception that the REC_N_NEW_EXTRA_BYTES are omitted. */
+typedef byte	mrec_t;
+
+/** Buffer for sorting in main memory. */
+struct row_merge_buf_struct {
+	mem_heap_t*	heap;		/*!< memory heap where allocated */
+	dict_index_t*	index;		/*!< the index the tuples belong to */
+	ulint		total_size;	/*!< total amount of data bytes */
+	ulint		n_tuples;	/*!< number of data tuples */
+	ulint		max_tuples;	/*!< maximum number of data tuples */
+	const dfield_t**tuples;		/*!< array of pointers to
+					arrays of fields that form
+					the data tuples */
+	const dfield_t**tmp_tuples;	/*!< temporary copy of tuples,
+					for sorting */
+};
+
+/** Buffer for sorting in main memory. */
+typedef struct row_merge_buf_struct	row_merge_buf_t;
+
+/** Information about temporary files used in merge sort */
+struct merge_file_struct {
+	int		fd;		/*!< file descriptor */
+	ulint		offset;		/*!< file offset (end of file) */
+	ib_uint64_t	n_rec;		/*!< number of records in the file */
+};
+
+/** Information about temporary files used in merge sort */
+typedef struct merge_file_struct	merge_file_t;
 
 /** Index field definition */
 struct merge_index_field_struct {
@@ -47,7 +99,7 @@ struct merge_index_field_struct {
 };
 
 /** Index field definition */
-typedef struct merge_index_field_struct merge_index_field_t;
+typedef struct merge_index_field_struct	merge_index_field_t;
 
 /** Definition of an index being created */
 struct merge_index_def_struct {
@@ -60,7 +112,17 @@ struct merge_index_def_struct {
 };
 
 /** Definition of an index being created */
-typedef struct merge_index_def_struct merge_index_def_t;
+typedef struct merge_index_def_struct	merge_index_def_t;
+
+/** Structure for reporting duplicate records. */
+struct row_merge_dup_struct {
+	const dict_index_t*	index;		/*!< index being sorted */
+	struct TABLE*		table;		/*!< MySQL table object */
+	ulint			n_dup;		/*!< number of duplicates */
+};
+
+/** Structure for reporting duplicate records. */
+typedef struct row_merge_dup_struct row_merge_dup_t;
 
 /*********************************************************************//**
 Sets an exclusive lock on a table, for the duration of creating indexes.
@@ -95,7 +157,8 @@ row_merge_drop_indexes(
 	trx_t*		trx,		/*!< in: transaction */
 	dict_table_t*	table,		/*!< in: table containing the indexes */
 	dict_index_t**	index,		/*!< in: indexes to drop */
-	ulint		num_created);	/*!< in: number of elements in index[] */
+	ulint		num_created);	/*!< in: number of elements in
+					index[] */
 /*********************************************************************//**
 Drop all partially created indexes during crash recovery. */
 UNIV_INTERN
@@ -117,7 +180,6 @@ row_merge_rename_tables(
 					old_table->name */
 	const char*	tmp_name,	/*!< in: new name for old_table */
 	trx_t*		trx);		/*!< in: transaction handle */
-
 /*********************************************************************//**
 Create a temporary table for creating a primary key, using the definition
 of an existing table.
@@ -173,7 +235,6 @@ row_merge_drop_table(
 /*=================*/
 	trx_t*		trx,		/*!< in: transaction */
 	dict_table_t*	table);		/*!< in: table instance to drop */
-
 /*********************************************************************//**
 Build indexes on a table by reading a clustered index,
 creating a temporary file containing index entries, merge sorting
@@ -194,4 +255,130 @@ row_merge_build_indexes(
 	struct TABLE*	table);		/*!< in/out: MySQL table, for
 					reporting erroneous key value
 					if applicable */
+/********************************************************************//**
+Write a buffer to a block. */
+UNIV_INTERN
+void
+row_merge_buf_write(
+/*================*/
+	const row_merge_buf_t*	buf,	/*!< in: sorted buffer */
+	const merge_file_t*	of,	/*!< in: output file */
+	row_merge_block_t*	block);	/*!< out: buffer for writing to file */
+/********************************************************************//**
+Sort a buffer. */
+UNIV_INTERN
+void
+row_merge_buf_sort(
+/*===============*/
+        row_merge_buf_t*        buf,    /*!< in/out: sort buffer */
+        row_merge_dup_t*        dup);	/*!< in/out: for reporting duplicates */
+/********************************************************************//**
+Write a merge block to the file system.
+@return TRUE if request was successful, FALSE if fail */
+UNIV_INTERN
+ibool
+row_merge_write(
+/*============*/
+	int		fd,	/*!< in: file descriptor */
+	ulint		offset,	/*!< in: offset where to write,
+				in number of row_merge_block_t elements */
+	const void*	buf);	/*!< in: data */
+/********************************************************************//**
+Empty a sort buffer.
+@return sort buffer */
+UNIV_INTERN
+row_merge_buf_t*
+row_merge_buf_empty(
+/*================*/
+        row_merge_buf_t*        buf);    /*!< in,own: sort buffer */
+/*********************************************************************//**
+Create a merge file. */
+UNIV_INTERN
+void
+row_merge_file_create(
+/*==================*/
+        merge_file_t*   merge_file);     /*!< out: merge file structure */
+/*********************************************************************//**
+Merge disk files.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+ulint
+row_merge_sort(
+/*===========*/
+	trx_t*			trx,	/*!< in: transaction */
+	const dict_index_t*	index,	/*!< in: index being created */
+	merge_file_t*		file,	/*!< in/out: file containing
+					index entries */
+	row_merge_block_t*	block,	/*!< in/out: 3 buffers */
+	int*			tmpfd,	/*!< in/out: temporary file handle */
+	struct TABLE*		table);	/*!< in/out: MySQL table, for
+					reporting erroneous key value
+					if applicable */
+/*********************************************************************//**
+Allocate a sort buffer.
+@return own: sort buffer */
+UNIV_INTERN
+row_merge_buf_t*
+row_merge_buf_create(
+/*=================*/
+        dict_index_t*   index);  /*!< in: secondary index */
+/*********************************************************************//**
+Deallocate a sort buffer. */
+UNIV_INTERN
+void
+row_merge_buf_free(
+/*===============*/
+	row_merge_buf_t*	buf);    /*!< in,own: sort buffer, to be freed */
+/*********************************************************************//**
+Destroy a merge file. */
+UNIV_INTERN
+void
+row_merge_file_destroy(
+/*===================*/
+	merge_file_t*	merge_file);	/*!< out: merge file structure */
+/*********************************************************************//**
+Compare two merge records.
+@return 1, 0, -1 if mrec1 is greater, equal, less, respectively, than mrec2 */
+UNIV_INTERN
+int
+row_merge_cmp(
+/*==========*/
+	const mrec_t*		mrec1,		/*!< in: first merge
+						record to be compared */
+	const mrec_t*		mrec2,		/*!< in: second merge
+						record to be compared */
+	const ulint*		offsets1,	/*!< in: first record offsets */
+	const ulint*		offsets2,	/*!< in: second record offsets */
+	const dict_index_t*	index,		/*!< in: index */
+	ibool*			null_eq);	/*!< out: set to TRUE if
+						found matching null values */
+/********************************************************************//**
+Read a merge block from the file system.
+@return TRUE if request was successful, FALSE if fail */
+UNIV_INTERN
+ibool
+row_merge_read(
+/*===========*/
+	int			fd,	/*!< in: file descriptor */
+	ulint			offset,	/*!< in: offset where to read
+					in number of row_merge_block_t
+					elements */
+	row_merge_block_t*	buf);	/*!< out: data */
+/********************************************************************//**
+Read a merge record.
+@return pointer to next record, or NULL on I/O error or end of list */
+UNIV_INTERN __attribute__((nonnull))
+const byte*
+row_merge_read_rec(
+/*===============*/
+	row_merge_block_t*	block,	/*!< in/out: file buffer */
+	mrec_buf_t*		buf,	/*!< in/out: secondary buffer */
+	const byte*		b,	/*!< in: pointer to record */
+	const dict_index_t*	index,	/*!< in: index of the record */
+	int			fd,	/*!< in: file descriptor */
+	ulint*			foffs,	/*!< in/out: file offset */
+	const mrec_t**		mrec,	/*!< out: pointer to merge record,
+					or NULL on end of list
+					(non-NULL on I/O error) */
+	ulint*			offsets);/*!< out: offsets of mrec */
 #endif /* row0merge.h */
diff --git a/storage/innobase/include/row0mysql.h b/storage/innobase/include/row0mysql.h
index e17fd584110..17a29e38ec7 100644
--- a/storage/innobase/include/row0mysql.h
+++ b/storage/innobase/include/row0mysql.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 2000, 2010, Innobase Oy. All Rights Reserved.
+Copyright (c) 2000, 2010, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -116,7 +116,7 @@ row_mysql_pad_col(
 /**************************************************************//**
 Stores a non-SQL-NULL field given in the MySQL format in the InnoDB format.
 The counterpart of this function is row_sel_field_store_in_mysql_format() in
-row0sel.c.
+row0sel.cc.
 @return	up to which byte we used buf in the conversion */
 UNIV_INTERN
 byte*
@@ -127,7 +127,10 @@ row_mysql_store_col_in_innobase_format(
 					this function is called! */
 	byte*		buf,		/*!< in/out: buffer for a converted
 					integer value; this must be at least
-					col_len long then! */
+					col_len long then! NOTE that dfield
+					may also get a pointer to 'buf',
+					therefore do not discard this as long
+					as dfield is used! */
 	ibool		row_format_col,	/*!< TRUE if the mysql_data is from
 					a MySQL row, FALSE if from a MySQL
 					key value;
@@ -190,15 +193,6 @@ row_update_prebuilt_trx(
 					in MySQL handle */
 	trx_t*		trx);		/*!< in: transaction handle */
 /*********************************************************************//**
-Unlocks AUTO_INC type locks that were possibly reserved by a trx. This
-function should be called at the the end of an SQL statement, by the
-connection thread that owns the transaction (trx->mysql_thd). */
-UNIV_INTERN
-void
-row_unlock_table_autoinc_for_mysql(
-/*===============================*/
-	trx_t*	trx);			/*!< in/out: transaction */
-/*********************************************************************//**
 Sets an AUTO_INC type lock on the table mentioned in prebuilt. The
 AUTO_INC lock gives exclusive access to the auto-inc counter of the
 table. The lock is reserved only for the duration of an SQL statement.
@@ -415,7 +409,7 @@ row_table_add_foreign_constraints(
 					any foreign keys are found. */
 
 /*********************************************************************//**
-The master thread in srv0srv.c calls this regularly to drop tables which
+The master thread in srv0srv.cc calls this regularly to drop tables which
 we must drop in background after queries to them have ended. Such lazy
 dropping of tables is needed in ALTER TABLE on Unix.
 @return	how many tables dropped + remaining tables in list */
@@ -528,6 +522,20 @@ row_is_magic_monitor_table(
 	const char*	table_name);	/*!< in: name of the table, in the
 					form database/table_name */
 
+/*********************************************************************//**
+Initialize this module */
+UNIV_INTERN
+void
+row_mysql_init(void);
+/*================*/
+
+/*********************************************************************//**
+Close this module */
+UNIV_INTERN
+void
+row_mysql_close(void);
+/*=================*/
+
 /* A struct describing a place for an individual column in the MySQL
 row format which is presented to the table handler in ha_innobase.
 This template struct is used to speed up row transformations between
@@ -544,6 +552,10 @@ struct mysql_row_templ_struct {
 					Innobase record in the clustered index;
 					not defined if template_type is
 					ROW_MYSQL_WHOLE_ROW */
+	ulint	icp_rec_field_no;	/*!< field number of the column in an
+					Innobase record in the current index;
+					not defined unless
+					index condition pushdown is used */
 	ulint	mysql_col_offset;	/*!< offset of the column in the MySQL
 					row format */
 	ulint	mysql_col_len;		/*!< length of the column in the MySQL
@@ -686,6 +698,12 @@ struct row_prebuilt_struct {
 					generated, the row id of the
 					last row fetched is stored
 					here */
+	doc_id_t	fts_doc_id;	/* if the table has an FTS index on
+					it then we fetch the doc_id.
+					FTS-FIXME: Currently we fetch it always
+					but in the future we must only fetch
+					it when FTS columns are being
+					updated */
 	dtuple_t*	clust_ref;	/*!< prebuilt dtuple used in
 					sel/upd/del */
 	ulint		select_lock_type;/*!< LOCK_NONE, LOCK_S, or LOCK_X */
@@ -762,6 +780,7 @@ struct row_prebuilt_struct {
 					to this heap */
 	mem_heap_t*	old_vers_heap;	/*!< memory heap where a previous
 					version is built in consistent read */
+	fts_result_t*	result;		/* The result of an FTS query */
 	/*----------------------*/
 	ulonglong	autoinc_last_value;
 					/*!< last value of AUTO-INC interval */
@@ -778,6 +797,13 @@ struct row_prebuilt_struct {
 					store it here so that we can return
 					it to MySQL */
 	/*----------------------*/
+	void*		idx_cond;	/*!< In ICP, pointer to a ha_innobase,
+					passed to innobase_index_cond().
+					NULL if index condition pushdown is
+					not used. */
+	ulint		idx_cond_n_cols;/*!< Number of fields in idx_cond_cols.
+					0 if and only if idx_cond == NULL. */
+	/*----------------------*/
 	ulint		magic_n2;	/*!< this should be the same as
 					magic_n */
 };
diff --git a/storage/innobase/include/row0mysql.ic b/storage/innobase/include/row0mysql.ic
index 35033aa2ad1..2eb60898c46 100644
--- a/storage/innobase/include/row0mysql.ic
+++ b/storage/innobase/include/row0mysql.ic
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 2001, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 2001, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
diff --git a/storage/innobase/include/row0purge.h b/storage/innobase/include/row0purge.h
index 485d51dbc83..740771fa3eb 100644
--- a/storage/innobase/include/row0purge.h
+++ b/storage/innobase/include/row0purge.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1997, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -34,6 +34,8 @@ Created 3/14/1997 Heikki Tuuri
 #include "trx0types.h"
 #include "que0types.h"
 #include "row0types.h"
+#include "row0purge.h"
+#include "ut0vec.h"
 
 /********************************************************************//**
 Creates a purge node to a query graph.
@@ -42,8 +44,9 @@ UNIV_INTERN
 purge_node_t*
 row_purge_node_create(
 /*==================*/
-	que_thr_t*	parent,	/*!< in: parent node, i.e., a thr node */
-	mem_heap_t*	heap);	/*!< in: memory heap where created */
+	que_thr_t*	parent,		/*!< in: parent node, i.e., a
+					thr node */
+	mem_heap_t*	heap);		/*!< in: memory heap where created */
 /***********************************************************//**
 Determines if it is possible to remove a secondary index entry.
 Removal is possible if the secondary index entry does not refer to any
@@ -83,20 +86,16 @@ struct purge_node_struct{
 	/*----------------------*/
 	/* Local storage for this graph node */
 	roll_ptr_t	roll_ptr;/* roll pointer to undo log record */
-	trx_undo_rec_t*	undo_rec;/* undo log record */
-	trx_undo_inf_t*	reservation;/* reservation for the undo log record in
-				the purge array */
+	ib_vector_t*    undo_recs;/*!< Undo recs to purge */
+
 	undo_no_t	undo_no;/* undo number of the record */
+
 	ulint		rec_type;/* undo log record type: TRX_UNDO_INSERT_REC,
 				... */
-	btr_pcur_t	pcur;	/*!< persistent cursor used in searching the
-				clustered index record */
-	ibool		found_clust;/* TRUE if the clustered index record
-				determined by ref was found in the clustered
-				index, and we were able to position pcur on
-				it */
 	dict_table_t*	table;	/*!< table where purge is done */
+
 	ulint		cmpl_info;/* compiler analysis info of an update */
+
 	upd_t*		update;	/*!< update vector for a clustered index
 				record */
 	dtuple_t*	ref;	/*!< NULL, or row reference to the next row to
@@ -109,6 +108,14 @@ struct purge_node_struct{
 	mem_heap_t*	heap;	/*!< memory heap used as auxiliary storage for
 				row; this must be emptied after a successful
 				purge of a row */
+	ibool		found_clust;/* TRUE if the clustered index record
+				determined by ref was found in the clustered
+				index, and we were able to position pcur on
+				it */
+	btr_pcur_t	pcur;	/*!< persistent cursor used in searching the
+				clustered index record */
+	ibool		done;	/* Debug flag */
+
 };
 
 #ifndef UNIV_NONINL
diff --git a/storage/innobase/include/row0purge.ic b/storage/innobase/include/row0purge.ic
index 23d7d3845a4..700106d1048 100644
--- a/storage/innobase/include/row0purge.ic
+++ b/storage/innobase/include/row0purge.ic
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1997, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
diff --git a/storage/innobase/include/row0row.h b/storage/innobase/include/row0row.h
index c2849be7c3e..cf253ab2347 100644
--- a/storage/innobase/include/row0row.h
+++ b/storage/innobase/include/row0row.h
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -294,10 +294,7 @@ row_search_index_entry(
 /* The allowed latching order of index records is the following:
 (1) a secondary index record ->
 (2) the clustered index record ->
-(3) rollback segment data for the clustered index record.
-
-No new latches may be obtained while the kernel mutex is reserved.
-However, the kernel mutex can be reserved while latches are owned. */
+(3) rollback segment data for the clustered index record. */
 
 /*******************************************************************//**
 Formats the raw data in "data" (in InnoDB on-disk format) using
diff --git a/storage/innobase/include/row0row.ic b/storage/innobase/include/row0row.ic
index 0b9ca982af8..8e9f3460519 100644
--- a/storage/innobase/include/row0row.ic
+++ b/storage/innobase/include/row0row.ic
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
diff --git a/storage/innobase/include/row0sel.h b/storage/innobase/include/row0sel.h
index 1c4ea6f7244..fa3c93b6b9a 100644
--- a/storage/innobase/include/row0sel.h
+++ b/storage/innobase/include/row0sel.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1997, 2010, Innobase Oy. All Rights Reserved.
+Copyright (c) 1997, 2010, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -179,6 +179,15 @@ row_search_check_if_query_cache_permitted(
 	trx_t*		trx,		/*!< in: transaction object */
 	const char*	norm_name);	/*!< in: concatenation of database name,
 					'/' char, table name */
+void
+row_create_key(
+/*===========*/
+	dtuple_t*	tuple,		/* in: tuple where to build;
+					NOTE: we assume that the type info
+					in the tuple is already according
+					to index! */
+	dict_index_t*	index,		/* in: index of the key value */
+	doc_id_t*	doc_id);	/* in: doc id to lookup.*/
 /*******************************************************************//**
 Read the max AUTOINC value from an index.
 @return	DB_SUCCESS if all OK else error code */
diff --git a/storage/innobase/include/row0sel.ic b/storage/innobase/include/row0sel.ic
index 5907f9913da..d83a3448832 100644
--- a/storage/innobase/include/row0sel.ic
+++ b/storage/innobase/include/row0sel.ic
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1997, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -92,7 +92,7 @@ open_step(
 		}
 	}
 
-	if (UNIV_EXPECT(err, DB_SUCCESS) != DB_SUCCESS) {
+	if (err != DB_SUCCESS) {
 		/* SQL error detected */
 		fprintf(stderr, "SQL error %lu\n", (ulong) err);
 
diff --git a/storage/innobase/include/row0types.h b/storage/innobase/include/row0types.h
index 7d6a7c8e2b1..463651b43b8 100644
--- a/storage/innobase/include/row0types.h
+++ b/storage/innobase/include/row0types.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1996, 2010, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2010, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
diff --git a/storage/innobase/include/row0uins.h b/storage/innobase/include/row0uins.h
index 77b071c3a6b..5f3a7212ee1 100644
--- a/storage/innobase/include/row0uins.h
+++ b/storage/innobase/include/row0uins.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1997, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
diff --git a/storage/innobase/include/row0uins.ic b/storage/innobase/include/row0uins.ic
index 27606150d8e..54da2e49874 100644
--- a/storage/innobase/include/row0uins.ic
+++ b/storage/innobase/include/row0uins.ic
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1997, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
diff --git a/storage/innobase/include/row0umod.h b/storage/innobase/include/row0umod.h
index ed44cc8d601..84831e59d90 100644
--- a/storage/innobase/include/row0umod.h
+++ b/storage/innobase/include/row0umod.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1997, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
diff --git a/storage/innobase/include/row0umod.ic b/storage/innobase/include/row0umod.ic
index ea3fd3b43c7..00a8cd86e01 100644
--- a/storage/innobase/include/row0umod.ic
+++ b/storage/innobase/include/row0umod.ic
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1997, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
diff --git a/storage/innobase/include/row0undo.h b/storage/innobase/include/row0undo.h
index 6eb4ca448b3..90a15172ae0 100644
--- a/storage/innobase/include/row0undo.h
+++ b/storage/innobase/include/row0undo.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1997, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
diff --git a/storage/innobase/include/row0undo.ic b/storage/innobase/include/row0undo.ic
index dc788debc14..b97ffca590e 100644
--- a/storage/innobase/include/row0undo.ic
+++ b/storage/innobase/include/row0undo.ic
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1997, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
diff --git a/storage/innobase/include/row0upd.h b/storage/innobase/include/row0upd.h
index c275c1da78e..a7687bb1ded 100644
--- a/storage/innobase/include/row0upd.h
+++ b/storage/innobase/include/row0upd.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2011, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -304,6 +304,37 @@ row_upd_changes_ord_field_binary_func(
 	row_upd_changes_ord_field_binary_func(index,update,row,ext)
 #endif /* UNIV_DEBUG */
 /***********************************************************//**
+Checks if an FTS indexed column is affected by an UPDATE.
+@return offset within fts_t::indexes if FTS indexed column updated else
+ULINT_UNDEFINED */
+UNIV_INTERN
+ulint
+row_upd_changes_fts_column(
+/*=======================*/
+	dict_table_t*	table,		/*!< in: table */
+	upd_field_t*	upd_field);	/*!< in: field to check */
+/***********************************************************//**
+Checks if an FTS Doc ID column is affected by an UPDATE.
+@return TRUE if Doc ID column is affected */
+UNIV_INTERN
+ulint
+row_upd_changes_doc_id(
+/*===================*/
+	dict_table_t*	table,		/*!< in: table */
+	upd_field_t*	upd_field);	/*!< in: field to check */
+/***********************************************************//**
+Checks if an update vector changes the table's FTS-indexed columns.
+NOTE: must not be called for tables which do not have an FTS-index.
+Also, the vector returned must be explicitly freed as it's allocated
+using the ut_malloc() allocator.
+@return vector of FTS indexes that were affected by the update else NULL */
+UNIV_INTERN
+ib_vector_t*
+row_upd_changes_fts_columns(
+/*========================*/
+	dict_table_t*	table,		/*!< in: table */
+	upd_t*		update);	/*!< in: update vector for the row */
+/***********************************************************//**
 Checks if an update vector changes an ordering field of an index record.
 This function is fast if the update vector is short or the number of ordering
 fields in the index is small. Otherwise, this can be quadratic.
@@ -369,7 +400,7 @@ row_upd_index_parse(
 struct upd_field_struct{
 	unsigned	field_no:16;	/*!< field number in an index, usually
 					the clustered index, but in updating
-					a secondary index record in btr0cur.c
+					a secondary index record in btr0cur.cc
 					this is the position in the secondary
 					index */
 #ifndef UNIV_HOTBACKUP
diff --git a/storage/innobase/include/row0upd.ic b/storage/innobase/include/row0upd.ic
index 10646241125..8a2543eaac9 100644
--- a/storage/innobase/include/row0upd.ic
+++ b/storage/innobase/include/row0upd.ic
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -43,12 +43,12 @@ upd_create(
 {
 	upd_t*	update;
 
-	update = (upd_t*) mem_heap_alloc(heap, sizeof(upd_t));
+	update = (upd_t*) mem_heap_zalloc(heap, sizeof(upd_t));
 
 	update->info_bits = 0;
 	update->n_fields = n;
 	update->fields = (upd_field_t*)
-		mem_heap_alloc(heap, sizeof(upd_field_t) * n);
+		mem_heap_zalloc(heap, sizeof(upd_field_t) * n);
 
 	return(update);
 }
@@ -102,7 +102,7 @@ upd_field_set_field_no(
 	upd_field->field_no = field_no;
 	upd_field->orig_len = 0;
 
-	if (UNIV_UNLIKELY(field_no >= dict_index_get_n_fields(index))) {
+	if (field_no >= dict_index_get_n_fields(index)) {
 		fprintf(stderr,
 			"InnoDB: Error: trying to access field %lu in ",
 			(ulong) field_no);
@@ -157,7 +157,7 @@ row_upd_rec_sys_fields(
 	ut_ad(dict_index_is_clust(index));
 	ut_ad(rec_offs_validate(rec, index, offsets));
 
-	if (UNIV_LIKELY_NULL(page_zip)) {
+	if (page_zip) {
 		ulint	pos = dict_index_get_sys_col_pos(index, DATA_TRX_ID);
 		page_zip_write_trx_id_and_roll_ptr(page_zip, rec, offsets,
 						   pos, trx->id, roll_ptr);
diff --git a/storage/innobase/include/row0vers.h b/storage/innobase/include/row0vers.h
index 5a2e38230d5..d9e3471b3dc 100644
--- a/storage/innobase/include/row0vers.h
+++ b/storage/innobase/include/row0vers.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1997, 2011, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -37,13 +37,15 @@ Created 2/6/1997 Heikki Tuuri
 
 /*****************************************************************//**
 Finds out if an active transaction has inserted or modified a secondary
-index record. NOTE: the kernel mutex is temporarily released in this
-function!
-@return NULL if committed, else the active transaction */
+index record.
+@return 0 if committed, else the active transaction id;
+NOTE that this function can return false positives but never false
+negatives. The caller must confirm all positive results by calling
+trx_is_active() while holding lock_sys->mutex. */
 UNIV_INTERN
-trx_t*
-row_vers_impl_x_locked_off_kernel(
-/*==============================*/
+trx_id_t
+row_vers_impl_x_locked(
+/*===================*/
 	const rec_t*	rec,	/*!< in: record in a secondary index */
 	dict_index_t*	index,	/*!< in: the secondary index */
 	const ulint*	offsets);/*!< in: rec_get_offsets(rec, index) */
diff --git a/storage/innobase/include/row0vers.ic b/storage/innobase/include/row0vers.ic
index 8bb3a5c0cb3..ef43a55bf70 100644
--- a/storage/innobase/include/row0vers.ic
+++ b/storage/innobase/include/row0vers.ic
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1997, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
diff --git a/storage/innobase/include/srv0conc.h b/storage/innobase/include/srv0conc.h
new file mode 100644
index 00000000000..9aee1b17bf0
--- /dev/null
+++ b/storage/innobase/include/srv0conc.h
@@ -0,0 +1,111 @@
+/*****************************************************************************
+
+Copyright (c) 2011, Oracle and/or its affiliates. All Rights Reserved.
+
+Portions of this file contain modifications contributed and copyrighted by
+Google, Inc. Those modifications are gratefully acknowledged and are described
+briefly in the InnoDB documentation. The contributions by Google are
+incorporated with their permission, and subject to the conditions contained in
+the file COPYING.Google.
+
+Portions of this file contain modifications contributed and copyrighted
+by Percona Inc.. Those modifications are
+gratefully acknowledged and are described briefly in the InnoDB
+documentation. The contributions by Percona Inc. are incorporated with
+their permission, and subject to the conditions contained in the file
+COPYING.Percona.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file srv/srv0conc.h
+
+InnoDB concurrency manager header file
+
+Created 2011/04/18 Sunny Bains
+*******************************************************/
+
+#ifndef srv_conc_h
+#define srv_conc_h
+
+/** We are prepared for a situation that we have this many threads waiting for
+a semaphore inside InnoDB. innobase_start_or_create_for_mysql() sets the
+value. */
+
+extern	ulint	srv_max_n_threads;
+
+/** The following controls how many threads we let inside InnoDB concurrently:
+threads waiting for locks are not counted into the number because otherwise
+we could get a deadlock. Value of 0 will disable the concurrency check. */
+
+extern ulong	srv_thread_concurrency;
+
+/*********************************************************************//**
+Initialise the concurrency management data structures */
+void
+srv_conc_init(void);
+/*===============*/
+
+/*********************************************************************//**
+Free the concurrency management data structures */
+void
+srv_conc_free(void);
+/*===============*/
+
+/*********************************************************************//**
+Puts an OS thread to wait if there are too many concurrent threads
+(>= srv_thread_concurrency) inside InnoDB. The threads wait in a FIFO queue. */
+UNIV_INTERN
+void
+srv_conc_enter_innodb(
+/*==================*/
+	trx_t*	trx);		/*!< in: transaction object associated
+				with the thread */
+
+/*********************************************************************//**
+This lets a thread enter InnoDB regardless of the number of threads inside
+InnoDB. This must be called when a thread ends a lock wait. */
+UNIV_INTERN
+void
+srv_conc_force_enter_innodb(
+/*========================*/
+	trx_t*	trx);		/*!< in: transaction object associated with
+				the thread */
+
+/*********************************************************************//**
+This must be called when a thread exits InnoDB in a lock wait or at the
+end of an SQL statement. */
+UNIV_INTERN
+void
+srv_conc_force_exit_innodb(
+/*=======================*/
+	trx_t*	trx);		/*!< in: transaction object associated with
+				the thread */
+
+/*********************************************************************//**
+Get the count of threads waiting inside InnoDB. */
+UNIV_INTERN
+ulint
+srv_conc_get_waiting_threads(void);
+/*==============================*/
+
+/*********************************************************************//**
+Get the count of threads active inside InnoDB. */
+UNIV_INTERN
+ulint
+srv_conc_get_active_threads(void);
+/*==============================*/
+
+#endif /* srv_conc_h */
diff --git a/storage/innobase/include/srv0mon.h b/storage/innobase/include/srv0mon.h
new file mode 100644
index 00000000000..5e47f82f416
--- /dev/null
+++ b/storage/innobase/include/srv0mon.h
@@ -0,0 +1,817 @@
+/***********************************************************************
+
+Copyright (c) 2010, 2011, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it
+under the terms of the GNU General Public License as published by the
+Free Software Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
+Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+***********************************************************************/
+
+/**************************************************//**
+@file include/srv0mon.h
+Server monitor counter related defines
+
+Created 12/15/2009	Jimmy Yang
+*******************************************************/
+
+#ifndef srv0mon_h
+#define srv0mon_h
+
+#include "univ.i"
+#ifndef UNIV_HOTBACKUP
+
+
+/** Possible status values for "mon_status" in "struct monitor_value" */
+enum monitor_running_status {
+	MONITOR_STARTED = 1,	/*!< Monitor has been turned on */
+	MONITOR_STOPPED = 2	/*!< Monitor has been turned off */
+};
+
+typedef enum monitor_running_status	monitor_running_t;
+
+/** Monitor counter value type */
+typedef	ib_int64_t			mon_type_t;
+
+/** Two monitor structures are defined in this file. One is
+"monitor_value_t" which contains dynamic counter values for each
+counter. The other is "monitor_info_t", which contains
+static information (counter name, desc etc.) for each counter.
+In addition, an enum datatype "monitor_id_t" is also defined,
+it identifies each monitor with an internally used symbol, whose
+integer value indexes into above two structure for its dynamic
+and static information.
+Developer who intend to add new counters would require to
+fill in counter information as described in "monitor_info_t" and
+create the internal counter ID in "monitor_id_t". */
+
+/** Structure containing the actual values of a monitor counter. */
+struct monitor_value_struct {
+	ib_time_t	mon_start_time;	/*!< Start time of monitoring  */
+	ib_time_t	mon_stop_time;	/*!< Stop time of monitoring */
+	ib_time_t	mon_reset_time;	/*!< Time counter resetted */
+	mon_type_t	mon_value;	/*!< Current counter Value */
+	mon_type_t	mon_max_value;	/*!< Current Max value */
+	mon_type_t	mon_min_value;	/*!< Current Min value */
+	mon_type_t	mon_value_reset;/*!< value at last reset */
+	mon_type_t	mon_max_value_start; /*!< Max value since start */
+	mon_type_t	mon_min_value_start; /*!< Min value since start */
+	mon_type_t	mon_start_value;/*!< Value at the start time */
+	mon_type_t	mon_last_value;	/*!< Last set of values */
+	monitor_running_t mon_status;	/* whether monitor still running */
+};
+
+typedef struct monitor_value_struct	monitor_value_t;
+
+/** Follwoing defines are possible values for "monitor_type" field in
+"struct monitor_info" */
+enum monitor_type_value {
+	MONITOR_NONE = 0,	/*!< No monitoring */
+	MONITOR_MODULE = 1,	/*!< This is a monitor module type,
+				not a counter */
+	MONITOR_EXISTING = 2,	/*!< The monitor carries information from
+				an existing system status variable */
+	MONITOR_NO_AVERAGE = 4,	/*!< Set this status if we don't want to
+				calculate the average value for the counter */
+	MONITOR_DISPLAY_CURRENT = 8, /*!< Display current value of the
+				counter, rather than incremental value
+				over the period. Mostly for counters
+				displaying current resource usage */
+	MONITOR_GROUP_MODULE = 16, /*!< Monitor can be turned on/off
+				only as a module, but not individually */
+	MONITOR_DEFAULT_ON = 32,/*!< Monitor will be turned on by default at
+				server start up */
+	MONITOR_SET_OWNER = 64,	/*!< Owner of "monitor set", a set of
+				monitor counters */
+	MONITOR_SET_MEMBER = 128,/*!< Being part of a "monitor set" */
+	MONITOR_HIDDEN = 256	/*!< Do not display this monitor in the
+				metrics table */
+};
+
+typedef enum monitor_type_value	monitor_type_t;
+
+/** Counter minimum value is initialized to be max value of
+ mon_type_t (ib_int64_t) */
+#define	MIN_RESERVED		((mon_type_t) (IB_ULONGLONG_MAX >> 1))
+#define	MAX_RESERVED		(~MIN_RESERVED)
+
+/** This enumeration defines internal monitor identifier used internally
+to identify each particular counter. Its value indexes into two arrays,
+one is the "innodb_counter_value" array which records actual monitor
+counter values, the other is "innodb_counter_info" array which describes
+each counter's basic information (name, desc etc.). A couple of
+naming rules here:
+1) If the monitor defines a module, it starts with MONITOR_MODULE
+2) If the monitor uses exisitng counters from "status variable", its ID
+name shall start with MONITOR_OVLD
+
+Please refer to "innodb_counter_info" in srv/srv0mon.cc for detail
+information for each monitor counter */
+
+enum monitor_id_value {
+	/* This is to identify the default value set by the metrics
+	control global variables */
+	MONITOR_DEFAULT_START = 0,
+
+	/* Start of Metadata counter */
+	MONITOR_MODULE_METADATA,
+	MONITOR_TABLE_OPEN,
+	MONITOR_TABLE_CLOSE,
+	MONITOR_TABLE_REFERENCE,
+	MONITOR_OVLD_META_MEM_POOL,
+
+	/* Lock manager related counters */
+	MONITOR_MODULE_LOCK,
+	MONITOR_DEADLOCK,
+	MONITOR_TIMEOUT,
+	MONITOR_LOCKREC_WAIT,
+	MONITOR_TABLELOCK_WAIT,
+	MONITOR_NUM_RECLOCK_REQ,
+	MONITOR_RECLOCK_CREATED,
+	MONITOR_RECLOCK_REMOVED,
+	MONITOR_NUM_RECLOCK,
+	MONITOR_TABLELOCK_CREATED,
+	MONITOR_TABLELOCK_REMOVED,
+	MONITOR_NUM_TABLELOCK,
+	MONITOR_OVLD_ROW_LOCK_CURRENT_WAIT,
+	MONITOR_OVLD_LOCK_WAIT_TIME,
+	MONITOR_OVLD_LOCK_MAX_WAIT_TIME,
+	MONITOR_OVLD_ROW_LOCK_WAIT,
+	MONITOR_OVLD_LOCK_AVG_WAIT_TIME,
+
+	/* Buffer and I/O realted counters. */
+	MONITOR_MODULE_BUFFER,
+	MONITOR_OVLD_BUFFER_POOL_SIZE,
+	MONITOR_OVLD_BUF_POOL_READS,
+	MONITOR_OVLD_BUF_POOL_READ_REQUESTS,
+	MONITOR_OVLD_BUF_POOL_WRITE_REQUEST,
+	MONITOR_PAGE_INFLUSH,
+	MONITOR_OVLD_BUF_POOL_WAIT_FREE,
+	MONITOR_OVLD_BUF_POOL_READ_AHEAD,
+	MONITOR_OVLD_BUF_POOL_READ_AHEAD_EVICTED,
+	MONITOR_OVLD_BUF_POOL_PAGE_TOTAL,
+	MONITOR_OVLD_BUF_POOL_PAGE_MISC,
+	MONITOR_OVLD_BUF_POOL_PAGES_DATA,
+	MONITOR_OVLD_BUF_POOL_PAGES_DIRTY,
+	MONITOR_OVLD_BUF_POOL_PAGES_FREE,
+	MONITOR_OVLD_PAGE_CREATED,
+	MONITOR_OVLD_PAGES_WRITTEN,
+	MONITOR_OVLD_PAGES_READ,
+	MONITOR_OVLD_BYTE_READ,
+	MONITOR_OVLD_BYTE_WRITTEN,
+	MONITOR_FLUSH_BATCH_SCANNED,
+	MONITOR_FLUSH_BATCH_SCANNED_NUM_CALL,
+	MONITOR_FLUSH_BATCH_SCANNED_PER_CALL,
+	MONITOR_FLUSH_BATCH_TOTAL_PAGE,
+	MONITOR_FLUSH_BATCH_COUNT,
+	MONITOR_FLUSH_BATCH_PAGES,
+	MONITOR_FLUSH_NEIGHBOR_TOTAL_PAGE,
+	MONITOR_FLUSH_NEIGHBOR_COUNT,
+	MONITOR_FLUSH_NEIGHBOR_PAGES,
+	MONITOR_FLUSH_MAX_DIRTY_TOTAL_PAGE,
+	MONITOR_FLUSH_MAX_DIRTY_COUNT,
+	MONITOR_FLUSH_MAX_DIRTY_PAGES,
+	MONITOR_FLUSH_ADAPTIVE_TOTAL_PAGE,
+	MONITOR_FLUSH_ADAPTIVE_COUNT,
+	MONITOR_FLUSH_ADAPTIVE_PAGES,
+	MONITOR_FLUSH_ASYNC_TOTAL_PAGE,
+	MONITOR_FLUSH_ASYNC_COUNT,
+	MONITOR_FLUSH_ASYNC_PAGES,
+	MONITOR_FLUSH_SYNC_TOTAL_PAGE,
+	MONITOR_FLUSH_SYNC_COUNT,
+	MONITOR_FLUSH_SYNC_PAGES,
+	MONITOR_FLUSH_BACKGROUND_TOTAL_PAGE,
+	MONITOR_FLUSH_BACKGROUND_COUNT,
+	MONITOR_FLUSH_BACKGROUND_PAGES,
+	MONITOR_LRU_BATCH_SCANNED,
+	MONITOR_LRU_BATCH_SCANNED_NUM_CALL,
+	MONITOR_LRU_BATCH_SCANNED_PER_CALL,
+	MONITOR_LRU_BATCH_TOTAL_PAGE,
+	MONITOR_LRU_BATCH_COUNT,
+	MONITOR_LRU_BATCH_PAGES,
+	MONITOR_LRU_SINGLE_FLUSH_SCANNED,
+	MONITOR_LRU_SINGLE_FLUSH_SCANNED_NUM_CALL,
+	MONITOR_LRU_SINGLE_FLUSH_SCANNED_PER_CALL,
+	MONITOR_LRU_SINGLE_FLUSH_FAILURE_COUNT,
+	MONITOR_LRU_GET_FREE_SEARCH,
+	MONITOR_LRU_SEARCH_SCANNED,
+	MONITOR_LRU_SEARCH_SCANNED_NUM_CALL,
+	MONITOR_LRU_SEARCH_SCANNED_PER_CALL,
+	MONITOR_LRU_UNZIP_SEARCH_SCANNED,
+	MONITOR_LRU_UNZIP_SEARCH_SCANNED_NUM_CALL,
+	MONITOR_LRU_UNZIP_SEARCH_SCANNED_PER_CALL,
+
+	/* Buffer Page I/O specific counters. */
+	MONITOR_MODULE_BUF_PAGE,
+	MONITOR_INDEX_LEAF_PAGE_READ,
+	MONITOR_INDEX_NON_LEAF_PAGE_READ,
+	MONITOR_INDEX_IBUF_LEAF_PAGE_READ,
+	MONITOR_INDEX_IBUF_NON_LEAF_PAGE_READ,
+	MONITOR_UNDO_LOG_PAGE_READ,
+	MONITOR_INODE_PAGE_READ,
+	MONITOR_IBUF_FREELIST_PAGE_READ,
+	MONITOR_IBUF_BITMAP_PAGE_READ,
+	MONITOR_SYSTEM_PAGE_READ,
+	MONITOR_TRX_SYSTEM_PAGE_READ,
+	MONITOR_FSP_HDR_PAGE_READ,
+	MONITOR_XDES_PAGE_READ,
+	MONITOR_BLOB_PAGE_READ,
+	MONITOR_ZBLOB_PAGE_READ,
+	MONITOR_ZBLOB2_PAGE_READ,
+	MONITOR_OTHER_PAGE_READ,
+	MONITOR_INDEX_LEAF_PAGE_WRITTEN,
+	MONITOR_INDEX_NON_LEAF_PAGE_WRITTEN,
+	MONITOR_INDEX_IBUF_LEAF_PAGE_WRITTEN,
+	MONITOR_INDEX_IBUF_NON_LEAF_PAGE_WRITTEN,
+	MONITOR_UNDO_LOG_PAGE_WRITTEN,
+	MONITOR_INODE_PAGE_WRITTEN,
+	MONITOR_IBUF_FREELIST_PAGE_WRITTEN,
+	MONITOR_IBUF_BITMAP_PAGE_WRITTEN,
+	MONITOR_SYSTEM_PAGE_WRITTEN,
+	MONITOR_TRX_SYSTEM_PAGE_WRITTEN,
+	MONITOR_FSP_HDR_PAGE_WRITTEN,
+	MONITOR_XDES_PAGE_WRITTEN,
+	MONITOR_BLOB_PAGE_WRITTEN,
+	MONITOR_ZBLOB_PAGE_WRITTEN,
+	MONITOR_ZBLOB2_PAGE_WRITTEN,
+	MONITOR_OTHER_PAGE_WRITTEN,
+
+	/* OS level counters (I/O) */
+	MONITOR_MODULE_OS,
+	MONITOR_OVLD_OS_FILE_READ,
+	MONITOR_OVLD_OS_FILE_WRITE,
+	MONITOR_OVLD_OS_FSYNC,
+	MONITOR_OS_PENDING_READS,
+	MONITOR_OS_PENDING_WRITES,
+	MONITOR_OVLD_OS_LOG_WRITTEN,
+	MONITOR_OVLD_OS_LOG_FSYNC,
+	MONITOR_OVLD_OS_LOG_PENDING_FSYNC,
+	MONITOR_OVLD_OS_LOG_PENDING_WRITES,
+
+	/* Transaction related counters */
+	MONITOR_MODULE_TRX,
+	MONITOR_TRX_RW_COMMIT,
+	MONITOR_TRX_RO_COMMIT,
+	MONITOR_TRX_NL_RO_COMMIT,
+	MONITOR_TRX_COMMIT_UNDO,
+	MONITOR_TRX_ROLLBACK,
+	MONITOR_TRX_ROLLBACK_SAVEPOINT,
+	MONITOR_TRX_ROLLBACK_ACTIVE,
+	MONITOR_TRX_ACTIVE,
+	MONITOR_RSEG_HISTORY_LEN,
+	MONITOR_NUM_UNDO_SLOT_USED,
+	MONITOR_NUM_UNDO_SLOT_CACHED,
+	MONITOR_RSEG_CUR_SIZE,
+
+	/* Purge related counters */
+	MONITOR_MODULE_PURGE,
+	MONITOR_N_DEL_ROW_PURGE,
+	MONITOR_N_UPD_EXIST_EXTERN,
+	MONITOR_PURGE_INVOKED,
+	MONITOR_PURGE_N_PAGE_HANDLED,
+	MONITOR_DML_PURGE_DELAY,
+	MONITOR_PURGE_STOP_COUNT,
+	MONITOR_PURGE_RESUME_COUNT,
+
+	/* Recovery related counters */
+	MONITOR_MODULE_RECOVERY,
+	MONITOR_NUM_CHECKPOINT,
+	MONITOR_OVLD_LSN_FLUSHDISK,
+	MONITOR_OVLD_LSN_CHECKPOINT,
+	MONITOR_OVLD_LSN_CURRENT,
+	MONITOR_LSN_CHECKPOINT_AGE,
+	MONITOR_OVLD_BUF_OLDEST_LSN,
+	MONITOR_OVLD_MAX_AGE_ASYNC,
+	MONITOR_OVLD_MAX_AGE_SYNC,
+	MONITOR_PENDING_LOG_WRITE,
+	MONITOR_PENDING_CHECKPOINT_WRITE,
+	MONITOR_LOG_IO,
+	MONITOR_OVLD_LOG_WAITS,
+	MONITOR_OVLD_LOG_WRITE_REQUEST,
+	MONITOR_OVLD_LOG_WRITES,
+
+	/* Page Manager related counters */
+	MONITOR_MODULE_PAGE,
+	MONITOR_PAGE_COMPRESS,
+	MONITOR_PAGE_DECOMPRESS,
+
+	/* Index related counters */
+	MONITOR_MODULE_INDEX,
+	MONITOR_INDEX_SPLIT,
+	MONITOR_INDEX_MERGE,
+
+	/* Adaptive Hash Index related counters */
+	MONITOR_MODULE_ADAPTIVE_HASH,
+	MONITOR_OVLD_ADAPTIVE_HASH_SEARCH,
+	MONITOR_OVLD_ADAPTIVE_HASH_SEARCH_BTREE,
+	MONITOR_ADAPTIVE_HASH_PAGE_ADDED,
+	MONITOR_ADAPTIVE_HASH_PAGE_REMOVED,
+	MONITOR_ADAPTIVE_HASH_ROW_ADDED,
+	MONITOR_ADAPTIVE_HASH_ROW_REMOVED,
+	MONITOR_ADAPTIVE_HASH_ROW_REMOVE_NOT_FOUND,
+	MONITOR_ADAPTIVE_HASH_ROW_UPDATED,
+
+	/* Tablespace related counters */
+	MONITOR_MODULE_FIL_SYSTEM,
+	MONITOR_OVLD_N_FILE_OPENED,
+
+	/* InnoDB Change Buffer related counters */
+	MONITOR_MODULE_IBUF_SYSTEM,
+	MONITOR_OVLD_IBUF_MERGE_INSERT,
+	MONITOR_OVLD_IBUF_MERGE_DELETE,
+	MONITOR_OVLD_IBUF_MERGE_PURGE,
+	MONITOR_OVLD_IBUF_MERGE_DISCARD_INSERT,
+	MONITOR_OVLD_IBUF_MERGE_DISCARD_DELETE,
+	MONITOR_OVLD_IBUF_MERGE_DISCARD_PURGE,
+	MONITOR_OVLD_IBUF_MERGES,
+	MONITOR_OVLD_IBUF_SIZE,
+
+	/* Counters for server operations */
+	MONITOR_MODULE_SERVER,
+	MONITOR_MASTER_THREAD_SLEEP,
+	MONITOR_OVLD_SERVER_ACTIVITY,
+	MONITOR_MASTER_ACTIVE_LOOPS,
+	MONITOR_MASTER_IDLE_LOOPS,
+	MONITOR_SRV_BACKGROUND_DROP_TABLE_MICROSECOND,
+	MONITOR_SRV_IBUF_MERGE_MICROSECOND,
+	MONITOR_SRV_LOG_FLUSH_MICROSECOND,
+	MONITOR_SRV_MEM_VALIDATE_MICROSECOND,
+	MONITOR_SRV_PURGE_MICROSECOND,
+	MONITOR_SRV_DICT_LRU_MICROSECOND,
+	MONITOR_SRV_CHECKPOINT_MICROSECOND,
+	MONITOR_OVLD_SRV_DBLWR_WRITES,
+	MONITOR_OVLD_SRV_DBLWR_PAGES_WRITTEN,
+	MONITOR_OVLD_SRV_PAGE_SIZE,
+	MONITOR_OVLD_RWLOCK_S_SPIN_WAITS,
+	MONITOR_OVLD_RWLOCK_X_SPIN_WAITS,
+	MONITOR_OVLD_RWLOCK_S_SPIN_ROUNDS,
+	MONITOR_OVLD_RWLOCK_X_SPIN_ROUNDS,
+	MONITOR_OVLD_RWLOCK_S_OS_WAITS,
+	MONITOR_OVLD_RWLOCK_X_OS_WAITS,
+
+	/* Data DML related counters */
+	MONITOR_MODULE_DML_STATS,
+	MONITOR_OLVD_ROW_READ,
+	MONITOR_OLVD_ROW_INSERTED,
+	MONITOR_OLVD_ROW_DELETED,
+	MONITOR_OLVD_ROW_UPDTATED,
+
+	/* Data DDL related counters */
+	MONITOR_MODULE_DDL_STATS,
+	MONITOR_BACKGROUND_DROP_TABLE,
+
+	MONITOR_MODULE_ICP,
+	MONITOR_ICP_ATTEMPTS,
+	MONITOR_ICP_NO_MATCH,
+	MONITOR_ICP_OUT_OF_RANGE,
+	MONITOR_ICP_MATCH,
+
+	/* This is used only for control system to turn
+	on/off and reset all monitor counters */
+	MONITOR_ALL_COUNTER,
+
+	/* This must be the last member */
+	NUM_MONITOR
+};
+
+typedef enum monitor_id_value		monitor_id_t;
+
+/** This informs the monitor control system to turn
+on/off and reset monitor counters through wild card match */
+#define	MONITOR_WILDCARD_MATCH		(NUM_MONITOR + 1)
+
+/** Cannot find monitor counter with a specified name */
+#define	MONITOR_NO_MATCH		(NUM_MONITOR + 2)
+
+/** struct monitor_info describes the basic/static information
+about each monitor counter. */
+struct monitor_info_struct {
+	const char*	monitor_name;	/*!< Monitor name */
+	const char*	monitor_module;	/*!< Sub Module the monitor
+					belongs to */
+	const char*	monitor_desc;	/*!< Brief desc of monitor counter */
+	monitor_type_t	monitor_type;	/*!< Type of Monitor Info */
+	monitor_id_t	monitor_related_id;/*!< Monitor ID of counter that
+					related to this monitor. This is
+					set when the monitor belongs to
+					a "monitor set" */
+	monitor_id_t	monitor_id;	/*!< Monitor ID as defined in enum
+					monitor_id_t */
+};
+
+typedef struct monitor_info_struct	monitor_info_t;
+
+/** Following are the "set_option" values allowed for
+srv_mon_process_existing_counter() and srv_mon_process_existing_counter()
+functions. To turn on/off/reset the monitor counters. */
+enum mon_set_option {
+	MONITOR_TURN_ON = 1,		/*!< Turn on the counter */
+	MONITOR_TURN_OFF,		/*!< Turn off the counter */
+	MONITOR_RESET_VALUE,		/*!< Reset current values */
+	MONITOR_RESET_ALL_VALUE,	/*!< Reset all values */
+	MONITOR_GET_VALUE		/*!< Option for
+					srv_mon_process_existing_counter()
+					function */
+};
+
+typedef enum mon_set_option		mon_option_t;
+
+/** Number of bit in a ulint datatype */
+#define	NUM_BITS_ULINT	(sizeof(ulint) * CHAR_BIT)
+
+/** This "monitor_set_tbl" is a bitmap records whether a particular monitor
+counter has been turned on or off */
+extern ulint		monitor_set_tbl[(NUM_MONITOR + NUM_BITS_ULINT - 1) /
+					NUM_BITS_ULINT];
+
+/** Macros to turn on/off the control bit in monitor_set_tbl for a monitor
+counter option. */
+#define MONITOR_ON(monitor)				\
+	(monitor_set_tbl[monitor / NUM_BITS_ULINT] |=	\
+			((ulint)1 << (monitor % NUM_BITS_ULINT)))
+
+#define MONITOR_OFF(monitor)				\
+	(monitor_set_tbl[monitor / NUM_BITS_ULINT] &=	\
+			~((ulint)1 << (monitor % NUM_BITS_ULINT)))
+
+/** Check whether the requested monitor is turned on/off */
+#define MONITOR_IS_ON(monitor)				\
+	(monitor_set_tbl[monitor / NUM_BITS_ULINT] &	\
+			((ulint)1 << (monitor % NUM_BITS_ULINT)))
+
+/** The actual monitor counter array that records each monintor counter
+value */
+extern monitor_value_t	 innodb_counter_value[NUM_MONITOR];
+
+/** Following are macro defines for basic montior counter manipulations.
+Please note we do not provide any synchronization for these monitor
+operations due to performance consideration. Most counters can
+be placed under existing mutex protections in respective code
+module. */
+
+/** Macros to access various fields of a monitor counters */
+#define MONITOR_FIELD(monitor, field)			\
+		(innodb_counter_value[monitor].field)
+
+#define MONITOR_VALUE(monitor)				\
+		MONITOR_FIELD(monitor, mon_value)
+
+#define MONITOR_MAX_VALUE(monitor)			\
+		MONITOR_FIELD(monitor, mon_max_value)
+
+#define MONITOR_MIN_VALUE(monitor)			\
+		MONITOR_FIELD(monitor, mon_min_value)
+
+#define MONITOR_VALUE_RESET(monitor)			\
+		MONITOR_FIELD(monitor, mon_value_reset)
+
+#define MONITOR_MAX_VALUE_START(monitor)		\
+		MONITOR_FIELD(monitor, mon_max_value_start)
+
+#define MONITOR_MIN_VALUE_START(monitor)		\
+		MONITOR_FIELD(monitor, mon_min_value_start)
+
+#define MONITOR_LAST_VALUE(monitor)			\
+		MONITOR_FIELD(monitor, mon_last_value)
+
+#define MONITOR_START_VALUE(monitor)			\
+		MONITOR_FIELD(monitor, mon_start_value)
+
+#define MONITOR_VALUE_SINCE_START(monitor)		\
+		(MONITOR_VALUE(monitor) + MONITOR_VALUE_RESET(monitor))
+
+#define MONITOR_STATUS(monitor)				\
+		MONITOR_FIELD(monitor, mon_status)
+
+#define MONITOR_SET_START(monitor)					\
+	do {								\
+		MONITOR_STATUS(monitor) = MONITOR_STARTED;		\
+		MONITOR_FIELD((monitor), mon_start_time) = time(NULL);	\
+	} while (0)
+
+#define MONITOR_SET_OFF(monitor)					\
+	do {								\
+		MONITOR_STATUS(monitor) = MONITOR_STOPPED;		\
+		MONITOR_FIELD((monitor), mon_stop_time) = time(NULL);	\
+	} while (0)
+
+#define	MONITOR_INIT_ZERO_VALUE		0
+
+/** Max and min values are initialized when we first turn on the monitor
+counter, and set the MONITOR_STATUS. */
+#define MONITOR_MAX_MIN_NOT_INIT(monitor)				\
+		(MONITOR_STATUS(monitor) == MONITOR_INIT_ZERO_VALUE	\
+		 && MONITOR_MIN_VALUE(monitor) == MONITOR_INIT_ZERO_VALUE \
+		 && MONITOR_MAX_VALUE(monitor) == MONITOR_INIT_ZERO_VALUE)
+
+#define MONITOR_INIT(monitor)						\
+	if (MONITOR_MAX_MIN_NOT_INIT(monitor)) {			\
+		MONITOR_MIN_VALUE(monitor) = MIN_RESERVED;		\
+		MONITOR_MIN_VALUE_START(monitor) = MIN_RESERVED;	\
+		MONITOR_MAX_VALUE(monitor) = MAX_RESERVED;		\
+		MONITOR_MAX_VALUE_START(monitor) = MAX_RESERVED;	\
+	}
+
+/** Macros to increment/decrement the counters. The normal
+monitor counter operation expects appropriate synchronization
+already exists. No additional mutex is necessary when operating
+on the counters */
+#define	MONITOR_INC(monitor)						\
+	if (MONITOR_IS_ON(monitor)) {					\
+		MONITOR_VALUE(monitor)++;				\
+		if (MONITOR_VALUE(monitor) > MONITOR_MAX_VALUE(monitor)) {  \
+			MONITOR_MAX_VALUE(monitor) = MONITOR_VALUE(monitor);\
+		}							\
+	}
+
+#ifdef HAVE_ATOMIC_BUILTINS
+
+# define MONITOR_ATOMIC_INC(monitor)					\
+	if (MONITOR_IS_ON(monitor)) {					\
+		ib_uint64_t	value;					\
+		value  = os_atomic_increment_uint64(			\
+			(ib_uint64_t*) &MONITOR_VALUE(monitor),	 1);	\
+		/* Note: This is not 100% accurate because of the	\
+		inherent race, we ignore it due to performance. */	\
+		if (value > (ib_uint64_t) MONITOR_MAX_VALUE(monitor)) {	\
+			MONITOR_MAX_VALUE(monitor) = value;		\
+		}							\
+	}
+
+# define MONITOR_ATOMIC_DEC(monitor)					\
+	if (MONITOR_IS_ON(monitor)) {					\
+		ib_uint64_t	value;					\
+		value = os_atomic_decrement_ulint(			\
+			(ib_uint64_t*) &MONITOR_VALUE(monitor), 1);	\
+		/* Note: This is not 100% accurate because of the	\
+		inherent race, we ignore it due to performance. */	\
+		if (value < (ib_uint64_t) MONITOR_MIN_VALUE(monitor)) {	\
+			MONITOR_MIN_VALUE(monitor) = value;		\
+		}							\
+	}
+#endif /* HAVE_ATOMIC_BUILTINS */
+
+#define	MONITOR_DEC(monitor)						\
+	if (MONITOR_IS_ON(monitor)) {					\
+		MONITOR_VALUE(monitor)--;				\
+		if (MONITOR_VALUE(monitor) < MONITOR_MIN_VALUE(monitor)) {  \
+			MONITOR_MIN_VALUE(monitor) = MONITOR_VALUE(monitor);\
+		}							\
+	}
+
+#define	MONITOR_INC_VALUE(monitor, value)				\
+	if (MONITOR_IS_ON(monitor)) {					\
+		MONITOR_VALUE(monitor) += (mon_type_t) (value);		\
+		if (MONITOR_VALUE(monitor) > MONITOR_MAX_VALUE(monitor)) {  \
+			MONITOR_MAX_VALUE(monitor) = MONITOR_VALUE(monitor);\
+		}							\
+	}
+
+#define	MONITOR_DEC_VALUE(monitor, value)				\
+	if (MONITOR_IS_ON(monitor)) {					\
+		ut_ad(MONITOR_VALUE(monitor) >= (mon_type_t) (value);	\
+		MONITOR_VALUE(monitor) -= (mon_type_t) (value);		\
+		if (MONITOR_VALUE(monitor) < MONITOR_MIN_VALUE(monitor)) {  \
+			MONITOR_MIN_VALUE(monitor) = MONITOR_VALUE(monitor);\
+		}							\
+	}
+
+/* Increment/decrement counter without check the monitor on/off bit, which
+could already be checked as a module group */
+#define	MONITOR_INC_NOCHECK(monitor)					\
+	do {								\
+		MONITOR_VALUE(monitor)++;				\
+		if (MONITOR_VALUE(monitor) > MONITOR_MAX_VALUE(monitor)) {  \
+			MONITOR_MAX_VALUE(monitor) = MONITOR_VALUE(monitor);\
+		}							\
+	} while (0)							\
+
+#define	MONITOR_DEC_NOCHECK(monitor)					\
+	do {								\
+		MONITOR_VALUE(monitor)--;				\
+		if (MONITOR_VALUE(monitor) < MONITOR_MIN_VALUE(monitor)) {  \
+			MONITOR_MIN_VALUE(monitor) = MONITOR_VALUE(monitor);\
+		}							\
+	} while (0)
+
+/** Directly set a monitor counter's value */
+#define	MONITOR_SET(monitor, value)					\
+	if (MONITOR_IS_ON(monitor)) {					\
+		MONITOR_VALUE(monitor) = (mon_type_t) (value);		\
+		if (MONITOR_VALUE(monitor) > MONITOR_MAX_VALUE(monitor)) {  \
+			MONITOR_MAX_VALUE(monitor) = MONITOR_VALUE(monitor);\
+		}							\
+		if (MONITOR_VALUE(monitor) < MONITOR_MIN_VALUE(monitor)) {  \
+			MONITOR_MIN_VALUE(monitor) = MONITOR_VALUE(monitor);\
+		}							\
+	}
+
+/** Add time difference between now and input "value" (in seconds) to the
+monitor counter
+@monitor	monitor to update for the time difference
+@value		the start time value */
+#define	MONITOR_INC_TIME_IN_MICRO_SECS(monitor, value)			\
+	if (MONITOR_IS_ON(monitor)) {					\
+		ullint	old_time = (value);				\
+		value = ut_time_us(NULL);				\
+		MONITOR_VALUE(monitor) += (mon_type_t) (value - old_time);\
+	}
+
+/** This macro updates 3 counters in one call. However, it only checks the
+main/first monitor counter 'monitor', to see it is on or off to decide
+whether to do the update.
+@monitor		the main monitor counter to update. It accounts for
+			the accumulative value for the counter.
+@monitor_n_calls	counter that counts number of times this macro is
+			called
+@monitor_per_call	counter that records the current and max value of
+			each incremental value
+@value			incremental value to record this time */
+#define MONITOR_INC_VALUE_CUMULATIVE(					\
+		monitor, monitor_n_calls, monitor_per_call, value)	\
+	if (MONITOR_IS_ON(monitor)) {					\
+		MONITOR_VALUE(monitor_n_calls)++;			\
+		MONITOR_VALUE(monitor_per_call) = (mon_type_t) (value);	\
+		if (MONITOR_VALUE(monitor_per_call)			\
+		    > MONITOR_MAX_VALUE(monitor_per_call)) {		\
+			MONITOR_MAX_VALUE(monitor_per_call) =		\
+				 (mon_type_t) (value);			\
+		}							\
+		MONITOR_VALUE(monitor) += (mon_type_t) (value);		\
+		if (MONITOR_VALUE(monitor) > MONITOR_MAX_VALUE(monitor)) {  \
+			MONITOR_MAX_VALUE(monitor) = MONITOR_VALUE(monitor);\
+		}							\
+	}
+
+/** Directly set a monitor counter's value, and if the value
+is monotonically increasing, only max value needs to be updated */
+#define	MONITOR_SET_UPD_MAX_ONLY(monitor, value)			\
+	if (MONITOR_IS_ON(monitor)) {					\
+		MONITOR_VALUE(monitor) = (mon_type_t) (value);		\
+		if (MONITOR_VALUE(monitor) > MONITOR_MAX_VALUE(monitor)) {  \
+			MONITOR_MAX_VALUE(monitor) = MONITOR_VALUE(monitor);\
+		}							\
+	}
+
+/** Some values such as log sequence number are montomically increasing
+number, do not need to record max/min values */
+#define MONITOR_SET_SIMPLE(monitor, value)				\
+	if (MONITOR_IS_ON(monitor)) {					\
+		MONITOR_VALUE(monitor) = (mon_type_t) (value);		\
+	}
+
+/** Reset the monitor value and max/min value to zero. The reset
+operation would only be conducted when the counter is turned off */
+#define MONITOR_RESET_ALL(monitor)					\
+	do {								\
+		MONITOR_VALUE(monitor) = MONITOR_INIT_ZERO_VALUE;	\
+		MONITOR_MAX_VALUE(monitor) = MAX_RESERVED;		\
+		MONITOR_MIN_VALUE(monitor) = MIN_RESERVED;		\
+		MONITOR_VALUE_RESET(monitor) = MONITOR_INIT_ZERO_VALUE;	\
+		MONITOR_MAX_VALUE_START(monitor) = MAX_RESERVED;	\
+		MONITOR_MIN_VALUE_START(monitor) = MIN_RESERVED;	\
+		MONITOR_LAST_VALUE(monitor) = MONITOR_INIT_ZERO_VALUE;	\
+		MONITOR_FIELD(monitor, mon_start_time) =		\
+					MONITOR_INIT_ZERO_VALUE;	\
+		MONITOR_FIELD(monitor, mon_stop_time) =			\
+					MONITOR_INIT_ZERO_VALUE;	\
+		MONITOR_FIELD(monitor, mon_reset_time) =		\
+					MONITOR_INIT_ZERO_VALUE;	\
+	} while (0)
+
+/** Following four macros defines necessary operations to fetch and
+consolidate information from existing system status variables. */
+
+/** Save the passed-in value to mon_start_value field of monitor
+counters */
+#define MONITOR_SAVE_START(monitor, value)				\
+	(MONITOR_START_VALUE(monitor) =					\
+		 (mon_type_t) (value) - MONITOR_VALUE_RESET(monitor))
+
+/** Save the passed-in value to mon_last_value field of monitor
+counters */
+#define MONITOR_SAVE_LAST(monitor)					\
+	do {								\
+		MONITOR_LAST_VALUE(monitor) = MONITOR_VALUE(monitor);	\
+		MONITOR_START_VALUE(monitor) += MONITOR_VALUE(monitor);	\
+	} while (0)
+
+/** Set monitor value to the difference of value and mon_start_value
+compensated by mon_last_value if accumulated value is required. */
+#define MONITOR_SET_DIFF(monitor, value)				\
+	MONITOR_SET_UPD_MAX_ONLY(monitor, ((value)			\
+	- MONITOR_VALUE_RESET(monitor)					\
+	- MONITOR_FIELD(monitor, mon_start_value)			\
+	+ MONITOR_FIELD(monitor, mon_last_value)))
+
+/****************************************************************//**
+Get monitor's monitor_info_t by its monitor id (index into the
+innodb_counter_info array
+@return	Point to corresponding monitor_info_t, or NULL if no such
+monitor */
+UNIV_INTERN
+monitor_info_t*
+srv_mon_get_info(
+/*=============*/
+	monitor_id_t	monitor_id);	/*!< id index into the
+					innodb_counter_info array */
+/****************************************************************//**
+Get monitor's name by its monitor id (index into the
+innodb_counter_info array
+@return	corresponding monitor name, or NULL if no such
+monitor */
+UNIV_INTERN
+const char*
+srv_mon_get_name(
+/*=============*/
+	monitor_id_t	monitor_id);	/*!< id index into the
+					innodb_counter_info array */
+
+/****************************************************************//**
+Turn on/off/reset monitor counters in a module. If module_value
+is NUM_MONITOR then turn on all monitor counters.
+@return	0 if successful, or the first monitor that cannot be
+turned on because it is already turned on. */
+UNIV_INTERN
+void
+srv_mon_set_module_control(
+/*=======================*/
+	monitor_id_t	module_id,	/*!< in: Module ID as in
+					monitor_counter_id. If it is
+					set to NUM_MONITOR, this means
+					we shall turn on all the counters */
+	mon_option_t	set_option);	/*!< in: Turn on/off reset the
+					counter */
+/****************************************************************//**
+This function consolidates some existing server counters used
+by "system status variables". These existing system variables do not have
+mechanism to start/stop and reset the counters, so we simulate these
+controls by remembering the corresponding counter values when the
+corresponding monitors are turned on/off/reset, and do appropriate
+mathematics to deduct the actual value. */
+UNIV_INTERN
+void
+srv_mon_process_existing_counter(
+/*=============================*/
+	monitor_id_t	monitor_id,	/*!< in: the monitor's ID as in
+					monitor_counter_id */
+	mon_option_t	set_option);	/*!< in: Turn on/off reset the
+					counter */
+/*************************************************************//**
+This function is used to calculate the maximum counter value
+since the start of monitor counter
+@return	max counter value since start. */
+UNIV_INLINE
+mon_type_t
+srv_mon_calc_max_since_start(
+/*=========================*/
+	monitor_id_t	monitor);	/*!< in: monitor id */
+/*************************************************************//**
+This function is used to calculate the minimum counter value
+since the start of monitor counter
+@return	min counter value since start. */
+UNIV_INLINE
+mon_type_t
+srv_mon_calc_min_since_start(
+/*=========================*/
+	monitor_id_t	monitor);	/*!< in: monitor id*/
+/*************************************************************//**
+Reset a monitor, create a new base line with the current monitor
+value. This baseline is recorded by MONITOR_VALUE_RESET(monitor) */
+UNIV_INTERN
+void
+srv_mon_reset(
+/*==========*/
+	monitor_id_t	monitor);	/*!< in: monitor id*/
+/*************************************************************//**
+This function resets all values of a monitor counter */
+UNIV_INLINE
+void
+srv_mon_reset_all(
+/*==============*/
+	monitor_id_t	monitor);	/*!< in: monitor id*/
+/*************************************************************//**
+Turn on monitor counters that are marked as default ON. */
+UNIV_INTERN
+void
+srv_mon_default_on(void);
+/*====================*/
+
+#ifndef UNIV_NONINL
+#include "srv0mon.ic"
+#endif
+#else /* !UNIV_HOTBACKUP */
+# define MONITOR_INC(x)		((void) 0)
+# define MONITOR_DEC(x)		((void) 0)
+#endif /* !UNIV_HOTBACKUP */
+
+#endif
diff --git a/storage/innobase/include/srv0mon.ic b/storage/innobase/include/srv0mon.ic
new file mode 100644
index 00000000000..17411d77a8b
--- /dev/null
+++ b/storage/innobase/include/srv0mon.ic
@@ -0,0 +1,113 @@
+/*****************************************************************************
+
+Copyright (c) 2010, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/*******************************************************************//**
+@file include/srv0mon.ic
+Server monitoring system
+
+Created 1/20/2010	Jimmy Yang
+************************************************************************/
+
+/*************************************************************//**
+This function is used to calculate the maximum counter value
+since the start of monitor counter
+@return	max counter value since start. */
+UNIV_INLINE
+mon_type_t
+srv_mon_calc_max_since_start(
+/*=========================*/
+	monitor_id_t	monitor)	/*!< in: monitor id */
+{
+	if (MONITOR_MAX_VALUE_START(monitor) == MAX_RESERVED) {
+
+		/* MONITOR_MAX_VALUE_START has not yet been
+		initialized, the max value since start is the
+		max count in MONITOR_MAX_VALUE */
+		MONITOR_MAX_VALUE_START(monitor) =
+				MONITOR_MAX_VALUE(monitor);
+
+	} else if (MONITOR_MAX_VALUE(monitor) != MAX_RESERVED
+		   && (MONITOR_MAX_VALUE(monitor)
+		       + MONITOR_VALUE_RESET(monitor)
+		      > MONITOR_MAX_VALUE_START(monitor))) {
+
+		/* If the max value since reset (as specified
+		in MONITOR_MAX_VALUE) plus the reset value is
+		larger than MONITOR_MAX_VALUE_START, reset
+		MONITOR_MAX_VALUE_START to this new max value */
+		MONITOR_MAX_VALUE_START(monitor) =
+				MONITOR_MAX_VALUE(monitor)
+				+ MONITOR_VALUE_RESET(monitor);
+	}
+
+	return(MONITOR_MAX_VALUE_START(monitor));
+}
+
+/*************************************************************//**
+This function is used to calculate the minimum counter value
+since the start of monitor counter
+@return	min counter value since start. */
+UNIV_INLINE
+mon_type_t
+srv_mon_calc_min_since_start(
+/*=========================*/
+	monitor_id_t	monitor)	/*!< in: monitor id */
+{
+	if (MONITOR_MIN_VALUE_START(monitor) == MIN_RESERVED) {
+
+		/* MONITOR_MIN_VALUE_START has not yet been
+		initialized, the min value since start is the
+		min count in MONITOR_MIN_VALUE */
+		MONITOR_MIN_VALUE_START(monitor) =
+				MONITOR_MIN_VALUE(monitor);
+
+	} else if (MONITOR_MIN_VALUE(monitor) != MIN_RESERVED
+		   && (MONITOR_MIN_VALUE(monitor)
+		       + MONITOR_VALUE_RESET(monitor)
+		       < MONITOR_MIN_VALUE_START(monitor))) {
+
+		/* If the min value since reset (as specified
+		in MONITOR_MIN_VALUE) plus the reset value is
+		less than MONITOR_MIN_VALUE_START, reset
+		MONITOR_MIN_VALUE_START to this new min value */
+		MONITOR_MIN_VALUE_START(monitor) =
+			MONITOR_MIN_VALUE(monitor)
+                        + MONITOR_VALUE_RESET(monitor);
+        }
+
+	return(MONITOR_MIN_VALUE_START(monitor));
+}
+
+/*************************************************************//**
+This function resets all values of a monitor counter */
+UNIV_INLINE
+void
+srv_mon_reset_all(
+/*==============*/
+	monitor_id_t	monitor)	/*!< in: monitor id */
+{
+	/* Do not reset all counter values if monitor is still on. */
+	if (MONITOR_IS_ON(monitor)) {
+		fprintf(stderr, "InnoDB: Cannot reset all values for "
+			"monitor counter %s while it is on. Please "
+			"turn it off and retry. \n",
+			srv_mon_get_name(monitor));
+	} else {
+		MONITOR_RESET_ALL(monitor);
+	}
+}
diff --git a/storage/innobase/include/srv0srv.h b/storage/innobase/include/srv0srv.h
index ed2f4672a99..99cff251e3c 100644
--- a/storage/innobase/include/srv0srv.h
+++ b/storage/innobase/include/srv0srv.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1995, 2011, Innobase Oy. All Rights Reserved.
+Copyright (c) 1995, 2012, Oracle and/or its affiliates. All rights reserved.
 Copyright (c) 2008, 2009, Google Inc.
 Copyright (c) 2009, Percona Inc.
 
@@ -26,8 +26,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -43,15 +43,18 @@ Created 10/10/1995 Heikki Tuuri
 
 #include "univ.i"
 #ifndef UNIV_HOTBACKUP
+#include "log0log.h"
 #include "sync0sync.h"
 #include "os0sync.h"
 #include "que0types.h"
 #include "trx0types.h"
+#include "srv0conc.h"
+#include "buf0checksum.h"
 
 extern const char*	srv_main_thread_op_info;
 
 /** Prefix used by MySQL to indicate pre-5.1 table name encoding */
-extern const char	srv_mysql50_table_name_prefix[9];
+extern const char	srv_mysql50_table_name_prefix[10];
 
 /* When this event is set the lock timeout and InnoDB monitor
 thread starts running */
@@ -66,6 +69,21 @@ extern os_event_t	srv_timeout_event;
 /* The error monitor thread waits on this event. */
 extern os_event_t	srv_error_event;
 
+/** The buffer pool dump/load thread waits on this event. */
+extern os_event_t	srv_buf_dump_event;
+
+/** The buffer pool dump/load file name */
+#define SRV_BUF_DUMP_FILENAME_DEFAULT	"ib_buffer_pool"
+extern char*		srv_buf_dump_filename;
+
+/** Boolean config knobs that tell InnoDB to dump the buffer pool at shutdown
+and/or load it during startup. */
+extern char		srv_buffer_pool_dump_at_shutdown;
+extern char		srv_buffer_pool_load_at_startup;
+
+/* Whether to disable file system cache if it is defined */
+extern char		srv_disable_sort_file_cache;
+
 /* If the last data file is auto-extended, we add this many pages to it
 at a time */
 #define SRV_AUTO_EXTEND_INCREMENT	\
@@ -91,27 +109,33 @@ extern FILE*	srv_misc_tmpfile;
 /* Server parameters which are read from the initfile */
 
 extern char*	srv_data_home;
+
 #ifdef UNIV_LOG_ARCHIVE
 extern char*	srv_arch_dir;
 #endif /* UNIV_LOG_ARCHIVE */
 
 /** store to its own file each table created by an user; data
 dictionary tables are in the system tablespace 0 */
-#ifndef UNIV_HOTBACKUP
 extern my_bool	srv_file_per_table;
-#else
-extern ibool	srv_file_per_table;
-#endif /* UNIV_HOTBACKUP */
+/** Sleep delay for threads waiting to enter InnoDB. In micro-seconds. */
+extern	ulong	srv_thread_sleep_delay;
+#if defined(HAVE_ATOMIC_BUILTINS)
+/** Maximum sleep delay (in micro-seconds), value of 0 disables it.*/
+extern	ulong	srv_adaptive_max_sleep_delay;
+#endif /* HAVE_ATOMIC_BUILTINS */
+
 /** The file format to use on new *.ibd files. */
 extern ulint	srv_file_format;
 /** Whether to check file format during startup.  A value of
-DICT_TF_FORMAT_MAX + 1 means no checking ie. FALSE.  The default is to
+UNIV_FORMAT_MAX + 1 means no checking ie. FALSE.  The default is to
 set it to the highest format we support. */
 extern ulint	srv_max_file_format_at_startup;
 /** Place locks to records only i.e. do not use next-key locking except
 on duplicate key checking and foreign key checking */
 extern ibool	srv_locks_unsafe_for_binlog;
-#endif /* !UNIV_HOTBACKUP */
+
+/* Variable specifying the FTS parallel sort buffer size */
+extern ulong	srv_sort_buf_size;
 
 /* If this flag is TRUE, then we will use the native aio of the
 OS (provided we compiled Innobase with it in), otherwise we will
@@ -120,7 +144,18 @@ Currently we support native aio on windows and linux */
 extern my_bool	srv_use_native_aio;
 #ifdef __WIN__
 extern ibool	srv_use_native_conditions;
-#endif
+#endif /* __WIN__ */
+#endif /* !UNIV_HOTBACKUP */
+
+/** Server undo tablespaces directory, can be absolute path. */
+extern char*	srv_undo_dir;
+
+/** Number of undo tablespaces to use. */
+extern ulong	srv_undo_tablespaces;
+
+/* The number of undo segments to use */
+extern ulong	srv_undo_logs;
+
 extern ulint	srv_n_data_files;
 extern char**	srv_data_file_names;
 extern ulint*	srv_data_file_sizes;
@@ -136,7 +171,7 @@ extern ibool	srv_created_new_raw;
 
 extern ulint	srv_n_log_groups;
 extern ulint	srv_n_log_files;
-extern ulint	srv_log_file_size;
+extern ib_uint64_t	srv_log_file_size;
 extern ulint	srv_log_buffer_size;
 extern ulong	srv_flush_log_at_trx_commit;
 extern char	srv_adaptive_flushing;
@@ -156,6 +191,12 @@ extern ibool	srv_use_sys_malloc;
 #endif /* UNIV_HOTBACKUP */
 extern ulint	srv_buf_pool_size;	/*!< requested size in bytes */
 extern ulint    srv_buf_pool_instances; /*!< requested number of buffer pool instances */
+extern ulong	srv_n_page_hash_locks;	/*!< number of locks to
+					protect buf_pool->page_hash */
+extern ulong	srv_LRU_scan_depth;	/*!< Scan depth for LRU
+					flush batch */
+extern my_bool	srv_flush_neighbors;	/*!< whether or not to flush
+					neighbors of a block */
 extern ulint	srv_buf_pool_old_size;	/*!< previously requested size */
 extern ulint	srv_buf_pool_curr_size;	/*!< current size in bytes */
 extern ulint	srv_mem_pool_size;
@@ -172,7 +213,7 @@ extern ulong    srv_io_capacity;
 /* Returns the number of IO operations that is X percent of the
 capacity. PCT_IO(5) -> returns the number of IO operations that
 is 5% of the max where max is srv_io_capacity.  */
-#define PCT_IO(p) ((ulong) (srv_io_capacity * ((double) p / 100.0)))
+#define PCT_IO(p) ((ulong) (srv_io_capacity * ((double) (p) / 100.0)))
 
 /* The "innodb_stats_method" setting, decides how InnoDB is going
 to treat NULL value when collecting statistics. It is not defined
@@ -194,28 +235,26 @@ extern ulint	srv_max_n_open_files;
 extern ulint	srv_max_dirty_pages_pct;
 
 extern ulint	srv_force_recovery;
-extern ulong	srv_thread_concurrency;
-
-extern ulint	srv_max_n_threads;
 
-extern lint	srv_conc_n_threads;
-
-extern ulint	srv_fast_shutdown;	 /* If this is 1, do not do a
-					 purge and index buffer merge.
-					 If this 2, do not even flush the
-					 buffer pool to data files at the
-					 shutdown: we effectively 'crash'
-					 InnoDB (but lose no committed
-					 transactions). */
+extern ulint	srv_fast_shutdown;	/*!< If this is 1, do not do a
+					purge and index buffer merge.
+					If this 2, do not even flush the
+					buffer pool to data files at the
+					shutdown: we effectively 'crash'
+					InnoDB (but lose no committed
+					transactions). */
 extern ibool	srv_innodb_status;
 
-extern unsigned long long	srv_stats_sample_pages;
+extern unsigned long long	srv_stats_transient_sample_pages;
+extern unsigned long long	srv_stats_persistent_sample_pages;
 
 extern ibool	srv_use_doublewrite_buf;
-extern ibool	srv_use_checksums;
+extern ulong	srv_doublewrite_batch_size;
+extern ulong	srv_checksum_algorithm;
 
 extern ulong	srv_max_buf_pool_modified_pct;
 extern ulong	srv_max_purge_lag;
+extern ulong	srv_max_purge_lag_delay;
 
 extern ulong	srv_replication_delay;
 /*-------------------------------------------*/
@@ -229,19 +268,29 @@ extern ibool	srv_print_innodb_monitor;
 extern ibool	srv_print_innodb_lock_monitor;
 extern ibool	srv_print_innodb_tablespace_monitor;
 extern ibool	srv_print_verbose_log;
+#define DEPRECATED_MSG_INNODB_TABLE_MONITOR \
+	"Using innodb_table_monitor is deprecated and it may be removed " \
+	"in future releases. Please use the InnoDB INFORMATION_SCHEMA " \
+	"tables instead, see " REFMAN "innodb-i_s-tables.html"
 extern ibool	srv_print_innodb_table_monitor;
 
 extern ibool	srv_lock_timeout_active;
 extern ibool	srv_monitor_active;
 extern ibool	srv_error_monitor_active;
 
+/* TRUE during the lifetime of the buffer pool dump/load thread */
+extern ibool	srv_buf_dump_thread_active;
+
 extern ulong	srv_n_spin_wait_rounds;
 extern ulong	srv_n_free_tickets_to_enter;
 extern ulong	srv_thread_sleep_delay;
 extern ulong	srv_spin_wait_delay;
 extern ibool	srv_priority_boost;
 
+extern ulint	srv_n_lock_wait_count;
+
 extern ulint	srv_truncated_status_writes;
+extern ulint	srv_available_undo_logs;
 
 extern	ulint	srv_mem_pool_size;
 extern	ulint	srv_lock_table_size;
@@ -260,16 +309,13 @@ extern	ibool	srv_print_latch_waits;
 # define srv_print_latch_waits		FALSE
 #endif /* UNIV_DEBUG */
 
-extern ulint	srv_activity_count;
 extern ulint	srv_fatal_semaphore_wait_threshold;
-#define SRV_SEMAPHORE_WAIT_EXTENSION	7200
 extern ulint	srv_dml_needed_delay;
 
-extern mutex_t*	kernel_mutex_temp;/* mutex protecting the server, trx structs,
-				query threads, and lock table: we allocate
-				it from dynamic memory to get it to the
-				same DRAM page as other hotspot semaphores */
-#define kernel_mutex (*kernel_mutex_temp)
+#ifndef HAVE_ATOMIC_BUILTINS
+/** Mutex protecting some server global variables. */
+extern mutex_t	server_mutex;
+#endif /* !HAVE_ATOMIC_BUILTINS */
 
 #define SRV_MAX_N_IO_THREADS	130
 
@@ -285,7 +331,7 @@ extern ulint srv_log_write_requests;
 extern ulint srv_log_writes;
 
 /* amount of data written to the log files in bytes */
-extern ulint srv_os_log_written;
+extern lsn_t srv_os_log_written;
 
 /* amount of writes being done to the log files */
 extern ulint srv_os_log_pending_writes;
@@ -300,8 +346,8 @@ extern ulong srv_n_purge_threads;
 /* the number of pages to purge in one batch */
 extern ulong srv_purge_batch_size;
 
-/* the number of rollback segments to use */
-extern ulong srv_rollback_segments;
+/* the number of sync wait arrays */
+extern ulong srv_sync_array_size;
 
 /* variable that counts amount of data read in total (in bytes) */
 extern ulint srv_data_read;
@@ -333,20 +379,24 @@ extern ulint srv_buf_pool_flushed;
 reading of a disk page */
 extern ulint srv_buf_pool_reads;
 
+/* print all user-level transactions deadlocks to mysqld stderr */
+extern my_bool srv_print_all_deadlocks;
+
 /** Status variables to be passed to MySQL */
 typedef struct export_var_struct export_struc;
 
-/** Status variables to be passed to MySQL */
-extern export_struc export_vars;
+/** Thread slot in the thread table */
+typedef struct srv_slot_struct	srv_slot_t;
 
-/** The server system */
-typedef struct srv_sys_struct	srv_sys_t;
+/** Thread table is an array of slots */
+typedef srv_slot_t	srv_table_t;
 
-/** The server system */
-extern srv_sys_t*	srv_sys;
+/** Status variables to be passed to MySQL */
+extern export_struc export_vars;
 
 # ifdef UNIV_PFS_THREAD
 /* Keys to register InnoDB threads with performance schema */
+extern mysql_pfs_key_t	buf_page_cleaner_thread_key;
 extern mysql_pfs_key_t	trx_rollback_clean_thread_key;
 extern mysql_pfs_key_t	io_handler_thread_key;
 extern mysql_pfs_key_t	srv_lock_timeout_thread_key;
@@ -359,20 +409,14 @@ extern mysql_pfs_key_t	srv_purge_thread_key;
 schema */
 #  define pfs_register_thread(key)			\
 do {								\
-	if (PSI_server) {					\
-		struct PSI_thread* psi = PSI_server->new_thread(key, NULL, 0);\
-		if (psi) {					\
-			PSI_server->set_thread(psi);		\
-		}						\
-	}							\
+	struct PSI_thread* psi = PSI_CALL(new_thread)(key, NULL, 0);\
+	PSI_CALL(set_thread)(psi);				\
 } while (0)
 
 /* This macro delist the current thread from performance schema */
 #  define pfs_delete_thread()				\
 do {								\
-	if (PSI_server) {					\
-		PSI_server->delete_current_thread();		\
-	}							\
+	PSI_CALL(delete_current_thread)();			\
 } while (0)
 # endif /* UNIV_PFS_THREAD */
 
@@ -445,10 +489,13 @@ typedef enum srv_stats_method_name_enum		srv_stats_method_name_t;
 #ifndef UNIV_HOTBACKUP
 /** Types of threads existing in the system. */
 enum srv_thread_type {
-	SRV_WORKER = 0,	/**< threads serving parallelized queries and
-			queries released from lock wait */
-	SRV_MASTER	/**< the master thread, (whose type number must
-			be biggest) */
+	SRV_NONE,			/*!< None */
+	SRV_WORKER,			/*!< threads serving parallelized
+					queries and queries released from
+					lock wait */
+	SRV_PURGE,			/*!< Purge coordinator thread */
+	SRV_MASTER			/*!< the master thread, (whose type
+					number must be biggest) */
 };
 
 /*********************************************************************//**
@@ -478,21 +525,6 @@ void
 srv_general_init(void);
 /*==================*/
 /*********************************************************************//**
-Gets the number of threads in the system.
-@return	sum of srv_n_threads[] */
-UNIV_INTERN
-ulint
-srv_get_n_threads(void);
-/*===================*/
-/*********************************************************************//**
-Check whether thread type has reserved a slot.
-@return	slot number or UNDEFINED if not found*/
-UNIV_INTERN
-ulint
-srv_thread_has_reserved_slot(
-/*=========================*/
-	enum srv_thread_type	type);	/*!< in: thread type to check */
-/*********************************************************************//**
 Sets the info describing an i/o thread current state. */
 UNIV_INTERN
 void
@@ -501,32 +533,16 @@ srv_set_io_thread_op_info(
 	ulint		i,	/*!< in: the 'segment' of the i/o thread */
 	const char*	str);	/*!< in: constant char string describing the
 				state */
-/*********************************************************************//**
-Releases threads of the type given from suspension in the thread table.
-NOTE! The server mutex has to be reserved by the caller!
-@return number of threads released: this may be less than n if not
-enough threads were suspended at the moment */
-UNIV_INTERN
-ulint
-srv_release_threads(
-/*================*/
-	enum srv_thread_type	type,	/*!< in: thread type */
-	ulint			n);	/*!< in: number of threads to release */
-/*********************************************************************//**
-The master thread controlling the server.
-@return	a dummy parameter */
-UNIV_INTERN
-os_thread_ret_t
-srv_master_thread(
-/*==============*/
-	void*	arg);	/*!< in: a dummy parameter required by
-			os_thread_create */
 /*******************************************************************//**
-Wakes up the purge thread if it's not already awake. */
+Tells the purge thread that there has been activity in the database
+and wakes up the purge thread if it is suspended (not sleeping).  Note
+that there is a small chance that the purge thread stays suspended
+(we do not protect our operation with the srv_sys_t:mutex, for
+performance reasons). */
 UNIV_INTERN
 void
-srv_wake_purge_thread(void);
-/*=======================*/
+srv_wake_purge_thread_if_not_active(void);
+/*=====================================*/
 /*******************************************************************//**
 Tells the Innobase server that there has been activity in the database
 and wakes up the master thread if it is suspended (not sleeping). Used
@@ -543,150 +559,159 @@ UNIV_INTERN
 void
 srv_wake_master_thread(void);
 /*========================*/
-/*******************************************************************//**
-Tells the purge thread that there has been activity in the database
-and wakes up the purge thread if it is suspended (not sleeping).  Note
-that there is a small chance that the purge thread stays suspended
-(we do not protect our operation with the kernel mutex, for
-performace reasons). */
+/******************************************************************//**
+Outputs to a file the output of the InnoDB Monitor.
+@return FALSE if not all information printed
+due to failure to obtain necessary mutex */
 UNIV_INTERN
-void
-srv_wake_purge_thread_if_not_active(void);
-/*=====================================*/
-/*********************************************************************//**
-Puts an OS thread to wait if there are too many concurrent threads
-(>= srv_thread_concurrency) inside InnoDB. The threads wait in a FIFO queue. */
+ibool
+srv_printf_innodb_monitor(
+/*======================*/
+	FILE*	file,		/*!< in: output stream */
+	ibool	nowait,		/*!< in: whether to wait for the
+				lock_sys_t::mutex */
+	ulint*	trx_start,	/*!< out: file position of the start of
+				the list of active transactions */
+	ulint*	trx_end);	/*!< out: file position of the end of
+				the list of active transactions */
+
+/******************************************************************//**
+Function to pass InnoDB status variables to MySQL */
 UNIV_INTERN
 void
-srv_conc_enter_innodb(
-/*==================*/
-	trx_t*	trx);	/*!< in: transaction object associated with the
-			thread */
-/*********************************************************************//**
-This lets a thread enter InnoDB regardless of the number of threads inside
-InnoDB. This must be called when a thread ends a lock wait. */
+srv_export_innodb_status(void);
+/*==========================*/
+/*******************************************************************//**
+Get current server activity count. We don't hold srv_sys::mutex while
+reading this value as it is only used in heuristics.
+@return activity count. */
 UNIV_INTERN
-void
-srv_conc_force_enter_innodb(
+ulint
+srv_get_activity_count(void);
 /*========================*/
-	trx_t*	trx);	/*!< in: transaction object associated with the
-			thread */
-/*********************************************************************//**
-This must be called when a thread exits InnoDB in a lock wait or at the
-end of an SQL statement. */
+/*******************************************************************//**
+Check if there has been any activity.
+@return FALSE if no change in activity counter. */
 UNIV_INTERN
-void
-srv_conc_force_exit_innodb(
-/*=======================*/
-	trx_t*	trx);	/*!< in: transaction object associated with the
-			thread */
-/*********************************************************************//**
-This must be called when a thread exits InnoDB. */
+ibool
+srv_check_activity(
+/*===============*/
+	ulint		old_activity_count);	/*!< old activity count */
+/******************************************************************//**
+Increment the server activity counter. */
 UNIV_INTERN
 void
-srv_conc_exit_innodb(
-/*=================*/
-	trx_t*	trx);	/*!< in: transaction object associated with the
-			thread */
-/***************************************************************//**
-Puts a MySQL OS thread to wait for a lock to be released. If an error
-occurs during the wait trx->error_state associated with thr is
-!= DB_SUCCESS when we return. DB_LOCK_WAIT_TIMEOUT and DB_DEADLOCK
-are possible errors. DB_DEADLOCK is returned if selective deadlock
-resolution chose this transaction as a victim. */
+srv_inc_activity_count(void);
+/*=========================*/
+
+/**********************************************************************//**
+Enqueues a task to server task queue and releases a worker thread, if there
+is a suspended one. */
 UNIV_INTERN
 void
-srv_suspend_mysql_thread(
+srv_que_task_enqueue_low(
 /*=====================*/
-	que_thr_t*	thr);	/*!< in: query thread associated with the MySQL
-				OS thread */
-/********************************************************************//**
-Releases a MySQL OS thread waiting for a lock to be released, if the
-thread is already suspended. */
+	que_thr_t*	thr);	/*!< in: query thread */
+
+/**********************************************************************//**
+Check whether any background thread is active. If so, return the thread
+type.
+@return SRV_NONE if all are are suspended or have exited, thread
+type if any are still active. */
 UNIV_INTERN
-void
-srv_release_mysql_thread_if_suspended(
-/*==================================*/
-	que_thr_t*	thr);	/*!< in: query thread associated with the
-				MySQL OS thread	 */
+enum srv_thread_type
+srv_get_active_thread_type(void);
+/*============================*/
+
+extern "C" {
+
 /*********************************************************************//**
-A thread which wakes up threads whose lock wait may have lasted too long.
+A thread which prints the info output by various InnoDB monitors.
 @return	a dummy parameter */
 UNIV_INTERN
 os_thread_ret_t
-srv_lock_timeout_thread(
-/*====================*/
+DECLARE_THREAD(srv_monitor_thread)(
+/*===============================*/
 	void*	arg);	/*!< in: a dummy parameter required by
 			os_thread_create */
+
 /*********************************************************************//**
-A thread which prints the info output by various InnoDB monitors.
+The master thread controlling the server.
 @return	a dummy parameter */
 UNIV_INTERN
 os_thread_ret_t
-srv_monitor_thread(
-/*===============*/
+DECLARE_THREAD(srv_master_thread)(
+/*==============================*/
 	void*	arg);	/*!< in: a dummy parameter required by
 			os_thread_create */
+
 /*************************************************************************
 A thread which prints warnings about semaphore waits which have lasted
 too long. These can be used to track bugs which cause hangs.
 @return	a dummy parameter */
 UNIV_INTERN
 os_thread_ret_t
-srv_error_monitor_thread(
-/*=====================*/
+DECLARE_THREAD(srv_error_monitor_thread)(
+/*=====================================*/
 	void*	arg);	/*!< in: a dummy parameter required by
 			os_thread_create */
-/******************************************************************//**
-Outputs to a file the output of the InnoDB Monitor.
-@return FALSE if not all information printed
-due to failure to obtain necessary mutex */
-UNIV_INTERN
-ibool
-srv_printf_innodb_monitor(
-/*======================*/
-	FILE*	file,		/*!< in: output stream */
-	ibool	nowait,		/*!< in: whether to wait for kernel mutex */
-	ulint*	trx_start,	/*!< out: file position of the start of
-				the list of active transactions */
-	ulint*	trx_end);	/*!< out: file position of the end of
-				the list of active transactions */
 
-/******************************************************************//**
-Function to pass InnoDB status variables to MySQL */
+/*********************************************************************//**
+Purge coordinator thread that schedules the purge tasks.
+@return	a dummy parameter */
 UNIV_INTERN
-void
-srv_export_innodb_status(void);
-/*==========================*/
+os_thread_ret_t
+DECLARE_THREAD(srv_purge_coordinator_thread)(
+/*=========================================*/
+	void*	arg __attribute__((unused)));	/*!< in: a dummy parameter
+						required by os_thread_create */
 
 /*********************************************************************//**
-Asynchronous purge thread.
+Worker thread that reads tasks from the work queue and executes them.
 @return	a dummy parameter */
 UNIV_INTERN
 os_thread_ret_t
-srv_purge_thread(
-/*=============*/
-	void*	arg __attribute__((unused))); /*!< in: a dummy parameter
-					      required by os_thread_create */
+DECLARE_THREAD(srv_worker_thread)(
+/*==============================*/
+	void*	arg __attribute__((unused)));	/*!< in: a dummy parameter
+						required by os_thread_create */
+} /* extern "C" */
 
 /**********************************************************************//**
-Enqueues a task to server task queue and releases a worker thread, if there
-is a suspended one. */
+Get count of tasks in the queue.
+@return number of tasks in queue  */
 UNIV_INTERN
-void
-srv_que_task_enqueue_low(
-/*=====================*/
-	que_thr_t*	thr);	/*!< in: query thread */
+ulint
+srv_get_task_queue_length(void);
+/*===========================*/
 
-/**********************************************************************//**
-Check whether any background thread is active. If so, return the thread
-type.
-@return ULINT_UNDEFINED if all are are suspended or have exited, thread
-type if any are still active. */
+/*********************************************************************//**
+Releases threads of the type given from suspension in the thread table.
+NOTE! The server mutex has to be reserved by the caller!
+@return number of threads released: this may be less than n if not
+enough threads were suspended at the moment */
 UNIV_INTERN
 ulint
-srv_get_active_thread_type(void);
-/*============================*/
+srv_release_threads(
+/*================*/
+	enum srv_thread_type	type,	/*!< in: thread type */
+	ulint			n);	/*!< in: number of threads to release */
+
+/**********************************************************************//**
+Check whether any background thread are active. If so print which thread
+is active. Send the threads wakeup signal.
+@return name of thread that is active or NULL */
+UNIV_INTERN
+const char*
+srv_any_background_threads_are_active(void);
+/*=======================================*/
+
+/**********************************************************************//**
+Wakeup the purge threads. */
+UNIV_INTERN
+void
+srv_purge_wakeup(void);
+/*==================*/
 
 /** Status variables to be passed to MySQL */
 struct export_var_struct{
@@ -698,6 +723,8 @@ struct export_var_struct{
 	ulint innodb_data_writes;		/*!< I/O write requests */
 	ulint innodb_data_written;		/*!< Data bytes written */
 	ulint innodb_data_reads;		/*!< I/O read requests */
+	char  innodb_buffer_pool_dump_status[512];/*!< Buf pool dump status */
+	char  innodb_buffer_pool_load_status[512];/*!< Buf pool load status */
 	ulint innodb_buffer_pool_pages_total;	/*!< Buffer pool size */
 	ulint innodb_buffer_pool_pages_data;	/*!< Data pages */
 	ulint innodb_buffer_pool_pages_dirty;	/*!< Dirty data pages */
@@ -720,7 +747,7 @@ struct export_var_struct{
 	ulint innodb_log_waits;			/*!< srv_log_waits */
 	ulint innodb_log_write_requests;	/*!< srv_log_write_requests */
 	ulint innodb_log_writes;		/*!< srv_log_writes */
-	ulint innodb_os_log_written;		/*!< srv_os_log_written */
+	lsn_t innodb_os_log_written;		/*!< srv_os_log_written */
 	ulint innodb_os_log_fsyncs;		/*!< fil_n_log_flushes */
 	ulint innodb_os_log_pending_writes;	/*!< srv_os_log_pending_writes */
 	ulint innodb_os_log_pending_fsyncs;	/*!< fil_n_pending_log_flushes */
@@ -741,26 +768,38 @@ struct export_var_struct{
 	ulint innodb_rows_inserted;		/*!< srv_n_rows_inserted */
 	ulint innodb_rows_updated;		/*!< srv_n_rows_updated */
 	ulint innodb_rows_deleted;		/*!< srv_n_rows_deleted */
+	ulint innodb_num_open_files;		/*!< fil_n_file_opened */
 	ulint innodb_truncated_status_writes;	/*!< srv_truncated_status_writes */
+	ulint innodb_available_undo_logs;       /*!< srv_available_undo_logs */
 };
 
-/** Thread slot in the thread table */
-typedef struct srv_slot_struct	srv_slot_t;
-
-/** Thread table is an array of slots */
-typedef srv_slot_t	srv_table_t;
-
-/** The server system struct */
-struct srv_sys_struct{
-	srv_table_t*	threads;	/*!< server thread table */
-	UT_LIST_BASE_NODE_T(que_thr_t)
-			tasks;		/*!< task queue */
+/** Thread slot in the thread table.  */
+struct srv_slot_struct{
+	srv_thread_type type;			/*!< thread type: user,
+						utility etc. */
+	ibool		in_use;			/*!< TRUE if this slot
+						is in use */
+	ibool		suspended;		/*!< TRUE if the thread is
+						waiting for the event of this
+						slot */
+	ib_time_t	suspend_time;		/*!< time when the thread was
+						suspended. Initialized by
+						lock_wait_table_reserve_slot()
+						for lock wait */
+	ulong		wait_timeout;		/*!< wait time that if exceeded
+						the thread will be timed out.
+						Initialized by
+						lock_wait_table_reserve_slot()
+						for lock wait */
+	os_event_t	event;			/*!< event used in suspending
+						the thread when it has nothing
+						to do */
+	que_thr_t*	thr;			/*!< suspended query thread
+						(only used for user threads) */
 };
 
-extern ulint	srv_n_threads_active[];
 #else /* !UNIV_HOTBACKUP */
 # define srv_use_adaptive_hash_indexes		FALSE
-# define srv_use_checksums			TRUE
 # define srv_use_native_aio			FALSE
 # define srv_force_recovery			0UL
 # define srv_set_io_thread_op_info(t,info)	((void) 0)
diff --git a/storage/innobase/include/srv0srv.ic b/storage/innobase/include/srv0srv.ic
index 8a1a678a016..53405c06f97 100644
--- a/storage/innobase/include/srv0srv.ic
+++ b/storage/innobase/include/srv0srv.ic
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1995, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
diff --git a/storage/innobase/include/srv0start.h b/storage/innobase/include/srv0start.h
index 796d2cade3b..9d948675011 100644
--- a/storage/innobase/include/srv0start.h
+++ b/storage/innobase/include/srv0start.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1995, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -27,8 +27,15 @@ Created 10/10/1995 Heikki Tuuri
 #define srv0start_h
 
 #include "univ.i"
+#include "log0log.h"
 #include "ut0byte.h"
 
+#ifdef __WIN__
+#define SRV_PATH_SEPARATOR	'\\'
+#else
+#define SRV_PATH_SEPARATOR	'/'
+#endif
+
 /*********************************************************************//**
 Normalizes a directory path for Windows: converts slashes to backslashes. */
 UNIV_INTERN
@@ -85,11 +92,19 @@ Shuts down the Innobase database.
 UNIV_INTERN
 int
 innobase_shutdown_for_mysql(void);
+
+/********************************************************************
+Signal all per-table background threads to shutdown, and wait for them to do
+so. */
+
+void
+srv_shutdown_table_bg_threads(void);
+
 /*=============================*/
 /** Log sequence number at shutdown */
-extern	ib_uint64_t	srv_shutdown_lsn;
+extern	lsn_t	srv_shutdown_lsn;
 /** Log sequence number immediately after startup */
-extern	ib_uint64_t	srv_start_lsn;
+extern	lsn_t	srv_start_lsn;
 
 #ifdef HAVE_DARWIN_THREADS
 /** TRUE if the F_FULLFSYNC option is available */
@@ -113,6 +128,11 @@ enum srv_shutdown_state {
 	SRV_SHUTDOWN_NONE = 0,	/*!< Database running normally */
 	SRV_SHUTDOWN_CLEANUP,	/*!< Cleaning up in
 				logs_empty_and_mark_files_at_shutdown() */
+	SRV_SHUTDOWN_FLUSH_PHASE,/*!< At this phase the master and the
+				purge threads must have completed their
+				work. Once we enter this phase the
+				page_cleaner can clean up the buffer
+				pool and exit */
 	SRV_SHUTDOWN_LAST_PHASE,/*!< Last phase after ensuring that
 				the buffer pool can be freed: flush
 				all file spaces and close all files */
diff --git a/storage/innobase/include/sync0arr.h b/storage/innobase/include/sync0arr.h
index 6e931346238..56f9ff78c49 100644
--- a/storage/innobase/include/sync0arr.h
+++ b/storage/innobase/include/sync0arr.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1995, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -36,32 +36,6 @@ typedef struct sync_cell_struct		sync_cell_t;
 /** Synchronization wait array */
 typedef struct sync_array_struct	sync_array_t;
 
-/** Parameters for sync_array_create() @{ */
-#define SYNC_ARRAY_OS_MUTEX	1	/*!< protected by os_mutex_t */
-#define SYNC_ARRAY_MUTEX	2	/*!< protected by mutex_t */
-/* @} */
-
-/*******************************************************************//**
-Creates a synchronization wait array. It is protected by a mutex
-which is automatically reserved when the functions operating on it
-are called.
-@return	own: created wait array */
-UNIV_INTERN
-sync_array_t*
-sync_array_create(
-/*==============*/
-	ulint	n_cells,	/*!< in: number of cells in the array
-				to create */
-	ulint	protection);	/*!< in: either SYNC_ARRAY_OS_MUTEX or
-				SYNC_ARRAY_MUTEX: determines the type
-				of mutex protecting the data structure */
-/******************************************************************//**
-Frees the resources in a wait array. */
-UNIV_INTERN
-void
-sync_array_free(
-/*============*/
-	sync_array_t*	arr);	/*!< in, own: sync wait array */
 /******************************************************************//**
 Reserves a wait array cell for waiting for an object.
 The event of the cell is reset to nonsignalled state. */
@@ -99,9 +73,9 @@ sync_array_free_cell(
 Note that one of the wait objects was signalled. */
 UNIV_INTERN
 void
-sync_array_object_signalled(
-/*========================*/
-	sync_array_t*	arr);	/*!< in: wait array */
+sync_array_object_signalled(void);
+/*=============================*/
+
 /**********************************************************************//**
 If the wakeup algorithm does not work perfectly at semaphore relases,
 this function will do the waking (see the comment in mutex_exit). This
@@ -132,11 +106,30 @@ sync_array_validate(
 Prints info of the wait array. */
 UNIV_INTERN
 void
-sync_array_print_info(
+sync_array_print(
+/*=============*/
+	FILE*		file);	/*!< in: file where to print */
+
+/**********************************************************************//**
+Create the primary system wait array(s), they are protected by an OS mutex */
+UNIV_INTERN
+void
+sync_array_init(
+/*============*/
+	ulint		n_threads);	/*!< in: Number of slots to create */
+/**********************************************************************//**
+Close sync array wait sub-system. */
+UNIV_INTERN
+void
+sync_array_close(void);
 /*==================*/
-	FILE*		file,	/*!< in: file where to print */
-	sync_array_t*	arr);	/*!< in: wait array */
 
+/**********************************************************************//**
+Get an instance of the sync wait array. */
+UNIV_INTERN
+sync_array_t*
+sync_array_get(void);
+/*================*/
 
 #ifndef UNIV_NONINL
 #include "sync0arr.ic"
diff --git a/storage/innobase/include/sync0arr.ic b/storage/innobase/include/sync0arr.ic
index bf57f5b2dc2..0114a1ff5a2 100644
--- a/storage/innobase/include/sync0arr.ic
+++ b/storage/innobase/include/sync0arr.ic
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1995, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -24,4 +24,3 @@ Inline code
 
 Created 9/5/1995 Heikki Tuuri
 *******************************************************/
-
diff --git a/storage/innobase/include/sync0rw.h b/storage/innobase/include/sync0rw.h
index 2cab266d86a..b0c21d0c76b 100644
--- a/storage/innobase/include/sync0rw.h
+++ b/storage/innobase/include/sync0rw.h
@@ -18,8 +18,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -117,12 +117,17 @@ extern	mysql_pfs_key_t	buf_block_lock_key;
 extern	mysql_pfs_key_t	buf_block_debug_latch_key;
 # endif /* UNIV_SYNC_DEBUG */
 extern	mysql_pfs_key_t	dict_operation_lock_key;
-extern	mysql_pfs_key_t	fil_space_latch_key;
 extern	mysql_pfs_key_t	checkpoint_lock_key;
+extern	mysql_pfs_key_t	fil_space_latch_key;
+extern	mysql_pfs_key_t	fts_cache_rw_lock_key;
+extern	mysql_pfs_key_t	fts_cache_init_rw_lock_key;
+extern	mysql_pfs_key_t	index_tree_rw_lock_key;
 extern	mysql_pfs_key_t	trx_i_s_cache_lock_key;
 extern	mysql_pfs_key_t	trx_purge_latch_key;
 extern	mysql_pfs_key_t	index_tree_rw_lock_key;
 extern	mysql_pfs_key_t	dict_table_stats_latch_key;
+extern  mysql_pfs_key_t trx_sys_rw_lock_key;
+extern  mysql_pfs_key_t hash_table_rw_lock_key;
 #endif /* UNIV_PFS_RWLOCK */
 
 
@@ -154,9 +159,6 @@ unlocking, not the corresponding function. */
 # define rw_lock_s_lock(M)					\
 	rw_lock_s_lock_func((M), 0, __FILE__, __LINE__)
 
-# define rw_lock_s_lock_inline(M, P, F, L)			\
-	rw_lock_s_lock_func((M), (P), (F), (L))
-
 # define rw_lock_s_lock_gen(M, P)				\
 	rw_lock_s_lock_func((M), (P), __FILE__, __LINE__)
 
@@ -173,18 +175,12 @@ unlocking, not the corresponding function. */
 # define rw_lock_x_lock(M)					\
 	rw_lock_x_lock_func((M), 0, __FILE__, __LINE__)
 
-# define rw_lock_x_lock_inline(M, P, F, L)			\
-	rw_lock_x_lock_func((M), (P), (F), (L))
-
 # define rw_lock_x_lock_gen(M, P)				\
 	rw_lock_x_lock_func((M), (P), __FILE__, __LINE__)
 
 # define rw_lock_x_lock_nowait(M)				\
 	rw_lock_x_lock_func_nowait((M), __FILE__, __LINE__)
 
-# define rw_lock_x_lock_func_nowait_inline(M, F, L)		\
-	rw_lock_x_lock_func_nowait((M), (F), (L))
-
 # ifdef UNIV_SYNC_DEBUG
 #  define rw_lock_x_unlock_gen(L, P)	rw_lock_x_unlock_func(P, L)
 # else
@@ -216,9 +212,6 @@ unlocking, not the corresponding function. */
 # define rw_lock_s_lock(M)					\
 	pfs_rw_lock_s_lock_func((M), 0, __FILE__, __LINE__)
 
-# define rw_lock_s_lock_inline(M, P, F, L)			\
-	pfs_rw_lock_s_lock_func((M), (P), (F), (L))
-
 # define rw_lock_s_lock_gen(M, P)				\
 	pfs_rw_lock_s_lock_func((M), (P), __FILE__, __LINE__)
 
@@ -234,18 +227,12 @@ unlocking, not the corresponding function. */
 # define rw_lock_x_lock(M)					\
 	pfs_rw_lock_x_lock_func((M), 0, __FILE__, __LINE__)
 
-# define rw_lock_x_lock_inline(M, P, F, L)			\
-	pfs_rw_lock_x_lock_func((M), (P), (F), (L))
-
 # define rw_lock_x_lock_gen(M, P)				\
 	pfs_rw_lock_x_lock_func((M), (P), __FILE__, __LINE__)
 
 # define rw_lock_x_lock_nowait(M)				\
 	pfs_rw_lock_x_lock_func_nowait((M), __FILE__, __LINE__)
 
-# define rw_lock_x_lock_func_nowait_inline(M, F, L)		\
-	pfs_rw_lock_x_lock_func_nowait((M), (F), (L))
-
 # ifdef UNIV_SYNC_DEBUG
 #  define rw_lock_x_unlock_gen(L, P)	pfs_rw_lock_x_unlock_func(P, L)
 # else
@@ -419,22 +406,6 @@ rw_lock_x_lock_move_ownership(
 	rw_lock_t*	lock);	/*!< in: lock which was x-locked in the
 				buffer read */
 /******************************************************************//**
-Releases a shared mode lock when we know there are no waiters and none
-else will access the lock during the time this function is executed. */
-UNIV_INLINE
-void
-rw_lock_s_unlock_direct(
-/*====================*/
-	rw_lock_t*	lock);	/*!< in/out: rw-lock */
-/******************************************************************//**
-Releases an exclusive mode lock when we know there are no waiters, and
-none else will access the lock durint the time this function is executed. */
-UNIV_INLINE
-void
-rw_lock_x_unlock_direct(
-/*====================*/
-	rw_lock_t*	lock);	/*!< in/out: rw-lock */
-/******************************************************************//**
 Returns the value of writer_count for the lock. Does not reserve the lock
 mutex, so the caller must be sure it is not changed during the call.
 @return	value of writer_count */
@@ -607,7 +578,7 @@ struct rw_lock_struct {
 				/*!< Thread id of writer thread. Is only
 				guaranteed to have sane and non-stale
 				value iff recursive flag is set. */
-	os_event_t	event;	/*!< Used by sync0arr.c for thread queueing */
+	os_event_t	event;	/*!< Used by sync0arr.cc for thread queueing */
 	os_event_t	wait_ex_event;
 				/*!< Event for next-writer to wait on. A thread
 				must decrement lock_word before waiting. */
@@ -689,9 +660,6 @@ rw_lock_s_lock_gen()
 rw_lock_s_lock_nowait()
 rw_lock_s_unlock_gen()
 rw_lock_free()
-
-Two function APIs rw_lock_x_unlock_direct() and rw_lock_s_unlock_direct()
-do not have any caller/user, they are not instrumented.
 */
 
 #ifdef UNIV_PFS_RWLOCK
diff --git a/storage/innobase/include/sync0rw.ic b/storage/innobase/include/sync0rw.ic
index a5a7cda14f9..eab89e2619e 100644
--- a/storage/innobase/include/sync0rw.ic
+++ b/storage/innobase/include/sync0rw.ic
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1995, 2011, Oracle and/or its affiliates. All Rights Reserved.
 Copyright (c) 2008, Google Inc.
 
 Portions of this file contain modifications contributed and copyrighted by
@@ -18,8 +18,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -90,7 +90,7 @@ rw_lock_set_waiter_flag(
 	rw_lock_t*	lock)	/*!< in/out: rw-lock */
 {
 #ifdef INNODB_RW_LOCKS_USE_ATOMICS
-	(void) os_compare_and_swap_ulint(&lock->waiters, 0, 1);
+	os_compare_and_swap_ulint(&lock->waiters, 0, 1);
 #else /* INNODB_RW_LOCKS_USE_ATOMICS */
 	lock->waiters = 1;
 #endif /* INNODB_RW_LOCKS_USE_ATOMICS */
@@ -107,7 +107,7 @@ rw_lock_reset_waiter_flag(
 	rw_lock_t*	lock)	/*!< in/out: rw-lock */
 {
 #ifdef INNODB_RW_LOCKS_USE_ATOMICS
-	(void) os_compare_and_swap_ulint(&lock->waiters, 1, 0);
+	os_compare_and_swap_ulint(&lock->waiters, 1, 0);
 #else /* INNODB_RW_LOCKS_USE_ATOMICS */
 	lock->waiters = 0;
 #endif /* INNODB_RW_LOCKS_USE_ATOMICS */
@@ -131,7 +131,7 @@ rw_lock_get_writer(
 	} else if (((-lock_word) % X_LOCK_DECR) == 0) {
 		return(RW_LOCK_EX);
 	} else {
-                ut_ad(lock_word > -X_LOCK_DECR);
+		ut_ad(lock_word > -X_LOCK_DECR);
 		return(RW_LOCK_WAIT_EX);
 	}
 }
@@ -200,7 +200,7 @@ rw_lock_lock_word_decr(
 	ulint		amount)		/*!< in: amount to decrement */
 {
 #ifdef INNODB_RW_LOCKS_USE_ATOMICS
-        lint local_lock_word = lock->lock_word;
+	lint local_lock_word = lock->lock_word;
 	while (local_lock_word > 0) {
 		if (os_compare_and_swap_lint(&lock->lock_word,
 					     local_lock_word,
@@ -244,7 +244,7 @@ rw_lock_lock_word_incr(
 
 	mutex_exit(&(lock->mutex));
 
-        return(local_lock_word);
+	return(local_lock_word);
 #endif /* INNODB_RW_LOCKS_USE_ATOMICS */
 }
 
@@ -308,7 +308,6 @@ rw_lock_s_lock_low(
 	const char*	file_name, /*!< in: file name where lock requested */
 	ulint		line)	/*!< in: line where requested */
 {
-	/* TODO: study performance of UNIV_LIKELY branch prediction hints. */
 	if (!rw_lock_lock_word_decr(lock, 1)) {
 		/* Locking did not succeed */
 		return(FALSE);
@@ -318,7 +317,7 @@ rw_lock_s_lock_low(
 	rw_lock_add_debug_info(lock, pass, RW_LOCK_SHARED, file_name, line);
 #endif
 	/* These debugging values are not set safely: they may be incorrect
-        or even refer to a line that is invalid for the file name. */
+	or even refer to a line that is invalid for the file name. */
 	lock->last_s_file_name = file_name;
 	lock->last_s_line = line;
 
@@ -409,7 +408,6 @@ rw_lock_s_lock_func(
 	ut_ad(!rw_lock_own(lock, RW_LOCK_EX));
 #endif /* UNIV_SYNC_DEBUG */
 
-	/* TODO: study performance of UNIV_LIKELY branch prediction hints. */
 	if (rw_lock_s_lock_low(lock, pass, file_name, line)) {
 
 		return; /* Success */
@@ -462,8 +460,12 @@ rw_lock_x_lock_func_nowait(
 		there is an exclusive writer and this is the writer thread. */
 		lock->lock_word -= X_LOCK_DECR;
 
+		/* Recursive x-locks must be multiples of X_LOCK_DECR. */
 		ut_ad(((-lock->lock_word) % X_LOCK_DECR) == 0);
 
+		/* Watch for too many recursive locks */
+		ut_ad(lock->lock_word < 0);
+
 	} else {
 		/* Failure */
 		return(FALSE);
@@ -502,10 +504,10 @@ rw_lock_s_unlock_func(
 	if (rw_lock_lock_word_incr(lock, 1) == 0) {
 
 		/* wait_ex waiter exists. It may not be asleep, but we signal
-                anyway. We do not wake other waiters, because they can't
-                exist without wait_ex waiter and wait_ex waiter goes first.*/
+		anyway. We do not wake other waiters, because they can't
+		exist without wait_ex waiter and wait_ex waiter goes first.*/
 		os_event_set(lock->wait_ex_event);
-		sync_array_object_signalled(sync_primary_wait_array);
+		sync_array_object_signalled();
 
 	}
 
@@ -517,31 +519,6 @@ rw_lock_s_unlock_func(
 }
 
 /******************************************************************//**
-Releases a shared mode lock when we know there are no waiters and none
-else will access the lock during the time this function is executed. */
-UNIV_INLINE
-void
-rw_lock_s_unlock_direct(
-/*====================*/
-	rw_lock_t*	lock)	/*!< in/out: rw-lock */
-{
-	ut_ad(lock->lock_word < X_LOCK_DECR);
-
-#ifdef UNIV_SYNC_DEBUG
-	rw_lock_remove_debug_info(lock, 0, RW_LOCK_SHARED);
-#endif
-
-	/* Decrease reader count by incrementing lock_word */
-	lock->lock_word++;
-
-	ut_ad(!lock->waiters);
-	ut_ad(rw_lock_validate(lock));
-#ifdef UNIV_SYNC_PERF_STAT
-	rw_s_exit_count++;
-#endif
-}
-
-/******************************************************************//**
 Releases an exclusive mode lock. */
 UNIV_INLINE
 void
@@ -564,6 +541,8 @@ rw_lock_x_unlock_func(
 	if (lock->lock_word == 0) {
 		/* Last caller in a possible recursive chain. */
 		lock->recursive = FALSE;
+		UNIV_MEM_INVALID(&lock->writer_thread,
+				 sizeof lock->writer_thread);
 	}
 
 #ifdef UNIV_SYNC_DEBUG
@@ -572,12 +551,12 @@ rw_lock_x_unlock_func(
 
 	if (rw_lock_lock_word_incr(lock, X_LOCK_DECR) == X_LOCK_DECR) {
 		/* Lock is now free. May have to signal read/write waiters.
-                We do not need to signal wait_ex waiters, since they cannot
-                exist when there is a writer. */
+		We do not need to signal wait_ex waiters, since they cannot
+		exist when there is a writer. */
 		if (lock->waiters) {
 			rw_lock_reset_waiter_flag(lock);
 			os_event_set(lock->event);
-			sync_array_object_signalled(sync_primary_wait_array);
+			sync_array_object_signalled();
 		}
 	}
 
@@ -588,38 +567,6 @@ rw_lock_x_unlock_func(
 #endif
 }
 
-/******************************************************************//**
-Releases an exclusive mode lock when we know there are no waiters, and
-none else will access the lock during the time this function is executed. */
-UNIV_INLINE
-void
-rw_lock_x_unlock_direct(
-/*====================*/
-	rw_lock_t*	lock)	/*!< in/out: rw-lock */
-{
-	/* Reset the exclusive lock if this thread no longer has an x-mode
-	lock */
-
-	ut_ad((lock->lock_word % X_LOCK_DECR) == 0);
-
-#ifdef UNIV_SYNC_DEBUG
-	rw_lock_remove_debug_info(lock, 0, RW_LOCK_EX);
-#endif
-
-	if (lock->lock_word == 0) {
-		lock->recursive = FALSE;
-	}
-
-	lock->lock_word += X_LOCK_DECR;
-
-	ut_ad(!lock->waiters);
-	ut_ad(rw_lock_validate(lock));
-
-#ifdef UNIV_SYNC_PERF_STAT
-	rw_x_exit_count++;
-#endif
-}
-
 #ifdef UNIV_PFS_RWLOCK
 
 /******************************************************************//**
@@ -643,9 +590,7 @@ pfs_rw_lock_create_func(
 	ulint		cline)		/*!< in: file line where created */
 {
 	/* Initialize the rwlock for performance schema */
-	lock->pfs_psi = (PSI_server && PFS_IS_INSTRUMENTED(key))
-				? PSI_server->init_rwlock(key, lock)
-				: NULL;
+	lock->pfs_psi = PSI_CALL(init_rwlock)(key, lock);
 
 	/* The actual function to initialize an rwlock */
 	rw_lock_create_func(lock,
@@ -656,7 +601,7 @@ pfs_rw_lock_create_func(
 			    cmutex_name,
 # endif /* UNIV_DEBUG */
 			    cfile_name,
-		            cline);
+			    cline);
 }
 /******************************************************************//**
 Performance schema instrumented wrap function for rw_lock_x_lock_func()
@@ -672,24 +617,23 @@ pfs_rw_lock_x_lock_func(
 	const char*	file_name,/*!< in: file name where lock requested */
 	ulint		line)	/*!< in: line where requested */
 {
-	struct PSI_rwlock_locker*	locker = NULL;
-	PSI_rwlock_locker_state		state;
+	if (lock->pfs_psi != NULL)
+	{
+		PSI_rwlock_locker*	locker;
+		PSI_rwlock_locker_state	state;
 
-	/* Record the entry of rw x lock request in performance schema */
-	if (UNIV_LIKELY(PSI_server && lock->pfs_psi)) {
-		locker = PSI_server->get_thread_rwlock_locker(
-			&state, lock->pfs_psi, PSI_RWLOCK_WRITELOCK);
-
-		if (locker) {
-			PSI_server->start_rwlock_wrwait(locker,
-							file_name, line);
-		}
-	}
+		/* Record the entry of rw x lock request in performance schema */
+		locker = PSI_CALL(start_rwlock_wrwait)(
+			&state, lock->pfs_psi, PSI_RWLOCK_WRITELOCK, file_name, line);
 
-	rw_lock_x_lock_func(lock, pass, file_name, line);
+		rw_lock_x_lock_func(lock, pass, file_name, line);
 
-	if (locker) {
-		PSI_server->end_rwlock_wrwait(locker, 0);
+		if (locker != NULL)
+			PSI_CALL(end_rwlock_wrwait)(locker, 0);
+	}
+	else
+	{
+		rw_lock_x_lock_func(lock, pass, file_name, line);
 	}
 }
 /******************************************************************//**
@@ -707,25 +651,25 @@ pfs_rw_lock_x_lock_func_nowait(
 				requested */
 	ulint		line)	/*!< in: line where requested */
 {
-	struct PSI_rwlock_locker*	locker = NULL;
-	PSI_rwlock_locker_state		state;
 	ibool	ret;
 
-	/* Record the entry of rw x lock request in performance schema */
-	if (UNIV_LIKELY(PSI_server && lock->pfs_psi)) {
-		locker = PSI_server->get_thread_rwlock_locker(
-			&state, lock->pfs_psi, PSI_RWLOCK_WRITELOCK);
+	if (lock->pfs_psi != NULL)
+	{
+		PSI_rwlock_locker*	locker;
+		PSI_rwlock_locker_state		state;
 
-		if (locker) {
-			PSI_server->start_rwlock_wrwait(locker,
-							file_name, line);
-		}
-	}
+		/* Record the entry of rw x lock request in performance schema */
+		locker = PSI_CALL(start_rwlock_wrwait)(
+			&state, lock->pfs_psi, PSI_RWLOCK_WRITELOCK, file_name, line);
 
-	ret = rw_lock_x_lock_func_nowait(lock, file_name, line);
+		ret = rw_lock_x_lock_func_nowait(lock, file_name, line);
 
-	if (locker) {
-		PSI_server->end_rwlock_wrwait(locker, 0);
+		if (locker != NULL)
+			PSI_CALL(end_rwlock_wrwait)(locker, ret);
+	}
+	else
+	{
+		ret = rw_lock_x_lock_func_nowait(lock, file_name, line);
 	}
 
 	return(ret);
@@ -740,8 +684,9 @@ pfs_rw_lock_free_func(
 /*==================*/
 	rw_lock_t*	lock)	/*!< in: pointer to rw-lock */
 {
-	if (UNIV_LIKELY(PSI_server && lock->pfs_psi)) {
-		PSI_server->destroy_rwlock(lock->pfs_psi);
+	if (lock->pfs_psi != NULL)
+	{
+		PSI_CALL(destroy_rwlock)(lock->pfs_psi);
 		lock->pfs_psi = NULL;
 	}
 
@@ -763,24 +708,26 @@ pfs_rw_lock_s_lock_func(
 				requested */
 	ulint		line)	/*!< in: line where requested */
 {
-	struct PSI_rwlock_locker*	locker = NULL;
-	PSI_rwlock_locker_state		state;
-
-	/* Instrumented to inform we are aquiring a shared rwlock */
-	if (UNIV_LIKELY(PSI_server && lock->pfs_psi)) {
-		locker = PSI_server->get_thread_rwlock_locker(
-			&state, lock->pfs_psi, PSI_RWLOCK_READLOCK);
-		if (locker) {
-			PSI_server->start_rwlock_rdwait(locker,
-							file_name, line);
-		}
-	}
+	if (lock->pfs_psi != NULL)
+	{
+		PSI_rwlock_locker*	locker;
+		PSI_rwlock_locker_state	state;
 
-	rw_lock_s_lock_func(lock, pass, file_name, line);
+		/* Instrumented to inform we are aquiring a shared rwlock */
+		locker = PSI_CALL(start_rwlock_rdwait)(
+			&state, lock->pfs_psi, PSI_RWLOCK_READLOCK, file_name, line);
 
-	if (locker) {
-		PSI_server->end_rwlock_rdwait(locker, 0);
+		rw_lock_s_lock_func(lock, pass, file_name, line);
+
+		if (locker != NULL)
+			PSI_CALL(end_rwlock_rdwait)(locker, 0);
+	}
+	else
+	{
+		rw_lock_s_lock_func(lock, pass, file_name, line);
 	}
+
+	return;
 }
 /******************************************************************//**
 Performance schema instrumented wrap function for rw_lock_s_lock_func()
@@ -798,24 +745,25 @@ pfs_rw_lock_s_lock_low(
 	const char*	file_name, /*!< in: file name where lock requested */
 	ulint		line)	/*!< in: line where requested */
 {
-	struct PSI_rwlock_locker*	locker = NULL;
-	PSI_rwlock_locker_state		state;
 	ibool	ret;
 
-	/* Instrumented to inform we are aquiring a shared rwlock */
-	if (UNIV_LIKELY(PSI_server && lock->pfs_psi)) {
-		locker = PSI_server->get_thread_rwlock_locker(
-			&state, lock->pfs_psi, PSI_RWLOCK_READLOCK);
-		if (locker) {
-			PSI_server->start_rwlock_rdwait(locker,
-							file_name, line);
-		}
-	}
+	if (lock->pfs_psi != NULL)
+	{
+		PSI_rwlock_locker*	locker;
+		PSI_rwlock_locker_state	state;
+
+		/* Instrumented to inform we are aquiring a shared rwlock */
+		locker = PSI_CALL(start_rwlock_rdwait)(
+			&state, lock->pfs_psi, PSI_RWLOCK_READLOCK, file_name, line);
 
-	ret = rw_lock_s_lock_low(lock, pass, file_name, line);
+		ret = rw_lock_s_lock_low(lock, pass, file_name, line);
 
-	if (locker) {
-		PSI_server->end_rwlock_rdwait(locker, 0);
+		if (locker != NULL)
+			PSI_CALL(end_rwlock_rdwait)(locker, ret);
+	}
+	else
+	{
+		ret = rw_lock_s_lock_low(lock, pass, file_name, line);
 	}
 
 	return(ret);
@@ -837,9 +785,8 @@ pfs_rw_lock_x_unlock_func(
 	rw_lock_t*	lock)	/*!< in/out: rw-lock */
 {
 	/* Inform performance schema we are unlocking the lock */
-	if (UNIV_LIKELY(PSI_server && lock->pfs_psi)) {
-		PSI_server->unlock_rwlock(lock->pfs_psi);
-	}
+	if (lock->pfs_psi != NULL)
+		PSI_CALL(unlock_rwlock)(lock->pfs_psi);
 
 	rw_lock_x_unlock_func(
 #ifdef UNIV_SYNC_DEBUG
@@ -864,9 +811,8 @@ pfs_rw_lock_s_unlock_func(
 	rw_lock_t*	lock)	/*!< in/out: rw-lock */
 {
 	/* Inform performance schema we are unlocking the lock */
-	if (UNIV_LIKELY(PSI_server && lock->pfs_psi)) {
-		PSI_server->unlock_rwlock(lock->pfs_psi);
-	}
+	if (lock->pfs_psi != NULL)
+		PSI_CALL(unlock_rwlock)(lock->pfs_psi);
 
 	rw_lock_s_unlock_func(
 #ifdef UNIV_SYNC_DEBUG
diff --git a/storage/innobase/include/sync0sync.h b/storage/innobase/include/sync0sync.h
index 9b07c4758c9..1adcf938903 100644
--- a/storage/innobase/include/sync0sync.h
+++ b/storage/innobase/include/sync0sync.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1995, 2012, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 1995, 2011, Oracle and/or its affiliates. All Rights Reserved.
 Copyright (c) 2008, Google Inc.
 
 Portions of this file contain modifications contributed and copyrighted by
@@ -42,7 +42,7 @@ Created 9/5/1995 Heikki Tuuri
 #include "sync0arr.h"
 
 #if  defined(UNIV_DEBUG) && !defined(UNIV_HOTBACKUP)
-extern my_bool	timed_mutexes;
+extern "C" my_bool	timed_mutexes;
 #endif /* UNIV_DEBUG && !UNIV_HOTBACKUP */
 
 #ifdef HAVE_WINDOWS_ATOMICS
@@ -53,25 +53,19 @@ typedef byte lock_word_t;
 #endif
 
 #if defined UNIV_PFS_MUTEX || defined UNIV_PFS_RWLOCK
-/* There are mutexes/rwlocks that we want to exclude from
-instrumentation even if their corresponding performance schema
-define is set. And this PFS_NOT_INSTRUMENTED is used
-as the key value to dentify those objects that would
-be excluded from instrumentation. */
-# define PFS_NOT_INSTRUMENTED		ULINT32_UNDEFINED
-
-# define PFS_IS_INSTRUMENTED(key)	((key) != PFS_NOT_INSTRUMENTED)
 
 /* By default, buffer mutexes and rwlocks will be excluded from
 instrumentation due to their large number of instances. */
 # define PFS_SKIP_BUFFER_MUTEX_RWLOCK
 
+/* By default, event->mutex will also be excluded from instrumentation */
+# define PFS_SKIP_EVENT_MUTEX
+
 #endif /* UNIV_PFS_MUTEX || UNIV_PFS_RWLOCK */
 
 #ifdef UNIV_PFS_MUTEX
 /* Key defines to register InnoDB mutexes with performance schema */
 extern mysql_pfs_key_t	autoinc_mutex_key;
-extern mysql_pfs_key_t	btr_search_enabled_mutex_key;
 extern mysql_pfs_key_t	buffer_block_mutex_key;
 extern mysql_pfs_key_t	buf_pool_mutex_key;
 extern mysql_pfs_key_t	buf_pool_zip_mutex_key;
@@ -81,13 +75,19 @@ extern mysql_pfs_key_t	dict_sys_mutex_key;
 extern mysql_pfs_key_t	file_format_max_mutex_key;
 extern mysql_pfs_key_t	fil_system_mutex_key;
 extern mysql_pfs_key_t	flush_list_mutex_key;
+extern mysql_pfs_key_t	fts_bg_threads_mutex_key;
+extern mysql_pfs_key_t	fts_delete_mutex_key;
+extern mysql_pfs_key_t	fts_optimize_mutex_key;
+extern mysql_pfs_key_t	fts_doc_id_mutex_key;
 extern mysql_pfs_key_t	hash_table_mutex_key;
 extern mysql_pfs_key_t	ibuf_bitmap_mutex_key;
 extern mysql_pfs_key_t	ibuf_mutex_key;
 extern mysql_pfs_key_t	ibuf_pessimistic_insert_mutex_key;
 extern mysql_pfs_key_t	log_sys_mutex_key;
 extern mysql_pfs_key_t	log_flush_order_mutex_key;
-extern mysql_pfs_key_t	kernel_mutex_key;
+# ifndef HAVE_ATOMIC_BUILTINS
+extern mysql_pfs_key_t	server_mutex_key;
+# endif /* !HAVE_ATOMIC_BUILTINS */
 # ifdef UNIV_MEM_DEBUG
 extern mysql_pfs_key_t	mem_hash_mutex_key;
 # endif /* UNIV_MEM_DEBUG */
@@ -104,13 +104,25 @@ extern mysql_pfs_key_t	rw_lock_mutex_key;
 extern mysql_pfs_key_t	srv_dict_tmpfile_mutex_key;
 extern mysql_pfs_key_t	srv_innodb_monitor_mutex_key;
 extern mysql_pfs_key_t	srv_misc_tmpfile_mutex_key;
+extern mysql_pfs_key_t	srv_threads_mutex_key;
 extern mysql_pfs_key_t	srv_monitor_file_mutex_key;
-extern mysql_pfs_key_t	syn_arr_mutex_key;
 # ifdef UNIV_SYNC_DEBUG
 extern mysql_pfs_key_t	sync_thread_mutex_key;
 # endif /* UNIV_SYNC_DEBUG */
-extern mysql_pfs_key_t	trx_doublewrite_mutex_key;
+extern mysql_pfs_key_t	buf_dblwr_mutex_key;
 extern mysql_pfs_key_t	trx_undo_mutex_key;
+extern mysql_pfs_key_t	trx_mutex_key;
+extern mysql_pfs_key_t	lock_sys_mutex_key;
+extern mysql_pfs_key_t	lock_sys_wait_mutex_key;
+extern mysql_pfs_key_t	trx_sys_mutex_key;
+extern mysql_pfs_key_t	srv_sys_mutex_key;
+extern mysql_pfs_key_t	srv_sys_tasks_mutex_key;
+#ifndef HAVE_ATOMIC_BUILTINS
+extern mysql_pfs_key_t	srv_conc_mutex_key;
+#endif /* !HAVE_ATOMIC_BUILTINS */
+extern mysql_pfs_key_t	event_os_mutex_key;
+extern mysql_pfs_key_t	ut_list_mutex_key;
+extern mysql_pfs_key_t	os_mutex_key;
 #endif /* UNIV_PFS_MUTEX */
 
 /******************************************************************//**
@@ -591,10 +603,23 @@ V
 File system pages
 |
 V
-Kernel mutex				If a kernel operation needs a file
-|					page allocation, it must reserve the
-|					fsp x-latch before acquiring the kernel
-|					mutex.
+lock_sys_wait_mutex			Mutex protecting lock timeout data
+|
+V
+lock_sys_mutex				Mutex protecting lock_sys_t
+|
+V
+trx_sys->mutex				Mutex protecting trx_sys_t
+|
+V
+Threads mutex				Background thread scheduling mutex
+|
+V
+query_thr_mutex				Mutex protecting query threads
+|
+V
+trx_mutex				Mutex protecting trx_t fields
+|
 V
 Search system mutex
 |
@@ -609,7 +634,8 @@ Any other latch
 V
 Memory pool mutex */
 
-/* Latching order levels */
+/* Latching order levels. If you modify these, you have to also update
+sync_thread_add_level(). */
 
 /* User transaction locks are higher than any of the latch levels below:
 no latches are allowed when a thread goes to wait for a normal table
@@ -629,10 +655,11 @@ or row lock! */
 					trx_i_s_cache_t::last_read_mutex */
 #define SYNC_FILE_FORMAT_TAG	1200	/* Used to serialize access to the
 					file format tag */
-#define	SYNC_DICT_OPERATION	1001	/* table create, drop, etc. reserve
+#define	SYNC_DICT_OPERATION	1010	/* table create, drop, etc. reserve
 					this in X-mode; implicit or backround
 					operations purge, rollback, foreign
 					key checks reserve this in S-mode */
+#define SYNC_FTS_CACHE		1005	/* FTS cache rwlock */
 #define SYNC_DICT		1000
 #define SYNC_DICT_AUTOINC_MUTEX	999
 #define SYNC_DICT_HEADER	995
@@ -663,14 +690,20 @@ or row lock! */
 /*------------------------------------- MySQL query cache mutex */
 /*------------------------------------- MySQL binlog mutex */
 /*-------------------------------*/
-#define	SYNC_KERNEL		300
-#define SYNC_REC_LOCK		299
-#define	SYNC_TRX_LOCK_HEAP	298
+#define SYNC_LOCK_WAIT_SYS	300
+#define SYNC_LOCK_SYS		299
+#define SYNC_TRX_SYS		298
+#define SYNC_TRX		297
+#define SYNC_THREADS		295
+#define SYNC_REC_LOCK		294
 #define SYNC_TRX_SYS_HEADER	290
 #define	SYNC_PURGE_QUEUE	200
 #define SYNC_LOG		170
 #define SYNC_LOG_FLUSH_ORDER	147
 #define SYNC_RECV		168
+#define SYNC_FTS_CACHE_INIT	166	/* Used for FTS cache initialization */
+#define SYNC_FTS_BG_THREADS	165
+#define SYNC_FTS_OPTIMIZE       164     // FIXME: is this correct number, test
 #define	SYNC_WORK_QUEUE		162
 #define	SYNC_SEARCH_SYS		160	/* NOTE that if we have a memory
 					heap that can be extended to the
@@ -679,6 +712,7 @@ or row lock! */
 					can call routines there! Otherwise
 					the level is SYNC_MEM_HASH. */
 #define	SYNC_BUF_POOL		150	/* Buffer pool mutex */
+#define	SYNC_BUF_PAGE_HASH	149	/* buf_pool->page_hash rw_lock */
 #define	SYNC_BUF_BLOCK		146	/* Block mutex */
 #define	SYNC_BUF_FLUSH_LIST	145	/* Buffer flush list mutex */
 #define SYNC_DOUBLEWRITE	140
@@ -700,7 +734,7 @@ implementation of a mutual exclusion semaphore. */
 
 /** InnoDB mutex */
 struct mutex_struct {
-	os_event_t	event;	/*!< Used by sync0arr.c for the wait queue */
+	os_event_t	event;	/*!< Used by sync0arr.cc for the wait queue */
 	volatile lock_word_t	lock_word;	/*!< lock_word is the target
 				of the atomic test-and-set instruction when
 				atomic operations are enabled. */
@@ -747,11 +781,6 @@ struct mutex_struct {
 #endif
 };
 
-/** The global array of wait cells for implementation of the databases own
-mutexes and read-write locks. */
-extern sync_array_t*	sync_primary_wait_array;/* Appears here for
-						debugging purposes only! */
-
 /** Constant determining how long spin wait is continued before suspending
 the thread. A value 600 rounds on a 1995 100 MHz Pentium seems to correspond
 to 20 microseconds. */
@@ -777,6 +806,30 @@ extern ut_list_base_node_t  mutex_list;
 /** Mutex protecting the mutex_list variable */
 extern mutex_t mutex_list_mutex;
 
+#ifndef HAVE_ATOMIC_BUILTINS
+/**********************************************************//**
+Function that uses a mutex to decrement a variable atomically */
+UNIV_INLINE
+void
+os_atomic_dec_ulint_func(
+/*=====================*/
+	mutex_t*		mutex,		/*!< in: mutex guarding the
+						decrement */
+	volatile ulint*		var,		/*!< in/out: variable to
+						decrement */
+	ulint			delta);		/*!< in: delta to decrement */
+/**********************************************************//**
+Function that uses a mutex to increment a variable atomically */
+UNIV_INLINE
+void
+os_atomic_inc_ulint_func(
+/*=====================*/
+	mutex_t*		mutex,		/*!< in: mutex guarding the
+						increment */
+	volatile ulint*		var,		/*!< in/out: variable to
+						increment */
+	ulint			delta);		/*!< in: delta to increment */
+#endif /* !HAVE_ATOMIC_BUILTINS */
 
 #ifndef UNIV_NONINL
 #include "sync0sync.ic"
diff --git a/storage/innobase/include/sync0sync.ic b/storage/innobase/include/sync0sync.ic
index eb21f44c65e..746e73ebee7 100644
--- a/storage/innobase/include/sync0sync.ic
+++ b/storage/innobase/include/sync0sync.ic
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1995, 2009, Oracle and/or its affiliates. All Rights Reserved.
 Copyright (c) 2008, Google Inc.
 
 Portions of this file contain modifications contributed and copyrighted by
@@ -18,8 +18,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -94,7 +94,7 @@ mutex_test_and_set(
 		mutex->lock_word = 1;
 	}
 
-	return((byte)ret);
+	return((byte) ret);
 #endif
 }
 
@@ -236,22 +236,22 @@ pfs_mutex_enter_func(
 	const char*	file_name,	/*!< in: file name where locked */
 	ulint		line)		/*!< in: line where locked */
 {
-	struct PSI_mutex_locker*	locker = NULL;
-	PSI_mutex_locker_state		state;
-	int	result = 0;
-
-	if (UNIV_LIKELY(PSI_server && mutex->pfs_psi)) {
-		locker = PSI_server->get_thread_mutex_locker(
-				&state, mutex->pfs_psi, PSI_MUTEX_LOCK);
-		if (locker) {
-			PSI_server->start_mutex_wait(locker, file_name, line);
-		}
-	}
+	if (mutex->pfs_psi != NULL)
+	{
+		PSI_mutex_locker*	locker;
+		PSI_mutex_locker_state	state;
+
+		locker = PSI_CALL(start_mutex_wait)(&state, mutex->pfs_psi,
+			PSI_MUTEX_LOCK, file_name, line);
 
-	mutex_enter_func(mutex, file_name, line);
+		mutex_enter_func(mutex, file_name, line);
 
-	if (locker) {
-		PSI_server->end_mutex_wait(locker, result);
+		if (locker != NULL)
+			PSI_CALL(end_mutex_wait)(locker, 0);
+	}
+	else
+	{
+		mutex_enter_func(mutex, file_name, line);
 	}
 }
 /********************************************************************//**
@@ -270,21 +270,23 @@ pfs_mutex_enter_nowait_func(
 	ulint		line)		/*!< in: line where requested */
 {
 	ulint	ret;
-	struct PSI_mutex_locker*	locker = NULL;
-	PSI_mutex_locker_state		state;
-
-	if (UNIV_LIKELY(PSI_server && mutex->pfs_psi)) {
-		locker = PSI_server->get_thread_mutex_locker(
-				&state, mutex->pfs_psi, PSI_MUTEX_TRYLOCK);
-		if (locker) {
-			PSI_server->start_mutex_wait(locker, file_name, line);
-		}
-	}
 
-	ret = mutex_enter_nowait_func(mutex, file_name, line);
+	if (mutex->pfs_psi != NULL)
+	{
+		PSI_mutex_locker*	locker;
+		PSI_mutex_locker_state		state;
+
+		locker = PSI_CALL(start_mutex_wait)(&state, mutex->pfs_psi,
+			PSI_MUTEX_TRYLOCK, file_name, line);
+
+		ret = mutex_enter_nowait_func(mutex, file_name, line);
 
-	if (locker) {
-		PSI_server->end_mutex_wait(locker, ret);
+		if (locker != NULL)
+			PSI_CALL(end_mutex_wait)(locker, (int) ret);
+	}
+	else
+	{
+		ret = mutex_enter_nowait_func(mutex, file_name, line);
 	}
 
 	return(ret);
@@ -300,9 +302,8 @@ pfs_mutex_exit_func(
 /*================*/
 	mutex_t*	mutex)	/*!< in: pointer to mutex */
 {
-	if (UNIV_LIKELY(PSI_server && mutex->pfs_psi)) {
-		PSI_server->unlock_mutex(mutex->pfs_psi);
-	}
+	if (mutex->pfs_psi != NULL)
+		PSI_CALL(unlock_mutex)(mutex->pfs_psi);
 
 	mutex_exit_func(mutex);
 }
@@ -328,9 +329,7 @@ pfs_mutex_create_func(
 	const char*	cfile_name,	/*!< in: file name where created */
 	ulint		cline)		/*!< in: file line where created */
 {
-	mutex->pfs_psi = (PSI_server && PFS_IS_INSTRUMENTED(key))
-				? PSI_server->init_mutex(key, mutex)
-				: NULL;
+	mutex->pfs_psi = PSI_CALL(init_mutex)(key, mutex);
 
 	mutex_create_func(mutex,
 # ifdef UNIV_DEBUG
@@ -353,8 +352,9 @@ pfs_mutex_free_func(
 /*================*/
 	mutex_t*	mutex)	/*!< in: mutex */
 {
-	if (UNIV_LIKELY(PSI_server && mutex->pfs_psi)) {
-		PSI_server->destroy_mutex(mutex->pfs_psi);
+	if (mutex->pfs_psi != NULL)
+	{
+		PSI_CALL(destroy_mutex)(mutex->pfs_psi);
 		mutex->pfs_psi = NULL;
 	}
 
@@ -362,3 +362,43 @@ pfs_mutex_free_func(
 }
 
 #endif /* UNIV_PFS_MUTEX */
+
+#ifndef HAVE_ATOMIC_BUILTINS
+/**********************************************************//**
+Function that uses a mutex to decrement a variable atomically */
+UNIV_INLINE
+void
+os_atomic_dec_ulint_func(
+/*=====================*/
+	mutex_t*	mutex,		/*!< in: mutex guarding the dec */
+	volatile ulint*	var,		/*!< in/out: variable to decrement */
+	ulint		delta)		/*!< in: delta to decrement */
+{
+	mutex_enter(mutex);
+
+	/* I don't think we will encounter a situation where
+	this check will not be required. */
+	ut_ad(*var >= delta);
+
+	*var -= delta;
+
+	mutex_exit(mutex);
+}
+
+/**********************************************************//**
+Function that uses a mutex to increment a variable atomically */
+UNIV_INLINE
+void
+os_atomic_inc_ulint_func(
+/*=====================*/
+	mutex_t*	mutex,		/*!< in: mutex guarding the increment */
+	volatile ulint*	var,		/*!< in/out: variable to increment */
+	ulint		delta)		/*!< in: delta to increment */
+{
+	mutex_enter(mutex);
+
+	*var += delta;
+
+	mutex_exit(mutex);
+}
+#endif /* !HAVE_ATOMIC_BUILTINS */
diff --git a/storage/innobase/include/sync0types.h b/storage/innobase/include/sync0types.h
index 1911bbac7fd..679cf6a9074 100644
--- a/storage/innobase/include/sync0types.h
+++ b/storage/innobase/include/sync0types.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1995, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
diff --git a/storage/innobase/include/trx0i_s.h b/storage/innobase/include/trx0i_s.h
index 73896a3cb76..c286fc4d9ae 100644
--- a/storage/innobase/include/trx0i_s.h
+++ b/storage/innobase/include/trx0i_s.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 2007, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 2007, 2011, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -66,13 +66,15 @@ do {								\
 		strncpy(buff, data, constraint);		\
 		buff[constraint] = '\0';			\
 								\
-		field = ha_storage_put_memlim(			\
+		field = static_cast<const char*>(		\
+			ha_storage_put_memlim(			\
 			(tcache)->storage, buff, constraint + 1,\
-			MAX_ALLOWED_FOR_STORAGE(tcache));	\
+			MAX_ALLOWED_FOR_STORAGE(tcache)));	\
 	} else {						\
-		field = ha_storage_put_str_memlim(		\
+		field = static_cast<const char*>(		\
+			ha_storage_put_str_memlim(		\
 			(tcache)->storage, data,		\
-			MAX_ALLOWED_FOR_STORAGE(tcache));	\
+			MAX_ALLOWED_FOR_STORAGE(tcache)));	\
 	}							\
 } while (0)
 
@@ -173,6 +175,11 @@ struct i_s_trx_row_struct {
 	ulint		trx_search_latch_timeout;
 					/*!< search_latch_timeout in
 					trx_struct */
+	ulint		trx_is_read_only;
+					/*!< trx_t::read_only */
+	ulint		trx_is_autocommit_non_locking;
+					/*!< trx_is_autocommit_non_locking(trx)
+					*/
 };
 
 /** This structure represents INFORMATION_SCHEMA.innodb_lock_waits row */
diff --git a/storage/innobase/include/trx0purge.h b/storage/innobase/include/trx0purge.h
index 2bd9e64476b..0199083467c 100644
--- a/storage/innobase/include/trx0purge.h
+++ b/storage/innobase/include/trx0purge.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1996, 2011, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2011, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -52,17 +52,6 @@ trx_purge_get_log_from_hist(
 /*========================*/
 	fil_addr_t	node_addr);	/*!< in: file address of the history
 					list node of the log */
-/*****************************************************************//**
-Checks if trx_id is >= purge_view: then it is guaranteed that its update
-undo log still exists in the system.
-@return TRUE if is sure that it is preserved, also if the function
-returns FALSE, it is possible that the undo log still exists in the
-system */
-UNIV_INTERN
-ibool
-trx_purge_update_undo_must_exist(
-/*=============================*/
-	trx_id_t	trx_id);/*!< in: transaction id */
 /********************************************************************//**
 Creates the global purge system control structure and inits the history
 mutex. */
@@ -70,7 +59,8 @@ UNIV_INTERN
 void
 trx_purge_sys_create(
 /*=================*/
-	ib_bh_t*	ib_bh);	/*!< in/own: UNDO log min binary heap*/
+	ulint		n_purge_threads,/*!< in: number of purge threads */
+	ib_bh_t*	ib_bh);		/*!< in/own: UNDO log min binary heap*/
 /********************************************************************//**
 Frees the global purge system control structure. */
 UNIV_INTERN
@@ -88,26 +78,6 @@ trx_purge_add_update_undo_to_history(
 	page_t*	undo_page,	/*!< in: update undo log header page,
 				x-latched */
 	mtr_t*	mtr);		/*!< in: mtr */
-/********************************************************************//**
-Fetches the next undo log record from the history list to purge. It must be
-released with the corresponding release function.
-@return copy of an undo log record or pointer to trx_purge_dummy_rec,
-if the whole undo log can skipped in purge; NULL if none left */
-UNIV_INTERN
-trx_undo_rec_t*
-trx_purge_fetch_next_rec(
-/*=====================*/
-	roll_ptr_t*	roll_ptr,/*!< out: roll pointer to undo record */
-	trx_undo_inf_t** cell,	/*!< out: storage cell for the record in the
-				purge array */
-	mem_heap_t*	heap);	/*!< in: memory heap where copied */
-/*******************************************************************//**
-Releases a reserved purge undo record. */
-UNIV_INTERN
-void
-trx_purge_rec_release(
-/*==================*/
-	trx_undo_inf_t*	cell);	/*!< in: storage cell */
 /*******************************************************************//**
 This function runs a purge batch.
 @return	number of undo log pages handled in the batch */
@@ -115,47 +85,92 @@ UNIV_INTERN
 ulint
 trx_purge(
 /*======*/
-	ulint	limit);		/*!< in: the maximum number of records to
-				purge in one batch */
-/******************************************************************//**
-Prints information of the purge system to stderr. */
+	ulint	n_purge_threads,	/*!< in: number of purge tasks to
+					submit to task queue. */
+	ulint	limit,			/*!< in: the maximum number of
+					records to purge in one batch */
+	bool	truncate);		/*!< in: truncate history if true */
+/*******************************************************************//**
+Stop purge and wait for it to stop, move to PURGE_STATE_STOP. */
 UNIV_INTERN
 void
-trx_purge_sys_print(void);
-/*======================*/
+trx_purge_stop(void);
+/*================*/
+/*******************************************************************//**
+Resume purge, move to PURGE_STATE_RUN. */
+UNIV_INTERN
+void
+trx_purge_run(void);
+/*================*/
+
+/** Purge states */
+enum purge_state_t {
+	PURGE_STATE_INIT,		/*!< Purge instance created */
+	PURGE_STATE_RUN,		/*!< Purge should be running */
+	PURGE_STATE_STOP,		/*!< Purge should be stopped */
+	PURGE_STATE_EXIT		/*!< Purge has been shutdown */
+};
+
+/*******************************************************************//**
+Get the purge state.
+@return purge state. */
+UNIV_INTERN
+purge_state_t
+trx_purge_state(void);
+/*=================*/
+
+/** This is the purge pointer/iterator. We need both the undo no and the
+transaction no up to which purge has parsed and applied the records. */
+typedef struct purge_iter_struct {
+	trx_id_t	trx_no;		/*!< Purge has advanced past all
+					transactions whose number is less
+					than this */
+	undo_no_t	undo_no;	/*!< Purge has advanced past all records
+					whose undo number is less than this */
+} purge_iter_t;
 
 /** The control structure used in the purge operation */
 struct trx_purge_struct{
-	ulint		state;		/*!< Purge system state */
 	sess_t*		sess;		/*!< System session running the purge
 					query */
 	trx_t*		trx;		/*!< System transaction running the
-					purge
-					query: this trx is not in the trx list
-					of the trx system and it never ends */
-	que_t*		query;		/*!< The query graph which will do the
-					parallelized purge operation */
+					purge query: this trx is not in the
+					trx list of the trx system and it
+					never ends */
 	rw_lock_t	latch;		/*!< The latch protecting the purge
-					view.  A purge operation must acquire
-					an x-latch here for the instant at which
+					view. A purge operation must acquire an
+					x-latch here for the instant at which
 					it changes the purge view: an undo
 					log operation can prevent this by
-					obtaining an s-latch here. */
+					obtaining an s-latch here. It also
+					protects state and running */
+	os_event_t	event;		/*!< State signal event */
+	ulint		n_stop;		/*!< Counter to track number stops */
+	bool		running;	/*!< true, if purge is active */
+	volatile purge_state_t	state;	/*!< Purge coordinator thread states,
+					we check this in several places
+					without holding the latch. */
+	que_t*		query;		/*!< The query graph which will do the
+					parallelized purge operation */
 	read_view_t*	view;		/*!< The purge will not remove undo logs
 					which are >= this view (purge view) */
-	ulonglong	n_pages_handled;/*!< Approximate number of undo log
-					pages processed in purge */
-	ulonglong	handle_limit;	/*!< Target of how many pages to get
-					processed in the current purge */
+	volatile ulint	n_submitted;	/*!< Count of total tasks submitted
+					to the task queue */
+	volatile ulint	n_completed;	/*!< Count of total tasks completed */
+
 	/*------------------------------*/
 	/* The following two fields form the 'purge pointer' which advances
 	during a purge, and which is used in history list truncation */
 
-	trx_id_t	purge_trx_no;	/*!< Purge has advanced past all
-					transactions whose number is less
-					than this */
-	undo_no_t	purge_undo_no;	/*!< Purge has advanced past all records
-					whose undo number is less than this */
+	purge_iter_t	iter;		/* Limit up to which we have read and
+					parsed the UNDO log records.  Not
+					necessarily purged from the indexes.
+					Note that this can never be less than
+					the limit below, we check for this
+					invariant in trx0purge.cc */
+	purge_iter_t	limit;		/* The 'purge pointer' which advances
+					during a purge, and which is used in
+					history list truncation */
 	/*-----------------------------*/
 	ibool		next_stored;	/*!< TRUE if the info of the next record
 					to purge is stored below: if yes, then
@@ -174,9 +189,6 @@ struct trx_purge_struct{
 					the next record to purge belongs */
 	ulint		hdr_offset;	/*!< Header byte offset on the page */
 	/*-----------------------------*/
-	trx_undo_arr_t*	arr;		/*!< Array of transaction numbers and
-					undo numbers of the undo records
-					currently under processing in purge */
 	mem_heap_t*	heap;		/*!< Temporary storage used during a
 					purge: can be emptied after purge
 					completes */
@@ -187,9 +199,14 @@ struct trx_purge_struct{
 	mutex_t		bh_mutex;	/*!< Mutex protecting ib_bh */
 };
 
-#define TRX_PURGE_ON		1	/* purge operation is running */
-#define TRX_STOP_PURGE		2	/* purge operation is stopped, or
-					it should be stopped */
+/** Info required to purge a record */
+struct trx_purge_rec_struct {
+	trx_undo_rec_t*	undo_rec;	/*!< Record to purge */
+	roll_ptr_t	roll_ptr;	/*!< File pointr to UNDO record */
+};
+
+typedef struct trx_purge_rec_struct trx_purge_rec_t;
+
 #ifndef UNIV_NONINL
 #include "trx0purge.ic"
 #endif
diff --git a/storage/innobase/include/trx0purge.ic b/storage/innobase/include/trx0purge.ic
index de09e393654..ca9cc1fb894 100644
--- a/storage/innobase/include/trx0purge.ic
+++ b/storage/innobase/include/trx0purge.ic
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -41,3 +41,22 @@ trx_purge_get_log_from_hist(
 	return(node_addr);
 }
 
+#ifdef UNIV_DEBUG
+/********************************************************************//**
+address of its history list node.
+@return	TRUE if purge_sys_t::limit <= purge_sys_t::iter*/
+UNIV_INLINE
+ibool
+trx_purge_check_limit(void)
+/*=======================*/
+{
+	ut_ad(purge_sys->limit.trx_no <= purge_sys->iter.trx_no);
+
+	if (purge_sys->limit.trx_no == purge_sys->iter.trx_no) {
+		ut_ad(purge_sys->limit.undo_no <= purge_sys->iter.undo_no);
+	}
+
+	return(TRUE);
+}
+#endif /* UNIV_DEBUG */
+
diff --git a/storage/innobase/include/trx0rec.h b/storage/innobase/include/trx0rec.h
index 477748f6f89..c9fae45dad4 100644
--- a/storage/innobase/include/trx0rec.h
+++ b/storage/innobase/include/trx0rec.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -239,32 +239,13 @@ trx_undo_get_undo_rec_low(
 /*======================*/
 	roll_ptr_t	roll_ptr,	/*!< in: roll pointer to record */
 	mem_heap_t*	heap);		/*!< in: memory heap where copied */
-/******************************************************************//**
-Copies an undo record to heap.
-
-NOTE: the caller must have latches on the clustered index page and
-purge_view.
-
-@return DB_SUCCESS, or DB_MISSING_HISTORY if the undo log has been
-truncated and we cannot fetch the old version */
-UNIV_INTERN
-ulint
-trx_undo_get_undo_rec(
-/*==================*/
-	roll_ptr_t	roll_ptr,	/*!< in: roll pointer to record */
-	trx_id_t	trx_id,		/*!< in: id of the trx that generated
-					the roll pointer: it points to an
-					undo log of this transaction */
-	trx_undo_rec_t** undo_rec,	/*!< out, own: copy of the record */
-	mem_heap_t*	heap);		/*!< in: memory heap where copied */
 /*******************************************************************//**
-Build a previous version of a clustered index record. This function checks
-that the caller has a latch on the index page of the clustered index record
-and an s-latch on the purge_view. This guarantees that the stack of versions
-is locked.
+Build a previous version of a clustered index record. The caller must
+hold a latch on the index page of the clustered index record, to
+guarantee that the stack of versions is locked all the way down to the
+purge_sys->view.
 @return DB_SUCCESS, or DB_MISSING_HISTORY if the previous version is
-earlier than purge_view, which means that it may have been removed,
-DB_ERROR if corrupted record */
+earlier than purge_view, which means that it may have been removed */
 UNIV_INTERN
 ulint
 trx_undo_prev_version_build(
diff --git a/storage/innobase/include/trx0rec.ic b/storage/innobase/include/trx0rec.ic
index 4fc5a7147f9..847c26f03a8 100644
--- a/storage/innobase/include/trx0rec.ic
+++ b/storage/innobase/include/trx0rec.ic
@@ -108,6 +108,6 @@ trx_undo_rec_copy(
 	len = mach_read_from_2(undo_rec)
 		- ut_align_offset(undo_rec, UNIV_PAGE_SIZE);
 	ut_ad(len < UNIV_PAGE_SIZE);
-	return(mem_heap_dup(heap, undo_rec, len));
+	return((trx_undo_rec_t*) mem_heap_dup(heap, undo_rec, len));
 }
 #endif /* !UNIV_HOTBACKUP */
diff --git a/storage/innobase/include/trx0roll.h b/storage/innobase/include/trx0roll.h
index 1dee5655c8c..3b724e03830 100644
--- a/storage/innobase/include/trx0roll.h
+++ b/storage/innobase/include/trx0roll.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2011, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -32,8 +32,6 @@ Created 3/26/1996 Heikki Tuuri
 #include "mtr0mtr.h"
 #include "trx0sys.h"
 
-#define trx_roll_free_all_savepoints(s) trx_roll_savepoints_free((s), NULL)
-
 /*******************************************************************//**
 Determines if this transaction is rolling back an incomplete transaction
 in crash recovery.
@@ -53,12 +51,6 @@ trx_savept_take(
 /*============*/
 	trx_t*	trx);	/*!< in: transaction */
 /*******************************************************************//**
-Creates an undo number array. */
-UNIV_INTERN
-trx_undo_arr_t*
-trx_undo_arr_create(void);
-/*=====================*/
-/*******************************************************************//**
 Frees an undo number array. */
 UNIV_INTERN
 void
@@ -74,13 +66,6 @@ trx_undo_arr_get_nth_info(
 /*======================*/
 	trx_undo_arr_t*	arr,	/*!< in: undo number array */
 	ulint		n);	/*!< in: position */
-/***********************************************************************//**
-Tries truncate the undo logs. */
-UNIV_INTERN
-void
-trx_roll_try_truncate(
-/*==================*/
-	trx_t*	trx);	/*!< in/out: transaction */
 /********************************************************************//**
 Pops the topmost record when the two undo logs of a transaction are seen
 as a single stack of records ordered by their undo numbers. Inserts the
@@ -116,19 +101,6 @@ trx_undo_rec_release(
 /*=================*/
 	trx_t*		trx,	/*!< in/out: transaction */
 	undo_no_t	undo_no);/*!< in: undo number */
-/*********************************************************************//**
-Starts a rollback operation. */
-UNIV_INTERN
-void
-trx_rollback(
-/*=========*/
-	trx_t*		trx,	/*!< in: transaction */
-	trx_sig_t*	sig,	/*!< in: signal starting the rollback */
-	que_thr_t**	next_thr);/*!< in/out: next query thread to run;
-				if the value which is passed in is
-				a pointer to a NULL pointer, then the
-				calling function can start running
-				a new query thread */
 /*******************************************************************//**
 Rollback or clean up any incomplete transactions which were
 encountered in crash recovery.  If the transaction already was
@@ -147,38 +119,13 @@ committed, then we clean up a possible insert undo log. If the
 transaction was not yet committed, then we roll it back.
 Note: this is done in a background thread.
 @return	a dummy parameter */
-UNIV_INTERN
+extern "C" UNIV_INTERN
 os_thread_ret_t
-trx_rollback_or_clean_all_recovered(
-/*================================*/
+DECLARE_THREAD(trx_rollback_or_clean_all_recovered)(
+/*================================================*/
 	void*	arg __attribute__((unused)));
 			/*!< in: a dummy parameter required by
 			os_thread_create */
-/****************************************************************//**
-Finishes a transaction rollback. */
-UNIV_INTERN
-void
-trx_finish_rollback_off_kernel(
-/*===========================*/
-	que_t*		graph,	/*!< in: undo graph which can now be freed */
-	trx_t*		trx,	/*!< in: transaction */
-	que_thr_t**	next_thr);/*!< in/out: next query thread to run;
-				if the value which is passed in is
-				a pointer to a NULL pointer, then the
-				calling function can start running
-				a new query thread; if this parameter is
-				NULL, it is ignored */
-/****************************************************************//**
-Builds an undo 'query' graph for a transaction. The actual rollback is
-performed by executing this query graph like a query subprocedure call.
-The reply about the completion of the rollback will be sent by this
-graph.
-@return	own: the query graph */
-UNIV_INTERN
-que_t*
-trx_roll_graph_build(
-/*=================*/
-	trx_t*	trx);	/*!< in: trx handle */
 /*********************************************************************//**
 Creates a rollback command node struct.
 @return	own: rollback node struct */
@@ -202,7 +149,7 @@ UNIV_INTERN
 int
 trx_rollback_for_mysql(
 /*===================*/
-	trx_t*	trx);	/*!< in: transaction handle */
+	trx_t*	trx);	/*!< in/out: transaction */
 /*******************************************************************//**
 Rollback the latest SQL statement for MySQL.
 @return	error code or DB_SUCCESS */
@@ -210,14 +157,14 @@ UNIV_INTERN
 int
 trx_rollback_last_sql_stat_for_mysql(
 /*=================================*/
-	trx_t*	trx);	/*!< in: transaction handle */
+	trx_t*	trx);	/*!< in/out: transaction */
 /*******************************************************************//**
-Rollback a transaction used in MySQL.
+Rollback a transaction to a given savepoint or do a complete rollback.
 @return	error code or DB_SUCCESS */
 UNIV_INTERN
 int
-trx_general_rollback_for_mysql(
-/*===========================*/
+trx_rollback_to_savepoint(
+/*======================*/
 	trx_t*		trx,	/*!< in: transaction handle */
 	trx_savept_t*	savept);/*!< in: pointer to savepoint undo number, if
 				partial rollback requested, or NULL for
@@ -273,17 +220,7 @@ trx_release_savepoint_for_mysql(
 	const char*	savepoint_name);	/*!< in: savepoint name */
 
 /*******************************************************************//**
-Frees a single savepoint struct. */
-UNIV_INTERN
-void
-trx_roll_savepoint_free(
-/*=====================*/
-	trx_t*			trx,	/*!< in: transaction handle */
-	trx_named_savept_t*	savep);	/*!< in: savepoint to free */
-
-/*******************************************************************//**
-Frees savepoint structs starting from savep, if savep == NULL then
-free all savepoints. */
+Frees savepoint structs starting from savep. */
 UNIV_INTERN
 void
 trx_roll_savepoints_free(
@@ -295,10 +232,10 @@ trx_roll_savepoints_free(
 
 /** A cell of trx_undo_arr_struct; used during a rollback and a purge */
 struct	trx_undo_inf_struct{
+	ibool		in_use;	/*!< true if cell is being used */
 	trx_id_t	trx_no;	/*!< transaction number: not defined during
 				a rollback */
 	undo_no_t	undo_no;/*!< undo number of an undo record */
-	ibool		in_use;	/*!< TRUE if the cell is in use */
 };
 
 /** During a rollback and a purge, undo numbers of undo records currently being
@@ -306,17 +243,18 @@ processed are stored in this array */
 
 struct trx_undo_arr_struct{
 	ulint		n_cells;	/*!< number of cells in the array */
-	ulint		n_used;		/*!< number of cells currently in use */
+	ulint		n_used;		/*!< number of cells in use */
 	trx_undo_inf_t*	infos;		/*!< the array of undo infos */
 	mem_heap_t*	heap;		/*!< memory heap from which allocated */
 };
 
 /** Rollback node states */
 enum roll_node_state {
-	ROLL_NODE_SEND = 1,	/*!< about to send a rollback signal to
-				the transaction */
-	ROLL_NODE_WAIT		/*!< rollback signal sent to the transaction,
-				waiting for completion */
+	ROLL_NODE_NONE = 0,		/*!< Unknown state */
+	ROLL_NODE_SEND,			/*!< about to send a rollback signal to
+					the transaction */
+	ROLL_NODE_WAIT			/*!< rollback signal sent to the
+				       	transaction, waiting for completion */
 };
 
 /** Rollback command node in a query graph */
@@ -328,6 +266,7 @@ struct roll_node_struct{
 	trx_savept_t		savept;	/*!< savepoint to which to
 					roll back, in the case of a
 					partial rollback */
+	que_thr_t*		undo_thr;/*!< undo query graph */
 };
 
 /** A savepoint set with SQL's "SAVEPOINT savepoint_id" command */
diff --git a/storage/innobase/include/trx0roll.ic b/storage/innobase/include/trx0roll.ic
index 3460832b18c..178e9bb730a 100644
--- a/storage/innobase/include/trx0roll.ic
+++ b/storage/innobase/include/trx0roll.ic
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
diff --git a/storage/innobase/include/trx0rseg.h b/storage/innobase/include/trx0rseg.h
index 5acde05de3d..66e5449cf57 100644
--- a/storage/innobase/include/trx0rseg.h
+++ b/storage/innobase/include/trx0rseg.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1996, 2011, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2011, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -29,6 +29,7 @@ Created 3/26/1996 Heikki Tuuri
 #include "univ.i"
 #include "trx0types.h"
 #include "trx0sys.h"
+#include "ut0bh.h"
 
 /******************************************************************//**
 Gets a rollback segment header.
@@ -86,11 +87,11 @@ trx_rsegf_undo_find_free(
 /******************************************************************//**
 Looks for a rollback segment, based on the rollback segment id.
 @return	rollback segment */
-UNIV_INTERN
+UNIV_INLINE
 trx_rseg_t*
 trx_rseg_get_on_id(
 /*===============*/
-	ulint	id);	/*!< in: rollback segment id */
+	ulint	id);		/*!< in: rollback segment id */
 /****************************************************************//**
 Creates a rollback segment header. This function is called only when
 a new rollback segment is created in the database.
@@ -107,30 +108,42 @@ trx_rseg_header_create(
 	mtr_t*	mtr);		/*!< in: mtr */
 /*********************************************************************//**
 Creates the memory copies for rollback segments and initializes the
-rseg list and array in trx_sys at a database startup. */
+rseg array in trx_sys at a database startup. */
 UNIV_INTERN
 void
-trx_rseg_list_and_array_init(
-/*=========================*/
-	trx_sysf_t*	sys_header,	/*!< in: trx system header */
+trx_rseg_array_init(
+/*================*/
+	trx_sysf_t*	sys_header,	/*!< in/out: trx system header */
 	ib_bh_t*	ib_bh,		/*!< in: rseg queue */
-	mtr_t*		mtr);		/*!< in: mtr */
-
+	mtr_t*		mtr);		/*!< in/out: mtr */
 /***************************************************************************
 Free's an instance of the rollback segment in memory. */
 UNIV_INTERN
 void
 trx_rseg_mem_free(
 /*==============*/
-	trx_rseg_t*	rseg);		/* in, own: instance to free */
+	trx_rseg_t*	rseg);		/*!< in, own: instance to free */
 
 /*********************************************************************
 Creates a rollback segment. */
 UNIV_INTERN
 trx_rseg_t*
-trx_rseg_create(void);
-/*==================*/
-
+trx_rseg_create(
+/*============*/
+	ulint	space);			/*!< in: id of UNDO tablespace */
+
+/********************************************************************
+Get the number of unique rollback tablespaces in use except space id 0.
+The last space id will be the sentinel value ULINT_UNDEFINED. The array
+will be sorted on space id. Note: space_ids should have have space for
+TRX_SYS_N_RSEGS + 1 elements.
+@return number of unique rollback tablespaces in use. */
+UNIV_INTERN
+ulint
+trx_rseg_get_n_undo_tablespaces(
+/*============================*/
+	ulint*		space_ids);	/*!< out: array of space ids of
+					UNDO tablespaces */
 /* Number of undo log slots in a rollback segment file copy */
 #define TRX_RSEG_N_SLOTS	(UNIV_PAGE_SIZE / 16)
 
@@ -176,16 +189,12 @@ struct trx_rseg_struct{
 					yet purged log */
 	ibool		last_del_marks;	/*!< TRUE if the last not yet purged log
 					needs purging */
-	/*--------------------------------------------------------*/
-	UT_LIST_NODE_T(trx_rseg_t) rseg_list;
-					/* the list of the rollback segment
-					memory objects */
 };
 
 /** For prioritising the rollback segments for purge. */
 struct rseg_queue_struct {
-	trx_id_t	trx_no;		/*!< trx_rseg_t::last_trx_no */
-	trx_rseg_t*	rseg;		/*!< Rollback segment */
+        trx_id_t	trx_no;         /*!< trx_rseg_t::last_trx_no */
+        trx_rseg_t*     rseg;           /*!< Rollback segment */
 };
 
 typedef struct rseg_queue_struct rseg_queue_t;
diff --git a/storage/innobase/include/trx0rseg.ic b/storage/innobase/include/trx0rseg.ic
index 5e8d2b41120..30743da9b8c 100644
--- a/storage/innobase/include/trx0rseg.ic
+++ b/storage/innobase/include/trx0rseg.ic
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -86,7 +86,7 @@ trx_rsegf_get_nth_undo(
 	ulint		n,	/*!< in: index of slot */
 	mtr_t*		mtr)	/*!< in: mtr */
 {
-	if (UNIV_UNLIKELY(n >= TRX_RSEG_N_SLOTS)) {
+	if (n >= TRX_RSEG_N_SLOTS) {
 		fprintf(stderr,
 			"InnoDB: Error: trying to get slot %lu of rseg\n",
 			(ulong) n);
@@ -108,7 +108,7 @@ trx_rsegf_set_nth_undo(
 	ulint		page_no,/*!< in: page number of the undo log segment */
 	mtr_t*		mtr)	/*!< in: mtr */
 {
-	if (UNIV_UNLIKELY(n >= TRX_RSEG_N_SLOTS)) {
+	if (n >= TRX_RSEG_N_SLOTS) {
 		fprintf(stderr,
 			"InnoDB: Error: trying to set slot %lu of rseg\n",
 			(ulong) n);
@@ -150,3 +150,18 @@ trx_rsegf_undo_find_free(
 
 	return(ULINT_UNDEFINED);
 }
+
+/******************************************************************//**
+Looks for a rollback segment, based on the rollback segment id.
+@return	rollback segment */
+UNIV_INLINE
+trx_rseg_t*
+trx_rseg_get_on_id(
+/*===============*/
+	ulint	id)	/*!< in: rollback segment id */
+{
+	ut_a(id < TRX_SYS_N_RSEGS);
+
+	return(trx_sys->rseg_array[id]);
+}
+
diff --git a/storage/innobase/include/trx0sys.h b/storage/innobase/include/trx0sys.h
index 3913792d594..a454c682f89 100644
--- a/storage/innobase/include/trx0sys.h
+++ b/storage/innobase/include/trx0sys.h
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -41,6 +41,9 @@ Created 3/26/1996 Heikki Tuuri
 #include "ut0bh.h"
 #include "read0types.h"
 #include "page0types.h"
+#include "ut0bh.h"
+
+typedef UT_LIST_BASE_NODE_T(trx_t) trx_list_t;
 
 /** In a MySQL replication slave, in crash recovery we store the master log
 file name and position here. */
@@ -66,53 +69,6 @@ extern ib_int64_t	trx_sys_mysql_bin_log_pos;
 /** The transaction system */
 extern trx_sys_t*	trx_sys;
 
-/** Doublewrite system */
-extern trx_doublewrite_t*	trx_doublewrite;
-/** The following is set to TRUE when we are upgrading from pre-4.1
-format data files to the multiple tablespaces format data files */
-extern ibool			trx_doublewrite_must_reset_space_ids;
-/** Set to TRUE when the doublewrite buffer is being created */
-extern ibool			trx_doublewrite_buf_is_being_created;
-/** The following is TRUE when we are using the database in the
-post-4.1 format, i.e., we have successfully upgraded, or have created
-a new database installation */
-extern ibool			trx_sys_multiple_tablespace_format;
-
-/****************************************************************//**
-Creates the doublewrite buffer to a new InnoDB installation. The header of the
-doublewrite buffer is placed on the trx system header page. */
-UNIV_INTERN
-void
-trx_sys_create_doublewrite_buf(void);
-/*================================*/
-/****************************************************************//**
-At a database startup initializes the doublewrite buffer memory structure if
-we already have a doublewrite buffer created in the data files. If we are
-upgrading to an InnoDB version which supports multiple tablespaces, then this
-function performs the necessary update operations. If we are in a crash
-recovery, this function uses a possible doublewrite buffer to restore
-half-written pages in the data files. */
-UNIV_INTERN
-void
-trx_sys_doublewrite_init_or_restore_pages(
-/*======================================*/
-	ibool	restore_corrupt_pages);	/*!< in: TRUE=restore pages */
-/****************************************************************//**
-Marks the trx sys header when we have successfully upgraded to the >= 4.1.x
-multiple tablespace format. */
-UNIV_INTERN
-void
-trx_sys_mark_upgraded_to_multiple_tablespaces(void);
-/*===============================================*/
-/****************************************************************//**
-Determines if a page number is located inside the doublewrite buffer.
-@return TRUE if the location is inside the two blocks of the
-doublewrite buffer */
-UNIV_INTERN
-ibool
-trx_doublewrite_page_inside(
-/*========================*/
-	ulint	page_no);	/*!< in: page number */
 /***************************************************************//**
 Checks if a page address is the trx sys header page.
 @return	TRUE if trx sys header page */
@@ -124,17 +80,24 @@ trx_sys_hdr_page(
 	ulint	page_no);/*!< in: page number */
 /*****************************************************************//**
 Creates and initializes the central memory structures for the transaction
-system. This is called when the database is started. */
+system. This is called when the database is started.
+@return min binary heap of rsegs to purge */
 UNIV_INTERN
-void
+ib_bh_t*
 trx_sys_init_at_db_start(void);
 /*==========================*/
 /*****************************************************************//**
-Creates and initializes the transaction system at the database creation. */
+Creates the trx_sys instance and initializes ib_bh and mutex. */
 UNIV_INTERN
 void
 trx_sys_create(void);
 /*================*/
+/*****************************************************************//**
+Creates and initializes the transaction system at the database creation. */
+UNIV_INTERN
+void
+trx_sys_create_sys_pages(void);
+/*==========================*/
 /****************************************************************//**
 Looks for a free slot for a rollback segment in the trx system file copy.
 @return	slot index or ULINT_UNDEFINED if not found */
@@ -152,16 +115,6 @@ trx_sys_get_nth_rseg(
 /*=================*/
 	trx_sys_t*	sys,	/*!< in: trx system */
 	ulint		n);	/*!< in: index of slot */
-/***************************************************************//**
-Sets the pointer in the nth slot of the rseg array. */
-UNIV_INLINE
-void
-trx_sys_set_nth_rseg(
-/*=================*/
-	trx_sys_t*	sys,	/*!< in: trx system */
-	ulint		n,	/*!< in: index of slot */
-	trx_rseg_t*	rseg);	/*!< in: pointer to rseg object, NULL if slot
-				not in use */
 /**********************************************************************//**
 Gets a pointer to the transaction system file copy and x-locks its page.
 @return	pointer to system file copy, page x-locked */
@@ -222,6 +175,14 @@ UNIV_INLINE
 trx_id_t
 trx_sys_get_new_trx_id(void);
 /*========================*/
+/*****************************************************************//**
+Determines the maximum transaction id.
+@return maximum currently allocated trx id; will be stale after the
+next call to trx_sys_get_new_trx_id() */
+UNIV_INLINE
+trx_id_t
+trx_sys_get_max_trx_id(void);
+/*========================*/
 #endif /* !UNIV_HOTBACKUP */
 
 #ifdef UNIV_DEBUG
@@ -251,39 +212,65 @@ trx_read_trx_id(
 /*============*/
 	const byte*	ptr);	/*!< in: pointer to memory from where to read */
 /****************************************************************//**
-Looks for the trx handle with the given id in trx_list.
-@return	the trx handle or NULL if not found */
+Looks for the trx instance with the given id in the rw trx_list.
+The caller must be holding trx_sys->mutex.
+@return	the trx handle or NULL if not found;
+the pointer must not be dereferenced unless lock_sys->mutex was
+acquired before calling this function and is still being held */
 UNIV_INLINE
 trx_t*
-trx_get_on_id(
-/*==========*/
+trx_get_rw_trx_by_id(
+/*=================*/
 	trx_id_t	trx_id);/*!< in: trx id to search for */
 /****************************************************************//**
-Returns the minumum trx id in trx list. This is the smallest id for which
-the trx can possibly be active. (But, you must look at the trx->conc_state to
+Returns the minimum trx id in rw trx list. This is the smallest id for which
+the trx can possibly be active. (But, you must look at the trx->state to
 find out if the minimum trx id transaction itself is active, or already
 committed.)
 @return	the minimum trx id, or trx_sys->max_trx_id if the trx list is empty */
 UNIV_INLINE
 trx_id_t
-trx_list_get_min_trx_id(void);
-/*=========================*/
+trx_rw_min_trx_id(void);
+/*===================*/
 /****************************************************************//**
-Checks if a transaction with the given id is active.
-@return	TRUE if active */
+Checks if a rw transaction with the given id is active. Caller must hold
+trx_sys->mutex in shared mode. If the caller is not holding
+lock_sys->mutex, the transaction may already have been committed.
+@return	transaction instance if active, or NULL;
+the pointer must not be dereferenced unless lock_sys->mutex was
+acquired before calling this function and is still being held */
 UNIV_INLINE
-ibool
-trx_is_active(
-/*==========*/
-	trx_id_t	trx_id);/*!< in: trx id of the transaction */
+trx_t*
+trx_rw_is_active_low(
+/*=================*/
+	trx_id_t	trx_id,		/*!< in: trx id of the transaction */
+	ibool*		corrupt);	/*!< in: NULL or pointer to a flag
+					that will be set if corrupt */
 /****************************************************************//**
-Checks that trx is in the trx list.
+Checks if a rw transaction with the given id is active. If the caller is
+not holding lock_sys->mutex, the transaction may already have been
+committed.
+@return	transaction instance if active, or NULL;
+the pointer must not be dereferenced unless lock_sys->mutex was
+acquired before calling this function and is still being held */
+UNIV_INLINE
+trx_t*
+trx_rw_is_active(
+/*=============*/
+	trx_id_t	trx_id,		/*!< in: trx id of the transaction */
+	ibool*		corrupt);	/*!< in: NULL or pointer to a flag
+					that will be set if corrupt */
+#ifdef UNIV_DEBUG
+/****************************************************************//**
+Checks whether a trx is in one of rw_trx_list or ro_trx_list.
 @return	TRUE if is in */
 UNIV_INTERN
 ibool
 trx_in_trx_list(
 /*============*/
-	trx_t*	in_trx);/*!< in: trx */
+	const trx_t*	in_trx)		/*!< in: transaction */
+	__attribute__((nonnull, warn_unused_result));
+#endif /* UNIV_DEBUG */
 #if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
 /***********************************************************//**
 Assert that a transaction has been recovered.
@@ -344,14 +331,12 @@ UNIV_INTERN
 void
 trx_sys_file_format_tag_init(void);
 /*==============================*/
-#ifndef UNIV_HOTBACKUP
 /*****************************************************************//**
 Shutdown/Close the transaction system. */
 UNIV_INTERN
 void
 trx_sys_close(void);
 /*===============*/
-#endif /* !UNIV_HOTBACKUP */
 /*****************************************************************//**
 Get the name representation of the file format from its id.
 @return	pointer to the name */
@@ -371,31 +356,30 @@ trx_sys_file_format_max_set(
 	ulint		format_id,	/*!< in: file format id */
 	const char**	name);		/*!< out: max file format name or
 					NULL if not needed. */
-/*****************************************************************//**
-Get the name representation of the file format from its id.
-@return	pointer to the max format name */
+/*********************************************************************
+Creates the rollback segments
+@return number of rollback segments that are active. */
 UNIV_INTERN
-const char*
-trx_sys_file_format_max_get(void);
-/*=============================*/
+ulint
+trx_sys_create_rsegs(
+/*=================*/
+	ulint	n_spaces,	/*!< number of tablespaces for UNDO logs */
+	ulint	n_rsegs);	/*!< number of rollback segments to create */
 /*****************************************************************//**
-Check for the max file format tag stored on disk.
-@return	DB_SUCCESS or error code */
-UNIV_INTERN
+Get the number of transaction in the system, independent of their state.
+@return count of transactions in trx_sys_t::trx_list */
+UNIV_INLINE
 ulint
-trx_sys_file_format_max_check(
-/*==========================*/
-	ulint		max_format_id);	/*!< in: the max format id to check */
-/********************************************************************//**
-Update the file format tag in the system tablespace only if the given
-format id is greater than the known max id.
-@return	TRUE if format_id was bigger than the known max id */
+trx_sys_get_n_rw_trx(void);
+/*======================*/
+
+/*********************************************************************
+Check if there are any active (non-prepared) transactions.
+@return total number of active transactions or 0 if none */
 UNIV_INTERN
-ibool
-trx_sys_file_format_max_upgrade(
-/*============================*/
-	const char**	name,		/*!< out: max file format name */
-	ulint		format_id);	/*!< in: file format identifier */
+ulint
+trx_sys_any_active_transactions(void);
+/*=================================*/
 #else /* !UNIV_HOTBACKUP */
 /*****************************************************************//**
 Prints to stderr the MySQL binlog info in the system header if the
@@ -432,6 +416,32 @@ trx_sys_read_pertable_file_format_id(
 				datafile */
 	ulint *format_id);	/*!< out: file format of the per-table
 				data file */
+#endif /* !UNIV_HOTBACKUP */
+/*****************************************************************//**
+Get the name representation of the file format from its id.
+@return	pointer to the max format name */
+UNIV_INTERN
+const char*
+trx_sys_file_format_max_get(void);
+/*=============================*/
+/*****************************************************************//**
+Check for the max file format tag stored on disk.
+@return	DB_SUCCESS or error code */
+UNIV_INTERN
+ulint
+trx_sys_file_format_max_check(
+/*==========================*/
+	ulint		max_format_id);	/*!< in: the max format id to check */
+/********************************************************************//**
+Update the file format tag in the system tablespace only if the given
+format id is greater than the known max id.
+@return	TRUE if format_id was bigger than the known max id */
+UNIV_INTERN
+ibool
+trx_sys_file_format_max_upgrade(
+/*============================*/
+	const char**	name,		/*!< out: max file format name */
+	ulint		format_id);	/*!< in: file format identifier */
 /*****************************************************************//**
 Get the name representation of the file format from its id.
 @return	pointer to the name */
@@ -441,14 +451,14 @@ trx_sys_file_format_id_to_name(
 /*===========================*/
 	const ulint	id);	/*!< in: id of the file format */
 
-#endif /* !UNIV_HOTBACKUP */
-/*********************************************************************
-Creates the rollback segments */
+#ifdef UNIV_DEBUG
+/*************************************************************//**
+Validate the trx_sys_t::trx_list. */
 UNIV_INTERN
-void
-trx_sys_create_rsegs(
-/*=================*/
-	ulint	n_rsegs);	/*!< number of rollback segments to create */
+ibool
+trx_sys_validate_trx_list(void);
+/*===========================*/
+#endif /* UNIV_DEBUG */
 
 /* The automatically created system rollback segment has this id */
 #define TRX_SYS_SYSTEM_RSEG_ID	0
@@ -502,8 +512,8 @@ We must remember this limit in order to keep file compatibility. */
 /** Contents of TRX_SYS_MYSQL_LOG_MAGIC_N_FLD */
 #define TRX_SYS_MYSQL_LOG_MAGIC_N	873422344
 
-#if UNIV_PAGE_SIZE < 4096
-# error "UNIV_PAGE_SIZE < 4096"
+#if UNIV_PAGE_SIZE_MIN < 4096
+# error "UNIV_PAGE_SIZE_MIN < 4096"
 #endif
 /** The offset of the MySQL replication info in the trx system header;
 this contains the same fields as TRX_SYS_MYSQL_LOG_INFO below */
@@ -559,7 +569,7 @@ this contains the same fields as TRX_SYS_MYSQL_LOG_INFO below */
 /** If this is not yet set to TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED_N,
 we must reset the doublewrite buffer, because starting from 4.1.x the
 space id of a data page is stored into
-FIL_PAGE_ARCH_LOG_NO_OR_SPACE_NO. */
+FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID. */
 #define TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED (24 + FSEG_HEADER_SIZE)
 
 /*-------------------------------------------------------------*/
@@ -572,7 +582,6 @@ FIL_PAGE_ARCH_LOG_NO_OR_SPACE_NO. */
 #define TRX_SYS_DOUBLEWRITE_BLOCK_SIZE	FSP_EXTENT_SIZE
 /* @} */
 
-#ifndef UNIV_HOTBACKUP
 /** File format tag */
 /* @{ */
 /** The offset of the file format tag on the trx system header page
@@ -591,48 +600,50 @@ identifier is added to this 64-bit constant. */
 	 | TRX_SYS_FILE_FORMAT_TAG_MAGIC_N_LOW)
 /* @} */
 
-/** Doublewrite control struct */
-struct trx_doublewrite_struct{
-	mutex_t	mutex;		/*!< mutex protecting the first_free field and
-				write_buf */
-	ulint	block1;		/*!< the page number of the first
-				doublewrite block (64 pages) */
-	ulint	block2;		/*!< page number of the second block */
-	ulint	first_free;	/*!< first free position in write_buf measured
-				in units of UNIV_PAGE_SIZE */
-	byte*	write_buf;	/*!< write buffer used in writing to the
-				doublewrite buffer, aligned to an
-				address divisible by UNIV_PAGE_SIZE
-				(which is required by Windows aio) */
-	byte*	write_buf_unaligned;
-				/*!< pointer to write_buf, but unaligned */
-	buf_page_t**
-		buf_block_arr;	/*!< array to store pointers to the buffer
-				blocks which have been cached to write_buf */
-};
-
-/** The transaction system central memory data structure; protected by the
-kernel mutex */
+#ifndef UNIV_HOTBACKUP
+/** The transaction system central memory data structure. */
 struct trx_sys_struct{
+
+	mutex_t		mutex;		/*!< mutex protecting most fields in
+					this structure except when noted
+					otherwise */
+	ulint		n_mysql_trx;	/*!< Number of transactions currently
+					allocated for MySQL */
+	ulint		n_prepared_trx;	/*!< Number of transactions currently
+					in the XA PREPARED state */
 	trx_id_t	max_trx_id;	/*!< The smallest number not yet
 					assigned as a transaction id or
 					transaction number */
-	UT_LIST_BASE_NODE_T(trx_t) trx_list;
-					/*!< List of active and committed in
-					memory transactions, sorted on trx id,
-					biggest first */
-	UT_LIST_BASE_NODE_T(trx_t) mysql_trx_list;
-					/*!< List of transactions created
-					for MySQL */
-	UT_LIST_BASE_NODE_T(trx_rseg_t) rseg_list;
-					/*!< List of rollback segment
-					objects */
-	trx_rseg_t*	latest_rseg;	/*!< Latest rollback segment in the
-					round-robin assignment of rollback
-					segments to transactions */
-	trx_rseg_t*	rseg_array[TRX_SYS_N_RSEGS];
+	trx_list_t	rw_trx_list;	/*!< List of active and committed in
+					memory read-write transactions, sorted
+					on trx id, biggest first. Recovered
+					transactions are always on this list. */
+	trx_list_t	ro_trx_list;	/*!< List of active and committed in
+					memory read-only transactions, sorted
+					on trx id, biggest first. NOTE:
+					The order for read-only transactions
+					is not necessary. We should exploit
+					this and increase concurrency during
+					add/remove. */
+	trx_list_t	mysql_trx_list;	/*!< List of transactions created
+					for MySQL. All transactions on
+					ro_trx_list are on mysql_trx_list. The
+					rw_trx_list can contain system
+					transactions and recovered transactions
+					that will not be in the mysql_trx_list.
+					There can be active non-locking
+					auto-commit read only transactions that
+					are on this list but not on ro_trx_list.
+					mysql_trx_list may additionally contain
+					transactions that have not yet been
+					started in InnoDB. */
+	trx_rseg_t*	const rseg_array[TRX_SYS_N_RSEGS];
 					/*!< Pointer array to rollback
-					segments; NULL if slot not in use */
+					segments; NULL if slot not in use;
+					created and destroyed in
+					single-threaded mode; not protected
+					by any mutex, because it is read-only
+					during multi-threaded operation */
 	ulint		rseg_history_len;/*!< Length of the TRX_RSEG_HISTORY
 					list (update undo logs for committed
 					transactions), protected by
diff --git a/storage/innobase/include/trx0sys.ic b/storage/innobase/include/trx0sys.ic
index 5e702b25325..e097e29b551 100644
--- a/storage/innobase/include/trx0sys.ic
+++ b/storage/innobase/include/trx0sys.ic
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -80,28 +80,11 @@ trx_sys_get_nth_rseg(
 	trx_sys_t*	sys,	/*!< in: trx system */
 	ulint		n)	/*!< in: index of slot */
 {
-	ut_ad(mutex_own(&(kernel_mutex)));
 	ut_ad(n < TRX_SYS_N_RSEGS);
 
 	return(sys->rseg_array[n]);
 }
 
-/***************************************************************//**
-Sets the pointer in the nth slot of the rseg array. */
-UNIV_INLINE
-void
-trx_sys_set_nth_rseg(
-/*=================*/
-	trx_sys_t*	sys,	/*!< in: trx system */
-	ulint		n,	/*!< in: index of slot */
-	trx_rseg_t*	rseg)	/*!< in: pointer to rseg object, NULL if slot
-				not in use */
-{
-	ut_ad(n < TRX_SYS_N_RSEGS);
-
-	sys->rseg_array[n] = rseg;
-}
-
 /**********************************************************************//**
 Gets a pointer to the transaction system header and x-latches its page.
 @return	pointer to system header, page x-latched. */
@@ -137,7 +120,6 @@ trx_sysf_rseg_get_space(
 	ulint		i,		/*!< in: slot index == rseg id */
 	mtr_t*		mtr)		/*!< in: mtr */
 {
-	ut_ad(mutex_own(&(kernel_mutex)));
 	ut_ad(sys_header);
 	ut_ad(i < TRX_SYS_N_RSEGS);
 
@@ -159,7 +141,6 @@ trx_sysf_rseg_get_page_no(
 	mtr_t*		mtr)		/*!< in: mtr */
 {
 	ut_ad(sys_header);
-	ut_ad(mutex_own(&(kernel_mutex)));
 	ut_ad(i < TRX_SYS_N_RSEGS);
 
 	return(mtr_read_ulint(sys_header + TRX_SYS_RSEGS
@@ -179,7 +160,6 @@ trx_sysf_rseg_set_space(
 	ulint		space,		/*!< in: space id */
 	mtr_t*		mtr)		/*!< in: mtr */
 {
-	ut_ad(mutex_own(&(kernel_mutex)));
 	ut_ad(sys_header);
 	ut_ad(i < TRX_SYS_N_RSEGS);
 
@@ -203,7 +183,6 @@ trx_sysf_rseg_set_page_no(
 					slot is reset to unused */
 	mtr_t*		mtr)		/*!< in: mtr */
 {
-	ut_ad(mutex_own(&(kernel_mutex)));
 	ut_ad(sys_header);
 	ut_ad(i < TRX_SYS_N_RSEGS);
 
@@ -251,30 +230,96 @@ trx_read_trx_id(
 }
 
 /****************************************************************//**
-Looks for the trx handle with the given id in trx_list.
-@return	the trx handle or NULL if not found */
+Looks for the trx handle with the given id in rw_trx_list.
+The caller must be holding trx_sys->mutex.
+@return	the trx handle or NULL if not found;
+the pointer must not be dereferenced unless lock_sys->mutex was
+acquired before calling this function and is still being held */
 UNIV_INLINE
 trx_t*
-trx_get_on_id(
-/*==========*/
+trx_get_rw_trx_by_id(
+/*=================*/
 	trx_id_t	trx_id)	/*!< in: trx id to search for */
 {
-	trx_t*	trx;
+	trx_t*		trx;
+	ulint		len;
+	trx_t*		first;
 
-	ut_ad(mutex_own(&(kernel_mutex)));
+	ut_ad(mutex_own(&trx_sys->mutex));
 
-	trx = UT_LIST_GET_FIRST(trx_sys->trx_list);
+	len = UT_LIST_GET_LEN(trx_sys->rw_trx_list);
 
-	while (trx != NULL) {
-		if (trx_id == trx->id) {
+	if (len == 0) {
+		return(NULL);
+	}
+
+	/* Because the list is ordered on trx id in descending order,
+	we try to speed things up a bit. */
+
+	trx = UT_LIST_GET_FIRST(trx_sys->rw_trx_list);
+	assert_trx_in_rw_list(trx);
+
+	if (trx_id == trx->id) {
+		return(trx);
+	} else if (len == 1 || trx_id > trx->id) {
+		return(NULL);
+	}
+
+	first = trx;
 
-			return(trx);
+	trx = UT_LIST_GET_LAST(trx_sys->rw_trx_list);
+	assert_trx_in_rw_list(trx);
+
+	if (trx_id == trx->id) {
+		return(trx);
+	} else if (len == 2 || trx_id < trx->id) {
+		return(NULL);
+	}
+
+	/* Search the list from the lower end (tail). */
+	if (trx_id < (first->id + trx->id) >> 1) {
+		for (trx = UT_LIST_GET_PREV(trx_list, trx);
+		     trx != NULL && trx_id > trx->id;
+		     trx = UT_LIST_GET_PREV(trx_list, trx)) {
+			assert_trx_in_rw_list(trx);
 		}
+	} else {
+		for (trx = UT_LIST_GET_NEXT(trx_list, first);
+		     trx != NULL && trx_id < trx->id;
+		     trx = UT_LIST_GET_NEXT(trx_list, trx)) {
+			assert_trx_in_rw_list(trx);
+		}
+	}
 
-		trx = UT_LIST_GET_NEXT(trx_list, trx);
+	return((trx != NULL && trx->id == trx_id) ? trx : NULL);
+}
+
+/****************************************************************//**
+Returns the minimum trx id in trx list. This is the smallest id for which
+the trx can possibly be active. (But, you must look at the trx->state
+to find out if the minimum trx id transaction itself is active, or already
+committed.). The caller must be holding the trx_sys_t::mutex in shared mode.
+@return	the minimum trx id, or trx_sys->max_trx_id if the trx list is empty */
+UNIV_INLINE
+trx_id_t
+trx_rw_min_trx_id_low(void)
+/*=======================*/
+{
+	trx_id_t	id;
+	const trx_t*	trx;
+
+	ut_ad(mutex_own(&trx_sys->mutex));
+
+	trx = UT_LIST_GET_LAST(trx_sys->rw_trx_list);
+
+	if (trx == NULL) {
+		id = trx_sys->max_trx_id;
+	} else {
+		assert_trx_in_rw_list(trx);
+		id = trx->id;
 	}
 
-	return(NULL);
+	return(id);
 }
 
 #if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
@@ -287,79 +332,109 @@ trx_assert_recovered(
 /*=================*/
 	trx_id_t	trx_id)		/*!< in: transaction identifier */
 {
-	trx_t*		trx;
+	const trx_t*	trx;
 
-	mutex_enter(&kernel_mutex);
-	trx = trx_get_on_id(trx_id);
-	ut_a(trx);
+	mutex_enter(&trx_sys->mutex);
+
+	trx = trx_get_rw_trx_by_id(trx_id);
 	ut_a(trx->is_recovered);
-	mutex_exit(&kernel_mutex);
+
+	mutex_exit(&trx_sys->mutex);
 
 	return(TRUE);
 }
 #endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
 
 /****************************************************************//**
-Returns the minumum trx id in trx list. This is the smallest id for which
-the trx can possibly be active. (But, you must look at the trx->conc_state to
-find out if the minimum trx id transaction itself is active, or already
+Returns the minimum trx id in rw trx list. This is the smallest id for which
+the rw trx can possibly be active. (But, you must look at the trx->state
+to find out if the minimum trx id transaction itself is active, or already
 committed.)
-@return	the minimum trx id, or trx_sys->max_trx_id if the trx list is empty */
+@return	the minimum trx id, or trx_sys->max_trx_id if rw trx list is empty */
 UNIV_INLINE
 trx_id_t
-trx_list_get_min_trx_id(void)
-/*=========================*/
+trx_rw_min_trx_id(void)
+/*===================*/
 {
-	trx_t*	trx;
+	trx_id_t	id;
 
-	ut_ad(mutex_own(&(kernel_mutex)));
+	mutex_enter(&trx_sys->mutex);
 
-	trx = UT_LIST_GET_LAST(trx_sys->trx_list);
+	id = trx_rw_min_trx_id_low();
 
-	if (trx == NULL) {
+	mutex_exit(&trx_sys->mutex);
 
-		return(trx_sys->max_trx_id);
-	}
-
-	return(trx->id);
+	return(id);
 }
 
 /****************************************************************//**
-Checks if a transaction with the given id is active.
-@return	TRUE if active */
+Checks if a rw transaction with the given id is active. Caller must hold
+trx_sys->mutex. If the caller is not holding lock_sys->mutex, the
+transaction may already have been committed.
+@return	transaction instance if active, or NULL;
+the pointer must not be dereferenced unless lock_sys->mutex was
+acquired before calling this function and is still being held */
 UNIV_INLINE
-ibool
-trx_is_active(
-/*==========*/
-	trx_id_t	trx_id)	/*!< in: trx id of the transaction */
+trx_t*
+trx_rw_is_active_low(
+/*=================*/
+	trx_id_t	trx_id,		/*!< in: trx id of the transaction */
+	ibool*		corrupt)	/*!< in: NULL or pointer to a flag
+					that will be set if corrupt */
 {
-	trx_t*	trx;
+	trx_t*		trx;
 
-	ut_ad(mutex_own(&(kernel_mutex)));
+	ut_ad(mutex_own(&trx_sys->mutex));
 
-	if (trx_id < trx_list_get_min_trx_id()) {
+	if (trx_id < trx_rw_min_trx_id_low()) {
 
-		return(FALSE);
-	}
+		trx = NULL;
+	} else if (trx_id >= trx_sys->max_trx_id) {
 
-	if (UNIV_UNLIKELY(trx_id >= trx_sys->max_trx_id)) {
+		/* There must be corruption: we let the caller handle the
+		diagnostic prints in this case. */
 
-		/* There must be corruption: we return TRUE because this
-		function is only called by lock_clust_rec_some_has_impl()
-		and row_vers_impl_x_locked_off_kernel() and they have
-		diagnostic prints in this case */
+		trx = NULL;
+		if (corrupt != NULL) {
+			*corrupt = TRUE;
+		}
+	} else {
+		trx = trx_get_rw_trx_by_id(trx_id);
 
-		return(TRUE);
+		if (trx != NULL
+		    && trx_state_eq(trx, TRX_STATE_COMMITTED_IN_MEMORY)) {
+
+			trx = NULL;
+		}
 	}
 
-	trx = trx_get_on_id(trx_id);
-	if (trx && (trx->conc_state == TRX_ACTIVE
-		    || trx->conc_state == TRX_PREPARED)) {
+	return(trx);
+}
 
-		return(TRUE);
-	}
+/****************************************************************//**
+Checks if a rw transaction with the given id is active. If the caller is
+not holding lock_sys->mutex, the transaction may already have been
+committed.
+@return	transaction instance if active, or NULL;
+the pointer must not be dereferenced unless lock_sys->mutex was
+acquired before calling this function and is still being held */
+UNIV_INLINE
+trx_t*
+trx_rw_is_active(
+/*=============*/
+	trx_id_t	trx_id,		/*!< in: trx id of the transaction */
+	ibool*		corrupt)	/*!< in: NULL or pointer to a flag
+					that will be set if corrupt */
+{
+	trx_t*		trx;
 
-	return(FALSE);
+	mutex_enter(&trx_sys->mutex);
+
+	trx = trx_rw_is_active_low(trx_id, corrupt);
+
+	mutex_exit(&trx_sys->mutex);
+
+	return(trx);
 }
 
 /*****************************************************************//**
@@ -370,9 +445,7 @@ trx_id_t
 trx_sys_get_new_trx_id(void)
 /*========================*/
 {
-	trx_id_t	id;
-
-	ut_ad(mutex_own(&kernel_mutex));
+	ut_ad(mutex_own(&trx_sys->mutex));
 
 	/* VERY important: after the database is started, max_trx_id value is
 	divisible by TRX_SYS_TRX_ID_WRITE_MARGIN, and the following if
@@ -381,14 +454,59 @@ trx_sys_get_new_trx_id(void)
 	Thus trx id values will not overlap when the database is
 	repeatedly started! */
 
-	if ((ulint) trx_sys->max_trx_id % TRX_SYS_TRX_ID_WRITE_MARGIN == 0) {
+	if (!(trx_sys->max_trx_id % (trx_id_t) TRX_SYS_TRX_ID_WRITE_MARGIN)) {
 
 		trx_sys_flush_max_trx_id();
 	}
 
-	id = trx_sys->max_trx_id++;
+	return(trx_sys->max_trx_id++);
+}
 
-	return(id);
+/*****************************************************************//**
+Determines the maximum transaction id.
+@return maximum currently allocated trx id; will be stale after the
+next call to trx_sys_get_new_trx_id() */
+UNIV_INLINE
+trx_id_t
+trx_sys_get_max_trx_id(void)
+/*========================*/
+{
+#if UNIV_WORD_SIZE < DATA_TRX_ID_LEN
+	trx_id_t	max_trx_id;
+#endif
+
+	ut_ad(!mutex_own(&trx_sys->mutex));
+
+#if UNIV_WORD_SIZE < DATA_TRX_ID_LEN
+	/* Avoid torn reads. */
+	mutex_enter(&trx_sys->mutex);
+	max_trx_id = trx_sys->max_trx_id;
+	mutex_exit(&trx_sys->mutex);
+	return(max_trx_id);
+#else
+	/* Perform a dirty read. Callers should be prepared for stale
+	values, and we know that the value fits in a machine word, so
+	that it will be read and written atomically. */
+	return(trx_sys->max_trx_id);
+#endif
 }
 
+/*****************************************************************//**
+Get the number of transaction in the system, independent of their state.
+@return count of transactions in trx_sys_t::rw_trx_list */
+UNIV_INLINE
+ulint
+trx_sys_get_n_rw_trx(void)
+/*======================*/
+{
+	ulint	n_trx;
+
+	mutex_enter(&trx_sys->mutex);
+
+	n_trx = UT_LIST_GET_LEN(trx_sys->rw_trx_list);
+
+	mutex_exit(&trx_sys->mutex);
+
+	return(n_trx);
+}
 #endif /* !UNIV_HOTBACKUP */
diff --git a/storage/innobase/include/trx0trx.h b/storage/innobase/include/trx0trx.h
index 7572c766301..3e6cfc7d0da 100644
--- a/storage/innobase/include/trx0trx.h
+++ b/storage/innobase/include/trx0trx.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1996, 2010, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2011, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -31,30 +31,25 @@ Created 3/26/1996 Heikki Tuuri
 #include "dict0types.h"
 #ifndef UNIV_HOTBACKUP
 #include "lock0types.h"
+#include "log0log.h"
 #include "usr0types.h"
 #include "que0types.h"
 #include "mem0mem.h"
 #include "read0types.h"
 #include "trx0xa.h"
 #include "ut0vec.h"
+#include "fts0fts.h"
 
 /** Dummy session used currently in MySQL interface */
 extern sess_t*	trx_dummy_sess;
 
-/** Number of transactions currently allocated for MySQL: protected by
-the kernel mutex */
-extern ulint	trx_n_mysql_transactions;
-/** Number of transactions currently in the XA PREPARED state: protected by
-the kernel mutex */
-extern ulint	trx_n_prepared;
-
 /********************************************************************//**
 Releases the search latch if trx has reserved it. */
-UNIV_INTERN
+UNIV_INLINE
 void
 trx_search_latch_release_if_reserved(
 /*=================================*/
-	trx_t*	   trx); /*!< in: transaction */
+	trx_t*		trx); /*!< in: transaction */
 /******************************************************************//**
 Set detailed error message for the transaction. */
 UNIV_INTERN
@@ -80,15 +75,6 @@ const dict_index_t*
 trx_get_error_info(
 /*===============*/
 	const trx_t*	trx);	/*!< in: trx object */
-/****************************************************************//**
-Creates and initializes a transaction object.
-@return	own: the transaction */
-UNIV_INTERN
-trx_t*
-trx_create(
-/*=======*/
-	sess_t*	sess)	/*!< in: session */
-	__attribute__((nonnull));
 /********************************************************************//**
 Creates a transaction object for MySQL.
 @return	own: transaction object */
@@ -104,11 +90,11 @@ trx_t*
 trx_allocate_for_background(void);
 /*=============================*/
 /********************************************************************//**
-Frees a transaction object. */
+Frees a transaction object of a background operation of the master thread. */
 UNIV_INTERN
 void
-trx_free(
-/*=====*/
+trx_free_for_background(
+/*====================*/
 	trx_t*	trx);	/*!< in, own: trx object */
 /********************************************************************//**
 At shutdown, frees a transaction object that is in the PREPARED state. */
@@ -125,13 +111,6 @@ void
 trx_free_for_mysql(
 /*===============*/
 	trx_t*	trx);	/*!< in, own: trx object */
-/********************************************************************//**
-Frees a transaction object of a background operation of the master thread. */
-UNIV_INTERN
-void
-trx_free_for_background(
-/*====================*/
-	trx_t*	trx);	/*!< in, own: trx object */
 /****************************************************************//**
 Creates trx objects for transactions and initializes the trx list of
 trx_sys at database start. Rollback segment and undo log lists must
@@ -142,50 +121,26 @@ UNIV_INTERN
 void
 trx_lists_init_at_db_start(void);
 /*============================*/
-/****************************************************************//**
-Starts a new transaction.
-@return TRUE if success, FALSE if the rollback segment could not
-support this many transactions */
-UNIV_INTERN
-ibool
-trx_start(
-/*======*/
-	trx_t*	trx,	/*!< in: transaction */
-	ulint	rseg_id);/*!< in: rollback segment id; if ULINT_UNDEFINED
-			is passed, the system chooses the rollback segment
-			automatically in a round-robin fashion */
-/****************************************************************//**
-Starts a new transaction.
-@return	TRUE */
-UNIV_INTERN
-ibool
-trx_start_low(
-/*==========*/
-	trx_t*	trx,	/*!< in: transaction */
-	ulint	rseg_id);/*!< in: rollback segment id; if ULINT_UNDEFINED
-			is passed, the system chooses the rollback segment
-			automatically in a round-robin fashion */
 /*************************************************************//**
 Starts the transaction if it is not yet started. */
-UNIV_INLINE
+UNIV_INTERN
 void
-trx_start_if_not_started(
-/*=====================*/
+trx_start_if_not_started_xa(
+/*========================*/
 	trx_t*	trx);	/*!< in: transaction */
 /*************************************************************//**
-Starts the transaction if it is not yet started. Assumes we have reserved
-the kernel mutex! */
-UNIV_INLINE
+Starts the transaction if it is not yet started. */
+UNIV_INTERN
 void
-trx_start_if_not_started_low(
-/*=========================*/
+trx_start_if_not_started(
+/*=====================*/
 	trx_t*	trx);	/*!< in: transaction */
 /****************************************************************//**
 Commits a transaction. */
 UNIV_INTERN
 void
-trx_commit_off_kernel(
-/*==================*/
+trx_commit(
+/*=======*/
 	trx_t*	trx);	/*!< in: transaction */
 /****************************************************************//**
 Cleans up a transaction at database startup. The cleanup is needed if
@@ -203,15 +158,14 @@ UNIV_INTERN
 ulint
 trx_commit_for_mysql(
 /*=================*/
-	trx_t*	trx);	/*!< in: trx handle */
+	trx_t*	trx);	/*!< in/out: transaction */
 /**********************************************************************//**
-Does the transaction prepare for MySQL.
-@return	0 or error number */
+Does the transaction prepare for MySQL. */
 UNIV_INTERN
-ulint
+void
 trx_prepare_for_mysql(
 /*==================*/
-	trx_t*	trx);	/*!< in: trx handle */
+	trx_t*	trx);	/*!< in/out: trx handle */
 /**********************************************************************//**
 This function is used to find number of prepared transactions and
 their transaction objects for a recovery.
@@ -225,7 +179,9 @@ trx_recover_for_mysql(
 /*******************************************************************//**
 This function is used to find one X/Open XA distributed transaction
 which is in the prepared state
-@return	trx or NULL; on match, the trx->xid will be invalidated */
+@return	trx or NULL; on match, the trx->xid will be invalidated;
+note that the trx may have been committed, unless the caller is
+holding lock_sys->mutex */
 UNIV_INTERN
 trx_t *
 trx_get_trx_by_xid(
@@ -257,86 +213,20 @@ read_view_t*
 trx_assign_read_view(
 /*=================*/
 	trx_t*	trx);	/*!< in: active transaction */
-/***********************************************************//**
-The transaction must be in the TRX_QUE_LOCK_WAIT state. Puts it to
-the TRX_QUE_RUNNING state and releases query threads which were
-waiting for a lock in the wait_thrs list. */
-UNIV_INTERN
-void
-trx_end_lock_wait(
-/*==============*/
-	trx_t*	trx);	/*!< in: transaction */
 /****************************************************************//**
-Sends a signal to a trx object. */
+Prepares a transaction for commit/rollback. */
 UNIV_INTERN
 void
-trx_sig_send(
-/*=========*/
-	trx_t*		trx,		/*!< in: trx handle */
-	ulint		type,		/*!< in: signal type */
-	ulint		sender,		/*!< in: TRX_SIG_SELF or
-					TRX_SIG_OTHER_SESS */
-	que_thr_t*	receiver_thr,	/*!< in: query thread which wants the
-					reply, or NULL; if type is
-					TRX_SIG_END_WAIT, this must be NULL */
-	trx_savept_t*	savept,		/*!< in: possible rollback savepoint, or
-					NULL */
-	que_thr_t**	next_thr);	/*!< in/out: next query thread to run;
-					if the value which is passed in is
-					a pointer to a NULL pointer, then the
-					calling function can start running
-					a new query thread; if the parameter
-					is NULL, it is ignored */
-/****************************************************************//**
-Send the reply message when a signal in the queue of the trx has
-been handled. */
-UNIV_INTERN
-void
-trx_sig_reply(
-/*==========*/
-	trx_sig_t*	sig,		/*!< in: signal */
-	que_thr_t**	next_thr);	/*!< in/out: next query thread to run;
-					if the value which is passed in is
-					a pointer to a NULL pointer, then the
-					calling function can start running
-					a new query thread */
-/****************************************************************//**
-Removes the signal object from a trx signal queue. */
-UNIV_INTERN
-void
-trx_sig_remove(
-/*===========*/
-	trx_t*		trx,	/*!< in: trx handle */
-	trx_sig_t*	sig);	/*!< in, own: signal */
-/****************************************************************//**
-Starts handling of a trx signal. */
-UNIV_INTERN
-void
-trx_sig_start_handle(
-/*=================*/
-	trx_t*		trx,		/*!< in: trx handle */
-	que_thr_t**	next_thr);	/*!< in/out: next query thread to run;
-					if the value which is passed in is
-					a pointer to a NULL pointer, then the
-					calling function can start running
-					a new query thread */
-/****************************************************************//**
-Ends signal handling. If the session is in the error state, and
-trx->graph_before_signal_handling != NULL, returns control to the error
-handling routine of the graph (currently only returns the control to the
-graph root which then sends an error message to the client). */
-UNIV_INTERN
-void
-trx_end_signal_handling(
-/*====================*/
-	trx_t*	trx);	/*!< in: trx */
+trx_commit_or_rollback_prepare(
+/*===========================*/
+	trx_t*	trx);	/*!< in/out: transaction */
 /*********************************************************************//**
 Creates a commit command node struct.
 @return	own: commit node struct */
 UNIV_INTERN
 commit_node_t*
-commit_node_create(
-/*===============*/
+trx_commit_node_create(
+/*===================*/
 	mem_heap_t*	heap);	/*!< in: mem heap where created */
 /***********************************************************//**
 Performs an execution step for a commit type node in a query graph.
@@ -348,16 +238,53 @@ trx_commit_step(
 	que_thr_t*	thr);	/*!< in: query thread */
 
 /**********************************************************************//**
-Prints info about a transaction to the given file. The caller must own the
-kernel mutex. */
+Prints info about a transaction.
+Caller must hold trx_sys->mutex. */
+UNIV_INTERN
+void
+trx_print_low(
+/*==========*/
+	FILE*		f,
+			/*!< in: output stream */
+	const trx_t*	trx,
+			/*!< in: transaction */
+	ulint		max_query_len,
+			/*!< in: max query length to print,
+			or 0 to use the default max length */
+	ulint		n_lock_rec,
+			/*!< in: lock_number_of_rows_locked(&trx->lock) */
+	ulint		n_lock_struct,
+			/*!< in: length of trx->lock.trx_locks */
+	ulint		heap_size)
+			/*!< in: mem_heap_get_size(trx->lock.lock_heap) */
+	__attribute__((nonnull));
+
+/**********************************************************************//**
+Prints info about a transaction.
+The caller must hold lock_sys->mutex and trx_sys->mutex.
+When possible, use trx_print() instead. */
+UNIV_INTERN
+void
+trx_print_latched(
+/*==============*/
+	FILE*		f,		/*!< in: output stream */
+	const trx_t*	trx,		/*!< in: transaction */
+	ulint		max_query_len)	/*!< in: max query length to print,
+					or 0 to use the default max length */
+	__attribute__((nonnull));
+
+/**********************************************************************//**
+Prints info about a transaction.
+Acquires and releases lock_sys->mutex and trx_sys->mutex. */
 UNIV_INTERN
 void
 trx_print(
 /*======*/
-	FILE*	f,		/*!< in: output stream */
-	trx_t*	trx,		/*!< in: transaction */
-	ulint	max_query_len);	/*!< in: max query length to print, or 0 to
-				   use the default max length */
+	FILE*		f,		/*!< in: output stream */
+	const trx_t*	trx,		/*!< in: transaction */
+	ulint		max_query_len)	/*!< in: max query length to print,
+					or 0 to use the default max length */
+	__attribute__((nonnull));
 
 /** Type of data dictionary operation */
 typedef enum trx_dict_op {
@@ -395,6 +322,37 @@ trx_set_dict_operation(
 
 #ifndef UNIV_HOTBACKUP
 /**********************************************************************//**
+Determines if a transaction is in the given state.
+The caller must hold trx_sys->mutex, or it must be the thread
+that is serving a running transaction.
+A running transaction must be in trx_sys->ro_trx_list or trx_sys->rw_trx_list
+unless it is a non-locking autocommit read only transaction, which is only
+in trx_sys->mysql_trx_list.
+@return	TRUE if trx->state == state */
+UNIV_INLINE
+ibool
+trx_state_eq(
+/*=========*/
+	const trx_t*	trx,	/*!< in: transaction */
+	trx_state_t	state)	/*!< in: state;
+				if state != TRX_STATE_NOT_STARTED
+				asserts that
+				trx->state != TRX_STATE_NOT_STARTED */
+	__attribute__((nonnull, warn_unused_result));
+# ifdef UNIV_DEBUG
+/**********************************************************************//**
+Asserts that a transaction has been started.
+The caller must hold trx_sys->mutex.
+@return TRUE if started */
+UNIV_INTERN
+ibool
+trx_assert_started(
+/*===============*/
+	const trx_t*	trx)	/*!< in: transaction */
+	__attribute__((nonnull, warn_unused_result));
+# endif /* UNIV_DEBUG */
+
+/**********************************************************************//**
 Determines if the currently running transaction has been interrupted.
 @return	TRUE if interrupted */
 UNIV_INTERN
@@ -419,7 +377,7 @@ Calculates the "weight" of a transaction. The weight of one transaction
 is estimated as the number of altered rows + the number of locked rows.
 @param t	transaction
 @return		transaction weight */
-#define TRX_WEIGHT(t)	((t)->undo_no + UT_LIST_GET_LEN((t)->trx_locks))
+#define TRX_WEIGHT(t)	((t)->undo_no + UT_LIST_GET_LEN((t)->lock.trx_locks))
 
 /*******************************************************************//**
 Compares the "weight" (or size) of two transactions. Transactions that
@@ -447,61 +405,322 @@ trx_get_que_state_str(
 /*==================*/
 	const trx_t*	trx);	/*!< in: transaction */
 
-/* Signal to a transaction */
-struct trx_sig_struct{
-	unsigned	type:3;		/*!< signal type */
-	unsigned	sender:1;	/*!< TRX_SIG_SELF or
-					TRX_SIG_OTHER_SESS */
-	que_thr_t*	receiver;	/*!< non-NULL if the sender of the signal
-					wants reply after the operation induced
-					by the signal is completed */
-	trx_savept_t	savept;		/*!< possible rollback savepoint */
-	UT_LIST_NODE_T(trx_sig_t)
-			signals;	/*!< queue of pending signals to the
-					transaction */
-	UT_LIST_NODE_T(trx_sig_t)
-			reply_signals;	/*!< list of signals for which the sender
-					transaction is waiting a reply */
+/*******************************************************************//**
+Transactions that aren't started by the MySQL server don't set
+the trx_t::mysql_thd field. For such transactions we set the lock
+wait timeout to 0 instead of the user configured value that comes
+from innodb_lock_wait_timeout via trx_t::mysql_thd.
+@param trx	transaction
+@return		lock wait timeout in seconds */
+#define trx_lock_wait_timeout_get(trx)					\
+	((trx)->mysql_thd != NULL					\
+	 ? thd_lock_wait_timeout((trx)->mysql_thd)			\
+	 : 0)
+
+/*******************************************************************//**
+Determine if the transaction is a non-locking autocommit select
+(implied read-only).
+@param t	transaction
+@return true	if non-locking autocommit select transaction. */
+#define trx_is_autocommit_non_locking(t)				\
+((t)->auto_commit && (t)->will_lock == 0)
+
+/*******************************************************************//**
+Determine if the transaction is a non-locking autocommit select
+with an explicit check for the read-only status.
+@param t	transaction
+@return true	if non-locking autocommit read-only transaction. */
+#define trx_is_ac_nl_ro(t)						\
+((t)->read_only && trx_is_autocommit_non_locking((t)))
+
+/*******************************************************************//**
+Assert that the transaction is in the trx_sys_t::rw_trx_list */
+#define assert_trx_in_rw_list(t) do {					\
+	ut_ad(!(t)->read_only);						\
+	assert_trx_in_list(t);						\
+} while (0)
+
+/*******************************************************************//**
+Assert that the transaction is either in trx_sys->ro_trx_list or
+trx_sys->rw_trx_list but not both and it cannot be an autocommit
+non-locking select */
+#define assert_trx_in_list(t) do {					\
+	ut_ad((t)->in_ro_trx_list == (t)->read_only);			\
+	ut_ad((t)->in_rw_trx_list == !(t)->read_only);			\
+	ut_ad(!trx_is_autocommit_non_locking((t)));			\
+	switch ((t)->state) {						\
+	case TRX_STATE_PREPARED:					\
+		ut_a(!(t)->read_only);					\
+		/* fall through */					\
+	case TRX_STATE_ACTIVE:						\
+	case TRX_STATE_COMMITTED_IN_MEMORY:				\
+		continue;						\
+	case TRX_STATE_NOT_STARTED:					\
+		break;							\
+	}								\
+	ut_error;							\
+} while (0)
+
+#ifdef UNIV_DEBUG
+/*******************************************************************//**
+Assert that an autocommit non-locking slect cannot be in the
+ro_trx_list nor the rw_trx_list and that it is a read-only transaction.
+The tranasction must be in the mysql_trx_list. */
+# define assert_trx_nonlocking_or_in_list(t)				\
+	do {								\
+		if (trx_is_autocommit_non_locking(t)) {			\
+			trx_state_t	t_state = (t)->state;		\
+			ut_ad((t)->read_only);				\
+			ut_ad(!(t)->is_recovered);			\
+			ut_ad(!(t)->in_ro_trx_list);			\
+			ut_ad(!(t)->in_rw_trx_list);			\
+			ut_ad((t)->in_mysql_trx_list);			\
+			ut_ad(t_state == TRX_STATE_NOT_STARTED		\
+			      || t_state == TRX_STATE_ACTIVE);		\
+		} else {						\
+			assert_trx_in_list(t);				\
+		}							\
+	} while (0)
+#else /* UNIV_DEBUG */
+/*******************************************************************//**
+Assert that an autocommit non-locking slect cannot be in the
+ro_trx_list nor the rw_trx_list and that it is a read-only transaction.
+The tranasction must be in the mysql_trx_list. */
+# define assert_trx_nonlocking_or_in_list(trx) ((void)0)
+#endif /* UNIV_DEBUG */
+
+/*******************************************************************//**
+Latching protocol for trx_lock_t::que_state.  trx_lock_t::que_state
+captures the state of the query thread during the execution of a query.
+This is different from a transaction state. The query state of a transaction
+can be updated asynchronously by other threads.  The other threads can be
+system threads, like the timeout monitor thread or user threads executing
+other queries. Another thing to be mindful of is that there is a delay between
+when a query thread is put into LOCK_WAIT state and before it actually starts
+waiting.  Between these two events it is possible that the query thread is
+granted the lock it was waiting for, which implies that the state can be changed
+asynchronously.
+
+All these operations take place within the context of locking. Therefore state
+changes within the locking code must acquire both the lock mutex and the
+trx->mutex when changing trx->lock.que_state to TRX_QUE_LOCK_WAIT or
+trx->lock.wait_lock to non-NULL but when the lock wait ends it is sufficient
+to only acquire the trx->mutex.
+To query the state either of the mutexes is sufficient within the locking
+code and no mutex is required when the query thread is no longer waiting. */
+
+/** The locks and state of an active transaction. Protected by
+lock_sys->mutex, trx->mutex or both. */
+struct trx_lock_struct {
+	ulint		n_active_thrs;	/*!< number of active query threads */
+
+	trx_que_t	que_state;	/*!< valid when trx->state
+					== TRX_STATE_ACTIVE: TRX_QUE_RUNNING,
+					TRX_QUE_LOCK_WAIT, ... */
+
+	lock_t*		wait_lock;	/*!< if trx execution state is
+					TRX_QUE_LOCK_WAIT, this points to
+					the lock request, otherwise this is
+					NULL; set to non-NULL when holding
+					both trx->mutex and lock_sys->mutex;
+					set to NULL when holding
+					lock_sys->mutex; readers should
+					hold lock_sys->mutex, except when
+					they are holding trx->mutex and
+					wait_lock==NULL */
+	ib_uint64_t	deadlock_mark;	/*!< A mark field that is initialized
+					to and checked against lock_mark_counter
+					by lock_deadlock_recursive(). */
+	ibool		was_chosen_as_deadlock_victim;
+					/*!< when the transaction decides to
+					wait for a lock, it sets this to FALSE;
+					if another transaction chooses this
+					transaction as a victim in deadlock
+					resolution, it sets this to TRUE.
+					Protected by trx->mutex. */
+	time_t		wait_started;	/*!< lock wait started at this time,
+					protected only by lock_sys->mutex */
+
+	que_thr_t*	wait_thr;	/*!< query thread belonging to this
+					trx that is in QUE_THR_LOCK_WAIT
+					state. For threads suspended in a
+					lock wait, this is protected by
+					lock_sys->mutex. Otherwise, this may
+					only be modified by the thread that is
+					serving the running transaction. */
+
+	mem_heap_t*	lock_heap;	/*!< memory heap for trx_locks;
+					protected by lock_sys->mutex */
+
+	UT_LIST_BASE_NODE_T(lock_t)
+			trx_locks;	/*!< locks requested
+					by the transaction;
+					insertions are protected by trx->mutex
+					and lock_sys->mutex; removals are
+					protected by lock_sys->mutex */
+
+	ib_vector_t*	table_locks;	/*!< All table locks requested by this
+					transaction, including AUTOINC locks */
+
+	ibool		cancel;		/*!< TRUE if the transaction is being
+					rolled back either via deadlock
+					detection or due to lock timeout. The
+					caller has to acquire the trx_t::mutex
+					in order to cancel the locks. In
+					lock_trx_table_locks_remove() we
+					check for this cancel of a transaction's
+					locks and avoid reacquiring the trx
+					mutex to prevent recursive deadlocks.
+					Protected by both the lock sys mutex
+					and the trx_t::mutex. */
 };
 
 #define TRX_MAGIC_N	91118598
 
-/* The transaction handle; every session has a trx object which is freed only
-when the session is freed; in addition there may be session-less transactions
-rolling back after a database recovery */
+/** The transaction handle
+
+Normally, there is a 1:1 relationship between a transaction handle
+(trx) and a session (client connection). One session is associated
+with exactly one user transaction. There are some exceptions to this:
+
+* For DDL operations, a subtransaction is allocated that modifies the
+data dictionary tables. Lock waits and deadlocks are prevented by
+acquiring the dict_operation_lock before starting the subtransaction
+and releasing it after committing the subtransaction.
+
+* The purge system uses a special transaction that is not associated
+with any session.
+
+* If the system crashed or it was quickly shut down while there were
+transactions in the ACTIVE or PREPARED state, these transactions would
+no longer be associated with a session when the server is restarted.
+
+A session may be served by at most one thread at a time. The serving
+thread of a session might change in some MySQL implementations.
+Therefore we do not have os_thread_get_curr_id() assertions in the code.
+
+Normally, only the thread that is currently associated with a running
+transaction may access (read and modify) the trx object, and it may do
+so without holding any mutex. The following are exceptions to this:
+
+* trx_rollback_resurrected() may access resurrected (connectionless)
+transactions while the system is already processing new user
+transactions. The trx_sys->mutex prevents a race condition between it
+and lock_trx_release_locks() [invoked by trx_commit()].
+
+* trx_print_low() may access transactions not associated with the current
+thread. The caller must be holding trx_sys->mutex and lock_sys->mutex.
+
+* When a transaction handle is in the trx_sys->mysql_trx_list or
+trx_sys->trx_list, some of its fields must not be modified without
+holding trx_sys->mutex exclusively.
+
+* The locking code (in particular, lock_deadlock_recursive() and
+lock_rec_convert_impl_to_expl()) will access transactions associated
+to other connections. The locks of transactions are protected by
+lock_sys->mutex and sometimes by trx->mutex. */
 
 struct trx_struct{
 	ulint		magic_n;
 
+	mutex_t		mutex;		/*!< Mutex protecting the fields
+					state and lock
+					(except some fields of lock, which
+					are protected by lock_sys->mutex) */
+
+	/** State of the trx from the point of view of concurrency control
+	and the valid state transitions.
+
+	Possible states:
+
+	TRX_STATE_NOT_STARTED
+	TRX_STATE_ACTIVE
+	TRX_STATE_PREPARED
+	TRX_STATE_COMMITTED_IN_MEMORY (alias below COMMITTED)
+
+	Valid state transitions are:
+
+	Regular transactions:
+	* NOT_STARTED -> ACTIVE -> COMMITTED -> NOT_STARTED
+
+	Auto-commit non-locking read-only:
+	* NOT_STARTED -> ACTIVE -> NOT_STARTED
+
+	XA (2PC):
+	* NOT_STARTED -> ACTIVE -> PREPARED -> COMMITTED -> NOT_STARTED
+
+	Recovered XA:
+	* NOT_STARTED -> PREPARED -> COMMITTED -> (freed)
+
+	XA (2PC) (shutdown before ROLLBACK or COMMIT):
+	* NOT_STARTED -> PREPARED -> (freed)
+
+	Latching and various transaction lists membership rules:
+
+	XA (2PC) transactions are always treated as read-write and
+	non-autocommit.
+
+	Transitions to ACTIVE or NOT_STARTED occur when
+	!in_rw_trx_list and !in_ro_trx_list (no trx_sys->mutex needed).
+
+	Autocommit non-locking read-only transactions move between states
+	without holding any mutex. They are !in_rw_trx_list, !in_ro_trx_list.
+
+	When a transaction is NOT_STARTED, it can be in_mysql_trx_list if
+	it is a user transaction. It cannot be in ro_trx_list or rw_trx_list.
+
+	ACTIVE->PREPARED->COMMITTED is only possible when trx->in_rw_trx_list.
+	The transition ACTIVE->PREPARED is protected by trx_sys->mutex.
+
+	ACTIVE->COMMITTED is possible when the transaction is in
+	ro_trx_list or rw_trx_list.
+
+	Transitions to COMMITTED are protected by both lock_sys->mutex
+	and trx->mutex.
+
+	NOTE: Some of these state change constraints are an overkill,
+	currently only required for a consistent view for printing stats.
+	This unnecessarily adds a huge cost for the general case.
+
+	NOTE: In the future we should add read only transactions to the
+	ro_trx_list the first time they try to acquire a lock ie. by default
+	we treat all read-only transactions as non-locking.  */
+	trx_state_t	state;
+
+	trx_lock_t	lock;		/*!< Information about the transaction
+					locks and state. Protected by
+					trx->mutex or lock_sys->mutex
+					or both */
+	ulint		is_recovered;	/*!< 0=normal transaction,
+					1=recovered, must be rolled back,
+					protected by trx_sys->mutex when
+					trx->in_rw_trx_list holds */
+
 	/* These fields are not protected by any mutex. */
 	const char*	op_info;	/*!< English text describing the
 					current operation, or an empty
 					string */
-	ulint		conc_state;	/*!< state of the trx from the point
-					of view of concurrency control:
-					TRX_ACTIVE, TRX_COMMITTED_IN_MEMORY,
-					... */
+	ulint		isolation_level;/*!< TRX_ISO_REPEATABLE_READ, ... */
+	ulint		check_foreigns;	/*!< normally TRUE, but if the user
+					wants to suppress foreign key checks,
+					(in table imports, for example) we
+					set this FALSE */
 	/*------------------------------*/
 	/* MySQL has a transaction coordinator to coordinate two phase
-       	commit between multiple storage engines and the binary log. When
-       	an engine participates in a transaction, it's responsible for
-       	registering itself using the trans_register_ha() API. */
+	commit between multiple storage engines and the binary log. When
+	an engine participates in a transaction, it's responsible for
+	registering itself using the trans_register_ha() API. */
 	unsigned	is_registered:1;/* This flag is set to 1 after the
-				       	transaction has been registered with
-				       	the coordinator using the XA API, and
-				       	is set to 0 after commit or rollback. */
+					transaction has been registered with
+					the coordinator using the XA API, and
+					is set to 0 after commit or rollback. */
 	unsigned	active_commit_ordered:1;/* 1 if owns prepare mutex, if
 					this is set to 1 then registered should
 					also be set to 1. This is used in the
 					XA code */
 	/*------------------------------*/
-	ulint		isolation_level;/* TRX_ISO_REPEATABLE_READ, ... */
-	ulint		check_foreigns;	/* normally TRUE, but if the user
-					wants to suppress foreign key checks,
-					(in table imports, for example) we
-					set this FALSE */
 	ulint		check_unique_secondary;
-					/* normally TRUE, but if the user
+					/*!< normally TRUE, but if the user
 					wants to speed up inserts by
 					suppressing unique key checks
 					for secondary indexes when we decide
@@ -518,104 +737,112 @@ struct trx_struct{
 					defer flush of the logs to disk
 					until after we release the
 					mutex. */
-	ulint		must_flush_log_later;/* this flag is set to TRUE in
-					trx_commit_off_kernel() if
-					flush_log_later was TRUE, and there
-					were modifications by the transaction;
-					in that case we must flush the log
-					in trx_commit_complete_for_mysql() */
+	ulint		must_flush_log_later;/*!< this flag is set to TRUE in
+					trx_commit() if flush_log_later was
+					TRUE, and there were modifications by
+					the transaction; in that case we must
+					flush the log in
+					trx_commit_complete_for_mysql() */
 	ulint		duplicates;	/*!< TRX_DUP_IGNORE | TRX_DUP_REPLACE */
 	ulint		has_search_latch;
-					/* TRUE if this trx has latched the
+					/*!< TRUE if this trx has latched the
 					search system latch in S-mode */
-	ulint		deadlock_mark;	/*!< a mark field used in deadlock
-					checking algorithm.  */
+	ulint		search_latch_timeout;
+					/*!< If we notice that someone is
+					waiting for our S-lock on the search
+					latch to be released, we wait in
+					row0sel.cc for BTR_SEA_TIMEOUT new
+					searches until we try to keep
+					the search latch again over
+					calls from MySQL; this is intended
+					to reduce contention on the search
+					latch */
 	trx_dict_op_t	dict_operation;	/**< @see enum trx_dict_op */
 
 	/* Fields protected by the srv_conc_mutex. */
 	ulint		declared_to_be_inside_innodb;
-					/* this is TRUE if we have declared
+					/*!< this is TRUE if we have declared
 					this transaction in
 					srv_conc_enter_innodb to be inside the
 					InnoDB engine */
-
-	/* Fields protected by dict_operation_lock. The very latch
-	it is used to track. */
+	ulint		n_tickets_to_enter_innodb;
+					/*!< this can be > 0 only when
+					declared_to_... is TRUE; when we come
+					to srv_conc_innodb_enter, if the value
+					here is > 0, we decrement this by 1 */
 	ulint		dict_operation_lock_mode;
 					/*!< 0, RW_S_LATCH, or RW_X_LATCH:
 					the latch mode trx currently holds
-					on dict_operation_lock */
+					on dict_operation_lock. Protected
+					by dict_operation_lock. */
+
+	trx_id_t	no;		/*!< transaction serialization number:
+					max trx id shortly before the
+					transaction is moved to
+					COMMITTED_IN_MEMORY state.
+					Protected by trx_sys_t::mutex
+					when trx->in_rw_trx_list. Initially
+					set to IB_ULONGLONG_MAX. */
 
-	/* All the next fields are protected by the kernel mutex, except the
-	undo logs which are protected by undo_mutex */
-	ulint		is_purge;	/*!< 0=user transaction, 1=purge */
-	ulint		is_recovered;	/*!< 0=normal transaction,
-					1=recovered, must be rolled back */
-	ulint		que_state;	/*!< valid when conc_state
-					== TRX_ACTIVE: TRX_QUE_RUNNING,
-					TRX_QUE_LOCK_WAIT, ... */
-	ulint		handling_signals;/* this is TRUE as long as the trx
-					is handling signals */
 	time_t		start_time;	/*!< time the trx object was created
 					or the state last time became
-					TRX_ACTIVE */
+					TRX_STATE_ACTIVE */
 	trx_id_t	id;		/*!< transaction id */
 	XID		xid;		/*!< X/Open XA transaction
 					identification to identify a
 					transaction branch */
-	trx_id_t	no;		/*!< transaction serialization number ==
-					max trx id when the transaction is
-					moved to COMMITTED_IN_MEMORY state */
-	ib_uint64_t	commit_lsn;	/*!< lsn at the time of the commit */
+	lsn_t		commit_lsn;	/*!< lsn at the time of the commit */
 	table_id_t	table_id;	/*!< Table to drop iff dict_operation
 					is TRUE, or 0. */
 	/*------------------------------*/
 	void*		mysql_thd;	/*!< MySQL thread handle corresponding
 					to this trx, or NULL */
 	const char*	mysql_log_file_name;
-					/* if MySQL binlog is used, this field
+					/*!< if MySQL binlog is used, this field
 					contains a pointer to the latest file
 					name; this is NULL if binlog is not
 					used */
-	ib_int64_t	mysql_log_offset;/* if MySQL binlog is used, this field
-					contains the end offset of the binlog
-					entry */
+	ib_int64_t	mysql_log_offset;
+					/*!< if MySQL binlog is used, this
+					field contains the end offset of the
+					binlog entry */
 	/*------------------------------*/
-	ulint		n_mysql_tables_in_use; /* number of Innobase tables
+	ulint		n_mysql_tables_in_use; /*!< number of Innobase tables
 					used in the processing of the current
 					SQL statement in MySQL */
 	ulint		mysql_n_tables_locked;
-					/* how many tables the current SQL
+					/*!< how many tables the current SQL
 					statement uses, except those
 					in consistent read */
-	ulint		search_latch_timeout;
-					/* If we notice that someone is
-					waiting for our S-lock on the search
-					latch to be released, we wait in
-					row0sel.c for BTR_SEA_TIMEOUT new
-					searches until we try to keep
-					the search latch again over
-					calls from MySQL; this is intended
-					to reduce contention on the search
-					latch */
-	/*------------------------------*/
-	ulint		n_tickets_to_enter_innodb;
-					/* this can be > 0 only when
-					declared_to_... is TRUE; when we come
-					to srv_conc_innodb_enter, if the value
-					here is > 0, we decrement this by 1 */
 	/*------------------------------*/
 	UT_LIST_NODE_T(trx_t)
-			trx_list;	/*!< list of transactions */
+			trx_list;	/*!< list of transactions;
+					protected by trx_sys->mutex.
+					The same node is used for both
+					trx_sys_t::ro_trx_list and
+					trx_sys_t::rw_trx_list */
+#ifdef UNIV_DEBUG
+	/** The following two fields are mutually exclusive. */
+	/* @{ */
+
+	ibool		in_ro_trx_list;	/*!< TRUE if in trx_sys->ro_trx_list */
+	ibool		in_rw_trx_list;	/*!< TRUE if in trx_sys->rw_trx_list */
+	/* @} */
+#endif /* UNIV_DEBUG */
 	UT_LIST_NODE_T(trx_t)
 			mysql_trx_list;	/*!< list of transactions created for
-					MySQL */
+					MySQL; protected by trx_sys->mutex */
+#ifdef UNIV_DEBUG
+	ibool		in_mysql_trx_list;
+					/*!< TRUE if in
+					trx_sys->mysql_trx_list */
+#endif /* UNIV_DEBUG */
 	/*------------------------------*/
-	ulint		error_state;	/*!< 0 if no error, otherwise error
+	enum db_err	error_state;	/*!< 0 if no error, otherwise error
 					number; NOTE That ONLY the thread
 					doing the transaction is allowed to
 					set this field: this is NOT protected
-					by the kernel mutex */
+					by any mutex */
 	const dict_index_t*error_info;	/*!< if the error number indicates a
 					duplicate key error, a pointer to
 					the problematic index is stored here */
@@ -629,50 +856,11 @@ struct trx_struct{
 					survive over a transaction commit, if
 					it is a stored procedure with a COMMIT
 					WORK statement, for instance */
-	ulint		n_active_thrs;	/*!< number of active query threads */
-	que_t*		graph_before_signal_handling;
-					/* value of graph when signal handling
-					for this trx started: this is used to
-					return control to the original query
-					graph for error processing */
-	trx_sig_t	sig;		/*!< one signal object can be allocated
-					in this space, avoiding mem_alloc */
-	UT_LIST_BASE_NODE_T(trx_sig_t)
-			signals;	/*!< queue of processed or pending
-					signals to the trx */
-	UT_LIST_BASE_NODE_T(trx_sig_t)
-			reply_signals;	/*!< list of signals sent by the query
-					threads of this trx for which a thread
-					is waiting for a reply; if this trx is
-					killed, the reply requests in the list
-					must be canceled */
-	/*------------------------------*/
-	lock_t*		wait_lock;	/*!< if trx execution state is
-					TRX_QUE_LOCK_WAIT, this points to
-					the lock request, otherwise this is
-					NULL */
-	ibool		was_chosen_as_deadlock_victim;
-					/* when the transaction decides to wait
-					for a lock, it sets this to FALSE;
-					if another transaction chooses this
-					transaction as a victim in deadlock
-					resolution, it sets this to TRUE */
-	time_t		wait_started;	/*!< lock wait started at this time */
-	UT_LIST_BASE_NODE_T(que_thr_t)
-			wait_thrs;	/*!< query threads belonging to this
-					trx that are in the QUE_THR_LOCK_WAIT
-					state */
-	/*------------------------------*/
-	mem_heap_t*	lock_heap;	/*!< memory heap for the locks of the
-					transaction */
-	UT_LIST_BASE_NODE_T(lock_t)
-			trx_locks;	/*!< locks reserved by the transaction */
-	/*------------------------------*/
 	mem_heap_t*	global_read_view_heap;
-					/* memory heap for the global read
+					/*!< memory heap for the global read
 					view */
 	read_view_t*	global_read_view;
-					/* consistent read view associated
+					/*!< consistent read view associated
 					to a transaction or NULL */
 	read_view_t*	read_view;	/*!< consistent read view used in the
 					transaction or NULL, this read view
@@ -699,7 +887,7 @@ struct trx_struct{
 					the number of modified/inserted
 					rows in a transaction */
 	trx_savept_t	last_sql_stat_start;
-					/* undo_no when the last sql statement
+					/*!< undo_no when the last sql statement
 					was started: in case of an error, trx
 					is rolled back down to this undo
 					number; see note at undo_mutex! */
@@ -725,29 +913,32 @@ struct trx_struct{
 					transaction. Note that these are
 					also in the lock list trx_locks. This
 					vector needs to be freed explicitly
-					when the trx_t instance is desrtoyed */
+					when the trx instance is destroyed.
+					Protected by lock_sys->mutex. */
+	/*------------------------------*/
+	ibool		read_only;	/*!< TRUE if transaction is flagged
+					as a READ-ONLY transaction.
+					if !auto_commit || will_lock > 0
+					then it will added to the list
+					trx_sys_t::ro_trx_list. A read only
+					transaction will not be assigned an
+					UNDO log. Non-locking auto-commit
+					read-only transaction will not be on
+					either list. */
+	ibool		auto_commit;	/*!< TRUE if it is an autocommit */
+	ulint		will_lock;	/*!< Will acquire some locks. Increment
+					each time we determine that a lock will
+					be acquired by the MySQL layer. */
+	/*------------------------------*/
+	fts_trx_t*	fts_trx;	/* FTS information, or NULL if
+					transaction hasn't modified tables
+					with FTS indexes (yet). */
+	doc_id_t	fts_next_doc_id;/* The document id used for updates */
 	/*------------------------------*/
 	char detailed_error[256];	/*!< detailed error message for last
 					error, or empty. */
 };
 
-#define TRX_MAX_N_THREADS	32	/* maximum number of
-					concurrent threads running a
-					single operation of a
-					transaction, e.g., a parallel
-					query */
-/* Transaction concurrency states (trx->conc_state) */
-#define	TRX_NOT_STARTED		0
-#define	TRX_ACTIVE		1
-#define	TRX_COMMITTED_IN_MEMORY	2
-#define	TRX_PREPARED		3	/* Support for 2PC/XA */
-
-/* Transaction execution states when trx->conc_state == TRX_ACTIVE */
-#define TRX_QUE_RUNNING		0	/* transaction is running */
-#define TRX_QUE_LOCK_WAIT	1	/* transaction is waiting for a lock */
-#define TRX_QUE_ROLLING_BACK	2	/* transaction is rolling back */
-#define TRX_QUE_COMMITTING	3	/* transaction is committing */
-
 /* Transaction isolation levels (trx->isolation_level) */
 #define TRX_ISO_READ_UNCOMMITTED	0	/* dirty read: non-locking
 						SELECTs are performed so that
@@ -794,7 +985,6 @@ Multiple flags can be combined with bitwise OR. */
 #define TRX_SIG_TOTAL_ROLLBACK		1
 #define TRX_SIG_ROLLBACK_TO_SAVEPT	2
 #define TRX_SIG_COMMIT			3
-#define	TRX_SIG_ERROR_OCCURRED		4
 #define TRX_SIG_BREAK_EXECUTION		5
 
 /* Sender types of a signal */
@@ -820,6 +1010,36 @@ struct commit_node_struct{
 };
 
 
+/** Test if trx->mutex is owned. */
+#define trx_mutex_own(t) mutex_own(&t->mutex)
+
+/** Acquire the trx->mutex. */
+#define trx_mutex_enter(t) do {			\
+	mutex_enter(&t->mutex);			\
+} while (0)
+
+/** Release the trx->mutex. */
+#define trx_mutex_exit(t) do {			\
+	mutex_exit(&t->mutex);			\
+} while (0)
+
+/** @brief The latch protecting the adaptive search system
+
+This latch protects the
+(1) hash index;
+(2) columns of a record to which we have a pointer in the hash index;
+
+but does NOT protect:
+
+(3) next record offset field in a record;
+(4) next or previous records on the same page.
+
+Bear in mind (3) and (4) when using the hash index.
+*/
+extern rw_lock_t*	btr_search_latch_temp;
+
+/** The latch protecting the adaptive search system */
+#define btr_search_latch	(*btr_search_latch_temp)
 
 #ifndef UNIV_NONINL
 #include "trx0trx.ic"
diff --git a/storage/innobase/include/trx0trx.ic b/storage/innobase/include/trx0trx.ic
index 4a1d3bcde0b..ceeb121ab70 100644
--- a/storage/innobase/include/trx0trx.ic
+++ b/storage/innobase/include/trx0trx.ic
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2011, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -23,37 +23,48 @@ The transaction
 Created 3/26/1996 Heikki Tuuri
 *******************************************************/
 
-/*************************************************************//**
-Starts the transaction if it is not yet started. */
-UNIV_INLINE
-void
-trx_start_if_not_started(
-/*=====================*/
-	trx_t*	trx)	/*!< in: transaction */
-{
-	ut_ad(trx->conc_state != TRX_COMMITTED_IN_MEMORY);
-
-	if (trx->conc_state == TRX_NOT_STARTED) {
-
-		trx_start(trx, ULINT_UNDEFINED);
-	}
-}
-
-/*************************************************************//**
-Starts the transaction if it is not yet started. Assumes we have reserved
-the kernel mutex! */
+/**********************************************************************//**
+Determines if a transaction is in the given state.
+The caller must hold trx_sys->mutex, or it must be the thread
+that is serving a running transaction.
+A running transaction must be in trx_sys->ro_trx_list or trx_sys->rw_trx_list
+unless it is a non-locking autocommit read only transaction, which is only
+in trx_sys->mysql_trx_list.
+@return	TRUE if trx->state == state */
 UNIV_INLINE
-void
-trx_start_if_not_started_low(
-/*=========================*/
-	trx_t*	trx)	/*!< in: transaction */
+ibool
+trx_state_eq(
+/*=========*/
+	const trx_t*	trx,	/*!< in: transaction */
+	trx_state_t	state)	/*!< in: state;
+				if state != TRX_STATE_NOT_STARTED
+				asserts that
+				trx->state != TRX_STATE_NOT_STARTED */
 {
-	ut_ad(trx->conc_state != TRX_COMMITTED_IN_MEMORY);
-
-	if (trx->conc_state == TRX_NOT_STARTED) {
-
-		trx_start_low(trx, ULINT_UNDEFINED);
+#ifdef UNIV_DEBUG
+	switch (trx->state) {
+	case TRX_STATE_PREPARED:
+		assert_trx_in_rw_list(trx);
+		return(trx->state == state);
+
+	case TRX_STATE_ACTIVE:
+		assert_trx_nonlocking_or_in_list(trx);
+		return(state == trx->state);
+
+	case TRX_STATE_COMMITTED_IN_MEMORY:
+		assert_trx_in_list(trx);
+		return(state == trx->state);
+
+	case TRX_STATE_NOT_STARTED:
+		/* This state is not allowed for running transactions. */
+		ut_a(state == TRX_STATE_NOT_STARTED);
+		ut_ad(!trx->in_rw_trx_list);
+		ut_ad(!trx->in_ro_trx_list);
+		return(state == trx->state);
 	}
+	ut_error;
+#endif /* UNIV_DEBUG */
+	return(trx->state == state);
 }
 
 /****************************************************************//**
@@ -79,7 +90,7 @@ trx_get_que_state_str(
 	const trx_t*	trx)	/*!< in: transaction */
 {
 	/* be sure to adjust TRX_QUE_STATE_STR_MAX_LEN if you change this */
-	switch (trx->que_state) {
+	switch (trx->lock.que_state) {
 	case TRX_QUE_RUNNING:
 		return("RUNNING");
 	case TRX_QUE_LOCK_WAIT:
@@ -113,7 +124,7 @@ trx_get_dict_operation(
 	}
 	ut_error;
 #endif /* UNIV_DEBUG */
-	return((enum trx_dict_op) UNIV_EXPECT(op, TRX_DICT_OP_NONE));
+	return((enum trx_dict_op) op);
 }
 /**********************************************************************//**
 Flag a transaction a dictionary operation. */
@@ -150,3 +161,19 @@ ok:
 
 	trx->dict_operation = op;
 }
+
+/********************************************************************//**
+Releases the search latch if trx has reserved it. */
+UNIV_INLINE
+void
+trx_search_latch_release_if_reserved(
+/*=================================*/
+	trx_t*	   trx) /*!< in: transaction */
+{
+	if (trx->has_search_latch) {
+		rw_lock_s_unlock(&btr_search_latch);
+
+		trx->has_search_latch = FALSE;
+	}
+}
+
diff --git a/storage/innobase/include/trx0types.h b/storage/innobase/include/trx0types.h
index a4115b5aca7..650d5878e64 100644
--- a/storage/innobase/include/trx0types.h
+++ b/storage/innobase/include/trx0types.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1996, 2010, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2011, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -29,20 +29,37 @@ Created 3/26/1996 Heikki Tuuri
 #include "ut0byte.h"
 
 /** printf(3) format used for printing DB_TRX_ID and other system fields */
-#define TRX_ID_FMT		"%llX"
+#define TRX_ID_FMT		IB_ID_FMT
 
 /** maximum length that a formatted trx_t::id could take, not including
 the terminating NUL character. */
 #define TRX_ID_MAX_LEN		17
 
+/** Transaction execution states when trx->state == TRX_STATE_ACTIVE */
+enum trx_que_enum {
+	TRX_QUE_RUNNING,		/*!< transaction is running */
+	TRX_QUE_LOCK_WAIT,		/*!< transaction is waiting for
+					a lock */
+	TRX_QUE_ROLLING_BACK,		/*!< transaction is rolling back */
+	TRX_QUE_COMMITTING		/*!< transaction is committing */
+};
+
+/** Transaction states (trx_t::state) */
+enum trx_state_enum {
+	TRX_STATE_NOT_STARTED,
+	TRX_STATE_ACTIVE,
+	TRX_STATE_PREPARED,			/* Support for 2PC/XA */
+	TRX_STATE_COMMITTED_IN_MEMORY
+};
+
 /** Memory objects */
 /* @{ */
 /** Transaction */
 typedef struct trx_struct	trx_t;
+/** The locks and state of an active transaction */
+typedef struct trx_lock_struct	trx_lock_t;
 /** Transaction system */
 typedef struct trx_sys_struct	trx_sys_t;
-/** Doublewrite information */
-typedef struct trx_doublewrite_struct	trx_doublewrite_t;
 /** Signal */
 typedef struct trx_sig_struct	trx_sig_t;
 /** Rollback segment */
@@ -61,6 +78,10 @@ typedef struct roll_node_struct	roll_node_t;
 typedef struct commit_node_struct commit_node_t;
 /** SAVEPOINT command node in a query graph */
 typedef struct trx_named_savept_struct trx_named_savept_t;
+/** Transaction concurrency state */
+typedef enum trx_state_enum trx_state_t;
+/** Transaction query thread state */
+typedef enum trx_que_enum trx_que_t;
 /* @} */
 
 /** Rollback contexts */
diff --git a/storage/innobase/include/trx0undo.h b/storage/innobase/include/trx0undo.h
index 4a1e40af505..ed2ce66bbb6 100644
--- a/storage/innobase/include/trx0undo.h
+++ b/storage/innobase/include/trx0undo.h
@@ -282,7 +282,7 @@ trx_undo_lists_init(
 Assigns an undo log for a transaction. A new undo log is created or a cached
 undo log reused.
 @return DB_SUCCESS if undo log assign successful, possible error codes
-are: DB_TOO_MANY_CONCURRENT_TRXS DB_OUT_OF_FILE_SPACE
+are: DB_TOO_MANY_CONCURRENT_TRXS DB_OUT_OF_FILE_SPACE DB_READ_ONLY
 DB_OUT_OF_MEMORY */
 UNIV_INTERN
 ulint
@@ -412,8 +412,8 @@ struct trx_undo_struct{
 					TRX_UNDO_UPDATE */
 	ulint		state;		/*!< state of the corresponding undo log
 					segment */
-	ibool		del_marks;	/*!< relevant only in an update undo log:
-					this is TRUE if the transaction may
+	ibool		del_marks;	/*!< relevant only in an update undo
+					log: this is TRUE if the transaction may
 					have delete marked records, because of
 					a delete of a row or an update of an
 					indexed field; purge is then
@@ -435,8 +435,8 @@ struct trx_undo_struct{
 					in bytes, or 0 for uncompressed */
 	ulint		hdr_page_no;	/*!< page number of the header page in
 					the undo log */
-	ulint		hdr_offset;	/*!< header offset of the undo log on the
-					page */
+	ulint		hdr_offset;	/*!< header offset of the undo log on
+				       	the page */
 	ulint		last_page_no;	/*!< page number of the last page in the
 					undo log; this may differ from
 					top_page_no during a rollback */
@@ -582,8 +582,8 @@ quite a large overhead. */
 #define	TRX_UNDO_XA_XID		(TRX_UNDO_XA_BQUAL_LEN + 4)
 /*--------------------------------------------------------------*/
 #define TRX_UNDO_LOG_XA_HDR_SIZE (TRX_UNDO_XA_XID + XIDDATASIZE)
-				/*!< Total size of the undo log header
-				with the XA XID */
+					/*!< Total size of the undo log header
+					with the XA XID */
 /* @} */
 
 #ifndef UNIV_NONINL
diff --git a/storage/innobase/include/trx0undo.ic b/storage/innobase/include/trx0undo.ic
index b81330f7f8b..4b38e63297c 100644
--- a/storage/innobase/include/trx0undo.ic
+++ b/storage/innobase/include/trx0undo.ic
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
diff --git a/storage/innobase/include/trx0xa.h b/storage/innobase/include/trx0xa.h
index e0dd8a1af5b..7caddfb7ba4 100644
--- a/storage/innobase/include/trx0xa.h
+++ b/storage/innobase/include/trx0xa.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1995, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
diff --git a/storage/innobase/include/univ.i b/storage/innobase/include/univ.i
index ce7181e7bd5..60eb1fede91 100644
--- a/storage/innobase/include/univ.i
+++ b/storage/innobase/include/univ.i
@@ -1,8 +1,7 @@
 /*****************************************************************************
 
-Copyright (c) 1994, 2010, Innobase Oy. All Rights Reserved.
+Copyright (c) 1994, 2012, Oracle and/or its affiliates. All Rights Reserved.
 Copyright (c) 2008, Google Inc.
-Copyright (c) 2009, Sun Microsystems, Inc.
 
 Portions of this file contain modifications contributed and copyrighted by
 Google, Inc. Those modifications are gratefully acknowledged and are described
@@ -10,12 +9,6 @@ briefly in the InnoDB documentation. The contributions by Google are
 incorporated with their permission, and subject to the conditions contained in
 the file COPYING.Google.
 
-Portions of this file contain modifications contributed and copyrighted by
-Sun Microsystems, Inc. Those modifications are gratefully acknowledged and
-are described briefly in the InnoDB documentation. The contributions by
-Sun Microsystems are incorporated with their permission, and subject to the
-conditions contained in the file COPYING.Sun_Microsystems.
-
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
 Foundation; version 2 of the License.
@@ -25,8 +18,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -50,8 +43,8 @@ Created 1/20/1994 Heikki Tuuri
 #define IB_TO_STR(s)	_IB_TO_STR(s)
 
 #define INNODB_VERSION_MAJOR	1
-#define INNODB_VERSION_MINOR	1
-#define INNODB_VERSION_BUGFIX	8
+#define INNODB_VERSION_MINOR	2
+#define INNODB_VERSION_BUGFIX	MYSQL_VERSION_PATCH
 
 /* The following is the InnoDB version as shown in
 SELECT plugin_version FROM information_schema.plugins;
@@ -68,8 +61,8 @@ component, i.e. we show M.N.P as M.N */
 	IB_TO_STR(INNODB_VERSION_BUGFIX)
 
 #define REFMAN "http://dev.mysql.com/doc/refman/"	\
-	IB_TO_STR(MYSQL_MAJOR_VERSION) "."		\
-	IB_TO_STR(MYSQL_MINOR_VERSION) "/en/"
+	IB_TO_STR(MYSQL_VERSION_MAJOR) "."		\
+	IB_TO_STR(MYSQL_VERSION_MINOR) "/en/"
 
 #ifdef MYSQL_DYNAMIC_PLUGIN
 /* In the dynamic plugin, redefine some externally visible symbols
@@ -103,10 +96,10 @@ if we are compiling on Windows. */
 # include <my_pthread.h>
 #endif /* UNIV_HOTBACKUP */
 
-/* Include <sys/stat.h> to get S_I... macros defined for os0file.c */
+/* Include <sys/stat.h> to get S_I... macros defined for os0file.cc */
 # include <sys/stat.h>
 # if !defined(__WIN__)
-#  include <sys/mman.h> /* mmap() for os0proc.c */
+#  include <sys/mman.h> /* mmap() for os0proc.cc */
 # endif
 
 /* Include the header file generated by GNU autoconf */
@@ -123,21 +116,21 @@ if we are compiling on Windows. */
 /* We only try to do explicit inlining of functions with gcc and
 Sun Studio */
 
-# if !defined(__GNUC__) && !(defined(__SUNPRO_C) || defined(__SUNPRO_CC))
-#  undef  UNIV_MUST_NOT_INLINE			/* Remove compiler warning */
-#  define UNIV_MUST_NOT_INLINE
-# endif
-
 # ifdef HAVE_PREAD
 #  define HAVE_PWRITE
 # endif
 
 #endif /* #if (defined(WIN32) || ... */
 
+#ifndef __WIN__
+#define __STDC_FORMAT_MACROS    /* Enable C99 printf format macros */
+#include <inttypes.h>
+#endif /* !__WIN__ */
+
 /* Following defines are to enable performance schema
 instrumentation in each of four InnoDB modules if
 HAVE_PSI_INTERFACE is defined. */
-#ifdef HAVE_PSI_INTERFACE
+#if defined HAVE_PSI_INTERFACE && !defined UNIV_HOTBACKUP
 # define UNIV_PFS_MUTEX
 # define UNIV_PFS_RWLOCK
 /* For I/O instrumentation, performance schema rely
@@ -149,8 +142,22 @@ resolved */
 #  define UNIV_PFS_IO
 # endif
 # define UNIV_PFS_THREAD
+
+/* There are mutexes/rwlocks that we want to exclude from
+instrumentation even if their corresponding performance schema
+define is set. And this PFS_NOT_INSTRUMENTED is used
+as the key value to identify those objects that would
+be excluded from instrumentation. */
+# define PFS_NOT_INSTRUMENTED		ULINT32_UNDEFINED
+
+# define PFS_IS_INSTRUMENTED(key)	((key) != PFS_NOT_INSTRUMENTED)
+
 #endif /* HAVE_PSI_INTERFACE */
 
+#ifdef __WIN__
+# define YY_NO_UNISTD_H 1
+#endif /* __WIN__ */
+
 /*			DEBUG VERSION CONTROL
 			===================== */
 
@@ -178,8 +185,6 @@ command. Not tested on Windows. */
 						debugging without UNIV_DEBUG */
 #define UNIV_BLOB_LIGHT_DEBUG			/* Enable off-page column
 						debugging without UNIV_DEBUG */
-#define UNIV_BLOB_NULL_DEBUG			/* Enable deep off-page
-						column debugging */
 #define UNIV_DEBUG				/* Enable ut_ad() assertions
 						and disable UNIV_INLINE */
 #define UNIV_DEBUG_LOCK_VALIDATE		/* Enable
@@ -200,6 +205,9 @@ assumes that no BLOBs survive server restart */
 #define UNIV_IBUF_COUNT_DEBUG			/* debug the insert buffer;
 this limits the database to IBUF_COUNT_N_SPACES and IBUF_COUNT_N_PAGES,
 and the insert buffer must be empty when the database is started */
+#define UNIV_PERF_DEBUG                         /* debug flag that enables
+                                                light weight performance
+                                                related stuff. */
 #define UNIV_SYNC_DEBUG				/* debug mutex and latch
 operations (very slow); also UNIV_DEBUG must be defined */
 #define UNIV_SEARCH_DEBUG			/* debug B-tree comparisons */
@@ -208,7 +216,7 @@ operations (very slow); also UNIV_DEBUG must be defined */
 #define UNIV_SEARCH_PERF_STAT			/* statistics for the
 						adaptive hash index */
 #define UNIV_SRV_PRINT_LATCH_WAITS		/* enable diagnostic output
-						in sync0sync.c */
+						in sync0sync.cc */
 #define UNIV_BTR_PRINT				/* enable functions for
 						printing B-trees */
 #define UNIV_ZIP_DEBUG				/* extensive consistency checks
@@ -218,6 +226,11 @@ operations (very slow); also UNIV_DEBUG must be defined */
 #define UNIV_AIO_DEBUG				/* prints info about
 						submitted and reaped AIO
 						requests to the log. */
+#define UNIV_STATS_DEBUG			/* prints various stats
+						related debug info from
+						dict0stats.c */
+#define FTS_INTERNAL_DIAG_PRINT                 /* FTS internal debugging
+                                                info output */
 #endif
 
 #define UNIV_BTR_DEBUG				/* check B-tree links */
@@ -240,7 +253,9 @@ easy way to get it to work. See http://bugs.mysql.com/bug.php?id=52263. */
 #else
 # define UNIV_INTERN
 #endif
-#if defined __GNUC__ && (__GNUC__ > 4 || __GNUC__ == 4 && __GNUC_MINOR__ >= 3)
+#if defined(INNODB_COMPILER_HINTS)      \
+    && defined __GNUC__                 \
+    && (__GNUC__ > 4 || __GNUC__ == 4 && __GNUC_MINOR__ >= 3)
 /** Starting with GCC 4.3, the "cold" attribute is used to inform the
 compiler that a function is unlikely executed.  The function is
 optimized for size rather than speed and on many targets it is placed
@@ -257,37 +272,31 @@ rarely invoked function for size instead for speed. */
 #ifndef UNIV_MUST_NOT_INLINE
 /* Definition for inline version */
 
-#ifdef __WIN__
-# define UNIV_INLINE	__inline
-#elif defined(__SUNPRO_CC) || defined(__SUNPRO_C)
-# define UNIV_INLINE static inline
-#else
-# define UNIV_INLINE static __inline__
-#endif
+#define UNIV_INLINE static inline
 
-#else
+#else /* !UNIV_MUST_NOT_INLINE */
 /* If we want to compile a noninlined version we use the following macro
 definitions: */
 
 #define UNIV_NONINL
 #define UNIV_INLINE	UNIV_INTERN
 
-#endif	/* UNIV_DEBUG */
+#endif /* !UNIV_MUST_NOT_INLINE */
 
 #ifdef _WIN32
 #define UNIV_WORD_SIZE		4
 #elif defined(_WIN64)
 #define UNIV_WORD_SIZE		8
 #else
-/* MySQL config.h generated by GNU autoconf will define SIZEOF_LONG in Posix */
+/** MySQL config.h generated by GNU autoconf will define SIZEOF_LONG in Posix */
 #define UNIV_WORD_SIZE		SIZEOF_LONG
 #endif
 
-/* The following alignment is used in memory allocations in memory heap
+/** The following alignment is used in memory allocations in memory heap
 management to ensure correct alignment for doubles etc. */
-#define UNIV_MEM_ALIGNMENT      8
+#define UNIV_MEM_ALIGNMENT	8
 
-/* The following alignment is used in aligning lints etc. */
+/** The following alignment is used in aligning lints etc. */
 #define UNIV_WORD_ALIGNMENT	UNIV_WORD_SIZE
 
 /*
@@ -295,31 +304,93 @@ management to ensure correct alignment for doubles etc. */
 			========================
 */
 
-/* The 2-logarithm of UNIV_PAGE_SIZE: */
-#define UNIV_PAGE_SIZE_SHIFT	14
-/* The universal page size of the database */
-#define UNIV_PAGE_SIZE		(1 << UNIV_PAGE_SIZE_SHIFT)
-
-/* Maximum number of parallel threads in a parallelized operation */
+/** There are currently two InnoDB file formats which are used to group
+features with similar restrictions and dependencies. Using an enum allows
+switch statements to give a compiler warning when a new one is introduced. */
+enum innodb_file_formats_enum {
+	/** Antelope File Format: InnoDB/MySQL up to 5.1.
+	This format includes REDUNDANT and COMPACT row formats */
+	UNIV_FORMAT_A		= 0,
+
+	/** Barracuda File Format: Introduced in InnoDB plugin for 5.1:
+	This format includes COMPRESSED and DYNAMIC row formats.  It
+	includes the ability to create secondary indexes from data that
+	is not on the clustered index page and the ability to store more
+	data off the clustered index page. */
+	UNIV_FORMAT_B		= 1
+};
+
+typedef enum innodb_file_formats_enum innodb_file_formats_t;
+
+/** Minimum supported file format */
+#define UNIV_FORMAT_MIN		UNIV_FORMAT_A
+
+/** Maximum supported file format */
+#define UNIV_FORMAT_MAX		UNIV_FORMAT_B
+
+/** The 2-logarithm of UNIV_PAGE_SIZE: */
+#define UNIV_PAGE_SIZE_SHIFT	srv_page_size_shift
+
+/** The universal page size of the database */
+#define UNIV_PAGE_SIZE		srv_page_size
+
+/** log2 of smallest compressed page size (1<<10 == 1024 bytes)
+Note: This must never change! */
+#define UNIV_ZIP_SIZE_SHIFT_MIN		10
+
+/** log2 of largest compressed page size (1<<14 == 16384 bytes).
+A compressed page directory entry reserves 14 bits for the start offset
+and 2 bits for flags. This limits the uncompressed page size to 16k.
+Even though a 16k uncompressed page can theoretically be compressed
+into a larger compressed page, it is not a useful feature so we will
+limit both with this same constant. */
+#define UNIV_ZIP_SIZE_SHIFT_MAX		14
+
+/* Define the Min, Max, Default page sizes. */
+/** Minimum Page Size Shift (power of 2) */
+#define UNIV_PAGE_SIZE_SHIFT_MIN	12
+/** Maximum Page Size Shift (power of 2) */
+#define UNIV_PAGE_SIZE_SHIFT_MAX	14
+/** Default Page Size Shift (power of 2) */
+#define UNIV_PAGE_SIZE_SHIFT_DEF	14
+/** Original 16k InnoDB Page Size Shift, in case the default changes */
+#define UNIV_PAGE_SIZE_SHIFT_ORIG	14
+
+/** Minimum page size InnoDB currently supports. */
+#define UNIV_PAGE_SIZE_MIN	(1 << UNIV_PAGE_SIZE_SHIFT_MIN)
+/** Maximum page size InnoDB currently supports. */
+#define UNIV_PAGE_SIZE_MAX	(1 << UNIV_PAGE_SIZE_SHIFT_MAX)
+/** Default page size for InnoDB tablespaces. */
+#define UNIV_PAGE_SIZE_DEF	(1 << UNIV_PAGE_SIZE_SHIFT_DEF)
+/** Original 16k page size for InnoDB tablespaces. */
+#define UNIV_PAGE_SIZE_ORIG	(1 << UNIV_PAGE_SIZE_SHIFT_ORIG)
+
+/** Smallest compressed page size */
+#define UNIV_ZIP_SIZE_MIN	(1 << UNIV_ZIP_SIZE_SHIFT_MIN)
+
+/** Largest compressed page size */
+#define UNIV_ZIP_SIZE_MAX	(1 << UNIV_ZIP_SIZE_SHIFT_MAX)
+
+/** Number of supported page sizes (The convention 'ssize' is used
+for 'log2 minus 9' or the number of shifts starting with 512.)
+This number varies depending on UNIV_PAGE_SIZE. */
+#define UNIV_PAGE_SSIZE_MAX					\
+	(UNIV_PAGE_SIZE_SHIFT - UNIV_ZIP_SIZE_SHIFT_MIN + 1)
+
+/** Maximum number of parallel threads in a parallelized operation */
 #define UNIV_MAX_PARALLELISM	32
 
-/** This is the "mbmaxlen" for my_charset_filename (defined in
-strings/ctype-utf8.c), which is used to encode File and Database names. */
-#define FILENAME_CHARSET_MAXNAMLEN	5
-
-/** The maximum length of an encode table name in bytes.  The max
-table and database names are NAME_CHAR_LEN (64) characters. After the
-encoding, the max length would be NAME_CHAR_LEN (64) *
-FILENAME_CHARSET_MAXNAMLEN (5) = 320 bytes. The number does not include a
-terminating '\0'. InnoDB can handle longer names internally */
-#define MAX_TABLE_NAME_LEN	320
-
+/** The maximum length of a table name. This is the MySQL limit and is
+defined in mysql_com.h like NAME_CHAR_LEN*SYSTEM_CHARSET_MBMAXLEN, the
+number does not include a terminating '\0'. InnoDB probably can handle
+longer names internally */
+#define MAX_TABLE_NAME_LEN	192
 
-/* The maximum length of a database name. Like MAX_TABLE_NAME_LEN this is
+/** The maximum length of a database name. Like MAX_TABLE_NAME_LEN this is
 the MySQL's NAME_LEN, see check_and_convert_db_name(). */
 #define MAX_DATABASE_NAME_LEN	MAX_TABLE_NAME_LEN
 
-/* MAX_FULL_NAME_LEN defines the full name path including the
+/** MAX_FULL_NAME_LEN defines the full name path including the
 database name and table name. In addition, 14 bytes is added for:
 	2 for surrounding quotes around table name
 	1 for the separating dot (.)
@@ -335,39 +406,41 @@ database name and table name. In addition, 14 bytes is added for:
 /* Note that inside MySQL 'byte' is defined as char on Linux! */
 #define byte			unsigned char
 
-/* Define an unsigned integer type that is exactly 32 bits. */
-
-#if SIZEOF_INT == 4
-typedef unsigned int		ib_uint32_t;
-#elif SIZEOF_LONG == 4
-typedef unsigned long		ib_uint32_t;
-#else
-#error "Neither int or long is 4 bytes"
-#endif
-
 /* Another basic type we use is unsigned long integer which should be equal to
 the word size of the machine, that is on a 32-bit platform 32 bits, and on a
 64-bit platform 64 bits. We also give the printf format for the type as a
 macro ULINTPF. */
 
+
+#ifdef __WIN__
+/* Use the integer types and formatting strings defined in Visual Studio. */
+# define UINT32PF	"%I32u"
+# define INT64PF	"%I64d"
+# define UINT64PF	"%I64u"
+typedef __int64 ib_int64_t;
+typedef unsigned __int64 ib_uint64_t;
+typedef unsigned __int32 ib_uint32_t;
+#else
+/* Use the integer types and formatting strings defined in the C99 standard. */
+# define UINT32PF	"%"PRIu32
+# define INT64PF	"%"PRId64
+# define UINT64PF	"%"PRIu64
+typedef int64_t ib_int64_t;
+typedef uint64_t ib_uint64_t;
+typedef uint32_t ib_uint32_t;
+# endif /* __WIN__ */
+
+# define IB_ID_FMT	UINT64PF
+
 #ifdef _WIN64
 typedef unsigned __int64	ulint;
-#define ULINTPF			"%I64u"
 typedef __int64			lint;
+# define ULINTPF		UINT64PF
 #else
 typedef unsigned long int	ulint;
-#define ULINTPF			"%lu"
 typedef long int		lint;
-#endif
-
-#ifdef __WIN__
-typedef __int64			ib_int64_t;
-typedef unsigned __int64	ib_uint64_t;
-#elif !defined(UNIV_HOTBACKUP)
-/* Note: longlong and ulonglong come from MySQL headers. */
-typedef longlong		ib_int64_t;
-typedef ulonglong		ib_uint64_t;
-#endif
+# define ULINTPF		"%lu"
+#endif /* _WIN64 */
 
 #ifndef UNIV_HOTBACKUP
 typedef unsigned long long int	ullint;
@@ -379,27 +452,33 @@ typedef unsigned long long int	ullint;
 #endif
 #endif
 
-/* The 'undefined' value for a ulint */
+/** The 'undefined' value for a ulint */
 #define ULINT_UNDEFINED		((ulint)(-1))
 
+#define ULONG_UNDEFINED		((ulong)(-1))
+
+/** The 'undefined' value for a ib_uint64_t */
+#define UINT64_UNDEFINED	((ib_uint64_t)(-1))
+
 /** The bitmask of 32-bit unsigned integer */
 #define ULINT32_MASK		0xFFFFFFFF
-/* The undefined 32-bit unsigned integer */
+/** The undefined 32-bit unsigned integer */
 #define	ULINT32_UNDEFINED	ULINT32_MASK
 
-/* Maximum value for a ulint */
+/** Maximum value for a ulint */
 #define ULINT_MAX		((ulint)(-2))
 
-/* Maximum value for ib_uint64_t */
+/** Maximum value for ib_uint64_t */
 #define IB_ULONGLONG_MAX	((ib_uint64_t) (~0ULL))
+#define IB_UINT64_MAX		IB_ULONGLONG_MAX
 
 /** The generic InnoDB system object identifier data type */
 typedef ib_uint64_t	ib_id_t;
 
-/* The 'undefined' value for a ullint */
+/** The 'undefined' value for a ullint */
 #define ULLINT_UNDEFINED        ((ullint)(-1))
 
-/* This 'ibool' type is used within Innobase. Remember that different included
+/** This 'ibool' type is used within Innobase. Remember that different included
 headers may define 'bool' differently. Do not assume that 'bool' is a ulint! */
 #define ibool			ulint
 
@@ -410,7 +489,7 @@ headers may define 'bool' differently. Do not assume that 'bool' is a ulint! */
 
 #endif
 
-/* The following number as the length of a logical field means that the field
+/** The following number as the length of a logical field means that the field
 has the SQL NULL as its value. NOTE that because we assume that the length
 of a field is a 32-bit integer when we store it, for example, to an undo log
 on disk, we must have also this number fit in 32 bits, also in 64-bit
@@ -418,15 +497,23 @@ computers! */
 
 #define UNIV_SQL_NULL ULINT32_UNDEFINED
 
-/* Lengths which are not UNIV_SQL_NULL, but bigger than the following
+/** Lengths which are not UNIV_SQL_NULL, but bigger than the following
 number indicate that a field contains a reference to an externally
 stored part of the field in the tablespace. The length field then
 contains the sum of the following flag and the locally stored len. */
 
-#define UNIV_EXTERN_STORAGE_FIELD (UNIV_SQL_NULL - UNIV_PAGE_SIZE)
+#define UNIV_EXTERN_STORAGE_FIELD (UNIV_SQL_NULL - UNIV_PAGE_SIZE_MAX)
 
-/* Some macros to improve branch prediction and reduce cache misses */
 #if defined(__GNUC__) && (__GNUC__ > 2) && ! defined(__INTEL_COMPILER)
+#define HAVE_GCC_GT_2
+/* Tell the compiler that variable/function is unused. */
+# define UNIV_UNUSED    __attribute__ ((unused))
+#else
+# define UNIV_UNUSED
+#endif /* CHECK FOR GCC VER_GT_2 */
+
+/* Some macros to improve branch prediction and reduce cache misses */
+#if defined(INNODB_COMPILER_HINTS) && defined(HAVE_GCC_GT_2)
 /* Tell the compiler that 'expr' probably evaluates to 'constant'. */
 # define UNIV_EXPECT(expr,constant) __builtin_expect(expr, constant)
 /* Tell the compiler that a pointer is likely to be NULL */
@@ -437,19 +524,30 @@ it is read. */
 /* Minimize cache-miss latency by moving data at addr into a cache before
 it is read or written. */
 # define UNIV_PREFETCH_RW(addr) __builtin_prefetch(addr, 1, 3)
+
 /* Sun Studio includes sun_prefetch.h as of version 5.9 */
 #elif (defined(__SUNPRO_C) && __SUNPRO_C >= 0x590) \
        || (defined(__SUNPRO_CC) && __SUNPRO_CC >= 0x590)
+
 # include <sun_prefetch.h>
+
 #if __SUNPRO_C >= 0x550
 # undef UNIV_INTERN
 # define UNIV_INTERN __hidden
 #endif /* __SUNPRO_C >= 0x550 */
-/* Use sun_prefetch when compile with Sun Studio */
+
 # define UNIV_EXPECT(expr,value) (expr)
 # define UNIV_LIKELY_NULL(expr) (expr)
-# define UNIV_PREFETCH_R(addr) sun_prefetch_read_many((void*) addr)
-# define UNIV_PREFETCH_RW(addr) sun_prefetch_write_many(addr)
+
+# if defined(INNODB_COMPILER_HINTS)
+//# define UNIV_PREFETCH_R(addr) sun_prefetch_read_many((void*) addr)
+#  define UNIV_PREFETCH_R(addr) ((void) 0)
+#  define UNIV_PREFETCH_RW(addr) sun_prefetch_write_many(addr)
+# else
+#  define UNIV_PREFETCH_R(addr) ((void) 0)
+#  define UNIV_PREFETCH_RW(addr) ((void) 0)
+# endif /* INNODB_COMPILER_HINTS */
+
 #else
 /* Dummy versions of the macros */
 # define UNIV_EXPECT(expr,value) (expr)
@@ -457,6 +555,7 @@ it is read or written. */
 # define UNIV_PREFETCH_R(addr) ((void) 0)
 # define UNIV_PREFETCH_RW(addr) ((void) 0)
 #endif
+
 /* Tell the compiler that cond is likely to hold */
 #define UNIV_LIKELY(cond) UNIV_EXPECT(cond, TRUE)
 /* Tell the compiler that cond is unlikely to hold */
@@ -487,7 +586,7 @@ typedef void* os_thread_ret_t;
 # define UNIV_MEM_INVALID(addr, size) VALGRIND_MAKE_MEM_UNDEFINED(addr, size)
 # define UNIV_MEM_FREE(addr, size) VALGRIND_MAKE_MEM_NOACCESS(addr, size)
 # define UNIV_MEM_ALLOC(addr, size) VALGRIND_MAKE_MEM_UNDEFINED(addr, size)
-# define UNIV_MEM_DESC(addr, size, b) VALGRIND_CREATE_BLOCK(addr, size, b)
+# define UNIV_MEM_DESC(addr, size) VALGRIND_CREATE_BLOCK(addr, size, #addr)
 # define UNIV_MEM_UNDESC(b) VALGRIND_DISCARD(b)
 # define UNIV_MEM_ASSERT_RW(addr, size) do {				\
 	const void* _p = (const void*) (ulint)				\
@@ -512,7 +611,7 @@ typedef void* os_thread_ret_t;
 # define UNIV_MEM_INVALID(addr, size) do {} while(0)
 # define UNIV_MEM_FREE(addr, size) do {} while(0)
 # define UNIV_MEM_ALLOC(addr, size) do {} while(0)
-# define UNIV_MEM_DESC(addr, size, b) do {} while(0)
+# define UNIV_MEM_DESC(addr, size) do {} while(0)
 # define UNIV_MEM_UNDESC(b) do {} while(0)
 # define UNIV_MEM_ASSERT_RW(addr, size) do {} while(0)
 # define UNIV_MEM_ASSERT_W(addr, size) do {} while(0)
@@ -526,4 +625,7 @@ typedef void* os_thread_ret_t;
 	UNIV_MEM_ALLOC(addr, size);			\
 } while (0)
 
+extern ulong	srv_page_size_shift;
+extern ulong	srv_page_size;
+
 #endif
diff --git a/storage/innobase/include/usr0sess.h b/storage/innobase/include/usr0sess.h
index 2c288f7d455..4a0710c5060 100644
--- a/storage/innobase/include/usr0sess.h
+++ b/storage/innobase/include/usr0sess.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -51,7 +51,8 @@ sess_close(
 /*=======*/
 	sess_t*		sess);		/* in, own: session object */
 
-/* The session handle. All fields are protected by the kernel mutex */
+/* The session handle. This data structure is only used by purge and is
+not really necessary. We should get rid of it. */
 struct sess_struct{
 	ulint		state;		/*!< state of the session */
 	trx_t*		trx;		/*!< transaction object permanently
diff --git a/storage/innobase/include/usr0sess.ic b/storage/innobase/include/usr0sess.ic
index 35a75d75acc..284e59537fe 100644
--- a/storage/innobase/include/usr0sess.ic
+++ b/storage/innobase/include/usr0sess.ic
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
diff --git a/storage/innobase/include/usr0types.h b/storage/innobase/include/usr0types.h
index 6cc6f015613..403ad0223a8 100644
--- a/storage/innobase/include/usr0types.h
+++ b/storage/innobase/include/usr0types.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
diff --git a/storage/innobase/include/ut0bh.h b/storage/innobase/include/ut0bh.h
index 1b211390283..4c029e256a9 100644
--- a/storage/innobase/include/ut0bh.h
+++ b/storage/innobase/include/ut0bh.h
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
diff --git a/storage/innobase/include/ut0bh.ic b/storage/innobase/include/ut0bh.ic
index afbe58e7e3b..a604237665d 100644
--- a/storage/innobase/include/ut0bh.ic
+++ b/storage/innobase/include/ut0bh.ic
@@ -1,5 +1,6 @@
 /***************************************************************************//**
-Copyright (c) 2011, Oracle Corpn. All Rights Reserved.
+
+Copyright (c) 2011, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -10,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -122,4 +123,3 @@ ib_bh_last(
 		: ib_bh_get(ib_bh, ib_bh_size(ib_bh) - 1));
 }
 
-
diff --git a/storage/innobase/include/ut0byte.h b/storage/innobase/include/ut0byte.h
index b99d7175b94..5bdd553ca80 100644
--- a/storage/innobase/include/ut0byte.h
+++ b/storage/innobase/include/ut0byte.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1994, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -32,7 +32,7 @@ Created 1/20/1994 Heikki Tuuri
 
 /*******************************************************//**
 Creates a 64-bit integer out of two 32-bit integers.
-@return	created dulint */
+@return	created integer */
 UNIV_INLINE
 ib_uint64_t
 ut_ull_create(
diff --git a/storage/innobase/include/ut0byte.ic b/storage/innobase/include/ut0byte.ic
index e7908efa41a..873d98c727e 100644
--- a/storage/innobase/include/ut0byte.ic
+++ b/storage/innobase/include/ut0byte.ic
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1994, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -25,7 +25,7 @@ Created 5/30/1994 Heikki Tuuri
 
 /*******************************************************//**
 Creates a 64-bit integer out of two 32-bit integers.
-@return	created dulint */
+@return	created integer */
 UNIV_INLINE
 ib_uint64_t
 ut_ull_create(
@@ -90,7 +90,7 @@ ut_align(
 
 	ut_ad(sizeof(void*) == sizeof(ulint));
 
-	return((void*)((((ulint)ptr) + align_no - 1) & ~(align_no - 1)));
+	return((void*)((((ulint) ptr) + align_no - 1) & ~(align_no - 1)));
 }
 
 /*********************************************************//**
@@ -110,7 +110,7 @@ ut_align_down(
 
 	ut_ad(sizeof(void*) == sizeof(ulint));
 
-	return((void*)((((ulint)ptr)) & ~(align_no - 1)));
+	return((void*)((((ulint) ptr)) & ~(align_no - 1)));
 }
 
 /*********************************************************//**
@@ -130,7 +130,7 @@ ut_align_offset(
 
 	ut_ad(sizeof(void*) == sizeof(ulint));
 
-	return(((ulint)ptr) & (align_no - 1));
+	return(((ulint) ptr) & (align_no - 1));
 }
 
 /*****************************************************************//**
diff --git a/storage/innobase/include/ut0crc32.h b/storage/innobase/include/ut0crc32.h
new file mode 100644
index 00000000000..456648001aa
--- /dev/null
+++ b/storage/innobase/include/ut0crc32.h
@@ -0,0 +1,48 @@
+/*****************************************************************************
+
+Copyright (c) 2011, 2011, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/ut0crc32.h
+CRC32 implementation
+
+Created Aug 10, 2011 Vasil Dimov
+*******************************************************/
+
+#ifndef ut0crc32_h
+#define ut0crc32_h
+
+#include "univ.i"
+
+/********************************************************************//**
+Initializes the data structures used by ut_crc32(). Does not do any
+allocations, would not hurt if called twice, but would be pointless. */
+UNIV_INTERN
+void
+ut_crc32_init();
+/*===========*/
+
+/********************************************************************//**
+Calculates CRC32.
+@param ptr	- data over which to calculate CRC32.
+@param len	- data length in bytes.
+@return CRC32 (CRC-32C, using the GF(2) primitive polynomial 0x11EDC6F41,
+or 0x1EDC6F41 without the high-order bit) */
+typedef ib_uint32_t (*ib_ut_crc32_t)(const byte* ptr, ulint len);
+
+extern ib_ut_crc32_t	ut_crc32;
+#endif /* ut0crc32_h */
diff --git a/storage/innobase/include/ut0dbg.h b/storage/innobase/include/ut0dbg.h
index ce6dcb63049..e9ad62fb81b 100644
--- a/storage/innobase/include/ut0dbg.h
+++ b/storage/innobase/include/ut0dbg.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1994, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -26,6 +26,12 @@ Created 1/30/1994 Heikki Tuuri
 #ifndef ut0dbg_h
 #define ut0dbg_h
 
+#ifdef UNIV_INNOCHECKSUM
+#define ut_a		assert
+#define ut_ad		assert
+#define ut_error	assert(0)
+#else /* !UNIV_INNOCHECKSUM */
+
 #include "univ.i"
 #include <stdlib.h>
 #include "os0thread.h"
@@ -163,4 +169,6 @@ speedo_show(
 
 #endif /* UNIV_COMPILE_TEST_FUNCS */
 
+#endif /* !UNIV_INNOCHECKSUM */
+
 #endif
diff --git a/storage/innobase/include/ut0list.h b/storage/innobase/include/ut0list.h
index ec67f4e2a0f..57d6bdc33a6 100644
--- a/storage/innobase/include/ut0list.h
+++ b/storage/innobase/include/ut0list.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 2006, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 2006, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -142,6 +142,15 @@ ib_list_get_last(
 /*=============*/
 	ib_list_t*	list);	/*!< in: list */
 
+/********************************************************************
+Check if list is empty. */
+UNIV_INLINE
+ibool
+ib_list_is_empty(
+/*=============*/
+					/* out: TRUE if empty else  */
+	const ib_list_t*	list);	/* in: list */
+
 /* List. */
 struct ib_list_struct {
 	ib_list_node_t*		first;		/*!< first node */
diff --git a/storage/innobase/include/ut0list.ic b/storage/innobase/include/ut0list.ic
index eb5c62796e8..d9dcb2eac99 100644
--- a/storage/innobase/include/ut0list.ic
+++ b/storage/innobase/include/ut0list.ic
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 2006, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 2006, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -46,3 +46,15 @@ ib_list_get_last(
 {
 	return(list->last);
 }
+
+/********************************************************************
+Check if list is empty. */
+UNIV_INLINE
+ibool
+ib_list_is_empty(
+/*=============*/
+					/* out: TRUE if empty else FALSE */
+	const ib_list_t*	list)	/* in: list */
+{
+	return(!(list->first || list->last));
+}
diff --git a/storage/innobase/include/ut0lst.h b/storage/innobase/include/ut0lst.h
index bb295ea1b22..51c89f15a77 100644
--- a/storage/innobase/include/ut0lst.h
+++ b/storage/innobase/include/ut0lst.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1995, 2010, Innobase Oy. All Rights Reserved.
+Copyright (c) 1995, 2011, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -28,10 +28,17 @@ Created 9/10/1995 Heikki Tuuri
 
 #include "univ.i"
 
+/*******************************************************************//**
+Return offset of F in POD T.
+@param T	- POD pointer
+@param F	- Field in T */
+#define IB_OFFSETOF(T, F)						\
+	(reinterpret_cast<byte*>(&(T)->F) - reinterpret_cast<byte*>(T))
+
 /* This module implements the two-way linear list which should be used
 if a list is used in the database. Note that a single struct may belong
 to two or more lists, provided that the list are given different names.
-An example of the usage of the lists can be found in fil0fil.c. */
+An example of the usage of the lists can be found in fil0fil.cc. */
 
 /*******************************************************************//**
 This macro expands to the unnamed type definition of a struct which acts
@@ -39,12 +46,16 @@ as the two-way list base node. The base node contains pointers
 to both ends of the list and a count of nodes in the list (excluding
 the base node from the count).
 @param TYPE	the name of the list node data type */
-#define UT_LIST_BASE_NODE_T(TYPE)\
-struct {\
-	ulint	count;	/*!< count of nodes in list */\
-	TYPE *	start;	/*!< pointer to list start, NULL if empty */\
-	TYPE *	end;	/*!< pointer to list end, NULL if empty */\
-}\
+template <typename TYPE>
+struct ut_list_base {
+	typedef TYPE elem_type;
+
+	ulint	count;	/*!< count of nodes in list */
+	TYPE*	start;	/*!< pointer to list start, NULL if empty */
+	TYPE*	end;	/*!< pointer to list end, NULL if empty */
+};
+
+#define UT_LIST_BASE_NODE_T(TYPE)	ut_list_base<TYPE>
 
 /*******************************************************************//**
 This macro expands to the unnamed type definition of a struct which
@@ -62,12 +73,29 @@ struct LRU_node_struct {
 The example implements an LRU list of name LRU_list. Its nodes are of type
 LRU_node_t. */
 
-#define UT_LIST_NODE_T(TYPE)\
-struct {\
-	TYPE *	prev;	/*!< pointer to the previous node,\
-			NULL if start of list */\
-	TYPE *	next;	/*!< pointer to next node, NULL if end of list */\
-}\
+template <typename TYPE>
+struct ut_list_node {
+	TYPE* 	prev;	/*!< pointer to the previous node,
+			NULL if start of list */
+	TYPE* 	next;	/*!< pointer to next node, NULL if end of list */
+};
+
+#define UT_LIST_NODE_T(TYPE)	ut_list_node<TYPE>
+
+/*******************************************************************//**
+Get the list node at offset.
+@param elem	- list element
+@param offset	- offset within element.
+@return reference to list node. */
+template <typename Type>
+ut_list_node<Type>&
+ut_elem_get_node(Type&	elem, size_t offset)
+{
+	ut_a(offset < sizeof(elem));
+
+	return(*reinterpret_cast<ut_list_node<Type>*>(
+		reinterpret_cast<byte*>(&elem) + offset));
+}
 
 /*******************************************************************//**
 Initializes the base node of a two-way list.
@@ -82,108 +110,197 @@ Initializes the base node of a two-way list.
 
 /*******************************************************************//**
 Adds the node as the first element in a two-way linked list.
+@param list	the base node (not a pointer to it)
+@param elem	the element to add
+@param offset	offset of list node in elem. */
+template <typename List, typename Type>
+void
+ut_list_prepend(
+	List&		list,
+	Type&		elem,
+	size_t		offset)
+{
+	ut_list_node<Type>&	elem_node = ut_elem_get_node(elem, offset);
+
+ 	elem_node.prev = 0;
+ 	elem_node.next = list.start;
+
+	if (list.start != 0) {
+		ut_list_node<Type>&	base_node =
+			ut_elem_get_node(*list.start, offset);
+
+		ut_ad(list.start != &elem);
+
+		base_node.prev = &elem;
+	}
+
+	list.start = &elem;
+
+	if (list.end == 0) {
+		list.end = &elem;
+	}
+
+	++list.count;
+}
+
+/*******************************************************************//**
+Adds the node as the first element in a two-way linked list.
 @param NAME	list name
-@param BASE	the base node (not a pointer to it)
-@param N	pointer to the node to be added to the list.
-*/
-#define UT_LIST_ADD_FIRST(NAME, BASE, N)\
-{\
-	ut_ad(N);\
-	((BASE).count)++;\
-	((N)->NAME).next = (BASE).start;\
-	((N)->NAME).prev = NULL;\
-	if (UNIV_LIKELY((BASE).start != NULL)) {\
-		ut_ad((BASE).start != (N));\
-		(((BASE).start)->NAME).prev = (N);\
-	}\
-	(BASE).start = (N);\
-	if (UNIV_UNLIKELY((BASE).end == NULL)) {\
-		(BASE).end = (N);\
-	}\
-}\
+@param LIST	the base node (not a pointer to it)
+@param ELEM	the element to add */
+#define UT_LIST_ADD_FIRST(NAME, LIST, ELEM)	\
+	ut_list_prepend(LIST, *ELEM, IB_OFFSETOF(ELEM, NAME))
+
+/*******************************************************************//**
+Adds the node as the last element in a two-way linked list.
+@param list	list
+@param elem	the element to add
+@param offset	offset of list node in elem */
+template <typename List, typename Type>
+void
+ut_list_append(
+	List&		list,
+	Type&		elem,
+	size_t		offset)
+{
+	ut_list_node<Type>&	elem_node = ut_elem_get_node(elem, offset);
+
+	elem_node.next = 0;
+	elem_node.prev = list.end;
+
+	if (list.end != 0) {
+		ut_list_node<Type>&	base_node =
+			ut_elem_get_node(*list.end, offset);
+
+		ut_ad(list.end != &elem);
+
+		base_node.next = &elem;
+	}
+
+	list.end = &elem;
+
+	if (list.start == 0) {
+		list.start = &elem;
+	}
+
+	++list.count;
+}
 
 /*******************************************************************//**
 Adds the node as the last element in a two-way linked list.
 @param NAME	list name
-@param BASE	the base node (not a pointer to it)
-@param N	pointer to the node to be added to the list
-*/
-#define UT_LIST_ADD_LAST(NAME, BASE, N)\
-{\
-	ut_ad(N != NULL);\
-	((BASE).count)++;\
-	((N)->NAME).prev = (BASE).end;\
-	((N)->NAME).next = NULL;\
-	if ((BASE).end != NULL) {\
-		ut_ad((BASE).end != (N));\
-		(((BASE).end)->NAME).next = (N);\
-	}\
-	(BASE).end = (N);\
-	if ((BASE).start == NULL) {\
-		(BASE).start = (N);\
-	}\
-}\
+@param LIST	list
+@param ELEM	the element to add */
+#define UT_LIST_ADD_LAST(NAME, LIST, ELEM)\
+	ut_list_append(LIST, *ELEM, IB_OFFSETOF(ELEM, NAME))
 
 /*******************************************************************//**
-Inserts a NODE2 after NODE1 in a list.
+Inserts a ELEM2 after ELEM1 in a list.
+@param list	the base node
+@param elem1	node after which ELEM2 is inserted
+@param elem2	node being inserted after NODE1
+@param offset	offset of list node in elem1 and elem2 */
+template <typename List, typename Type>
+void
+ut_list_insert(
+	List&		list,
+	Type&		elem1,
+	Type&		elem2,
+	size_t		offset)
+{
+	ut_ad(&elem1 != &elem2);
+
+	ut_list_node<Type>&	elem1_node = ut_elem_get_node(elem1, offset);
+	ut_list_node<Type>&	elem2_node = ut_elem_get_node(elem2, offset);
+
+	elem2_node.prev = &elem1;
+	elem2_node.next = elem1_node.next;
+
+	if (elem1_node.next != NULL) {
+		ut_list_node<Type>&	next_node =
+			ut_elem_get_node(*elem1_node.next, offset);
+
+		next_node.prev = &elem2;
+	}
+
+	elem1_node.next = &elem2;
+
+	if (list.end == &elem1) {
+		list.end = &elem2;
+	}
+
+	++list.count;
+}
+
+/*******************************************************************//**
+Inserts a ELEM2 after ELEM1 in a list.
 @param NAME	list name
-@param BASE	the base node (not a pointer to it)
-@param NODE1	pointer to node after which NODE2 is inserted
-@param NODE2	pointer to node being inserted after NODE1
-*/
-#define UT_LIST_INSERT_AFTER(NAME, BASE, NODE1, NODE2)\
-{\
-	ut_ad(NODE1);\
-	ut_ad(NODE2);\
-	ut_ad((NODE1) != (NODE2));\
-	((BASE).count)++;\
-	((NODE2)->NAME).prev = (NODE1);\
-	((NODE2)->NAME).next = ((NODE1)->NAME).next;\
-	if (((NODE1)->NAME).next != NULL) {\
-		((((NODE1)->NAME).next)->NAME).prev = (NODE2);\
-	}\
-	((NODE1)->NAME).next = (NODE2);\
-	if ((BASE).end == (NODE1)) {\
-		(BASE).end = (NODE2);\
-	}\
-}\
+@param LIST	the base node
+@param ELEM1	node after which ELEM2 is inserted
+@param ELEM2	node being inserted after ELEM1 */
+#define UT_LIST_INSERT_AFTER(NAME, LIST, ELEM1, ELEM2)\
+	ut_list_insert(LIST, *ELEM1, *ELEM2, IB_OFFSETOF(ELEM1, NAME))
 
 #ifdef UNIV_LIST_DEBUG
 /** Invalidate the pointers in a list node.
 @param NAME	list name
 @param N	pointer to the node that was removed */
-# define UT_LIST_REMOVE_CLEAR(NAME, N)		\
-((N)->NAME.prev = (N)->NAME.next = (void*) -1)
+# define UT_LIST_REMOVE_CLEAR(N)					\
+	(N).next = (Type*) -1;						\
+	(N).prev = (N).next
 #else
 /** Invalidate the pointers in a list node.
 @param NAME	list name
 @param N	pointer to the node that was removed */
-# define UT_LIST_REMOVE_CLEAR(NAME, N) while (0)
-#endif
+# define UT_LIST_REMOVE_CLEAR(N)
+#endif /* UNIV_LIST_DEBUG */
 
 /*******************************************************************//**
 Removes a node from a two-way linked list.
-@param NAME	list name
-@param BASE	the base node (not a pointer to it)
-@param N	pointer to the node to be removed from the list
-*/
-#define UT_LIST_REMOVE(NAME, BASE, N)					\
-do {									\
-	ut_ad(N);							\
-	ut_a((BASE).count > 0);						\
-	((BASE).count)--;						\
-	if (((N)->NAME).next != NULL) {					\
-		((((N)->NAME).next)->NAME).prev = ((N)->NAME).prev;	\
-	} else {							\
-		(BASE).end = ((N)->NAME).prev;				\
-	}								\
-	if (((N)->NAME).prev != NULL) {					\
-		((((N)->NAME).prev)->NAME).next = ((N)->NAME).next;	\
-	} else {							\
-		(BASE).start = ((N)->NAME).next;			\
-	}								\
-	UT_LIST_REMOVE_CLEAR(NAME, N);					\
-} while (0)
+@param list	the base node (not a pointer to it)
+@param elem	node to be removed from the list
+@param offset	offset of list node within elem */
+template <typename List, typename Type>
+void
+ut_list_remove(
+	List&		list,
+ 	Type&		elem,
+	size_t		offset)
+{
+	ut_list_node<Type>&	elem_node = ut_elem_get_node(elem, offset);
+
+	ut_a(list.count > 0);
+
+	if (elem_node.next != NULL) {
+		ut_list_node<Type>&	next_node =
+			ut_elem_get_node(*elem_node.next, offset);
+
+		next_node.prev = elem_node.prev;
+	} else {
+		list.end = elem_node.prev;
+	}
+
+	if (elem_node.prev != NULL) {
+		ut_list_node<Type>&	prev_node =
+			ut_elem_get_node(*elem_node.prev, offset);
+
+		prev_node.next = elem_node.next;
+	} else {
+		list.start = elem_node.next;
+	}
+
+	UT_LIST_REMOVE_CLEAR(elem_node);
+
+	--list.count;
+}
+
+/*******************************************************************//**
+Removes a node from a two-way linked list.
+  aram NAME	list name
+@param LIST	the base node (not a pointer to it)
+@param ELEM	node to be removed from the list */
+#define UT_LIST_REMOVE(NAME, LIST, ELEM)				\
+	ut_list_remove(LIST, *ELEM, IB_OFFSETOF(ELEM, NAME))
 
 /********************************************************************//**
 Gets the next node in a two-way list.
@@ -223,39 +340,70 @@ Gets the last node in a two-way list.
 #define UT_LIST_GET_LAST(BASE)\
 	(BASE).end
 
+struct	NullValidate { void operator()(const void* elem) { } };
+
+/********************************************************************//**
+Iterate over all the elements and call the functor for each element.
+@param list	base node (not a pointer to it)
+@param functor	Functor that is called for each element in the list
+@parm  node	pointer to member node within list element */
+template <typename List, class Functor>
+void
+ut_list_map(
+	List&		list,
+	ut_list_node<typename List::elem_type>
+			List::elem_type::*node,
+	Functor		functor)
+{
+	ulint		count = 0;
+
+	for (typename List::elem_type* elem = list.start;
+	     elem != 0;
+	     elem = (elem->*node).next, ++count) {
+
+		functor(elem);
+	}
+
+	ut_a(count == list.count);
+}
+
+/********************************************************************//**
+Checks the consistency of a two-way list.
+@param list	base node (not a pointer to it)
+@param functor	Functor that is called for each element in the list
+@parm  node	pointer to member node within list element */
+template <typename List, class Functor>
+void
+ut_list_validate(
+	List&		list,
+	ut_list_node<typename List::elem_type>
+			List::elem_type::*node,
+	Functor		functor = NullValidate())
+{
+	ut_list_map(list, node, functor);
+
+	ulint		count = 0;
+
+	for (typename List::elem_type* elem = list.end;
+	     elem != 0;
+	     elem = (elem->*node).prev, ++count) {
+
+		functor(elem);
+	}
+
+	ut_a(count == list.count);
+}
+
 /********************************************************************//**
 Checks the consistency of a two-way list.
 @param NAME		the name of the list
 @param TYPE		node type
-@param BASE		base node (not a pointer to it)
-@param ASSERTION	a condition on ut_list_node_313 */
-#define UT_LIST_VALIDATE(NAME, TYPE, BASE, ASSERTION)			\
-do {									\
-	ulint	ut_list_i_313;						\
-	TYPE*	ut_list_node_313;					\
-									\
-	ut_list_node_313 = (BASE).start;				\
-									\
-	for (ut_list_i_313 = (BASE).count; ut_list_i_313--; ) {		\
-		ut_a(ut_list_node_313);					\
-		ASSERTION;						\
-		ut_ad((ut_list_node_313->NAME).next || !ut_list_i_313);	\
-		ut_list_node_313 = (ut_list_node_313->NAME).next;	\
-	}								\
-									\
-	ut_a(ut_list_node_313 == NULL);					\
-									\
-	ut_list_node_313 = (BASE).end;					\
-									\
-	for (ut_list_i_313 = (BASE).count; ut_list_i_313--; ) {		\
-		ut_a(ut_list_node_313);					\
-		ASSERTION;						\
-		ut_ad((ut_list_node_313->NAME).prev || !ut_list_i_313);	\
-		ut_list_node_313 = (ut_list_node_313->NAME).prev;	\
-	}								\
-									\
-	ut_a(ut_list_node_313 == NULL);					\
-} while (0)
-
-#endif
+@param LIST		base node (not a pointer to it)
+@param FUNCTOR		called for each list element */
+#define UT_LIST_VALIDATE(NAME, TYPE, LIST, FUNCTOR)			\
+	ut_list_validate(LIST, &TYPE::NAME, FUNCTOR)
+
+#define UT_LIST_CHECK(NAME, TYPE, LIST)					\
+	ut_list_validate(LIST, &TYPE::NAME, NullValidate())
 
+#endif /* ut0lst.h */
diff --git a/storage/innobase/include/ut0mem.h b/storage/innobase/include/ut0mem.h
index 39f5f20dc6d..af7eb4e9b1d 100644
--- a/storage/innobase/include/ut0mem.h
+++ b/storage/innobase/include/ut0mem.h
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -101,7 +101,7 @@ ut_free(
 	void* ptr);  /*!< in, own: memory block, can be NULL */
 #ifndef UNIV_HOTBACKUP
 /**********************************************************************//**
-Implements realloc. This is needed by /pars/lexyy.c. Otherwise, you should not
+Implements realloc. This is needed by /pars/lexyy.cc. Otherwise, you should not
 use this function because the allocation functions in mem0mem.h are the
 recommended ones in InnoDB.
 
@@ -211,6 +211,18 @@ ut_strreplace(
 	const char*	s1,	/*!< in: string to replace */
 	const char*	s2);	/*!< in: string to replace s1 with */
 
+/********************************************************************
+Concatenate 3 strings.*/
+
+char*
+ut_str3cat(
+/*=======*/
+				/* out, own: concatenated string, must be
+				freed with mem_free() */
+	const char*	s1,	/* in: string 1 */
+	const char*	s2,	/* in: string 2 */
+	const char*	s3);	/* in: string 3 */
+
 /**********************************************************************//**
 Converts a raw binary data to a NUL-terminated hex string. The output is
 truncated if there is not enough space in "hex", make sure "hex_size" is at
diff --git a/storage/innobase/include/ut0mem.ic b/storage/innobase/include/ut0mem.ic
index c06e2b3ae81..5c9071d52cc 100644
--- a/storage/innobase/include/ut0mem.ic
+++ b/storage/innobase/include/ut0mem.ic
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -280,7 +280,7 @@ ut_str_sql_format(
 		switch (ch) {
 		case '\0':
 
-			if (UNIV_UNLIKELY(buf_size - buf_i < 4)) {
+			if (buf_size - buf_i < 4) {
 
 				goto func_exit;
 			}
@@ -292,7 +292,7 @@ ut_str_sql_format(
 		case '\'':
 		case '\\':
 
-			if (UNIV_UNLIKELY(buf_size - buf_i < 4)) {
+			if (buf_size - buf_i < 4) {
 
 				goto func_exit;
 			}
diff --git a/storage/innobase/include/ut0rbt.h b/storage/innobase/include/ut0rbt.h
index e26b637ae13..e8a4430e76b 100644
--- a/storage/innobase/include/ut0rbt.h
+++ b/storage/innobase/include/ut0rbt.h
@@ -1,12 +1,6 @@
 /***************************************************************************//**
 
-Copyright (c) 2007, 2010, Innobase Oy. All Rights Reserved.
-
-Portions of this file contain modifications contributed and copyrighted by
-Sun Microsystems, Inc. Those modifications are gratefully acknowledged and
-are described briefly in the InnoDB documentation. The contributions by
-Sun Microsystems are incorporated with their permission, and subject to the
-conditions contained in the file COPYING.Sun_Microsystems.
+Copyright (c) 2007, 2010, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -17,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 /******************************************************************//**
@@ -57,6 +51,7 @@ typedef struct ib_rbt_node_struct ib_rbt_node_t;
 typedef struct ib_rbt_bound_struct ib_rbt_bound_t;
 typedef void (*ib_rbt_print_node)(const ib_rbt_node_t* node);
 typedef int (*ib_rbt_compare)(const void* p1, const void* p2);
+typedef int (*ib_rbt_arg_compare)(const void*, const void* p1, const void* p2);
 
 /** Red black tree color types */
 enum ib_rbt_color_enum {
@@ -90,7 +85,11 @@ struct	ib_rbt_struct {
 	ulint		n_nodes;		/* Total number of data nodes */
 
 	ib_rbt_compare	compare;		/* Fn. to use for comparison */
+	ib_rbt_arg_compare
+			compare_with_arg;	/* Fn. to use for comparison
+						with argument */
 	ulint		sizeof_value;		/* Sizeof the item in bytes */
+	const void*	cmp_arg;		/* Compare func argument */
 };
 
 /** The result of searching for a key in the tree, this is useful for
@@ -133,6 +132,18 @@ rbt_create(
 	size_t		sizeof_value,		/*!< in: size in bytes */
 	ib_rbt_compare	compare);		/*!< in: comparator */
 /**********************************************************************//**
+Create an instance of a red black tree, whose comparison function takes
+an argument
+@return	rb tree instance */
+UNIV_INTERN
+ib_rbt_t*
+rbt_create_arg_cmp(
+/*===============*/
+	size_t		sizeof_value,		/*!< in: size in bytes */
+	ib_rbt_arg_compare
+			compare,		/*!< in: comparator */
+	const void*	cmp_arg);		/*!< in: compare fn arg */
+/**********************************************************************//**
 Delete a node from the red black tree, identified by key */
 UNIV_INTERN
 ibool
@@ -265,7 +276,10 @@ rbt_search_cmp(
 	const ib_rbt_t*	tree,			/*!< in: rb tree */
 	ib_rbt_bound_t*	parent,			/*!< in: search bounds */
 	const void*	key,			/*!< in: key to search */
-	ib_rbt_compare	compare);		/*!< in: comparator */
+	ib_rbt_compare	compare,		/*!< in: comparator */
+	ib_rbt_arg_compare
+			arg_compare);		/*!< in: fn to compare items
+						with argument */
 /**********************************************************************//**
 Clear the tree, deletes (and free's) all the nodes. */
 UNIV_INTERN
diff --git a/storage/innobase/include/ut0rnd.h b/storage/innobase/include/ut0rnd.h
index 946b1117af7..53b769849a5 100644
--- a/storage/innobase/include/ut0rnd.h
+++ b/storage/innobase/include/ut0rnd.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1994, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -28,6 +28,8 @@ Created 1/20/1994 Heikki Tuuri
 
 #include "univ.i"
 
+#ifndef UNIV_INNOCHECKSUM
+
 #include "ut0byte.h"
 
 /** The 'character code' for end of field or string (used
@@ -87,16 +89,6 @@ ut_hash_ulint(
 	ulint	 key,		/*!< in: value to be hashed */
 	ulint	 table_size);	/*!< in: hash table size */
 /*************************************************************//**
-Folds a pair of ulints.
-@return	folded value */
-UNIV_INLINE
-ulint
-ut_fold_ulint_pair(
-/*===============*/
-	ulint	n1,	/*!< in: ulint */
-	ulint	n2)	/*!< in: ulint */
-	__attribute__((const));
-/*************************************************************//**
 Folds a 64-bit integer.
 @return	folded value */
 UNIV_INLINE
@@ -114,16 +106,6 @@ ut_fold_string(
 /*===========*/
 	const char*	str)	/*!< in: null-terminated string */
 	__attribute__((pure));
-/*************************************************************//**
-Folds a binary string.
-@return	folded value */
-UNIV_INLINE
-ulint
-ut_fold_binary(
-/*===========*/
-	const byte*	str,	/*!< in: string of bytes */
-	ulint		len)	/*!< in: length */
-	__attribute__((pure));
 /***********************************************************//**
 Looks for a prime number slightly greater than the given argument.
 The prime is chosen so that it is not near any power of 2.
@@ -135,6 +117,29 @@ ut_find_prime(
 	ulint	n)	/*!< in: positive number > 100 */
 	__attribute__((const));
 
+#endif /* !UNIV_INNOCHECKSUM */
+
+/*************************************************************//**
+Folds a pair of ulints.
+@return	folded value */
+UNIV_INLINE
+ulint
+ut_fold_ulint_pair(
+/*===============*/
+	ulint	n1,	/*!< in: ulint */
+	ulint	n2)	/*!< in: ulint */
+	__attribute__((const));
+/*************************************************************//**
+Folds a binary string.
+@return	folded value */
+UNIV_INLINE
+ulint
+ut_fold_binary(
+/*===========*/
+	const byte*	str,	/*!< in: string of bytes */
+	ulint		len)	/*!< in: length */
+	__attribute__((pure));
+
 
 #ifndef UNIV_NONINL
 #include "ut0rnd.ic"
diff --git a/storage/innobase/include/ut0rnd.ic b/storage/innobase/include/ut0rnd.ic
index 795b8ab7a85..024c59e553b 100644
--- a/storage/innobase/include/ut0rnd.ic
+++ b/storage/innobase/include/ut0rnd.ic
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1994, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -25,6 +25,9 @@ Created 5/30/1994 Heikki Tuuri
 
 #define UT_HASH_RANDOM_MASK	1463735687
 #define UT_HASH_RANDOM_MASK2	1653893711
+
+#ifndef UNIV_INNOCHECKSUM
+
 #define UT_RND1			151117737
 #define UT_RND2			119785373
 #define UT_RND3			 85689495
@@ -156,20 +159,6 @@ ut_hash_ulint(
 }
 
 /*************************************************************//**
-Folds a pair of ulints.
-@return	folded value */
-UNIV_INLINE
-ulint
-ut_fold_ulint_pair(
-/*===============*/
-	ulint	n1,	/*!< in: ulint */
-	ulint	n2)	/*!< in: ulint */
-{
-	return(((((n1 ^ n2 ^ UT_HASH_RANDOM_MASK2) << 8) + n1)
-		^ UT_HASH_RANDOM_MASK) + n2);
-}
-
-/*************************************************************//**
 Folds a 64-bit integer.
 @return	folded value */
 UNIV_INLINE
@@ -203,6 +192,22 @@ ut_fold_string(
 	return(fold);
 }
 
+#endif /* !UNIV_INNOCHECKSUM */
+
+/*************************************************************//**
+Folds a pair of ulints.
+@return	folded value */
+UNIV_INLINE
+ulint
+ut_fold_ulint_pair(
+/*===============*/
+	ulint	n1,	/*!< in: ulint */
+	ulint	n2)	/*!< in: ulint */
+{
+	return(((((n1 ^ n2 ^ UT_HASH_RANDOM_MASK2) << 8) + n1)
+		^ UT_HASH_RANDOM_MASK) + n2);
+}
+
 /*************************************************************//**
 Folds a binary string.
 @return	folded value */
@@ -213,15 +218,37 @@ ut_fold_binary(
 	const byte*	str,	/*!< in: string of bytes */
 	ulint		len)	/*!< in: length */
 {
-	const byte*	str_end	= str + len;
 	ulint		fold = 0;
+	const byte*	str_end	= str + (len & 0xFFFFFFF8);
 
 	ut_ad(str || !len);
 
 	while (str < str_end) {
-		fold = ut_fold_ulint_pair(fold, (ulint)(*str));
+		fold = ut_fold_ulint_pair(fold, (ulint)(*str++));
+		fold = ut_fold_ulint_pair(fold, (ulint)(*str++));
+		fold = ut_fold_ulint_pair(fold, (ulint)(*str++));
+		fold = ut_fold_ulint_pair(fold, (ulint)(*str++));
+		fold = ut_fold_ulint_pair(fold, (ulint)(*str++));
+		fold = ut_fold_ulint_pair(fold, (ulint)(*str++));
+		fold = ut_fold_ulint_pair(fold, (ulint)(*str++));
+		fold = ut_fold_ulint_pair(fold, (ulint)(*str++));
+	}
 
-		str++;
+	switch (len & 0x7) {
+	case 7:
+		fold = ut_fold_ulint_pair(fold, (ulint)(*str++));
+	case 6:
+		fold = ut_fold_ulint_pair(fold, (ulint)(*str++));
+	case 5:
+		fold = ut_fold_ulint_pair(fold, (ulint)(*str++));
+	case 4:
+		fold = ut_fold_ulint_pair(fold, (ulint)(*str++));
+	case 3:
+		fold = ut_fold_ulint_pair(fold, (ulint)(*str++));
+	case 2:
+		fold = ut_fold_ulint_pair(fold, (ulint)(*str++));
+	case 1:
+		fold = ut_fold_ulint_pair(fold, (ulint)(*str++));
 	}
 
 	return(fold);
diff --git a/storage/innobase/include/ut0sort.h b/storage/innobase/include/ut0sort.h
index 5c6647dda9e..75648b5c317 100644
--- a/storage/innobase/include/ut0sort.h
+++ b/storage/innobase/include/ut0sort.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1995, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
diff --git a/storage/innobase/include/ut0ut.h b/storage/innobase/include/ut0ut.h
index 47ab6eb9b74..35b8a580e68 100644
--- a/storage/innobase/include/ut0ut.h
+++ b/storage/innobase/include/ut0ut.h
@@ -1,13 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1994, 2010, Innobase Oy. All Rights Reserved.
-Copyright (c) 2009, Sun Microsystems, Inc.
-
-Portions of this file contain modifications contributed and copyrighted by
-Sun Microsystems, Inc. Those modifications are gratefully acknowledged and
-are described briefly in the InnoDB documentation. The contributions by
-Sun Microsystems are incorporated with their permission, and subject to the
-conditions contained in the file COPYING.Sun_Microsystems.
+Copyright (c) 1994, 2011, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -18,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -35,6 +28,8 @@ Created 1/20/1994 Heikki Tuuri
 
 #include "univ.i"
 
+#ifndef UNIV_INNOCHECKSUM
+
 #include "db0err.h"
 
 #ifndef UNIV_HOTBACKUP
@@ -46,6 +41,8 @@ Created 1/20/1994 Heikki Tuuri
 #include <ctype.h>
 #endif
 
+#include <stdarg.h> /* for va_list */
+
 /** Index name prefix in fast index creation */
 #define	TEMP_INDEX_PREFIX	'\377'
 /** Index name prefix in fast index creation, as a string constant */
@@ -55,27 +52,32 @@ Created 1/20/1994 Heikki Tuuri
 typedef time_t	ib_time_t;
 
 #ifndef UNIV_HOTBACKUP
-#if defined(HAVE_PAUSE_INSTRUCTION)
+# if defined(HAVE_PAUSE_INSTRUCTION)
    /* According to the gcc info page, asm volatile means that the
    instruction has important side-effects and must not be removed.
    Also asm volatile may trigger a memory barrier (spilling all registers
    to memory). */
-#  define UT_RELAX_CPU() __asm__ __volatile__ ("pause")
-#elif defined(HAVE_FAKE_PAUSE_INSTRUCTION)
+#  ifdef __SUNPRO_CC
+#   define UT_RELAX_CPU() asm ("pause" )
+#  else
+#   define UT_RELAX_CPU() __asm__ __volatile__ ("pause")
+#  endif /* __SUNPRO_CC */
+
+# elif defined(HAVE_FAKE_PAUSE_INSTRUCTION)
 #  define UT_RELAX_CPU() __asm__ __volatile__ ("rep; nop")
-#elif defined(HAVE_WINDOWS_ATOMICS)
-   /* In the Win32 API, the x86 PAUSE instruction is executed by calling
-   the YieldProcessor macro defined in WinNT.h. It is a CPU architecture-
-   independent way by using YieldProcessor. */
-#  define UT_RELAX_CPU() YieldProcessor()
-#elif defined(HAVE_ATOMIC_BUILTINS)
+# elif defined(HAVE_ATOMIC_BUILTINS)
 #  define UT_RELAX_CPU() do { \
      volatile lint	volatile_var; \
      os_compare_and_swap_lint(&volatile_var, 0, 1); \
    } while (0)
-#else
+# elif defined(HAVE_WINDOWS_ATOMICS)
+   /* In the Win32 API, the x86 PAUSE instruction is executed by calling
+   the YieldProcessor macro defined in WinNT.h. It is a CPU architecture-
+   independent way by using YieldProcessor. */
+#  define UT_RELAX_CPU() YieldProcessor()
+# else
 #  define UT_RELAX_CPU() ((void)0) /* avoid warning for an empty statement */
-#endif
+# endif
 
 /*********************************************************************//**
 Delays execution for at most max_wait_us microseconds or returns earlier
@@ -94,16 +96,9 @@ do {								\
 } while (0)
 #endif /* !UNIV_HOTBACKUP */
 
-/********************************************************//**
-Gets the high 32 bits in a ulint. That is makes a shift >> 32,
-but since there seem to be compiler bugs in both gcc and Visual C++,
-we do this by a special conversion.
-@return	a >> 32 */
-UNIV_INTERN
-ulint
-ut_get_high32(
-/*==========*/
-	ulint	a);	/*!< in: ulint */
+template <class T> T ut_min(T a, T b) { return(a < b ? a : b); }
+template <class T> T ut_max(T a, T b) { return(a > b ? a : b); }
+
 /******************************************************//**
 Calculates the minimum of two ulints.
 @return	minimum */
@@ -261,6 +256,16 @@ ut_time_ms(void);
 #endif /* !UNIV_HOTBACKUP */
 
 /**********************************************************//**
+Returns the number of milliseconds since some epoch.  The
+value may wrap around.  It should only be used for heuristic
+purposes.
+@return ms since epoch */
+UNIV_INTERN
+ulint
+ut_time_ms(void);
+/*============*/
+
+/**********************************************************//**
 Returns the difference of two times in seconds.
 @return	time2 - time1 expressed in seconds */
 UNIV_INTERN
@@ -269,6 +274,9 @@ ut_difftime(
 /*========*/
 	ib_time_t	time2,	/*!< in: time */
 	ib_time_t	time1);	/*!< in: time */
+
+#endif /* !UNIV_INNOCHECKSUM */
+
 /**********************************************************//**
 Prints a timestamp to a file. */
 UNIV_INTERN
@@ -277,6 +285,9 @@ ut_print_timestamp(
 /*===============*/
 	FILE*	file)	/*!< in: file where to print */
 	UNIV_COLD __attribute__((nonnull));
+
+#ifndef UNIV_INNOCHECKSUM
+
 /**********************************************************//**
 Sprintfs a timestamp to a buffer, 13..14 chars plus terminating NUL. */
 UNIV_INTERN
@@ -379,6 +390,22 @@ ut_copy_file(
 
 #ifdef __WIN__
 /**********************************************************************//**
+A substitute for vsnprintf(3), formatted output conversion into
+a limited buffer. Note: this function DOES NOT return the number of
+characters that would have been printed if the buffer was unlimited because
+VC's _vsnprintf() returns -1 in this case and we would need to call
+_vscprintf() in addition to estimate that but we would need another copy
+of "ap" for that and VC does not provide va_copy(). */
+UNIV_INTERN
+void
+ut_vsnprintf(
+/*=========*/
+	char*		str,	/*!< out: string */
+	size_t		size,	/*!< in: str size */
+	const char*	fmt,	/*!< in: format */
+	va_list		ap);	/*!< in: format values */
+
+/**********************************************************************//**
 A substitute for snprintf(3), formatted output conversion into
 a limited buffer.
 @return number of characters that would have been printed if the size
@@ -393,6 +420,15 @@ ut_snprintf(
 	...);			/*!< in: format values */
 #else
 /**********************************************************************//**
+A wrapper for vsnprintf(3), formatted output conversion into
+a limited buffer. Note: this function DOES NOT return the number of
+characters that would have been printed if the buffer was unlimited because
+VC's _vsnprintf() returns -1 in this case and we would need to call
+_vscprintf() in addition to estimate that but we would need another copy
+of "ap" for that and VC does not provide va_copy(). */
+# define ut_vsnprintf(buf, size, fmt, ap)	\
+	((void) vsnprintf(buf, size, fmt, ap))
+/**********************************************************************//**
 A wrapper for snprintf(3), formatted output conversion into
 a limited buffer. */
 # define ut_snprintf	snprintf
@@ -408,9 +444,23 @@ ut_strerr(
 /*======*/
 	enum db_err	num);	/*!< in: error number */
 
+/****************************************************************
+Sort function for ulint arrays. */
+UNIV_INTERN
+void
+ut_ulint_sort(
+/*==========*/
+	ulint*	arr,		/*!< in/out: array to sort */
+	ulint*	aux_arr,	/*!< in/out: aux array to use in sort */
+	ulint	low,		/*!< in: lower bound */
+	ulint	high)		/*!< in: upper bound */
+	__attribute__((nonnull));
+
 #ifndef UNIV_NONINL
 #include "ut0ut.ic"
 #endif
 
+#endif /* !UNIV_INNOCHECKSUM */
+
 #endif
 
diff --git a/storage/innobase/include/ut0ut.ic b/storage/innobase/include/ut0ut.ic
index 6f55c7e410e..4e0f76e1957 100644
--- a/storage/innobase/include/ut0ut.ic
+++ b/storage/innobase/include/ut0ut.ic
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1994, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
diff --git a/storage/innobase/include/ut0vec.h b/storage/innobase/include/ut0vec.h
index 0f8b955b098..f2a5aba8116 100644
--- a/storage/innobase/include/ut0vec.h
+++ b/storage/innobase/include/ut0vec.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 2006, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 2006, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -29,59 +29,116 @@ Created 4/6/2006 Osku Salerma
 #include "univ.i"
 #include "mem0mem.h"
 
-/** An automatically resizing vector data type. */
+typedef struct ib_alloc_struct ib_alloc_t;
 typedef struct ib_vector_struct ib_vector_t;
 
-/* An automatically resizing vector datatype with the following properties:
+typedef void* (*ib_mem_alloc_t)(
+					/* out: Pointer to allocated memory */
+	ib_alloc_t*	allocator,	/* in: Pointer to allocator instance */
+	ulint		size);		/* in: Number of bytes to allocate */
+
+typedef void (*ib_mem_free_t)(
+	ib_alloc_t*	allocator,	/* in: Pointer to allocator instance */
+	void*		ptr);		/* in: Memory to free */
 
- -Contains void* items.
+typedef void* (*ib_mem_resize_t)(
+					/* out: Pointer to resized memory */
+	ib_alloc_t*	allocator,	/* in: Pointer to allocator */
+	void*		ptr,		/* in: Memory to resize */
+	ulint		old_size,	/* in: Old memory size in bytes */
+	ulint		new_size);	/* in: New size in bytes */
 
- -The items are owned by the caller.
+typedef int (*ib_compare_t)(const void*, const void*);
 
- -All memory allocation is done through a heap owned by the caller, who is
- responsible for freeing it when done with the vector.
+/* An automatically resizing vector datatype with the following properties:
 
- -When the vector is resized, the old memory area is left allocated since it
- uses the same heap as the new memory area, so this is best used for
- relatively small or short-lived uses.
+ -All memory allocation is done through an allocator, which is  responsible for
+freeing it when done with the vector.
 */
 
-/****************************************************************//**
-Create a new vector with the given initial size.
-@return	vector */
-UNIV_INTERN
+/* This is useful shorthand for elements of type void* */
+#define	ib_vector_getp(v, n)	(*(void**) ib_vector_get(v, n))
+#define	ib_vector_getp_const(v, n)	(*(void**) ib_vector_get_const(v, n))
+
+#define ib_vector_allocator(v)	(v->allocator)
+
+/********************************************************************
+Create a new vector with the given initial size. */
+
 ib_vector_t*
 ib_vector_create(
 /*=============*/
-	mem_heap_t*	heap,	/*!< in: heap */
-	ulint		size);	/*!< in: initial size */
+					/* out: vector */
+	ib_alloc_t*	alloc,		/* in: Allocator */
+					/* in: size of the data item */
+	ulint		sizeof_value,
+	ulint		size);		/* in: initial size */
 
-/****************************************************************//**
-Push a new element to the vector, increasing its size if necessary. */
-UNIV_INTERN
+/********************************************************************
+Destroy the vector. Make sure the vector owns the allocator, e.g.,
+the heap in the the heap allocator. */
+UNIV_INLINE
 void
+ib_vector_free(
+/*===========*/
+	ib_vector_t*	vec);		/* in/out: vector */
+
+/********************************************************************
+Push a new element to the vector, increasing its size if necessary,
+if elem is not NULL then elem is copied to the vector.*/
+UNIV_INLINE
+void*
 ib_vector_push(
 /*===========*/
+					/* out: pointer the "new" element */
+	ib_vector_t*	vec,		/* in/out: vector */
+	const void*	elem);		/* in: data element */
+
+/********************************************************************
+Pop the last element from the vector.*/
+UNIV_INLINE
+void*
+ib_vector_pop(
+/*==========*/
+					/* out: pointer to the "new" element */
+	ib_vector_t*	vec);		/* in/out: vector */
+
+/*******************************************************************//**
+Remove an element to the vector
+@return pointer to the "removed" element */
+UNIV_INLINE
+void*
+ib_vector_remove(
+/*=============*/
 	ib_vector_t*	vec,	/*!< in: vector */
-	void*		elem);	/*!< in: data element */
+	const void*	elem);	/*!< in: value to remove */
 
-/****************************************************************//**
-Get the number of elements in the vector.
-@return	number of elements in vector */
+/********************************************************************
+Get the number of elements in the vector. */
 UNIV_INLINE
 ulint
 ib_vector_size(
 /*===========*/
-	const ib_vector_t*	vec);	/*!< in: vector */
+					/* out: number of elements in vector */
+	const ib_vector_t*	vec);	/* in: vector */
 
-/****************************************************************//**
+/********************************************************************
+Increase the size of the vector. */
+
+void
+ib_vector_resize(
+/*=============*/
+					/* out: number of elements in vector */
+	ib_vector_t*	vec);		/* in/out: vector */
+
+/********************************************************************
 Test whether a vector is empty or not.
-@return	TRUE if empty */
+@return TRUE if empty */
 UNIV_INLINE
 ibool
 ib_vector_is_empty(
 /*===============*/
-	const ib_vector_t*	vec);	/*!< in: vector */
+	const ib_vector_t*	vec);    /*!< in: vector */
 
 /****************************************************************//**
 Get the n'th element.
@@ -93,6 +150,15 @@ ib_vector_get(
 	ib_vector_t*	vec,	/*!< in: vector */
 	ulint		n);	/*!< in: element index to get */
 
+/********************************************************************
+Const version of the get n'th element.
+@return n'th element */
+UNIV_INLINE
+const void*
+ib_vector_get_const(
+/*================*/
+	const ib_vector_t*	vec,	/* in: vector */
+	ulint			n);	/* in: element index to get */
 /****************************************************************//**
 Get last element. The vector must not be empty.
 @return	last element */
@@ -101,7 +167,6 @@ void*
 ib_vector_get_last(
 /*===============*/
 	ib_vector_t*	vec);	/*!< in: vector */
-
 /****************************************************************//**
 Set the n'th element. */
 UNIV_INLINE
@@ -112,33 +177,161 @@ ib_vector_set(
 	ulint		n,	/*!< in: element index to set */
 	void*		elem);	/*!< in: data element */
 
-/****************************************************************//**
-Remove the last element from the vector. */
+/********************************************************************
+Reset the vector size to 0 elements. */
+UNIV_INLINE
+void
+ib_vector_reset(
+/*============*/
+	ib_vector_t*	vec);		/* in/out: vector */
+
+/********************************************************************
+Get the last element of the vector. */
 UNIV_INLINE
 void*
-ib_vector_pop(
-/*==========*/
-	ib_vector_t*	vec);	/*!< in: vector */
+ib_vector_last(
+/*===========*/
+					/* out: pointer to last element */
+	ib_vector_t*	vec);		/* in/out: vector */
 
-/****************************************************************//**
-Free the underlying heap of the vector. Note that vec is invalid
-after this call. */
+/********************************************************************
+Get the last element of the vector. */
+UNIV_INLINE
+const void*
+ib_vector_last_const(
+/*=================*/
+					/* out: pointer to last element */
+	const ib_vector_t*	vec);	/* in: vector */
+
+/********************************************************************
+Sort the vector elements. */
 UNIV_INLINE
 void
-ib_vector_free(
+ib_vector_sort(
+/*===========*/
+	ib_vector_t*	vec,		/* in/out: vector */
+	ib_compare_t	compare);	/* in: the comparator to use for sort */
+
+/********************************************************************
+The default ib_vector_t heap free. Does nothing. */
+UNIV_INLINE
+void
+ib_heap_free(
+/*=========*/
+	ib_alloc_t*	allocator,	/* in: allocator */
+	void*		ptr);		/* in: size in bytes */
+
+/********************************************************************
+The default ib_vector_t heap malloc. Uses mem_heap_alloc(). */
+UNIV_INLINE
+void*
+ib_heap_malloc(
+/*===========*/
+					/* out: pointer to allocated memory */
+	ib_alloc_t*	allocator,	/* in: allocator */
+	ulint		size);		/* in: size in bytes */
+
+/********************************************************************
+The default ib_vector_t heap resize. Since we can't resize the heap
+we have to copy the elements from the old ptr to the new ptr.
+Uses mem_heap_alloc(). */
+UNIV_INLINE
+void*
+ib_heap_resize(
 /*===========*/
-	ib_vector_t*	vec);	/*!< in,own: vector */
+					/* out: pointer to reallocated
+					memory */
+	ib_alloc_t*	allocator,	/* in: allocator */
+	void*		old_ptr,	/* in: pointer to memory */
+	ulint		old_size,	/* in: old size in bytes */
+	ulint		new_size);	/* in: new size in bytes */
 
-/** An automatically resizing vector data type. */
+/********************************************************************
+Create a heap allocator that uses the passed in heap. */
+UNIV_INLINE
+ib_alloc_t*
+ib_heap_allocator_create(
+/*=====================*/
+					/* out: heap allocator instance */
+	mem_heap_t*	heap);		/* in: heap to use */
+
+/********************************************************************
+Free a heap allocator. */
+UNIV_INLINE
+void
+ib_heap_allocator_free(
+/*===================*/
+	ib_alloc_t*	ib_ut_alloc);	/* in: alloc instace to free */
+
+/********************************************************************
+Wrapper for ut_free(). */
+UNIV_INLINE
+void
+ib_ut_free(
+/*=======*/
+	ib_alloc_t*	allocator,	/* in: allocator */
+	void*		ptr);		/* in: size in bytes */
+
+/********************************************************************
+Wrapper for ut_malloc(). */
+UNIV_INLINE
+void*
+ib_ut_malloc(
+/*=========*/
+					/* out: pointer to allocated memory */
+	ib_alloc_t*	allocator,	/* in: allocator */
+	ulint		size);		/* in: size in bytes */
+
+/********************************************************************
+Wrapper for ut_realloc(). */
+UNIV_INLINE
+void*
+ib_ut_resize(
+/*=========*/
+					/* out: pointer to reallocated
+					memory */
+	ib_alloc_t*	allocator,	/* in: allocator */
+	void*		old_ptr,	/* in: pointer to memory */
+	ulint		old_size,	/* in: old size in bytes */
+	ulint		new_size);	/* in: new size in bytes */
+
+/********************************************************************
+Create a heap allocator that uses the passed in heap. */
+UNIV_INLINE
+ib_alloc_t*
+ib_ut_allocator_create(void);
+/*=========================*/
+
+/********************************************************************
+Create a heap allocator that uses the passed in heap. */
+UNIV_INLINE
+void
+ib_ut_allocator_free(
+/*=================*/
+	ib_alloc_t*	ib_ut_alloc);	/* in: alloc instace to free */
+
+/* Allocator used by ib_vector_t. */
+struct ib_alloc_struct {
+	ib_mem_alloc_t	mem_malloc;	/* For allocating memory */
+	ib_mem_free_t	mem_release;	/* For freeing memory */
+	ib_mem_resize_t	mem_resize;	/* For resizing memory */
+	void*		arg;		/* Currently if not NULL then it
+					points to the heap instance */
+};
+
+/* See comment at beginning of file. */
 struct ib_vector_struct {
-	mem_heap_t*	heap;	/*!< heap */
-	void**		data;	/*!< data elements */
-	ulint		used;	/*!< number of elements currently used */
-	ulint		total;	/*!< number of elements allocated */
+	ib_alloc_t*	allocator;	/* Allocator, because one size
+					doesn't fit all */
+	void*		data;		/* data elements */
+	ulint		used;		/* number of elements currently used */
+	ulint		total;		/* number of elements allocated */
+					/* Size of a data item */
+	ulint		sizeof_value;
 };
 
 #ifndef UNIV_NONINL
 #include "ut0vec.ic"
 #endif
 
-#endif
+#endif /* IB_VECTOR_H */
diff --git a/storage/innobase/include/ut0vec.ic b/storage/innobase/include/ut0vec.ic
index 34c858868ce..1255caee2d9 100644
--- a/storage/innobase/include/ut0vec.ic
+++ b/storage/innobase/include/ut0vec.ic
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 2006, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 2006, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -23,21 +23,169 @@ A vector of pointers to data items
 Created 4/6/2006 Osku Salerma
 ************************************************************************/
 
-/****************************************************************//**
-Get number of elements in vector.
-@return	number of elements in vector */
+#define	IB_VEC_OFFSET(v, i)	(vec->sizeof_value * i)
+
+/********************************************************************
+The default ib_vector_t heap malloc. Uses mem_heap_alloc(). */
+UNIV_INLINE
+void*
+ib_heap_malloc(
+/*===========*/
+	ib_alloc_t*	allocator,	/* in: allocator */
+	ulint		size)		/* in: size in bytes */
+{
+	mem_heap_t*	heap = (mem_heap_t*) allocator->arg;
+
+	return(mem_heap_alloc(heap, size));
+}
+
+/********************************************************************
+The default ib_vector_t heap free. Does nothing. */
+UNIV_INLINE
+void
+ib_heap_free(
+/*=========*/
+	ib_alloc_t*	allocator UNIV_UNUSED,	/* in: allocator */
+	void*		ptr UNIV_UNUSED)	/* in: size in bytes */
+{
+	/* We can't free individual elements. */
+}
+
+/********************************************************************
+The default ib_vector_t heap resize. Since we can't resize the heap
+we have to copy the elements from the old ptr to the new ptr.
+Uses mem_heap_alloc(). */
+UNIV_INLINE
+void*
+ib_heap_resize(
+/*===========*/
+	ib_alloc_t*	allocator,	/* in: allocator */
+	void*		old_ptr,	/* in: pointer to memory */
+	ulint		old_size,	/* in: old size in bytes */
+	ulint		new_size)	/* in: new size in bytes */
+{
+	void*		new_ptr;
+	mem_heap_t*	heap = (mem_heap_t*) allocator->arg;
+
+	new_ptr = mem_heap_alloc(heap, new_size);
+	memcpy(new_ptr, old_ptr, old_size);
+
+	return(new_ptr);
+}
+
+/********************************************************************
+Create a heap allocator that uses the passed in heap. */
+UNIV_INLINE
+ib_alloc_t*
+ib_heap_allocator_create(
+/*=====================*/
+	mem_heap_t*	heap)		/* in: heap to use */
+{
+	ib_alloc_t*	heap_alloc;
+
+	heap_alloc = (ib_alloc_t*) mem_heap_alloc(heap, sizeof(*heap_alloc));
+
+	heap_alloc->arg = heap;
+	heap_alloc->mem_release = ib_heap_free;
+	heap_alloc->mem_malloc = ib_heap_malloc;
+	heap_alloc->mem_resize = ib_heap_resize;
+
+	return(heap_alloc);
+}
+
+/********************************************************************
+Free a heap allocator. */
+UNIV_INLINE
+void
+ib_heap_allocator_free(
+/*===================*/
+	ib_alloc_t*	ib_ut_alloc)	/* in: alloc instace to free */
+{
+	mem_heap_free((mem_heap_t*) ib_ut_alloc->arg);
+}
+
+/********************************************************************
+Wrapper around ut_malloc(). */
+UNIV_INLINE
+void*
+ib_ut_malloc(
+/*=========*/
+	ib_alloc_t*	allocator UNIV_UNUSED,	/* in: allocator */
+	ulint		size)			/* in: size in bytes */
+{
+	return(ut_malloc(size));
+}
+
+/********************************************************************
+Wrapper around ut_free(). */
+UNIV_INLINE
+void
+ib_ut_free(
+/*=======*/
+	ib_alloc_t*	allocator UNIV_UNUSED,	/* in: allocator */
+	void*		ptr)			/* in: size in bytes */
+{
+	ut_free(ptr);
+}
+
+/********************************************************************
+Wrapper aroung ut_realloc(). */
+UNIV_INLINE
+void*
+ib_ut_resize(
+/*=========*/
+	ib_alloc_t*	allocator UNIV_UNUSED,	/* in: allocator */
+	void*		old_ptr,	/* in: pointer to memory */
+	ulint		old_size UNIV_UNUSED,/* in: old size in bytes */
+	ulint		new_size)	/* in: new size in bytes */
+{
+	return(ut_realloc(old_ptr, new_size));
+}
+
+/********************************************************************
+Create a ut allocator. */
+UNIV_INLINE
+ib_alloc_t*
+ib_ut_allocator_create(void)
+/*========================*/
+{
+	ib_alloc_t*	ib_ut_alloc;
+
+	ib_ut_alloc = (ib_alloc_t*) ut_malloc(sizeof(*ib_ut_alloc));
+
+	ib_ut_alloc->arg = NULL;
+	ib_ut_alloc->mem_release = ib_ut_free;
+	ib_ut_alloc->mem_malloc = ib_ut_malloc;
+	ib_ut_alloc->mem_resize = ib_ut_resize;
+
+	return(ib_ut_alloc);
+}
+
+/********************************************************************
+Free a ut allocator. */
+UNIV_INLINE
+void
+ib_ut_allocator_free(
+/*=================*/
+	ib_alloc_t*	ib_ut_alloc)	/* in: alloc instace to free */
+{
+	ut_free(ib_ut_alloc);
+}
+
+/********************************************************************
+Get number of elements in vector. */
 UNIV_INLINE
 ulint
 ib_vector_size(
 /*===========*/
-	const ib_vector_t*	vec)	/*!< in: vector */
+					/* out: number of elements in vector*/
+	const ib_vector_t*	vec)	/* in: vector */
 {
 	return(vec->used);
 }
 
 /****************************************************************//**
-Get n'th element.
-@return	n'th element */
+Get n'th element. */
 UNIV_INLINE
 void*
 ib_vector_get(
@@ -47,9 +195,23 @@ ib_vector_get(
 {
 	ut_a(n < vec->used);
 
-	return(vec->data[n]);
+	return((byte*) vec->data + IB_VEC_OFFSET(vec, n));
 }
 
+/********************************************************************
+Const version of the get n'th element.
+@return n'th element */
+UNIV_INLINE
+const void*
+ib_vector_get_const(
+/*================*/
+	const ib_vector_t*	vec,	/* in: vector */
+	ulint			n)	/* in: element index to get */
+{
+	ut_a(n < vec->used);
+
+	return((byte*) vec->data + IB_VEC_OFFSET(vec, n));
+}
 /****************************************************************//**
 Get last element. The vector must not be empty.
 @return	last element */
@@ -61,7 +223,7 @@ ib_vector_get_last(
 {
 	ut_a(vec->used > 0);
 
-	return(vec->data[vec->used - 1]);
+	return((byte*) ib_vector_get(vec, vec->used - 1));
 }
 
 /****************************************************************//**
@@ -74,9 +236,52 @@ ib_vector_set(
 	ulint		n,	/*!< in: element index to set */
 	void*		elem)	/*!< in: data element */
 {
+	void*		slot;
+
 	ut_a(n < vec->used);
 
-	vec->data[n] = elem;
+	slot = ((byte*) vec->data + IB_VEC_OFFSET(vec, n));
+	memcpy(slot, elem, vec->sizeof_value);
+}
+
+/********************************************************************
+Reset the vector size to 0 elements. */
+UNIV_INLINE
+void
+ib_vector_reset(
+/*============*/
+					/* out: void */
+	ib_vector_t*	vec)		/* in: vector */
+{
+	vec->used = 0;
+}
+
+/********************************************************************
+Get the last element of the vector. */
+UNIV_INLINE
+void*
+ib_vector_last(
+/*===========*/
+					/* out: void */
+	ib_vector_t*	vec)		/* in: vector */
+{
+	ut_a(ib_vector_size(vec) > 0);
+
+	return(ib_vector_get(vec, ib_vector_size(vec) - 1));
+}
+
+/********************************************************************
+Get the last element of the vector. */
+UNIV_INLINE
+const void*
+ib_vector_last_const(
+/*=================*/
+					/* out: void */
+	const ib_vector_t*	vec)	/* in: vector */
+{
+	ut_a(ib_vector_size(vec) > 0);
+
+	return(ib_vector_get_const(vec, ib_vector_size(vec) - 1));
 }
 
 /****************************************************************//**
@@ -86,35 +291,129 @@ UNIV_INLINE
 void*
 ib_vector_pop(
 /*==========*/
-	ib_vector_t*    vec)    /*!< in/out: vector */
+				/* out: pointer to element */
+	ib_vector_t*	vec)	/* in: vector */
 {
-	void*           elem;
+	void*		elem;
 
 	ut_a(vec->used > 0);
-	--vec->used;
-	elem = vec->data[vec->used];
 
-	ut_d(vec->data[vec->used] = NULL);
-	UNIV_MEM_INVALID(&vec->data[vec->used], sizeof(*vec->data));
+	elem = ib_vector_last(vec);
+	--vec->used;
 
 	return(elem);
 }
 
-/****************************************************************//**
-Free the underlying heap of the vector. Note that vec is invalid
-after this call. */
+/********************************************************************
+Append an element to the vector, if elem != NULL then copy the data
+from elem.*/
+UNIV_INLINE
+void*
+ib_vector_push(
+/*===========*/
+				/* out: pointer to the "new" element */
+	ib_vector_t*	vec,	/* in: vector */
+	const void*	elem)	/* in: element to add (can be NULL) */
+{
+	void*		last;
+
+	if (vec->used >= vec->total) {
+		ib_vector_resize(vec);
+	}
+
+	last = (byte*) vec->data + IB_VEC_OFFSET(vec, vec->used);
+
+#ifdef UNIV_DEBUG
+	memset(last, 0, vec->sizeof_value);
+#endif
+
+	if (elem) {
+		memcpy(last, elem, vec->sizeof_value);
+	}
+
+	++vec->used;
+
+	return(last);
+}
+
+/*******************************************************************//**
+Remove an element to the vector
+@return pointer to the "removed" element */
+UNIV_INLINE
+void*
+ib_vector_remove(
+/*=============*/
+	ib_vector_t*	vec,	/*!< in: vector */
+	const void*	elem)	/*!< in: value to remove */
+{
+	void*		current;
+	void*		next;
+	ulint		i;
+
+	for (i = 0; i < vec->used; i++) {
+		current = ib_vector_get(vec, i);
+
+		if (*(void**) current == elem) {
+			if (i == vec->used - 1) {
+				return(ib_vector_pop(vec));
+			}
+
+			next = ib_vector_get(vec, i + 1);
+			memcpy(current, next, vec->sizeof_value
+			       * (vec->used - i - 1));
+		}
+	}
+
+	--vec->used;
+
+	return(current);
+}
+
+/********************************************************************
+Sort the vector elements. */
+UNIV_INLINE
+void
+ib_vector_sort(
+/*===========*/
+				/* out: void */
+	ib_vector_t*	vec,	/* in: vector */
+	ib_compare_t	compare)/* in: the comparator to use for sort */
+{
+	qsort(vec->data, vec->used, vec->sizeof_value, compare);
+}
+
+/********************************************************************
+Destroy the vector. Make sure the vector owns the allocator, e.g.,
+the heap in the the heap allocator. */
 UNIV_INLINE
 void
 ib_vector_free(
 /*===========*/
-	ib_vector_t*    vec)    /*!< in, own: vector */
+	ib_vector_t*	vec)		/* in, own: vector */
 {
-	mem_heap_free(vec->heap);
+	/* Currently we only support two types of allocators, heap
+	and ut_malloc(), when the heap is freed all the elements are
+	freed too. With ut allocator, we need to free the elements,
+	the vector instance and the allocator separately. */
+
+	/* Only the heap allocator uses the arg field. */
+	if (vec->allocator->arg) {
+		mem_heap_free((mem_heap_t*) vec->allocator->arg);
+	} else {
+		ib_alloc_t*	allocator;
+
+		allocator = vec->allocator;
+
+		allocator->mem_release(allocator, vec->data);
+		allocator->mem_release(allocator, vec);
+
+		ib_ut_allocator_free(allocator);
+	}
 }
 
-/****************************************************************//**
+/********************************************************************
 Test whether a vector is empty or not.
-@return	TRUE if empty */
+@return TRUE if empty */
 UNIV_INLINE
 ibool
 ib_vector_is_empty(
diff --git a/storage/innobase/include/ut0wqueue.h b/storage/innobase/include/ut0wqueue.h
index 2ec0f16ab05..ed4e65e4dc6 100644
--- a/storage/innobase/include/ut0wqueue.h
+++ b/storage/innobase/include/ut0wqueue.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 2006, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 2006, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -66,6 +66,16 @@ ib_wqueue_add(
 	mem_heap_t*	heap);	/*!< in: memory heap to use for allocating the
 				list node */
 
+/********************************************************************
+Check if queue is empty. */
+
+ibool
+ib_wqueue_is_empty(
+/*===============*/
+					/* out: TRUE if queue empty
+					else FALSE */
+	const ib_wqueue_t*      wq);    /* in: work queue */
+
 /****************************************************************//**
 Wait for a work item to appear in the queue.
 @return	work item */
@@ -75,6 +85,16 @@ ib_wqueue_wait(
 /*===========*/
 	ib_wqueue_t*	wq);	/*!< in: work queue */
 
+/********************************************************************
+Wait for a work item to appear in the queue for specified time. */
+
+void*
+ib_wqueue_timedwait(
+/*================*/
+					/* out: work item or NULL on timeout*/
+	ib_wqueue_t*	wq,		/* in: work queue */
+	ib_time_t	wait_in_usecs); /* in: wait time in micro seconds */
+
 /* Work queue. */
 struct ib_wqueue_struct {
 	mutex_t		mutex;	/*!< mutex protecting everything */
diff --git a/storage/innobase/lock/lock0iter.c b/storage/innobase/lock/lock0iter.cc
index 51d1802ccde..b424d2fc757 100644
--- a/storage/innobase/lock/lock0iter.c
+++ b/storage/innobase/lock/lock0iter.cc
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 2007, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 2007, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,13 +11,13 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
 /**************************************************//**
-@file lock/lock0iter.c
+@file lock/lock0iter.cc
 Lock queue iterator. Can iterate over table and record
 lock queues.
 
@@ -32,9 +32,6 @@ Created July 16, 2007 Vasil Dimov
 #include "lock0priv.h"
 #include "ut0dbg.h"
 #include "ut0lst.h"
-#ifdef UNIV_DEBUG
-# include "srv0srv.h" /* kernel_mutex */
-#endif /* UNIV_DEBUG */
 
 /*******************************************************************//**
 Initialize lock queue iterator so that it starts to iterate from
@@ -54,7 +51,7 @@ lock_queue_iterator_reset(
 	ulint			bit_no)	/*!< in: record number in the
 					heap */
 {
-	ut_ad(mutex_own(&kernel_mutex));
+	ut_ad(lock_mutex_own());
 
 	iter->current_lock = lock;
 
@@ -90,7 +87,7 @@ lock_queue_iterator_get_prev(
 {
 	const lock_t*	prev_lock;
 
-	ut_ad(mutex_own(&kernel_mutex));
+	ut_ad(lock_mutex_own());
 
 	switch (lock_get_type_low(iter->current_lock)) {
 	case LOCK_REC:
diff --git a/storage/innobase/lock/lock0lock.c b/storage/innobase/lock/lock0lock.cc
index 66ec18aeee7..18fbfbee91b 100644
--- a/storage/innobase/lock/lock0lock.c
+++ b/storage/innobase/lock/lock0lock.cc
@@ -11,13 +11,13 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
 /**************************************************//**
-@file lock/lock0lock.c
+@file lock/lock0lock.cc
 The transaction lock system
 
 Created 5/7/1996 Heikki Tuuri
@@ -38,20 +38,27 @@ Created 5/7/1996 Heikki Tuuri
 #include "trx0purge.h"
 #include "dict0mem.h"
 #include "trx0sys.h"
+#include "pars0pars.h" /* pars_complete_graph_for_exec() */
+#include "que0que.h" /* que_node_get_parent() */
+#include "row0mysql.h" /* row_mysql_handle_errors() */
+#include "row0sel.h" /* sel_node_create(), sel_node_struct */
+#include "row0types.h" /* sel_node_t */
+#include "srv0mon.h"
+#include "ut0vec.h"
 #include "btr0btr.h"
 
 /* Restricts the length of search we will do in the waits-for
 graph of transactions */
 #define LOCK_MAX_N_STEPS_IN_DEADLOCK_CHECK 1000000
 
-/* Restricts the recursion depth of the search we will do in the waits-for
-graph of transactions */
+/* Restricts the search depth we will do in the waits-for graph of
+transactions */
 #define LOCK_MAX_DEPTH_IN_DEADLOCK_CHECK 200
 
 /* When releasing transaction locks, this specifies how often we release
-the kernel mutex for a moment to give also others access to it */
+the lock mutex for a moment to give also others access to it */
 
-#define LOCK_RELEASE_KERNEL_INTERVAL	1000
+#define LOCK_RELEASE_INTERVAL		1000
 
 /* Safety margin when creating a new record lock: this many extra records
 can be inserted to the page without need to create a lock with a bigger
@@ -293,6 +300,8 @@ waiting, in its lock queue. Solution: We can copy the locks as gap type
 locks, so that also the waiting locks are transformed to granted gap type
 locks on the inserted record. */
 
+#define LOCK_STACK_SIZE		OS_THREAD_MAX_N
+
 /* LOCK COMPATIBILITY MATRIX
  *    IS IX S  X  AI
  * IS +	 +  +  -  +
@@ -308,18 +317,14 @@ locks on the inserted record. */
  * statement-level MySQL binlog.
  * See also lock_mode_compatible().
  */
-#define LK(a,b) (1 << ((a) * LOCK_NUM + (b)))
-#define LKS(a,b) LK(a,b) | LK(b,a)
-
-/* Define the lock compatibility matrix in a ulint.  The first line below
-defines the diagonal entries.  The following lines define the compatibility
-for LOCK_IX, LOCK_S, and LOCK_AUTO_INC using LKS(), since the matrix
-is symmetric. */
-#define LOCK_MODE_COMPATIBILITY 0					\
- | LK(LOCK_IS, LOCK_IS) | LK(LOCK_IX, LOCK_IX) | LK(LOCK_S, LOCK_S)	\
- | LKS(LOCK_IX, LOCK_IS) | LKS(LOCK_IS, LOCK_AUTO_INC)			\
- | LKS(LOCK_S, LOCK_IS)							\
- | LKS(LOCK_AUTO_INC, LOCK_IS) | LKS(LOCK_AUTO_INC, LOCK_IX)
+static const byte lock_compatibility_matrix[5][5] = {
+ /**         IS     IX       S     X       AI */
+ /* IS */ {  TRUE,  TRUE,  TRUE,  FALSE,  TRUE},
+ /* IX */ {  TRUE,  TRUE,  FALSE, FALSE,  TRUE},
+ /* S  */ {  TRUE,  FALSE, TRUE,  FALSE,  FALSE},
+ /* X  */ {  FALSE, FALSE, FALSE, FALSE,  FALSE},
+ /* AI */ {  TRUE,  TRUE,  FALSE, FALSE,  FALSE}
+};
 
 /* STRONGER-OR-EQUAL RELATION (mode1=row, mode2=column)
  *    IS IX S  X  AI
@@ -330,17 +335,60 @@ is symmetric. */
  * AI -  -  -  -  +
  * See lock_mode_stronger_or_eq().
  */
+static const byte lock_strength_matrix[5][5] = {
+ /**         IS     IX       S     X       AI */
+ /* IS */ {  TRUE,  FALSE, FALSE,  FALSE, FALSE},
+ /* IX */ {  TRUE,  TRUE,  FALSE, FALSE,  FALSE},
+ /* S  */ {  TRUE,  FALSE, TRUE,  FALSE,  FALSE},
+ /* X  */ {  TRUE,  TRUE,  TRUE,  TRUE,   TRUE},
+ /* AI */ {  FALSE, FALSE, FALSE, FALSE,  TRUE}
+};
+
+/** Deadlock check context. */
+typedef struct lock_deadlock_ctx_struct lock_deadlock_ctx_t;
+
+/** Deadlock check context. */
+struct lock_deadlock_ctx_struct {
+	const trx_t*	start;		/*!< Joining transaction that is
+					requesting a lock in an incompatible
+					mode */
+
+	const lock_t*	wait_lock;	/*!< Lock that trx wants */
+
+	ib_uint64_t	mark_start;	/*!<  Value of lock_mark_count at
+					the start of the deadlock check. */
+
+	ulint		depth;		/*!< Stack depth */
+
+	ulint		cost;		/*!< Calculation steps thus far */
+
+	ibool		too_deep;	/*!< TRUE if search was too deep and
+					was aborted */
+};
+
+typedef struct lock_stack_struct lock_stack_t;
+
+/** DFS visited node information used during deadlock checking. */
+struct lock_stack_struct {
+	const lock_t*	lock;			/*!< Current lock */
+	const lock_t*	wait_lock;		/*!< Waiting for lock */
+	unsigned	heap_no:16;		/*!< heap number if rec lock */
+};
+
+/** Stack to use during DFS search. Currently only a single stack is required
+because there is no parallel deadlock check. This stack is protected by
+the lock_sys_t::mutex. */
+static lock_stack_t*	lock_stack;
+
+/** The count of the types of locks. */
+static const ulint	lock_types = UT_ARR_SIZE(lock_compatibility_matrix);
 
-/* Define the stronger-or-equal lock relation in a ulint.  This relation
-contains all pairs LK(mode1, mode2) where mode1 is stronger than or
-equal to mode2. */
-#define LOCK_MODE_STRONGER_OR_EQ 0					\
- | LK(LOCK_IS, LOCK_IS)							\
- | LK(LOCK_IX, LOCK_IS) | LK(LOCK_IX, LOCK_IX)				\
- | LK(LOCK_S, LOCK_IS) | LK(LOCK_S, LOCK_S)				\
- | LK(LOCK_AUTO_INC, LOCK_AUTO_INC)					\
- | LK(LOCK_X, LOCK_IS) | LK(LOCK_X, LOCK_IX) | LK(LOCK_X, LOCK_S)	\
- | LK(LOCK_X, LOCK_AUTO_INC) | LK(LOCK_X, LOCK_X)
+#ifdef UNIV_PFS_MUTEX
+/* Key to register mutex with performance schema */
+UNIV_INTERN mysql_pfs_key_t	lock_sys_mutex_key;
+/* Key to register mutex with performance schema */
+UNIV_INTERN mysql_pfs_key_t	lock_sys_wait_mutex_key;
+#endif /* UNIV_PFS_MUTEX */
 
 #ifdef UNIV_DEBUG
 UNIV_INTERN ibool	lock_print_waits	= FALSE;
@@ -349,9 +397,9 @@ UNIV_INTERN ibool	lock_print_waits	= FALSE;
 Validates the lock system.
 @return	TRUE if ok */
 static
-ibool
-lock_validate(void);
-/*===============*/
+bool
+lock_validate();
+/*============*/
 
 /*********************************************************************//**
 Validates the record lock queues on a page.
@@ -370,45 +418,22 @@ UNIV_INTERN lock_sys_t*	lock_sys	= NULL;
 /* We store info on the latest deadlock error to this buffer. InnoDB
 Monitor will then fetch it and print */
 UNIV_INTERN ibool	lock_deadlock_found = FALSE;
-UNIV_INTERN FILE*	lock_latest_err_file;
-
-/* Flags for recursive deadlock search */
-#define LOCK_VICTIM_IS_START	1
-#define LOCK_VICTIM_IS_OTHER	2
-#define LOCK_EXCEED_MAX_DEPTH	3
+static FILE*		lock_latest_err_file;
 
 /********************************************************************//**
-Checks if a lock request results in a deadlock.
-@return TRUE if a deadlock was detected and we chose trx as a victim;
-FALSE if no deadlock, or there was a deadlock, but we chose other
-transaction(s) as victim(s) */
-static
-ibool
-lock_deadlock_occurs(
-/*=================*/
-	lock_t*	lock,	/*!< in: lock the transaction is requesting */
-	trx_t*	trx);	/*!< in: transaction */
-/********************************************************************//**
-Looks recursively for a deadlock.
-@return 0 if no deadlock found, LOCK_VICTIM_IS_START if there was a
-deadlock and we chose 'start' as the victim, LOCK_VICTIM_IS_OTHER if a
-deadlock was found and we chose some other trx as a victim: we must do
-the search again in this last case because there may be another
-deadlock!
-LOCK_EXCEED_MAX_DEPTH if the lock search exceeds max steps or max depth. */
+Checks if a joining lock request results in a deadlock. If a deadlock is
+found this function will resolve the dadlock by choosing a victim transaction
+and rolling it back. It will attempt to resolve all deadlocks. The returned
+transaction id will be the joining transaction id or 0 if some other
+transaction was chosen as a victim and rolled back or no deadlock found.
+
+@return id of transaction chosen as victim or 0 */
 static
-ulint
-lock_deadlock_recursive(
-/*====================*/
-	trx_t*	start,		/*!< in: recursion starting point */
-	trx_t*	trx,		/*!< in: a transaction waiting for a lock */
-	lock_t*	wait_lock,	/*!< in:  lock that is waiting to be granted */
-	ulint*	cost,		/*!< in/out: number of calculation steps thus
-				far: if this exceeds LOCK_MAX_N_STEPS_...
-				we return LOCK_EXCEED_MAX_DEPTH */
-	ulint	depth);		/*!< in: recursion depth: if this exceeds
-				LOCK_MAX_DEPTH_IN_DEADLOCK_CHECK, we
-				return LOCK_EXCEED_MAX_DEPTH */
+trx_id_t
+lock_deadlock_check_and_resolve(
+/*===========================*/
+	const lock_t*	lock,	/*!< in: lock the transaction is requesting */
+	const trx_t*	trx);	/*!< in: transaction */
 
 /*********************************************************************//**
 Gets the nth bit of a record lock.
@@ -420,8 +445,7 @@ lock_rec_get_nth_bit(
 	const lock_t*	lock,	/*!< in: record lock */
 	ulint		i)	/*!< in: index of the bit */
 {
-	ulint	byte_index;
-	ulint	bit_index;
+	const byte*	b;
 
 	ut_ad(lock);
 	ut_ad(lock_get_type_low(lock) == LOCK_REC);
@@ -431,62 +455,60 @@ lock_rec_get_nth_bit(
 		return(FALSE);
 	}
 
-	byte_index = i / 8;
-	bit_index = i % 8;
+	b = ((const byte*) &lock[1]) + (i / 8);
 
-	return(1 & ((const byte*) &lock[1])[byte_index] >> bit_index);
+	return(1 & *b >> (i % 8));
 }
 
-/*************************************************************************/
-
-#define lock_mutex_enter_kernel()	mutex_enter(&kernel_mutex)
-#define lock_mutex_exit_kernel()	mutex_exit(&kernel_mutex)
+/*********************************************************************//**
+Reports that a transaction id is insensible, i.e., in the future. */
+UNIV_INTERN
+void
+lock_report_trx_id_insanity(
+/*========================*/
+	trx_id_t	trx_id,		/*!< in: trx id */
+	const rec_t*	rec,		/*!< in: user record */
+	dict_index_t*	index,		/*!< in: index */
+	const ulint*	offsets,	/*!< in: rec_get_offsets(rec, index) */
+	trx_id_t	max_trx_id)	/*!< in: trx_sys_get_max_trx_id() */
+{
+	ut_print_timestamp(stderr);
+	fputs("  InnoDB: Error: transaction id associated with record\n",
+	      stderr);
+	rec_print_new(stderr, rec, offsets);
+	fputs("InnoDB: in ", stderr);
+	dict_index_name_print(stderr, NULL, index);
+	fprintf(stderr, "\n"
+		"InnoDB: is " TRX_ID_FMT " which is higher than the"
+		" global trx id counter " TRX_ID_FMT "!\n"
+		"InnoDB: The table is corrupt. You have to do"
+		" dump + drop + reimport.\n",
+		trx_id, max_trx_id);
+}
 
 /*********************************************************************//**
 Checks that a transaction id is sensible, i.e., not in the future.
 @return	TRUE if ok */
-UNIV_INTERN
+static
 ibool
 lock_check_trx_id_sanity(
 /*=====================*/
 	trx_id_t	trx_id,		/*!< in: trx id */
 	const rec_t*	rec,		/*!< in: user record */
 	dict_index_t*	index,		/*!< in: index */
-	const ulint*	offsets,	/*!< in: rec_get_offsets(rec, index) */
-	ibool		has_kernel_mutex)/*!< in: TRUE if the caller owns the
-					kernel mutex */
+	const ulint*	offsets)	/*!< in: rec_get_offsets(rec, index) */
 {
-	ibool	is_ok		= TRUE;
+	ibool		is_ok;
+	trx_id_t	max_trx_id;
 
 	ut_ad(rec_offs_validate(rec, index, offsets));
 
-	if (!has_kernel_mutex) {
-		mutex_enter(&kernel_mutex);
-	}
-
-	/* A sanity check: the trx_id in rec must be smaller than the global
-	trx id counter */
-
-	if (UNIV_UNLIKELY(trx_id >= trx_sys->max_trx_id)) {
-		ut_print_timestamp(stderr);
-		fputs("  InnoDB: Error: transaction id associated"
-		      " with record\n",
-		      stderr);
-		rec_print_new(stderr, rec, offsets);
-		fputs("InnoDB: in ", stderr);
-		dict_index_name_print(stderr, NULL, index);
-		fprintf(stderr, "\n"
-			"InnoDB: is " TRX_ID_FMT " which is higher than the"
-			" global trx id counter " TRX_ID_FMT "!\n"
-			"InnoDB: The table is corrupt. You have to do"
-			" dump + drop + reimport.\n",
-			(ullint) trx_id, (ullint) trx_sys->max_trx_id);
-
-		is_ok = FALSE;
-	}
+	max_trx_id = trx_sys_get_max_trx_id();
+	is_ok = trx_id < max_trx_id;
 
-	if (!has_kernel_mutex) {
-		mutex_exit(&kernel_mutex);
+	if (UNIV_UNLIKELY(!is_ok)) {
+		lock_report_trx_id_insanity(trx_id,
+					    rec, index, offsets, max_trx_id);
 	}
 
 	return(is_ok);
@@ -513,8 +535,7 @@ lock_clust_rec_cons_read_sees(
 	ut_ad(rec_offs_validate(rec, index, offsets));
 
 	/* NOTE that we call this function while holding the search
-	system latch. To obey the latching order we must NOT reserve the
-	kernel mutex here! */
+	system latch. */
 
 	trx_id = row_get_rec_trx_id(rec, index, offsets);
 
@@ -545,8 +566,7 @@ lock_sec_rec_cons_read_sees(
 	ut_ad(page_rec_is_user_rec(rec));
 
 	/* NOTE that we might call this function while holding the search
-	system latch. To obey the latching order we must NOT reserve the
-	kernel mutex here! */
+	system latch. */
 
 	if (recv_recovery_is_on()) {
 
@@ -567,14 +587,39 @@ lock_sys_create(
 /*============*/
 	ulint	n_cells)	/*!< in: number of slots in lock hash table */
 {
-	lock_sys = mem_alloc(sizeof(lock_sys_t));
+	ulint	lock_sys_sz;
 
-	lock_sys->rec_hash = hash_create(n_cells);
+	srv_n_lock_wait_count = 0;
+	srv_n_lock_wait_time = 0;
+	srv_n_lock_max_wait_time = 0;
+	srv_lock_timeout_active = FALSE;
+	srv_n_lock_wait_current_count = 0;
+
+	lock_sys_sz = sizeof(*lock_sys)
+		+ OS_THREAD_MAX_N * sizeof(srv_slot_t);
+
+	lock_sys = static_cast<lock_sys_t*>(mem_zalloc(lock_sys_sz));
+
+	lock_stack = static_cast<lock_stack_t*>(
+		mem_zalloc(sizeof(*lock_stack) * LOCK_STACK_SIZE));
+
+	void*	ptr = &lock_sys[1];
+
+	lock_sys->waiting_threads = static_cast<srv_slot_t*>(ptr);
+
+	lock_sys->last_slot = lock_sys->waiting_threads;
+
+	mutex_create(lock_sys_mutex_key, &lock_sys->mutex, SYNC_LOCK_SYS);
 
-	/* hash_create_mutexes(lock_sys->rec_hash, 2, SYNC_REC_LOCK); */
+	mutex_create(lock_sys_wait_mutex_key,
+		     &lock_sys->wait_mutex, SYNC_LOCK_WAIT_SYS);
+
+	lock_sys->rec_hash = hash_create(n_cells);
 
 	lock_latest_err_file = os_file_create_tmpfile();
 	ut_a(lock_latest_err_file);
+
+	srv_timeout_event = os_event_create(NULL);
 }
 
 /*********************************************************************//**
@@ -590,8 +635,15 @@ lock_sys_close(void)
 	}
 
 	hash_table_free(lock_sys->rec_hash);
+
+	mutex_free(&lock_sys->mutex);
+	mutex_free(&lock_sys->wait_mutex);
+
+	mem_free(lock_stack);
 	mem_free(lock_sys);
+
 	lock_sys = NULL;
+	lock_stack = NULL;
 }
 
 /*********************************************************************//**
@@ -602,7 +654,7 @@ ulint
 lock_get_size(void)
 /*===============*/
 {
-	return((ulint)sizeof(lock_t));
+	return((ulint) sizeof(lock_t));
 }
 
 /*********************************************************************//**
@@ -616,26 +668,21 @@ lock_get_mode(
 {
 	ut_ad(lock);
 
-	return(lock->type_mode & LOCK_MODE_MASK);
+	return(static_cast<enum lock_mode>(lock->type_mode & LOCK_MODE_MASK));
 }
 
 /*********************************************************************//**
 Gets the wait flag of a lock.
-@return	TRUE if waiting */
+@return	LOCK_WAIT if waiting, 0 if not */
 UNIV_INLINE
-ibool
+ulint
 lock_get_wait(
 /*==========*/
 	const lock_t*	lock)	/*!< in: lock */
 {
 	ut_ad(lock);
 
-	if (UNIV_UNLIKELY(lock->type_mode & LOCK_WAIT)) {
-
-		return(TRUE);
-	}
-
-	return(FALSE);
+	return(lock->type_mode & LOCK_WAIT);
 }
 
 /*********************************************************************//**
@@ -656,11 +703,21 @@ lock_get_src_table(
 	dict_table_t*	src;
 	lock_t*		lock;
 
+	ut_ad(!lock_mutex_own());
+
 	src = NULL;
 	*mode = LOCK_NONE;
 
-	for (lock = UT_LIST_GET_FIRST(trx->trx_locks);
-	     lock;
+	/* The trx mutex protects the trx_locks for our purposes.
+	Other transactions could want to convert one of our implicit
+	record locks to an explicit one. For that, they would need our
+	trx mutex. Waiting locks can be removed while only holding
+	lock_sys->mutex, but this is a running transaction and cannot
+	thus be holding any waiting locks. */
+	trx_mutex_enter(trx);
+
+	for (lock = UT_LIST_GET_FIRST(trx->lock.trx_locks);
+	     lock != NULL;
 	     lock = UT_LIST_GET_NEXT(trx_locks, lock)) {
 		lock_table_t*	tab_lock;
 		enum lock_mode	lock_mode;
@@ -679,12 +736,14 @@ lock_get_src_table(
 			    || UT_LIST_GET_FIRST(src->locks) != lock) {
 				/* We only support the case when
 				there is only one lock on this table. */
-				return(NULL);
+				src = NULL;
+				goto func_exit;
 			}
 		} else if (src != tab_lock->table) {
 			/* The transaction is locking more than
 			two tables (src and dest): abort */
-			return(NULL);
+			src = NULL;
+			goto func_exit;
 		}
 
 		/* Check that the source table is locked by
@@ -693,7 +752,8 @@ lock_get_src_table(
 		if (lock_mode == LOCK_IX || lock_mode == LOCK_IS) {
 			if (*mode != LOCK_NONE && *mode != lock_mode) {
 				/* There are multiple locks on src. */
-				return(NULL);
+				src = NULL;
+				goto func_exit;
 			}
 			*mode = lock_mode;
 		}
@@ -704,6 +764,8 @@ lock_get_src_table(
 		src = dest;
 	}
 
+func_exit:
+	trx_mutex_exit(trx);
 	return(src);
 }
 
@@ -717,8 +779,8 @@ UNIV_INTERN
 ibool
 lock_is_table_exclusive(
 /*====================*/
-	dict_table_t*	table,	/*!< in: table */
-	trx_t*		trx)	/*!< in: transaction */
+	const dict_table_t*	table,	/*!< in: table */
+	const trx_t*		trx)	/*!< in: transaction */
 {
 	const lock_t*	lock;
 	ibool		ok	= FALSE;
@@ -726,10 +788,10 @@ lock_is_table_exclusive(
 	ut_ad(table);
 	ut_ad(trx);
 
-	lock_mutex_enter_kernel();
+	lock_mutex_enter();
 
 	for (lock = UT_LIST_GET_FIRST(table->locks);
-	     lock;
+	     lock != NULL;
 	     lock = UT_LIST_GET_NEXT(locks, &lock->un_member.tab_lock)) {
 		if (lock->trx != trx) {
 			/* A lock on the table is held
@@ -759,7 +821,7 @@ not_ok:
 	}
 
 func_exit:
-	lock_mutex_exit_kernel();
+	lock_mutex_exit();
 
 	return(ok);
 }
@@ -771,12 +833,15 @@ void
 lock_set_lock_and_trx_wait(
 /*=======================*/
 	lock_t*	lock,	/*!< in: lock */
-	trx_t*	trx)	/*!< in: trx */
+	trx_t*	trx)	/*!< in/out: trx */
 {
 	ut_ad(lock);
-	ut_ad(trx->wait_lock == NULL);
+	ut_ad(lock->trx == trx);
+	ut_ad(trx->lock.wait_lock == NULL);
+	ut_ad(lock_mutex_own());
+	ut_ad(trx_mutex_own(trx));
 
-	trx->wait_lock = lock;
+	trx->lock.wait_lock = lock;
 	lock->type_mode |= LOCK_WAIT;
 }
 
@@ -787,22 +852,23 @@ UNIV_INLINE
 void
 lock_reset_lock_and_trx_wait(
 /*=========================*/
-	lock_t*	lock)	/*!< in: record lock */
+	lock_t*	lock)	/*!< in/out: record lock */
 {
-	ut_ad((lock->trx)->wait_lock == lock);
+	ut_ad(lock->trx->lock.wait_lock == lock);
 	ut_ad(lock_get_wait(lock));
+	ut_ad(lock_mutex_own());
 
 	/* Reset the back pointer in trx to this waiting lock request */
 
-	(lock->trx)->wait_lock = NULL;
+	lock->trx->lock.wait_lock = NULL;
 	lock->type_mode &= ~LOCK_WAIT;
 }
 
 /*********************************************************************//**
 Gets the gap flag of a record lock.
-@return	TRUE if gap flag set */
+@return	LOCK_GAP or 0 */
 UNIV_INLINE
-ibool
+ulint
 lock_rec_get_gap(
 /*=============*/
 	const lock_t*	lock)	/*!< in: record lock */
@@ -810,19 +876,14 @@ lock_rec_get_gap(
 	ut_ad(lock);
 	ut_ad(lock_get_type_low(lock) == LOCK_REC);
 
-	if (lock->type_mode & LOCK_GAP) {
-
-		return(TRUE);
-	}
-
-	return(FALSE);
+	return(lock->type_mode & LOCK_GAP);
 }
 
 /*********************************************************************//**
 Gets the LOCK_REC_NOT_GAP flag of a record lock.
-@return	TRUE if LOCK_REC_NOT_GAP flag set */
+@return	LOCK_REC_NOT_GAP or 0 */
 UNIV_INLINE
-ibool
+ulint
 lock_rec_get_rec_not_gap(
 /*=====================*/
 	const lock_t*	lock)	/*!< in: record lock */
@@ -830,19 +891,14 @@ lock_rec_get_rec_not_gap(
 	ut_ad(lock);
 	ut_ad(lock_get_type_low(lock) == LOCK_REC);
 
-	if (lock->type_mode & LOCK_REC_NOT_GAP) {
-
-		return(TRUE);
-	}
-
-	return(FALSE);
+	return(lock->type_mode & LOCK_REC_NOT_GAP);
 }
 
 /*********************************************************************//**
 Gets the waiting insert flag of a record lock.
-@return	TRUE if gap flag set */
+@return	LOCK_INSERT_INTENTION or 0 */
 UNIV_INLINE
-ibool
+ulint
 lock_rec_get_insert_intention(
 /*==========================*/
 	const lock_t*	lock)	/*!< in: record lock */
@@ -850,12 +906,7 @@ lock_rec_get_insert_intention(
 	ut_ad(lock);
 	ut_ad(lock_get_type_low(lock) == LOCK_REC);
 
-	if (lock->type_mode & LOCK_INSERT_INTENTION) {
-
-		return(TRUE);
-	}
-
-	return(FALSE);
+	return(lock->type_mode & LOCK_INSERT_INTENTION);
 }
 
 /*********************************************************************//**
@@ -868,12 +919,10 @@ lock_mode_stronger_or_eq(
 	enum lock_mode	mode1,	/*!< in: lock mode */
 	enum lock_mode	mode2)	/*!< in: lock mode */
 {
-	ut_ad(mode1 == LOCK_X || mode1 == LOCK_S || mode1 == LOCK_IX
-	      || mode1 == LOCK_IS || mode1 == LOCK_AUTO_INC);
-	ut_ad(mode2 == LOCK_X || mode2 == LOCK_S || mode2 == LOCK_IX
-	      || mode2 == LOCK_IS || mode2 == LOCK_AUTO_INC);
+	ut_ad((ulint) mode1 < lock_types);
+	ut_ad((ulint) mode2 < lock_types);
 
-	return((LOCK_MODE_STRONGER_OR_EQ) & LK(mode1, mode2));
+	return(lock_strength_matrix[mode1][mode2]);
 }
 
 /*********************************************************************//**
@@ -886,12 +935,10 @@ lock_mode_compatible(
 	enum lock_mode	mode1,	/*!< in: lock mode */
 	enum lock_mode	mode2)	/*!< in: lock mode */
 {
-	ut_ad(mode1 == LOCK_X || mode1 == LOCK_S || mode1 == LOCK_IX
-	      || mode1 == LOCK_IS || mode1 == LOCK_AUTO_INC);
-	ut_ad(mode2 == LOCK_X || mode2 == LOCK_S || mode2 == LOCK_IX
-	      || mode2 == LOCK_IS || mode2 == LOCK_AUTO_INC);
+	ut_ad((ulint) mode1 < lock_types);
+	ut_ad((ulint) mode2 < lock_types);
 
-	return((LOCK_MODE_COMPATIBILITY) & LK(mode1, mode2));
+	return(lock_compatibility_matrix[mode1][mode2]);
 }
 
 /*********************************************************************//**
@@ -919,7 +966,8 @@ lock_rec_has_to_wait(
 	ut_ad(lock_get_type_low(lock2) == LOCK_REC);
 
 	if (trx != lock2->trx
-	    && !lock_mode_compatible(LOCK_MODE_MASK & type_mode,
+	    && !lock_mode_compatible(static_cast<enum lock_mode>(
+			             LOCK_MODE_MASK & type_mode),
 				     lock_get_mode(lock2))) {
 
 		/* We have somewhat complex rules when gap type record locks
@@ -1107,14 +1155,14 @@ lock_rec_get_next_on_page_const(
 	ulint	space;
 	ulint	page_no;
 
-	ut_ad(mutex_own(&kernel_mutex));
+	ut_ad(lock_mutex_own());
 	ut_ad(lock_get_type_low(lock) == LOCK_REC);
 
 	space = lock->un_member.rec_lock.space;
 	page_no = lock->un_member.rec_lock.page_no;
 
 	for (;;) {
-		lock = HASH_GET_NEXT(hash, lock);
+		lock = static_cast<const lock_t*>(HASH_GET_NEXT(hash, lock));
 
 		if (!lock) {
 
@@ -1156,46 +1204,41 @@ lock_rec_get_first_on_page_addr(
 {
 	lock_t*	lock;
 
-	ut_ad(mutex_own(&kernel_mutex));
+	ut_ad(lock_mutex_own());
 
-	lock = HASH_GET_FIRST(lock_sys->rec_hash,
-			      lock_rec_hash(space, page_no));
-	while (lock) {
-		if ((lock->un_member.rec_lock.space == space)
-		    && (lock->un_member.rec_lock.page_no == page_no)) {
+	for (lock = static_cast<lock_t*>(
+			HASH_GET_FIRST(lock_sys->rec_hash,
+				       lock_rec_hash(space, page_no)));
+	      lock != NULL;
+	      lock = static_cast<lock_t*>(HASH_GET_NEXT(hash, lock))) {
+
+		if (lock->un_member.rec_lock.space == space
+		    && lock->un_member.rec_lock.page_no == page_no) {
 
 			break;
 		}
-
-		lock = HASH_GET_NEXT(hash, lock);
 	}
 
 	return(lock);
 }
 
 /*********************************************************************//**
-Returns TRUE if there are explicit record locks on a page.
-@return	TRUE if there are explicit record locks on the page */
+Determines if there are explicit record locks on a page.
+@return	an explicit record lock on the page, or NULL if there are none */
 UNIV_INTERN
-ibool
+lock_t*
 lock_rec_expl_exist_on_page(
 /*========================*/
 	ulint	space,	/*!< in: space id */
 	ulint	page_no)/*!< in: page number */
 {
-	ibool	ret;
-
-	mutex_enter(&kernel_mutex);
-
-	if (lock_rec_get_first_on_page_addr(space, page_no)) {
-		ret = TRUE;
-	} else {
-		ret = FALSE;
-	}
+	lock_t*	lock;
 
-	mutex_exit(&kernel_mutex);
+	lock_mutex_enter();
+	lock = lock_rec_get_first_on_page_addr(space, page_no);
+	lock_mutex_exit();
 
-	return(ret);
+	return(lock);
 }
 
 /*********************************************************************//**
@@ -1213,20 +1256,20 @@ lock_rec_get_first_on_page(
 	ulint	space	= buf_block_get_space(block);
 	ulint	page_no	= buf_block_get_page_no(block);
 
-	ut_ad(mutex_own(&kernel_mutex));
+	ut_ad(lock_mutex_own());
 
 	hash = buf_block_get_lock_hash_val(block);
 
-	lock = HASH_GET_FIRST(lock_sys->rec_hash, hash);
+	for (lock = static_cast<lock_t*>(
+			HASH_GET_FIRST( lock_sys->rec_hash, hash));
+	     lock != NULL;
+	     lock = static_cast<lock_t*>(HASH_GET_NEXT(hash, lock))) {
 
-	while (lock) {
 		if ((lock->un_member.rec_lock.space == space)
 		    && (lock->un_member.rec_lock.page_no == page_no)) {
 
 			break;
 		}
-
-		lock = HASH_GET_NEXT(hash, lock);
 	}
 
 	return(lock);
@@ -1242,7 +1285,7 @@ lock_rec_get_next(
 	ulint	heap_no,/*!< in: heap number of the record */
 	lock_t*	lock)	/*!< in: lock */
 {
-	ut_ad(mutex_own(&kernel_mutex));
+	ut_ad(lock_mutex_own());
 
 	do {
 		ut_ad(lock_get_type_low(lock) == LOCK_REC);
@@ -1253,6 +1296,19 @@ lock_rec_get_next(
 }
 
 /*********************************************************************//**
+Gets the next explicit lock request on a record.
+@return	next lock, NULL if none exists or if heap_no == ULINT_UNDEFINED */
+UNIV_INLINE
+const lock_t*
+lock_rec_get_next_const(
+/*====================*/
+	ulint		heap_no,/*!< in: heap number of the record */
+	const lock_t*	lock)	/*!< in: lock */
+{
+	return(lock_rec_get_next(heap_no, (lock_t*) lock));
+}
+
+/*********************************************************************//**
 Gets the first explicit lock request on a record.
 @return	first lock, NULL if none exists */
 UNIV_INLINE
@@ -1264,7 +1320,7 @@ lock_rec_get_first(
 {
 	lock_t*	lock;
 
-	ut_ad(mutex_own(&kernel_mutex));
+	ut_ad(lock_mutex_own());
 
 	for (lock = lock_rec_get_first_on_page(block); lock;
 	     lock = lock_rec_get_next_on_page(lock)) {
@@ -1316,7 +1372,7 @@ lock_rec_copy(
 
 	size = sizeof(lock_t) + lock_rec_get_n_bits(lock) / 8;
 
-	return(mem_heap_dup(heap, lock, size));
+	return(static_cast<lock_t*>(mem_heap_dup(heap, lock, size)));
 }
 
 /*********************************************************************//**
@@ -1334,15 +1390,16 @@ lock_rec_get_prev(
 	ulint	page_no;
 	lock_t*	found_lock	= NULL;
 
-	ut_ad(mutex_own(&kernel_mutex));
+	ut_ad(lock_mutex_own());
 	ut_ad(lock_get_type_low(in_lock) == LOCK_REC);
 
 	space = in_lock->un_member.rec_lock.space;
 	page_no = in_lock->un_member.rec_lock.page_no;
 
-	lock = lock_rec_get_first_on_page_addr(space, page_no);
+	for (lock = lock_rec_get_first_on_page_addr(space, page_no);
+	     /* No op */;
+	     lock = lock_rec_get_next_on_page(lock)) {
 
-	for (;;) {
 		ut_ad(lock);
 
 		if (lock == in_lock) {
@@ -1354,46 +1411,55 @@ lock_rec_get_prev(
 
 			found_lock = lock;
 		}
-
-		lock = lock_rec_get_next_on_page(lock);
 	}
 }
 
 /*============= FUNCTIONS FOR ANALYZING TABLE LOCK QUEUE ================*/
 
 /*********************************************************************//**
-Checks if a transaction has the specified table lock, or stronger.
+Checks if a transaction has the specified table lock, or stronger. This
+function should only be called by the thread that owns the transaction.
 @return	lock or NULL */
 UNIV_INLINE
-lock_t*
+const lock_t*
 lock_table_has(
 /*===========*/
-	trx_t*		trx,	/*!< in: transaction */
-	dict_table_t*	table,	/*!< in: table */
-	enum lock_mode	mode)	/*!< in: lock mode */
+	const trx_t*		trx,	/*!< in: transaction */
+	const dict_table_t*	table,	/*!< in: table */
+	enum lock_mode		mode)	/*!< in: lock mode */
 {
-	lock_t*	lock;
+	lint			i;
 
-	ut_ad(mutex_own(&kernel_mutex));
+	if (ib_vector_is_empty(trx->lock.table_locks)) {
+		return(NULL);
+	}
 
 	/* Look for stronger locks the same trx already has on the table */
 
-	lock = UT_LIST_GET_LAST(table->locks);
+	for (i = ib_vector_size(trx->lock.table_locks) - 1; i >= 0; --i) {
+		const lock_t*	lock;
+		enum lock_mode	lock_mode;
 
-	while (lock != NULL) {
+		lock = *static_cast<const lock_t**>(
+			ib_vector_get(trx->lock.table_locks, i));
 
-		if (lock->trx == trx
-		    && lock_mode_stronger_or_eq(lock_get_mode(lock), mode)) {
+		if (lock == NULL) {
+			continue;
+		}
+
+		lock_mode = lock_get_mode(lock);
+
+		ut_ad(trx == lock->trx);
+		ut_ad(lock_get_type_low(lock) & LOCK_TABLE);
+		ut_ad(lock->un_member.tab_lock.table != NULL);
 
-			/* The same trx already has locked the table in
-			a mode stronger or equal to the mode given */
+		if (table == lock->un_member.tab_lock.table
+		    && lock_mode_stronger_or_eq(lock_mode, mode)) {
 
 			ut_ad(!lock_get_wait(lock));
 
 			return(lock);
 		}
-
-		lock = UT_LIST_GET_PREV(un_member.tab_lock.locks, lock);
 	}
 
 	return(NULL);
@@ -1406,7 +1472,7 @@ Checks if a transaction has a GRANTED explicit lock on rec stronger or equal
 to precise_mode.
 @return	lock or NULL */
 UNIV_INLINE
-lock_t*
+const lock_t*
 lock_rec_has_expl(
 /*==============*/
 	ulint			precise_mode,/*!< in: LOCK_S or LOCK_X
@@ -1417,21 +1483,24 @@ lock_rec_has_expl(
 	const buf_block_t*	block,	/*!< in: buffer block containing
 					the record */
 	ulint			heap_no,/*!< in: heap number of the record */
-	trx_t*			trx)	/*!< in: transaction */
+	const trx_t*		trx)	/*!< in: transaction */
 {
-	lock_t*	lock;
+	const lock_t*	lock;
 
-	ut_ad(mutex_own(&kernel_mutex));
+	ut_ad(lock_mutex_own());
 	ut_ad((precise_mode & LOCK_MODE_MASK) == LOCK_S
 	      || (precise_mode & LOCK_MODE_MASK) == LOCK_X);
 	ut_ad(!(precise_mode & LOCK_INSERT_INTENTION));
 
-	lock = lock_rec_get_first(block, heap_no);
+	for (lock = lock_rec_get_first(block, heap_no);
+	     lock != NULL;
+	     lock = lock_rec_get_next_const(heap_no, lock)) {
 
-	while (lock) {
 		if (lock->trx == trx
-		    && lock_mode_stronger_or_eq(lock_get_mode(lock),
-						precise_mode & LOCK_MODE_MASK)
+		    && lock_mode_stronger_or_eq(
+			    lock_get_mode(lock),
+			    static_cast<enum lock_mode>(
+				    precise_mode & LOCK_MODE_MASK))
 		    && !lock_get_wait(lock)
 		    && (!lock_rec_get_rec_not_gap(lock)
 			|| (precise_mode & LOCK_REC_NOT_GAP)
@@ -1443,8 +1512,6 @@ lock_rec_has_expl(
 
 			return(lock);
 		}
-
-		lock = lock_rec_get_next(heap_no, lock);
 	}
 
 	return(NULL);
@@ -1455,7 +1522,7 @@ lock_rec_has_expl(
 Checks if some other transaction has a lock request in the queue.
 @return	lock or NULL */
 static
-lock_t*
+const lock_t*
 lock_rec_other_has_expl_req(
 /*========================*/
 	enum lock_mode		mode,	/*!< in: LOCK_S or LOCK_X */
@@ -1472,16 +1539,17 @@ lock_rec_other_has_expl_req(
 					requests by all transactions
 					are taken into account */
 {
-	lock_t*	lock;
+	const lock_t*	lock;
 
-	ut_ad(mutex_own(&kernel_mutex));
+	ut_ad(lock_mutex_own());
 	ut_ad(mode == LOCK_X || mode == LOCK_S);
 	ut_ad(gap == 0 || gap == LOCK_GAP);
 	ut_ad(wait == 0 || wait == LOCK_WAIT);
 
-	lock = lock_rec_get_first(block, heap_no);
+	for (lock = lock_rec_get_first(block, heap_no);
+	     lock != NULL;
+	     lock = lock_rec_get_next_const(heap_no, lock)) {
 
-	while (lock) {
 		if (lock->trx != trx
 		    && (gap
 			|| !(lock_rec_get_gap(lock)
@@ -1491,8 +1559,6 @@ lock_rec_other_has_expl_req(
 
 			return(lock);
 		}
-
-		lock = lock_rec_get_next(heap_no, lock);
 	}
 
 	return(NULL);
@@ -1504,7 +1570,7 @@ Checks if some other transaction has a conflicting explicit lock request
 in the queue, so that we have to wait.
 @return	lock or NULL */
 static
-lock_t*
+const lock_t*
 lock_rec_other_has_conflicting(
 /*===========================*/
 	enum lock_mode		mode,	/*!< in: LOCK_S or LOCK_X,
@@ -1514,35 +1580,21 @@ lock_rec_other_has_conflicting(
 	const buf_block_t*	block,	/*!< in: buffer block containing
 					the record */
 	ulint			heap_no,/*!< in: heap number of the record */
-	trx_t*			trx)	/*!< in: our transaction */
+	const trx_t*		trx)	/*!< in: our transaction */
 {
-	lock_t*	lock;
-
-	ut_ad(mutex_own(&kernel_mutex));
-
-	lock = lock_rec_get_first(block, heap_no);
-
-	if (UNIV_LIKELY_NULL(lock)) {
-		if (UNIV_UNLIKELY(heap_no == PAGE_HEAP_NO_SUPREMUM)) {
+	const lock_t*		lock;
+	ibool			is_supremum;
 
-			do {
-				if (lock_rec_has_to_wait(trx, mode, lock,
-							 TRUE)) {
-					return(lock);
-				}
+	ut_ad(lock_mutex_own());
 
-				lock = lock_rec_get_next(heap_no, lock);
-			} while (lock);
-		} else {
+	is_supremum = (heap_no == PAGE_HEAP_NO_SUPREMUM);
 
-			do {
-				if (lock_rec_has_to_wait(trx, mode, lock,
-							 FALSE)) {
-					return(lock);
-				}
+	for (lock = lock_rec_get_first(block, heap_no);
+	     lock != NULL;
+	     lock = lock_rec_get_next_const(heap_no, lock)) {
 
-				lock = lock_rec_get_next(heap_no, lock);
-			} while (lock);
+		if (lock_rec_has_to_wait(trx, mode, lock, is_supremum)) {
+			return(lock);
 		}
 	}
 
@@ -1563,17 +1615,18 @@ lock_rec_find_similar_on_page(
 	lock_t*		lock,		/*!< in: lock_rec_get_first_on_page() */
 	const trx_t*	trx)		/*!< in: transaction */
 {
-	ut_ad(mutex_own(&kernel_mutex));
+	ut_ad(lock_mutex_own());
+
+	for (/* No op */;
+	     lock != NULL;
+	     lock = lock_rec_get_next_on_page(lock)) {
 
-	while (lock != NULL) {
 		if (lock->trx == trx
 		    && lock->type_mode == type_mode
 		    && lock_rec_get_n_bits(lock) > heap_no) {
 
 			return(lock);
 		}
-
-		lock = lock_rec_get_next_on_page(lock);
 	}
 
 	return(NULL);
@@ -1582,69 +1635,80 @@ lock_rec_find_similar_on_page(
 /*********************************************************************//**
 Checks if some transaction has an implicit x-lock on a record in a secondary
 index.
-@return	transaction which has the x-lock, or NULL */
+@return	transaction id of the transaction which has the x-lock, or 0;
+NOTE that this function can return false positives but never false
+negatives. The caller must confirm all positive results by calling
+trx_is_active(). */
 static
-trx_t*
-lock_sec_rec_some_has_impl_off_kernel(
-/*==================================*/
+trx_id_t
+lock_sec_rec_some_has_impl(
+/*=======================*/
 	const rec_t*	rec,	/*!< in: user record */
 	dict_index_t*	index,	/*!< in: secondary index */
 	const ulint*	offsets)/*!< in: rec_get_offsets(rec, index) */
 {
+	trx_id_t	trx_id;
+	trx_id_t	max_trx_id;
 	const page_t*	page = page_align(rec);
 
-	ut_ad(mutex_own(&kernel_mutex));
+	ut_ad(!lock_mutex_own());
+	ut_ad(!mutex_own(&trx_sys->mutex));
 	ut_ad(!dict_index_is_clust(index));
 	ut_ad(page_rec_is_user_rec(rec));
 	ut_ad(rec_offs_validate(rec, index, offsets));
 
+	max_trx_id = page_get_max_trx_id(page);
+
 	/* Some transaction may have an implicit x-lock on the record only
 	if the max trx id for the page >= min trx id for the trx list, or
 	database recovery is running. We do not write the changes of a page
 	max trx id to the log, and therefore during recovery, this value
 	for a page may be incorrect. */
 
-	if (page_get_max_trx_id(page) < trx_list_get_min_trx_id()
-	    && !recv_recovery_is_on()) {
+	if (max_trx_id < trx_rw_min_trx_id() && !recv_recovery_is_on()) {
 
-		return(NULL);
-	}
+		trx_id = 0;
 
-	/* Ok, in this case it is possible that some transaction has an
-	implicit x-lock. We have to look in the clustered index. */
+	} else if (!lock_check_trx_id_sanity(max_trx_id, rec, index, offsets)) {
 
-	if (!lock_check_trx_id_sanity(page_get_max_trx_id(page),
-				      rec, index, offsets, TRUE)) {
 		buf_page_print(page, 0, 0);
 
-		/* The page is corrupt: try to avoid a crash by returning
-		NULL */
-		return(NULL);
+		/* The page is corrupt: try to avoid a crash by returning 0 */
+		trx_id = 0;
+
+	/* In this case it is possible that some transaction has an implicit
+	x-lock. We have to look in the clustered index. */
+
+	} else {
+		trx_id = row_vers_impl_x_locked(rec, index, offsets);
 	}
 
-	return(row_vers_impl_x_locked_off_kernel(rec, index, offsets));
+	return(trx_id);
 }
 
 /*********************************************************************//**
 Return approximate number or record locks (bits set in the bitmap) for
 this transaction. Since delete-marked records may be removed, the
-record count will not be precise. */
+record count will not be precise.
+The caller must be holding lock_sys->mutex. */
 UNIV_INTERN
 ulint
 lock_number_of_rows_locked(
 /*=======================*/
-	const trx_t*	trx)	/*!< in: transaction */
+	const trx_lock_t*	trx_lock)	/*!< in: transaction locks */
 {
-	lock_t*	lock;
-	ulint   n_records = 0;
-	ulint	n_bits;
-	ulint	n_bit;
+	const lock_t*	lock;
+	ulint		n_records = 0;
+
+	ut_ad(lock_mutex_own());
 
-	lock = UT_LIST_GET_FIRST(trx->trx_locks);
+	for (lock = UT_LIST_GET_FIRST(trx_lock->trx_locks);
+	     lock != NULL;
+	     lock = UT_LIST_GET_NEXT(trx_locks, lock)) {
 
-	while (lock) {
 		if (lock_get_type_low(lock) == LOCK_REC) {
-			n_bits = lock_rec_get_n_bits(lock);
+			ulint	n_bit;
+			ulint	n_bits = lock_rec_get_n_bits(lock);
 
 			for (n_bit = 0; n_bit < n_bits; n_bit++) {
 				if (lock_rec_get_nth_bit(lock, n_bit)) {
@@ -1652,11 +1716,9 @@ lock_number_of_rows_locked(
 				}
 			}
 		}
-
-		lock = UT_LIST_GET_NEXT(trx_locks, lock);
 	}
 
-	return (n_records);
+	return(n_records);
 }
 
 /*============== RECORD LOCK CREATION AND QUEUE MANAGEMENT =============*/
@@ -1676,7 +1738,10 @@ lock_rec_create(
 					the record */
 	ulint			heap_no,/*!< in: heap number of the record */
 	dict_index_t*		index,	/*!< in: index of record */
-	trx_t*			trx)	/*!< in: transaction */
+	trx_t*			trx,	/*!< in/out: transaction */
+	ibool			caller_owns_trx_mutex)
+					/*!< in: TRUE if caller owns
+					trx mutex */
 {
 	lock_t*		lock;
 	ulint		page_no;
@@ -1685,7 +1750,12 @@ lock_rec_create(
 	ulint		n_bytes;
 	const page_t*	page;
 
-	ut_ad(mutex_own(&kernel_mutex));
+	ut_ad(lock_mutex_own());
+	ut_ad(caller_owns_trx_mutex == trx_mutex_own(trx));
+
+	/* Non-locking autocommit read-only transactions should not set
+	any locks. */
+	assert_trx_in_list(trx);
 
 	space = buf_block_get_space(block);
 	page_no	= buf_block_get_page_no(block);
@@ -1707,9 +1777,8 @@ lock_rec_create(
 	n_bits = page_dir_get_n_heap(page) + LOCK_PAGE_BITMAP_MARGIN;
 	n_bytes = 1 + n_bits / 8;
 
-	lock = mem_heap_alloc(trx->lock_heap, sizeof(lock_t) + n_bytes);
-
-	UT_LIST_ADD_LAST(trx_locks, trx->trx_locks, lock);
+	lock = static_cast<lock_t*>(
+		mem_heap_alloc(trx->lock.lock_heap, sizeof(lock_t) + n_bytes));
 
 	lock->trx = trx;
 
@@ -1728,13 +1797,32 @@ lock_rec_create(
 	/* Set the bit corresponding to rec */
 	lock_rec_set_nth_bit(lock, heap_no);
 
+	index->table->n_rec_locks++;
+
+	ut_ad(index->table->n_ref_count > 0 || !index->table->can_be_evicted);
+
 	HASH_INSERT(lock_t, hash, lock_sys->rec_hash,
 		    lock_rec_fold(space, page_no), lock);
+
+	if (!caller_owns_trx_mutex) {
+		trx_mutex_enter(trx);
+	}
+	ut_ad(trx_mutex_own(trx));
+
 	if (UNIV_UNLIKELY(type_mode & LOCK_WAIT)) {
 
 		lock_set_lock_and_trx_wait(lock, trx);
 	}
 
+	UT_LIST_ADD_LAST(trx_locks, trx->lock.trx_locks, lock);
+
+	if (!caller_owns_trx_mutex) {
+		trx_mutex_exit(trx);
+	}
+
+	MONITOR_INC(MONITOR_RECLOCK_CREATED);
+	MONITOR_INC(MONITOR_NUM_RECLOCK);
+
 	return(lock);
 }
 
@@ -1764,24 +1852,26 @@ lock_rec_enqueue_waiting(
 	dict_index_t*		index,	/*!< in: index of record */
 	que_thr_t*		thr)	/*!< in: query thread */
 {
-	lock_t*	lock;
-	trx_t*	trx;
+	trx_t*			trx;
+	lock_t*			lock;
+	trx_id_t		victim_trx_id;
 
-	ut_ad(mutex_own(&kernel_mutex));
+	ut_ad(lock_mutex_own());
+
+	trx = thr_get_trx(thr);
+
+	ut_ad(trx_mutex_own(trx));
 
 	/* Test if there already is some other reason to suspend thread:
 	we do not enqueue a lock request if the query thread should be
 	stopped anyway */
 
-	if (UNIV_UNLIKELY(que_thr_stop(thr))) {
-
+	if (que_thr_stop(thr)) {
 		ut_error;
 
 		return(DB_QUE_THR_SUSPENDED);
 	}
 
-	trx = thr_get_trx(thr);
-
 	switch (trx_get_dict_operation(trx)) {
 	case TRX_DICT_OP_NONE:
 		break;
@@ -1799,43 +1889,60 @@ lock_rec_enqueue_waiting(
 		ut_ad(0);
 	}
 
-	/* Enqueue the lock request that will wait to be granted */
-	lock = lock_rec_create(type_mode | LOCK_WAIT,
-			       block, heap_no, index, trx);
+	/* Enqueue the lock request that will wait to be granted, note that
+	we already own the trx mutex. */
+	lock = lock_rec_create(
+		type_mode | LOCK_WAIT, block, heap_no, index, trx, TRUE);
+
+	/* Release the mutex to obey the latching order.
+	This is safe, because lock_deadlock_check_and_resolve()
+	is invoked when a lock wait is enqueued for the currently
+	running transaction. Because trx is a running transaction
+	(it is not currently suspended because of a lock wait),
+	its state can only be changed by this thread, which is
+	currently associated with the transaction. */
+
+	trx_mutex_exit(trx);
+
+	victim_trx_id = lock_deadlock_check_and_resolve(lock, trx);
+
+	trx_mutex_enter(trx);
 
-	/* Check if a deadlock occurs: if yes, remove the lock request and
-	return an error code */
+	if (victim_trx_id != 0) {
 
-	if (UNIV_UNLIKELY(lock_deadlock_occurs(lock, trx))) {
+		ut_ad(victim_trx_id == trx->id);
 
 		lock_reset_lock_and_trx_wait(lock);
 		lock_rec_reset_nth_bit(lock, heap_no);
 
 		return(DB_DEADLOCK);
-	}
 
-	/* If there was a deadlock but we chose another transaction as a
-	victim, it is possible that we already have the lock now granted! */
+	} else if (trx->lock.wait_lock == NULL) {
 
-	if (trx->wait_lock == NULL) {
+		/* If there was a deadlock but we chose another
+		transaction as a victim, it is possible that we
+		already have the lock now granted! */
 
 		return(DB_SUCCESS_LOCKED_REC);
 	}
 
-	trx->que_state = TRX_QUE_LOCK_WAIT;
-	trx->was_chosen_as_deadlock_victim = FALSE;
-	trx->wait_started = time(NULL);
+	trx->lock.que_state = TRX_QUE_LOCK_WAIT;
+
+	trx->lock.was_chosen_as_deadlock_victim = FALSE;
+	trx->lock.wait_started = ut_time();
 
 	ut_a(que_thr_stop(thr));
 
 #ifdef UNIV_DEBUG
 	if (lock_print_waits) {
 		fprintf(stderr, "Lock wait for trx " TRX_ID_FMT " in index ",
-			(ullint) trx->id);
+			trx->id);
 		ut_print_name(stderr, trx, FALSE, index->name);
 	}
 #endif /* UNIV_DEBUG */
 
+	MONITOR_INC(MONITOR_LOCKREC_WAIT);
+
 	return(DB_LOCK_WAIT);
 }
 
@@ -1858,11 +1965,16 @@ lock_rec_add_to_queue(
 					the record */
 	ulint			heap_no,/*!< in: heap number of the record */
 	dict_index_t*		index,	/*!< in: index of record */
-	trx_t*			trx)	/*!< in: transaction */
+	trx_t*			trx,	/*!< in/out: transaction */
+	ibool			caller_owns_trx_mutex)
+					/*!< in: TRUE if caller owns the
+					transaction mutex */
 {
 	lock_t*	lock;
+	lock_t*	first_lock;
 
-	ut_ad(mutex_own(&kernel_mutex));
+	ut_ad(lock_mutex_own());
+	ut_ad(caller_owns_trx_mutex == trx_mutex_own(trx));
 #ifdef UNIV_DEBUG
 	switch (type_mode & LOCK_MODE_MASK) {
 	case LOCK_X:
@@ -1876,7 +1988,7 @@ lock_rec_add_to_queue(
 		enum lock_mode	mode = (type_mode & LOCK_MODE_MASK) == LOCK_S
 			? LOCK_X
 			: LOCK_S;
-		lock_t*		other_lock
+		const lock_t*	other_lock
 			= lock_rec_other_has_expl_req(mode, 0, LOCK_WAIT,
 						      block, heap_no, trx);
 		ut_a(!other_lock);
@@ -1901,16 +2013,15 @@ lock_rec_add_to_queue(
 
 	/* Look for a waiting lock request on the same record or on a gap */
 
-	lock = lock_rec_get_first_on_page(block);
+	for (first_lock = lock = lock_rec_get_first_on_page(block);
+	     lock != NULL;
+	     lock = lock_rec_get_next_on_page(lock)) {
 
-	while (lock != NULL) {
 		if (lock_get_wait(lock)
-		    && (lock_rec_get_nth_bit(lock, heap_no))) {
+		    && lock_rec_get_nth_bit(lock, heap_no)) {
 
 			goto somebody_waits;
 		}
-
-		lock = lock_rec_get_next_on_page(lock);
 	}
 
 	if (UNIV_LIKELY(!(type_mode & LOCK_WAIT))) {
@@ -1920,8 +2031,7 @@ lock_rec_add_to_queue(
 		we can just set the bit */
 
 		lock = lock_rec_find_similar_on_page(
-			type_mode, heap_no,
-			lock_rec_get_first_on_page(block), trx);
+			type_mode, heap_no, first_lock, trx);
 
 		if (lock) {
 
@@ -1932,7 +2042,9 @@ lock_rec_add_to_queue(
 	}
 
 somebody_waits:
-	return(lock_rec_create(type_mode, block, heap_no, index, trx));
+	return(lock_rec_create(
+			type_mode, block, heap_no, index, trx,
+			caller_owns_trx_mutex));
 }
 
 /** Record locking request status */
@@ -1970,10 +2082,11 @@ lock_rec_lock_fast(
 	dict_index_t*		index,	/*!< in: index of record */
 	que_thr_t*		thr)	/*!< in: query thread */
 {
-	lock_t*	lock;
-	trx_t*	trx;
+	lock_t*			lock;
+	trx_t*			trx;
+	enum lock_rec_req_status status = LOCK_REC_SUCCESS;
 
-	ut_ad(mutex_own(&kernel_mutex));
+	ut_ad(lock_mutex_own());
 	ut_ad((LOCK_MODE_MASK & mode) != LOCK_S
 	      || lock_table_has(thr_get_trx(thr), index->table, LOCK_IS));
 	ut_ad((LOCK_MODE_MASK & mode) != LOCK_X
@@ -1990,35 +2103,35 @@ lock_rec_lock_fast(
 
 	if (lock == NULL) {
 		if (!impl) {
-			lock_rec_create(mode, block, heap_no, index, trx);
-		}
-
-		return(LOCK_REC_SUCCESS_CREATED);
-	}
-
-	if (lock_rec_get_next_on_page(lock)) {
-
-		return(LOCK_REC_FAIL);
-	}
-
-	if (lock->trx != trx
-	    || lock->type_mode != (mode | LOCK_REC)
-	    || lock_rec_get_n_bits(lock) <= heap_no) {
-
-		return(LOCK_REC_FAIL);
-	}
+			/* Note that we don't own the trx mutex. */
+			lock = lock_rec_create(
+				mode, block, heap_no, index, trx, FALSE);
 
-	if (!impl) {
-		/* If the nth bit of the record lock is already set then we
-		do not set a new lock bit, otherwise we do set */
-
-		if (!lock_rec_get_nth_bit(lock, heap_no)) {
-			lock_rec_set_nth_bit(lock, heap_no);
-			return(LOCK_REC_SUCCESS_CREATED);
 		}
+		status = LOCK_REC_SUCCESS_CREATED;
+	} else {
+		trx_mutex_enter(trx);
+
+		if (lock_rec_get_next_on_page(lock)
+		     || lock->trx != trx
+		     || lock->type_mode != (mode | LOCK_REC)
+		     || lock_rec_get_n_bits(lock) <= heap_no) {
+
+			status = LOCK_REC_FAIL;
+		} else if (!impl) {
+			/* If the nth bit of the record lock is already set
+			then we do not set a new lock bit, otherwise we do
+			set */
+			if (!lock_rec_get_nth_bit(lock, heap_no)) {
+				lock_rec_set_nth_bit(lock, heap_no);
+				status = LOCK_REC_SUCCESS_CREATED;
+			}
+		}
+
+		trx_mutex_exit(trx);
 	}
 
-	return(LOCK_REC_SUCCESS);
+	return(status);
 }
 
 /*********************************************************************//**
@@ -2045,9 +2158,10 @@ lock_rec_lock_slow(
 	dict_index_t*		index,	/*!< in: index of record */
 	que_thr_t*		thr)	/*!< in: query thread */
 {
-	trx_t*	trx;
+	trx_t*			trx;
+	enum db_err		err = DB_SUCCESS;
 
-	ut_ad(mutex_own(&kernel_mutex));
+	ut_ad(lock_mutex_own());
 	ut_ad((LOCK_MODE_MASK & mode) != LOCK_S
 	      || lock_table_has(thr_get_trx(thr), index->table, LOCK_IS));
 	ut_ad((LOCK_MODE_MASK & mode) != LOCK_X
@@ -2060,27 +2174,37 @@ lock_rec_lock_slow(
 
 	trx = thr_get_trx(thr);
 
+	trx_mutex_enter(trx);
+
 	if (lock_rec_has_expl(mode, block, heap_no, trx)) {
 		/* The trx already has a strong enough lock on rec: do
 		nothing */
 
-	} else if (lock_rec_other_has_conflicting(mode, block, heap_no, trx)) {
+	} else if (lock_rec_other_has_conflicting(
+			static_cast<enum lock_mode>(mode),
+			block, heap_no, trx)) {
 
-		/* If another transaction has a non-gap conflicting request in
-		the queue, as this transaction does not have a lock strong
-		enough already granted on the record, we have to wait. */
+		/* If another transaction has a non-gap conflicting
+		request in the queue, as this transaction does not
+		have a lock strong enough already granted on the
+		record, we have to wait. */
+
+		err = lock_rec_enqueue_waiting(
+			mode, block, heap_no, index, thr);
 
-		return(lock_rec_enqueue_waiting(mode, block, heap_no,
-						index, thr));
 	} else if (!impl) {
-		/* Set the requested lock on the record */
+		/* Set the requested lock on the record, note that
+		we already own the transaction mutex. */
 
-		lock_rec_add_to_queue(LOCK_REC | mode, block,
-				      heap_no, index, trx);
-		return(DB_SUCCESS_LOCKED_REC);
+		lock_rec_add_to_queue(
+			LOCK_REC | mode, block, heap_no, index, trx, TRUE);
+
+		err = DB_SUCCESS_LOCKED_REC;
 	}
 
-	return(DB_SUCCESS);
+	trx_mutex_exit(trx);
+
+	return(err);
 }
 
 /*********************************************************************//**
@@ -2108,7 +2232,7 @@ lock_rec_lock(
 	dict_index_t*		index,	/*!< in: index of record */
 	que_thr_t*		thr)	/*!< in: query thread */
 {
-	ut_ad(mutex_own(&kernel_mutex));
+	ut_ad(lock_mutex_own());
 	ut_ad((LOCK_MODE_MASK & mode) != LOCK_S
 	      || lock_table_has(thr_get_trx(thr), index->table, LOCK_IS));
 	ut_ad((LOCK_MODE_MASK & mode) != LOCK_X
@@ -2137,19 +2261,21 @@ lock_rec_lock(
 
 /*********************************************************************//**
 Checks if a waiting record lock request still has to wait in a queue.
-@return	TRUE if still has to wait */
+@return	lock that is causing the wait */
 static
-ibool
+const lock_t*
 lock_rec_has_to_wait_in_queue(
 /*==========================*/
-	lock_t*	wait_lock)	/*!< in: waiting record lock */
+	const lock_t*	wait_lock)	/*!< in: waiting record lock */
 {
-	lock_t*	lock;
-	ulint	space;
-	ulint	page_no;
-	ulint	heap_no;
+	const lock_t*	lock;
+	ulint		space;
+	ulint		page_no;
+	ulint		heap_no;
+	ulint		bit_mask;
+	ulint		bit_offset;
 
-	ut_ad(mutex_own(&kernel_mutex));
+	ut_ad(lock_mutex_own());
 	ut_ad(lock_get_wait(wait_lock));
 	ut_ad(lock_get_type_low(wait_lock) == LOCK_REC);
 
@@ -2157,54 +2283,59 @@ lock_rec_has_to_wait_in_queue(
 	page_no = wait_lock->un_member.rec_lock.page_no;
 	heap_no = lock_rec_find_set_bit(wait_lock);
 
-	lock = lock_rec_get_first_on_page_addr(space, page_no);
+	bit_offset = heap_no / 8;
+	bit_mask = 1 << (heap_no % 8);
+
+	for (lock = lock_rec_get_first_on_page_addr(space, page_no);
+	     lock != wait_lock;
+	     lock = lock_rec_get_next_on_page_const(lock)) {
 
-	while (lock != wait_lock) {
+		const byte*	p = (const byte*) &lock[1];
 
-		if (lock_rec_get_nth_bit(lock, heap_no)
+		if (heap_no < lock_rec_get_n_bits(lock)
+		    && (p[bit_offset] & bit_mask)
 		    && lock_has_to_wait(wait_lock, lock)) {
 
-			return(TRUE);
+			return(lock);
 		}
-
-		lock = lock_rec_get_next_on_page(lock);
 	}
 
-	return(FALSE);
+	return(NULL);
 }
 
 /*************************************************************//**
-Grants a lock to a waiting lock request and releases the waiting
-transaction. */
+Grants a lock to a waiting lock request and releases the waiting transaction.
+The caller must hold lock_sys->mutex but not lock->trx->mutex. */
 static
 void
 lock_grant(
 /*=======*/
 	lock_t*	lock)	/*!< in/out: waiting lock request */
 {
-	ut_ad(mutex_own(&kernel_mutex));
+	ut_ad(lock_mutex_own());
 
 	lock_reset_lock_and_trx_wait(lock);
 
+	trx_mutex_enter(lock->trx);
+
 	if (lock_get_mode(lock) == LOCK_AUTO_INC) {
-		trx_t*		trx = lock->trx;
 		dict_table_t*	table = lock->un_member.tab_lock.table;
 
-		if (table->autoinc_trx == trx) {
+		if (UNIV_UNLIKELY(table->autoinc_trx == lock->trx)) {
 			fprintf(stderr,
 				"InnoDB: Error: trx already had"
 				" an AUTO-INC lock!\n");
 		} else {
-			table->autoinc_trx = trx;
+			table->autoinc_trx = lock->trx;
 
-			ib_vector_push(trx->autoinc_locks, lock);
+			ib_vector_push(lock->trx->autoinc_locks, &lock);
 		}
 	}
 
 #ifdef UNIV_DEBUG
 	if (lock_print_waits) {
 		fprintf(stderr, "Lock wait for trx " TRX_ID_FMT " ends\n",
-			(ullint) lock->trx->id);
+			lock->trx->id);
 	}
 #endif /* UNIV_DEBUG */
 
@@ -2213,9 +2344,17 @@ lock_grant(
 	TRX_QUE_LOCK_WAIT state, and there is no need to end the lock wait
 	for it */
 
-	if (lock->trx->que_state == TRX_QUE_LOCK_WAIT) {
-		trx_end_lock_wait(lock->trx);
+	if (lock->trx->lock.que_state == TRX_QUE_LOCK_WAIT) {
+		que_thr_t*	thr;
+
+		thr = que_thr_end_lock_wait(lock->trx);
+
+		if (thr != NULL) {
+			lock_wait_release_thread_if_suspended(thr);
+		}
 	}
+
+	trx_mutex_exit(lock->trx);
 }
 
 /*************************************************************//**
@@ -2228,7 +2367,9 @@ lock_rec_cancel(
 /*============*/
 	lock_t*	lock)	/*!< in: waiting record lock request */
 {
-	ut_ad(mutex_own(&kernel_mutex));
+	que_thr_t*	thr;
+
+	ut_ad(lock_mutex_own());
 	ut_ad(lock_get_type_low(lock) == LOCK_REC);
 
 	/* Reset the bit (there can be only one set bit) in the lock bitmap */
@@ -2240,7 +2381,15 @@ lock_rec_cancel(
 
 	/* The following function releases the trx from lock wait */
 
-	trx_end_lock_wait(lock->trx);
+	trx_mutex_enter(lock->trx);
+
+	thr = que_thr_end_lock_wait(lock->trx);
+
+	if (thr != NULL) {
+		lock_wait_release_thread_if_suspended(thr);
+	}
+
+	trx_mutex_exit(lock->trx);
 }
 
 /*************************************************************//**
@@ -2251,43 +2400,52 @@ static
 void
 lock_rec_dequeue_from_page(
 /*=======================*/
-	lock_t*	in_lock)/*!< in: record lock object: all record locks which
-			are contained in this lock object are removed;
-			transactions waiting behind will get their lock
-			requests granted, if they are now qualified to it */
+	lock_t*		in_lock)	/*!< in: record lock object: all
+					record locks which are contained in
+					this lock object are removed;
+					transactions waiting behind will
+					get their lock requests granted,
+					if they are now qualified to it */
 {
-	ulint	space;
-	ulint	page_no;
-	lock_t*	lock;
-	trx_t*	trx;
+	ulint		space;
+	ulint		page_no;
+	lock_t*		lock;
+	trx_lock_t*	trx_lock;
 
-	ut_ad(mutex_own(&kernel_mutex));
+	ut_ad(lock_mutex_own());
 	ut_ad(lock_get_type_low(in_lock) == LOCK_REC);
+	/* We may or may not be holding in_lock->trx->mutex here. */
 
-	trx = in_lock->trx;
+	trx_lock = &in_lock->trx->lock;
 
 	space = in_lock->un_member.rec_lock.space;
 	page_no = in_lock->un_member.rec_lock.page_no;
 
+	in_lock->index->table->n_rec_locks--;
+
 	HASH_DELETE(lock_t, hash, lock_sys->rec_hash,
 		    lock_rec_fold(space, page_no), in_lock);
 
-	UT_LIST_REMOVE(trx_locks, trx->trx_locks, in_lock);
+	UT_LIST_REMOVE(trx_locks, trx_lock->trx_locks, in_lock);
+
+	MONITOR_INC(MONITOR_RECLOCK_REMOVED);
+	MONITOR_DEC(MONITOR_NUM_RECLOCK);
 
 	/* Check if waiting locks in the queue can now be granted: grant
-	locks if there are no conflicting locks ahead. */
+	locks if there are no conflicting locks ahead. Stop at the first
+	X lock that is waiting or has been granted. */
 
-	lock = lock_rec_get_first_on_page_addr(space, page_no);
+	for (lock = lock_rec_get_first_on_page_addr(space, page_no);
+	     lock != NULL;
+	     lock = lock_rec_get_next_on_page(lock)) {
 
-	while (lock != NULL) {
 		if (lock_get_wait(lock)
 		    && !lock_rec_has_to_wait_in_queue(lock)) {
 
 			/* Grant the lock */
+			ut_ad(lock->trx != in_lock->trx);
 			lock_grant(lock);
 		}
-
-		lock = lock_rec_get_next_on_page(lock);
 	}
 }
 
@@ -2297,25 +2455,31 @@ static
 void
 lock_rec_discard(
 /*=============*/
-	lock_t*	in_lock)/*!< in: record lock object: all record locks which
-			are contained in this lock object are removed */
+	lock_t*		in_lock)	/*!< in: record lock object: all
+					record locks which are contained
+					in this lock object are removed */
 {
-	ulint	space;
-	ulint	page_no;
-	trx_t*	trx;
+	ulint		space;
+	ulint		page_no;
+	trx_lock_t*	trx_lock;
 
-	ut_ad(mutex_own(&kernel_mutex));
+	ut_ad(lock_mutex_own());
 	ut_ad(lock_get_type_low(in_lock) == LOCK_REC);
 
-	trx = in_lock->trx;
+	trx_lock = &in_lock->trx->lock;
 
 	space = in_lock->un_member.rec_lock.space;
 	page_no = in_lock->un_member.rec_lock.page_no;
 
+	in_lock->index->table->n_rec_locks--;
+
 	HASH_DELETE(lock_t, hash, lock_sys->rec_hash,
 		    lock_rec_fold(space, page_no), in_lock);
 
-	UT_LIST_REMOVE(trx_locks, trx->trx_locks, in_lock);
+	UT_LIST_REMOVE(trx_locks, trx_lock->trx_locks, in_lock);
+
+	MONITOR_INC(MONITOR_RECLOCK_REMOVED);
+	MONITOR_DEC(MONITOR_NUM_RECLOCK);
 }
 
 /*************************************************************//**
@@ -2333,7 +2497,7 @@ lock_rec_free_all_from_discard_page(
 	lock_t*	lock;
 	lock_t*	next_lock;
 
-	ut_ad(mutex_own(&kernel_mutex));
+	ut_ad(lock_mutex_own());
 
 	space = buf_block_get_space(block);
 	page_no = buf_block_get_page_no(block);
@@ -2367,18 +2531,17 @@ lock_rec_reset_and_release_wait(
 {
 	lock_t*	lock;
 
-	ut_ad(mutex_own(&kernel_mutex));
+	ut_ad(lock_mutex_own());
 
-	lock = lock_rec_get_first(block, heap_no);
+	for (lock = lock_rec_get_first(block, heap_no);
+	     lock != NULL;
+	     lock = lock_rec_get_next(heap_no, lock)) {
 
-	while (lock != NULL) {
 		if (lock_get_wait(lock)) {
 			lock_rec_cancel(lock);
 		} else {
 			lock_rec_reset_nth_bit(lock, heap_no);
 		}
-
-		lock = lock_rec_get_next(heap_no, lock);
 	}
 }
 
@@ -2404,9 +2567,7 @@ lock_rec_inherit_to_gap(
 {
 	lock_t*	lock;
 
-	ut_ad(mutex_own(&kernel_mutex));
-
-	lock = lock_rec_get_first(block, heap_no);
+	ut_ad(lock_mutex_own());
 
 	/* If srv_locks_unsafe_for_binlog is TRUE or session is using
 	READ COMMITTED isolation level, we do not want locks set
@@ -2414,20 +2575,21 @@ lock_rec_inherit_to_gap(
 	DO want S-locks set by a consistency constraint to be inherited also
 	then. */
 
-	while (lock != NULL) {
+	for (lock = lock_rec_get_first(block, heap_no);
+	     lock != NULL;
+	     lock = lock_rec_get_next(heap_no, lock)) {
+
 		if (!lock_rec_get_insert_intention(lock)
 		    && !((srv_locks_unsafe_for_binlog
 			  || lock->trx->isolation_level
 			  <= TRX_ISO_READ_COMMITTED)
 			 && lock_get_mode(lock) == LOCK_X)) {
 
-			lock_rec_add_to_queue(LOCK_REC | LOCK_GAP
-					      | lock_get_mode(lock),
-					      heir_block, heir_heap_no,
-					      lock->index, lock->trx);
+			lock_rec_add_to_queue(
+				LOCK_REC | LOCK_GAP | lock_get_mode(lock),
+				heir_block, heir_heap_no, lock->index,
+				lock->trx, FALSE);
 		}
-
-		lock = lock_rec_get_next(heap_no, lock);
 	}
 }
 
@@ -2449,23 +2611,24 @@ lock_rec_inherit_to_gap_if_gap_lock(
 {
 	lock_t*	lock;
 
-	ut_ad(mutex_own(&kernel_mutex));
+	lock_mutex_enter();
 
-	lock = lock_rec_get_first(block, heap_no);
+	for (lock = lock_rec_get_first(block, heap_no);
+	     lock != NULL;
+	     lock = lock_rec_get_next(heap_no, lock)) {
 
-	while (lock != NULL) {
 		if (!lock_rec_get_insert_intention(lock)
 		    && (heap_no == PAGE_HEAP_NO_SUPREMUM
 			|| !lock_rec_get_rec_not_gap(lock))) {
 
-			lock_rec_add_to_queue(LOCK_REC | LOCK_GAP
-					      | lock_get_mode(lock),
-					      block, heir_heap_no,
-					      lock->index, lock->trx);
+			lock_rec_add_to_queue(
+				LOCK_REC | LOCK_GAP | lock_get_mode(lock),
+				block, heir_heap_no, lock->index,
+				lock->trx, FALSE);
 		}
-
-		lock = lock_rec_get_next(heap_no, lock);
 	}
+
+	lock_mutex_exit();
 }
 
 /*************************************************************//**
@@ -2488,13 +2651,14 @@ lock_rec_move(
 {
 	lock_t*	lock;
 
-	ut_ad(mutex_own(&kernel_mutex));
-
-	lock = lock_rec_get_first(donator, donator_heap_no);
+	ut_ad(lock_mutex_own());
 
 	ut_ad(lock_rec_get_first(receiver, receiver_heap_no) == NULL);
 
-	while (lock != NULL) {
+	for (lock = lock_rec_get_first(donator, donator_heap_no);
+	     lock != NULL;
+	     lock = lock_rec_get_next(donator_heap_no, lock)) {
+
 		const ulint	type_mode = lock->type_mode;
 
 		lock_rec_reset_nth_bit(lock, donator_heap_no);
@@ -2506,9 +2670,9 @@ lock_rec_move(
 		/* Note that we FIRST reset the bit, and then set the lock:
 		the function works also if donator == receiver */
 
-		lock_rec_add_to_queue(type_mode, receiver, receiver_heap_no,
-				      lock->index, lock->trx);
-		lock = lock_rec_get_next(donator_heap_no, lock);
+		lock_rec_add_to_queue(
+			type_mode, receiver, receiver_heap_no,
+			lock->index, lock->trx, FALSE);
 	}
 
 	ut_ad(lock_rec_get_first(donator, donator_heap_no) == NULL);
@@ -2533,12 +2697,12 @@ lock_move_reorganize_page(
 	mem_heap_t*	heap		= NULL;
 	ulint		comp;
 
-	lock_mutex_enter_kernel();
+	lock_mutex_enter();
 
 	lock = lock_rec_get_first_on_page(block);
 
 	if (lock == NULL) {
-		lock_mutex_exit_kernel();
+		lock_mutex_exit();
 
 		return;
 	}
@@ -2561,6 +2725,7 @@ lock_move_reorganize_page(
 		lock_rec_bitmap_reset(lock);
 
 		if (lock_get_wait(lock)) {
+
 			lock_reset_lock_and_trx_wait(lock);
 		}
 
@@ -2613,9 +2778,9 @@ lock_move_reorganize_page(
 				/* NOTE that the old lock bitmap could be too
 				small for the new heap number! */
 
-				lock_rec_add_to_queue(lock->type_mode, block,
-						      new_heap_no,
-						      lock->index, lock->trx);
+				lock_rec_add_to_queue(
+					lock->type_mode, block, new_heap_no,
+					lock->index, lock->trx, FALSE);
 
 				/* if (new_heap_no == PAGE_HEAP_NO_SUPREMUM
 				&& lock_get_wait(lock)) {
@@ -2652,7 +2817,7 @@ lock_move_reorganize_page(
 #endif /* UNIV_DEBUG */
 	}
 
-	lock_mutex_exit_kernel();
+	lock_mutex_exit();
 
 	mem_heap_free(heap);
 
@@ -2676,7 +2841,7 @@ lock_move_rec_list_end(
 	lock_t*		lock;
 	const ulint	comp	= page_rec_is_comp(rec);
 
-	lock_mutex_enter_kernel();
+	lock_mutex_enter();
 
 	/* Note: when we move locks from record to record, waiting locks
 	and possible granted gap type locks behind them are enqueued in
@@ -2732,9 +2897,9 @@ lock_move_rec_list_end(
 						page_cur_get_rec(&cur2));
 				}
 
-				lock_rec_add_to_queue(type_mode,
-						      new_block, heap_no,
-						      lock->index, lock->trx);
+				lock_rec_add_to_queue(
+					type_mode, new_block, heap_no,
+					lock->index, lock->trx, FALSE);
 			}
 
 			page_cur_move_to_next(&cur1);
@@ -2742,7 +2907,7 @@ lock_move_rec_list_end(
 		}
 	}
 
-	lock_mutex_exit_kernel();
+	lock_mutex_exit();
 
 #ifdef UNIV_DEBUG_LOCK_VALIDATE
 	ut_ad(lock_rec_validate_page(block));
@@ -2757,7 +2922,8 @@ UNIV_INTERN
 void
 lock_move_rec_list_start(
 /*=====================*/
-	const buf_block_t*	new_block,	/*!< in: index page to move to */
+	const buf_block_t*	new_block,	/*!< in: index page to
+						move to */
 	const buf_block_t*	block,		/*!< in: index page */
 	const rec_t*		rec,		/*!< in: record on page:
 						this is the first
@@ -2774,7 +2940,7 @@ lock_move_rec_list_start(
 	ut_ad(block->frame == page_align(rec));
 	ut_ad(new_block->frame == page_align(old_end));
 
-	lock_mutex_enter_kernel();
+	lock_mutex_enter();
 
 	for (lock = lock_rec_get_first_on_page(block); lock;
 	     lock = lock_rec_get_next_on_page(lock)) {
@@ -2822,9 +2988,9 @@ lock_move_rec_list_start(
 						page_cur_get_rec(&cur2));
 				}
 
-				lock_rec_add_to_queue(type_mode,
-						      new_block, heap_no,
-						      lock->index, lock->trx);
+				lock_rec_add_to_queue(
+					type_mode, new_block, heap_no,
+					lock->index, lock->trx, FALSE);
 			}
 
 			page_cur_move_to_next(&cur1);
@@ -2851,7 +3017,7 @@ lock_move_rec_list_start(
 #endif /* UNIV_DEBUG */
 	}
 
-	lock_mutex_exit_kernel();
+	lock_mutex_exit();
 
 #ifdef UNIV_DEBUG_LOCK_VALIDATE
 	ut_ad(lock_rec_validate_page(block));
@@ -2869,7 +3035,7 @@ lock_update_split_right(
 {
 	ulint	heap_no = lock_get_min_heap_no(right_block);
 
-	lock_mutex_enter_kernel();
+	lock_mutex_enter();
 
 	/* Move the locks on the supremum of the left page to the supremum
 	of the right page */
@@ -2883,7 +3049,7 @@ lock_update_split_right(
 	lock_rec_inherit_to_gap(left_block, right_block,
 				PAGE_HEAP_NO_SUPREMUM, heap_no);
 
-	lock_mutex_exit_kernel();
+	lock_mutex_exit();
 }
 
 /*************************************************************//**
@@ -2902,7 +3068,7 @@ lock_update_merge_right(
 						page which will be
 						discarded */
 {
-	lock_mutex_enter_kernel();
+	lock_mutex_enter();
 
 	/* Inherit the locks from the supremum of the left page to the
 	original successor of infimum on the right page, to which the left
@@ -2920,7 +3086,7 @@ lock_update_merge_right(
 
 	lock_rec_free_all_from_discard_page(left_block);
 
-	lock_mutex_exit_kernel();
+	lock_mutex_exit();
 }
 
 /*************************************************************//**
@@ -2937,14 +3103,14 @@ lock_update_root_raise(
 	const buf_block_t*	block,	/*!< in: index page to which copied */
 	const buf_block_t*	root)	/*!< in: root page */
 {
-	lock_mutex_enter_kernel();
+	lock_mutex_enter();
 
 	/* Move the locks on the supremum of the root to the supremum
 	of block */
 
 	lock_rec_move(block, root,
 		      PAGE_HEAP_NO_SUPREMUM, PAGE_HEAP_NO_SUPREMUM);
-	lock_mutex_exit_kernel();
+	lock_mutex_exit();
 }
 
 /*************************************************************//**
@@ -2959,7 +3125,7 @@ lock_update_copy_and_discard(
 	const buf_block_t*	block)		/*!< in: index page;
 						NOT the root! */
 {
-	lock_mutex_enter_kernel();
+	lock_mutex_enter();
 
 	/* Move the locks on the supremum of the old page to the supremum
 	of new_page */
@@ -2968,7 +3134,7 @@ lock_update_copy_and_discard(
 		      PAGE_HEAP_NO_SUPREMUM, PAGE_HEAP_NO_SUPREMUM);
 	lock_rec_free_all_from_discard_page(block);
 
-	lock_mutex_exit_kernel();
+	lock_mutex_exit();
 }
 
 /*************************************************************//**
@@ -2982,7 +3148,7 @@ lock_update_split_left(
 {
 	ulint	heap_no = lock_get_min_heap_no(right_block);
 
-	lock_mutex_enter_kernel();
+	lock_mutex_enter();
 
 	/* Inherit the locks to the supremum of the left page from the
 	successor of the infimum on the right page */
@@ -2990,7 +3156,7 @@ lock_update_split_left(
 	lock_rec_inherit_to_gap(left_block, right_block,
 				PAGE_HEAP_NO_SUPREMUM, heap_no);
 
-	lock_mutex_exit_kernel();
+	lock_mutex_exit();
 }
 
 /*************************************************************//**
@@ -3011,7 +3177,7 @@ lock_update_merge_left(
 
 	ut_ad(left_block->frame == page_align(orig_pred));
 
-	lock_mutex_enter_kernel();
+	lock_mutex_enter();
 
 	left_next_rec = page_rec_get_next_const(orig_pred);
 
@@ -3039,7 +3205,7 @@ lock_update_merge_left(
 
 	lock_rec_free_all_from_discard_page(right_block);
 
-	lock_mutex_exit_kernel();
+	lock_mutex_exit();
 }
 
 /*************************************************************//**
@@ -3060,13 +3226,13 @@ lock_rec_reset_and_inherit_gap_locks(
 	ulint			heap_no)	/*!< in: heap_no of the
 						donating record */
 {
-	mutex_enter(&kernel_mutex);
+	lock_mutex_enter();
 
 	lock_rec_reset_and_release_wait(heir_block, heir_heap_no);
 
 	lock_rec_inherit_to_gap(heir_block, block, heir_heap_no, heap_no);
 
-	mutex_exit(&kernel_mutex);
+	lock_mutex_exit();
 }
 
 /*************************************************************//**
@@ -3086,12 +3252,12 @@ lock_update_discard(
 	const rec_t*	rec;
 	ulint		heap_no;
 
-	lock_mutex_enter_kernel();
+	lock_mutex_enter();
 
 	if (!lock_rec_get_first_on_page(block)) {
 		/* No locks exist on page, nothing to do */
 
-		lock_mutex_exit_kernel();
+		lock_mutex_exit();
 
 		return;
 	}
@@ -3129,7 +3295,7 @@ lock_update_discard(
 
 	lock_rec_free_all_from_discard_page(block);
 
-	lock_mutex_exit_kernel();
+	lock_mutex_exit();
 }
 
 /*************************************************************//**
@@ -3159,10 +3325,8 @@ lock_update_insert(
 			page_rec_get_next_low(rec, FALSE));
 	}
 
-	lock_mutex_enter_kernel();
-	lock_rec_inherit_to_gap_if_gap_lock(block,
-					    receiver_heap_no, donator_heap_no);
-	lock_mutex_exit_kernel();
+	lock_rec_inherit_to_gap_if_gap_lock(
+		block, receiver_heap_no, donator_heap_no);
 }
 
 /*************************************************************//**
@@ -3192,7 +3356,7 @@ lock_update_delete(
 								       FALSE));
 	}
 
-	lock_mutex_enter_kernel();
+	lock_mutex_enter();
 
 	/* Let the next record inherit the locks from rec, in gap mode */
 
@@ -3202,7 +3366,7 @@ lock_update_delete(
 
 	lock_rec_reset_and_release_wait(block, heap_no);
 
-	lock_mutex_exit_kernel();
+	lock_mutex_exit();
 }
 
 /*********************************************************************//**
@@ -3227,11 +3391,11 @@ lock_rec_store_on_page_infimum(
 
 	ut_ad(block->frame == page_align(rec));
 
-	lock_mutex_enter_kernel();
+	lock_mutex_enter();
 
 	lock_rec_move(block, block, PAGE_HEAP_NO_INFIMUM, heap_no);
 
-	lock_mutex_exit_kernel();
+	lock_mutex_exit();
 }
 
 /*********************************************************************//**
@@ -3252,322 +3416,580 @@ lock_rec_restore_from_page_infimum(
 {
 	ulint	heap_no = page_rec_get_heap_no(rec);
 
-	lock_mutex_enter_kernel();
+	lock_mutex_enter();
 
 	lock_rec_move(block, donator, heap_no, PAGE_HEAP_NO_INFIMUM);
 
-	lock_mutex_exit_kernel();
+	lock_mutex_exit();
 }
 
 /*=========== DEADLOCK CHECKING ======================================*/
 
-/********************************************************************//**
-Checks if a lock request results in a deadlock.
-@return TRUE if a deadlock was detected and we chose trx as a victim;
-FALSE if no deadlock, or there was a deadlock, but we chose other
-transaction(s) as victim(s) */
-static
-ibool
-lock_deadlock_occurs(
-/*=================*/
-	lock_t*	lock,	/*!< in: lock the transaction is requesting */
-	trx_t*	trx)	/*!< in: transaction */
+/*********************************************************************//**
+rewind(3) the file used for storing the latest detected deadlock and
+print a heading message to stderr if printing of all deadlocks to stderr
+is enabled. */
+UNIV_INLINE
+void
+lock_deadlock_start_print()
+/*=======================*/
 {
-	trx_t*		mark_trx;
-	ulint		ret;
-	ulint		cost	= 0;
+	ut_ad(lock_mutex_own());
 
-	ut_ad(trx);
-	ut_ad(lock);
-	ut_ad(mutex_own(&kernel_mutex));
-retry:
-	/* We check that adding this trx to the waits-for graph
-	does not produce a cycle. First mark all active transactions
-	with 0: */
+	rewind(lock_latest_err_file);
+	ut_print_timestamp(lock_latest_err_file);
+
+	if (srv_print_all_deadlocks) {
+		fprintf(stderr, "InnoDB: transactions deadlock detected, "
+			"dumping detailed information.\n");
+		ut_print_timestamp(stderr);
+	}
+}
 
-	mark_trx = UT_LIST_GET_FIRST(trx_sys->trx_list);
+/*********************************************************************//**
+Print a message to the deadlock file and possibly to stderr. */
+UNIV_INLINE
+void
+lock_deadlock_fputs(
+/*================*/
+	const char*	msg)	/*!< in: message to print */
+{
+	fputs(msg, lock_latest_err_file);
+
+	if (srv_print_all_deadlocks) {
+		fputs(msg, stderr);
+	}
+}
+
+/*********************************************************************//**
+Print transaction data to the deadlock file and possibly to stderr. */
+UNIV_INLINE
+void
+lock_deadlock_trx_print(
+/*====================*/
+	const trx_t*	trx,		/*!< in: transaction */
+	ulint		max_query_len)	/*!< in: max query length to print,
+					or 0 to use the default max length */
+{
+	ulint	n_lock_rec;
+	ulint	n_lock_struct;
+	ulint	heap_size;
+
+	ut_ad(lock_mutex_own());
+
+	n_lock_rec = lock_number_of_rows_locked(&trx->lock);
+	n_lock_struct = UT_LIST_GET_LEN(trx->lock.trx_locks);
+	heap_size = mem_heap_get_size(trx->lock.lock_heap);
 
-	while (mark_trx) {
-		mark_trx->deadlock_mark = 0;
-		mark_trx = UT_LIST_GET_NEXT(trx_list, mark_trx);
+	mutex_enter(&trx_sys->mutex);
+
+	trx_print_low(lock_latest_err_file, trx, max_query_len,
+		      n_lock_rec, n_lock_struct, heap_size);
+
+	if (srv_print_all_deadlocks) {
+		trx_print_low(stderr, trx, max_query_len,
+			      n_lock_rec, n_lock_struct, heap_size);
 	}
 
-	ret = lock_deadlock_recursive(trx, trx, lock, &cost, 0);
+	mutex_exit(&trx_sys->mutex);
+}
+
+/*********************************************************************//**
+Print lock data to the deadlock file and possibly to stderr. */
+UNIV_INLINE
+void
+lock_deadlock_lock_print(
+/*=====================*/
+	const lock_t*	lock)	/*!< in: record or table type lock */
+{
+	ut_ad(lock_mutex_own());
+
+	if (lock_get_type_low(lock) == LOCK_REC) {
+		lock_rec_print(lock_latest_err_file, lock);
 
-	switch (ret) {
-	case LOCK_VICTIM_IS_OTHER:
-		/* We chose some other trx as a victim: retry if there still
-		is a deadlock */
-		goto retry;
+		if (srv_print_all_deadlocks) {
+			lock_rec_print(stderr, lock);
+		}
+	} else {
+		lock_table_print(lock_latest_err_file, lock);
 
-	case LOCK_EXCEED_MAX_DEPTH:
-		/* If the lock search exceeds the max step
-		or the max depth, the current trx will be
-		the victim. Print its information. */
-		rewind(lock_latest_err_file);
-		ut_print_timestamp(lock_latest_err_file);
+		if (srv_print_all_deadlocks) {
+			lock_table_print(stderr, lock);
+		}
+	}
+}
 
-		fputs("TOO DEEP OR LONG SEARCH IN THE LOCK TABLE"
-		      " WAITS-FOR GRAPH, WE WILL ROLL BACK"
-		      " FOLLOWING TRANSACTION \n",
-		      lock_latest_err_file);
+/** Used in deadlock tracking. Protected by lock_sys->mutex. */
+static ib_uint64_t	lock_mark_counter = 0;
 
-		fputs("\n*** TRANSACTION:\n", lock_latest_err_file);
-		      trx_print(lock_latest_err_file, trx, 3000);
+/** Check if the search is too deep. */
+#define lock_deadlock_too_deep(c)				\
+	(c->depth > LOCK_MAX_DEPTH_IN_DEADLOCK_CHECK		\
+	 || c->cost > LOCK_MAX_N_STEPS_IN_DEADLOCK_CHECK)
 
-		fputs("*** WAITING FOR THIS LOCK TO BE GRANTED:\n",
-		      lock_latest_err_file);
+/********************************************************************//**
+Get the next lock in the queue that is owned by a transaction whose
+sub-tree has not already been searched.
+@return next lock or NULL if at end of queue */
+static
+const lock_t*
+lock_get_next_lock(
+/*===============*/
+	const lock_deadlock_ctx_t*
+				ctx,	/*!< in: deadlock context */
+	const lock_t*		lock,	/*!< in: lock in the queue */
+	ulint			heap_no)/*!< in: heap no if rec lock else
+					ULINT_UNDEFINED */
+{
+	ut_ad(lock_mutex_own());
 
-		if (lock_get_type(lock) == LOCK_REC) {
-			lock_rec_print(lock_latest_err_file, lock);
+	do {
+		if (lock_get_type_low(lock) == LOCK_REC) {
+			ut_ad(heap_no != ULINT_UNDEFINED);
+			lock = lock_rec_get_next_const(heap_no, lock);
 		} else {
-			lock_table_print(lock_latest_err_file, lock);
+			ut_ad(heap_no == ULINT_UNDEFINED);
+			ut_ad(lock_get_type_low(lock) == LOCK_TABLE);
+			lock = UT_LIST_GET_PREV(un_member.tab_lock.locks, lock);
 		}
-		break;
 
-	case LOCK_VICTIM_IS_START:
-		fputs("*** WE ROLL BACK TRANSACTION (2)\n",
-		      lock_latest_err_file);
-		break;
+		if (lock == NULL) {
+			return(NULL);
+		}
 
-	default:
-		/* No deadlock detected*/
-		return(FALSE);
+	} while (lock->trx->lock.deadlock_mark > ctx->mark_start);
+
+	ut_ad(lock_get_type_low(lock) == lock_get_type_low(ctx->wait_lock));
+
+	return(lock);
+}
+
+/********************************************************************//**
+Get the first lock to search. The search starts from the current
+wait_lock. What we are really interested in is an edge from the
+current wait_lock's owning transaction to another transaction that has
+a lock ahead in the queue. We skip locks where the owning transaction's
+sub-tree has already been searched.
+@return first lock or NULL */
+static
+const lock_t*
+lock_get_first_lock(
+/*================*/
+	const lock_deadlock_ctx_t*
+				ctx,	/*!< in: deadlock context */
+	ulint*			heap_no)/*!< out: heap no if rec lock,
+					else ULINT_UNDEFINED */
+{
+	const lock_t*		lock;
+
+	ut_ad(lock_mutex_own());
+
+	lock = ctx->wait_lock;
+
+	if (lock_get_type_low(lock) == LOCK_REC) {
+
+		*heap_no = lock_rec_find_set_bit(lock);
+		ut_ad(*heap_no != ULINT_UNDEFINED);
+
+		lock = lock_rec_get_first_on_page_addr(
+			lock->un_member.rec_lock.space,
+			lock->un_member.rec_lock.page_no);
+	} else {
+		*heap_no = ULINT_UNDEFINED;
+		ut_ad(lock_get_type_low(lock) == LOCK_TABLE);
+		lock = UT_LIST_GET_PREV(un_member.tab_lock.locks, lock);
 	}
 
-	lock_deadlock_found = TRUE;
+	ut_ad(lock != NULL);
 
-	return(TRUE);
+	/* Skip sub-trees that have already been searched. */
+
+	if (lock->trx->lock.deadlock_mark > ctx->mark_start) {
+		return(lock_get_next_lock(ctx, lock, *heap_no));
+	}
+
+	ut_ad(lock_get_type_low(lock) == lock_get_type_low(ctx->wait_lock));
+
+	return(lock);
 }
 
 /********************************************************************//**
-Looks recursively for a deadlock.
-@return 0 if no deadlock found, LOCK_VICTIM_IS_START if there was a
-deadlock and we chose 'start' as the victim, LOCK_VICTIM_IS_OTHER if a
-deadlock was found and we chose some other trx as a victim: we must do
-the search again in this last case because there may be another
-deadlock!
-LOCK_EXCEED_MAX_DEPTH if the lock search exceeds max steps or max depth. */
+Notify that a deadlock has been detected and print the conflicting
+transaction info. */
 static
-ulint
-lock_deadlock_recursive(
-/*====================*/
-	trx_t*	start,		/*!< in: recursion starting point */
-	trx_t*	trx,		/*!< in: a transaction waiting for a lock */
-	lock_t*	wait_lock,	/*!< in: lock that is waiting to be granted */
-	ulint*	cost,		/*!< in/out: number of calculation steps thus
-				far: if this exceeds LOCK_MAX_N_STEPS_...
-				we return LOCK_EXCEED_MAX_DEPTH */
-	ulint	depth)		/*!< in: recursion depth: if this exceeds
-				LOCK_MAX_DEPTH_IN_DEADLOCK_CHECK, we
-				return LOCK_EXCEED_MAX_DEPTH */
-{
-	ulint	ret;
-	lock_t*	lock;
-	trx_t*	lock_trx;
-	ulint	heap_no		= ULINT_UNDEFINED;
+void
+lock_deadlock_notify(
+/*=================*/
+	const lock_deadlock_ctx_t*	ctx,	/*!< in: deadlock context */
+	const lock_t*			lock)	/*!< in: lock causing
+						deadlock */
+{
+	ut_ad(lock_mutex_own());
+
+	lock_deadlock_start_print();
+
+	lock_deadlock_fputs("\n*** (1) TRANSACTION:\n");
 
-	ut_a(trx);
-	ut_a(start);
-	ut_a(wait_lock);
-	ut_ad(mutex_own(&kernel_mutex));
+	lock_deadlock_trx_print(ctx->wait_lock->trx, 3000);
 
-	if (trx->deadlock_mark == 1) {
-		/* We have already exhaustively searched the subtree starting
-		from this trx */
+	lock_deadlock_fputs("*** (1) WAITING FOR THIS LOCK TO BE GRANTED:\n");
 
-		return(0);
+	lock_deadlock_lock_print(ctx->wait_lock);
+
+	lock_deadlock_fputs("*** (2) TRANSACTION:\n");
+
+	lock_deadlock_trx_print(lock->trx, 3000);
+
+	lock_deadlock_fputs("*** (2) HOLDS THE LOCK(S):\n");
+
+	lock_deadlock_lock_print(lock);
+
+	lock_deadlock_fputs("*** (2) WAITING FOR THIS LOCK TO BE GRANTED:\n");
+
+	lock_deadlock_lock_print(ctx->start->lock.wait_lock);
+
+#ifdef UNIV_DEBUG
+	if (lock_print_waits) {
+		fputs("Deadlock detected\n", stderr);
 	}
+#endif /* UNIV_DEBUG */
+}
 
-	*cost = *cost + 1;
+/********************************************************************//**
+Select the victim transaction that should be rolledback.
+@return victim transaction */
+static
+const trx_t*
+lock_deadlock_select_victim(
+/*========================*/
+	const lock_deadlock_ctx_t*	ctx)	/*!< in: deadlock context */
+{
+	ut_ad(lock_mutex_own());
+	ut_ad(ctx->wait_lock->trx != ctx->start);
 
-	if (lock_get_type_low(wait_lock) == LOCK_REC) {
-		ulint		space;
-		ulint		page_no;
+	if (trx_weight_ge(ctx->wait_lock->trx, ctx->start)) {
+		/* The joining  transaction is 'smaller',
+		choose it as the victim and roll it back. */
 
-		heap_no = lock_rec_find_set_bit(wait_lock);
-		ut_a(heap_no != ULINT_UNDEFINED);
+		return(ctx->start);
+	}
 
-		space = wait_lock->un_member.rec_lock.space;
-		page_no = wait_lock->un_member.rec_lock.page_no;
+	return(ctx->wait_lock->trx);
+}
 
-		lock = lock_rec_get_first_on_page_addr(space, page_no);
+/********************************************************************//**
+Check whether the current waiting lock in the context has to wait for
+the given lock that is ahead in the queue.
+@return lock instance that could cause potential deadlock. */
+static
+const lock_t*
+lock_deadlock_check(
+/*================*/
+	const lock_deadlock_ctx_t*	ctx,	/*!< in: deadlock context */
+	const lock_t*			lock)	/*!< in: lock to check */
+{
+	ut_ad(lock_mutex_own());
 
-		/* Position the iterator on the first matching record lock. */
-		while (lock != NULL
-		       && lock != wait_lock
-		       && !lock_rec_get_nth_bit(lock, heap_no)) {
+	/* If it is the joining transaction wait lock. */
+	if (lock == ctx->start->lock.wait_lock) {
+		; /* Skip */
+	} else if (lock == ctx->wait_lock) {
 
-			lock = lock_rec_get_next_on_page(lock);
-		}
+		/* We can mark this subtree as searched */
+		ut_ad(lock->trx->lock.deadlock_mark <= ctx->mark_start);
+		lock->trx->lock.deadlock_mark = ++lock_mark_counter;
 
-		if (lock == wait_lock) {
-			lock = NULL;
-		}
+		/* We are not prepared for an overflow. This 64-bit
+		counter should never wrap around. At 10^9 increments
+		per second, it would take 10^3 years of uptime. */
 
-		ut_ad(lock == NULL || lock_rec_get_nth_bit(lock, heap_no));
+		ut_ad(lock_mark_counter > 0);
 
-	} else {
-		lock = wait_lock;
+	} else if (lock_has_to_wait(ctx->wait_lock, lock)) {
+
+		return(lock);
 	}
 
-	/* Look at the locks ahead of wait_lock in the lock queue */
+	return(NULL);
+}
 
-	for (;;) {
-		/* Get previous table lock. */
-		if (heap_no == ULINT_UNDEFINED) {
+/********************************************************************//**
+Pop the deadlock search state from the stack.
+@return stack slot instance that was on top of the stack. */
+static
+const lock_stack_t*
+lock_deadlock_pop(
+/*==============*/
+	lock_deadlock_ctx_t*	ctx)		/*!< in/out: context */
+{
+	const lock_stack_t*	stack;
+	const trx_lock_t*	trx_lock;
 
-			lock = UT_LIST_GET_PREV(
-				un_member.tab_lock.locks, lock);
-		}
+	ut_ad(lock_mutex_own());
 
-		if (lock == NULL) {
-			/* We can mark this subtree as searched */
-			trx->deadlock_mark = 1;
+	ut_ad(ctx->depth > 0);
 
-			return(FALSE);
-		}
+	do {
+		/* Restore search state. */
 
-		if (lock_has_to_wait(wait_lock, lock)) {
+		stack = &lock_stack[--ctx->depth];
+		trx_lock = &stack->lock->trx->lock;
+
+		/* Skip sub-trees that have already been searched. */
+	} while (ctx->depth > 0 && trx_lock->deadlock_mark > ctx->mark_start);
+
+	return(ctx->depth == 0) ? NULL : stack;
+}
+
+/********************************************************************//**
+Push the deadlock search state onto the stack.
+@return slot that was used in the stack */
+static
+lock_stack_t*
+lock_deadlock_push(
+/*===============*/
+	lock_deadlock_ctx_t*	ctx,		/*!< in/out: context */
+	const lock_t*		lock,		/*!< in: current lock */
+	ulint			heap_no)	/*!< in: heap number */
+{
+	ut_ad(lock_mutex_own());
 
-			ibool	too_far
-				= depth > LOCK_MAX_DEPTH_IN_DEADLOCK_CHECK
-				|| *cost > LOCK_MAX_N_STEPS_IN_DEADLOCK_CHECK;
+	/* Save current search state. */
 
-			lock_trx = lock->trx;
+	if (LOCK_STACK_SIZE > ctx->depth) {
+		lock_stack_t*	stack;
 
-			if (lock_trx == start) {
+		stack = &lock_stack[ctx->depth++];
 
-				/* We came back to the recursion starting
-				point: a deadlock detected; or we have
-				searched the waits-for graph too long */
+		stack->lock = lock;
+		stack->heap_no = heap_no;
+		stack->wait_lock = ctx->wait_lock;
 
-				FILE*	ef = lock_latest_err_file;
+		return(stack);
+	}
 
-				rewind(ef);
-				ut_print_timestamp(ef);
+	return(NULL);
+}
 
-				fputs("\n*** (1) TRANSACTION:\n", ef);
+/********************************************************************//**
+Looks iteratively for a deadlock.
+@return 0 if no deadlock else the victim transaction id.*/
+static
+trx_id_t
+lock_deadlock_search(
+/*=================*/
+	lock_deadlock_ctx_t*	ctx)	/*!< in/out: deadlock context */
+{
+	const lock_t*	lock;
+	ulint		heap_no;
 
-				trx_print(ef, wait_lock->trx, 3000);
+	ut_ad(lock_mutex_own());
+	ut_ad(!trx_mutex_own(ctx->start));
 
-				fputs("*** (1) WAITING FOR THIS LOCK"
-				      " TO BE GRANTED:\n", ef);
+	ut_ad(ctx->start != NULL);
+	ut_ad(ctx->wait_lock != NULL);
+	assert_trx_in_list(ctx->wait_lock->trx);
+	ut_ad(ctx->mark_start <= lock_mark_counter);
 
-				if (lock_get_type_low(wait_lock) == LOCK_REC) {
-					lock_rec_print(ef, wait_lock);
-				} else {
-					lock_table_print(ef, wait_lock);
-				}
+	/* Look at the locks ahead of wait_lock in the lock queue. */
+	lock = lock_get_first_lock(ctx, &heap_no);
+	do {
+		/* We should never visit the same sub-tree more than once. */
+		ut_ad(lock->trx->lock.deadlock_mark <= ctx->mark_start);
 
-				fputs("*** (2) TRANSACTION:\n", ef);
+		++ctx->cost;
 
-				trx_print(ef, lock->trx, 3000);
+		if (lock_deadlock_check(ctx, lock) == NULL) {
 
-				fputs("*** (2) HOLDS THE LOCK(S):\n", ef);
+			/* No conflict found, skip this lock. */
 
-				if (lock_get_type_low(lock) == LOCK_REC) {
-					lock_rec_print(ef, lock);
-				} else {
-					lock_table_print(ef, lock);
-				}
+		} else if (lock->trx == ctx->start) {
 
-				fputs("*** (2) WAITING FOR THIS LOCK"
-				      " TO BE GRANTED:\n", ef);
+			/* Found a cycle. */
 
-				if (lock_get_type_low(start->wait_lock)
-				    == LOCK_REC) {
-					lock_rec_print(ef, start->wait_lock);
-				} else {
-					lock_table_print(ef, start->wait_lock);
-				}
-#ifdef UNIV_DEBUG
-				if (lock_print_waits) {
-					fputs("Deadlock detected\n",
-					      stderr);
-				}
-#endif /* UNIV_DEBUG */
+			lock_deadlock_notify(ctx, lock);
 
-				if (trx_weight_ge(wait_lock->trx, start)) {
-					/* Our recursion starting point
-					transaction is 'smaller', let us
-					choose 'start' as the victim and roll
-					back it */
+			return(lock_deadlock_select_victim(ctx)->id);
 
-					return(LOCK_VICTIM_IS_START);
-				}
+		} else if (lock_deadlock_too_deep(ctx)) {
+
+			/* Search too deep to continue. */
 
-				lock_deadlock_found = TRUE;
+			ctx->too_deep = TRUE;
 
-				/* Let us choose the transaction of wait_lock
-				as a victim to try to avoid deadlocking our
-				recursion starting point transaction */
+			/* Select the joining transaction as the victim. */
+			return(ctx->start->id);
 
-				fputs("*** WE ROLL BACK TRANSACTION (1)\n",
-				      ef);
+		} else if (lock->trx->lock.que_state == TRX_QUE_LOCK_WAIT) {
 
-				wait_lock->trx->was_chosen_as_deadlock_victim
-					= TRUE;
+			/* Another trx ahead has requested a lock in an
+			incompatible mode, and is itself waiting for a lock. */
 
-				lock_cancel_waiting_and_release(wait_lock);
+			/* Save current search state. */
+			if (!lock_deadlock_push(ctx, lock, heap_no)) {
 
-				/* Since trx and wait_lock are no longer
-				in the waits-for graph, we can return FALSE;
-				note that our selective algorithm can choose
-				several transactions as victims, but still
-				we may end up rolling back also the recursion
-				starting point transaction! */
+				/* Unable to save current search state, stack
+				size not big enough. */
 
-				return(LOCK_VICTIM_IS_OTHER);
+				ctx->too_deep = TRUE;
+
+				return(ctx->start->id);
 			}
 
-			if (too_far) {
+			ctx->wait_lock = lock->trx->lock.wait_lock;
+			lock = lock_get_first_lock(ctx, &heap_no);
 
-#ifdef UNIV_DEBUG
-				if (lock_print_waits) {
-					fputs("Deadlock search exceeds"
-					      " max steps or depth.\n",
-					      stderr);
-				}
-#endif /* UNIV_DEBUG */
-				/* The information about transaction/lock
-				to be rolled back is available in the top
-				level. Do not print anything here. */
-				return(LOCK_EXCEED_MAX_DEPTH);
+			if (lock != NULL) {
+				continue;
 			}
+		}
 
-			if (lock_trx->que_state == TRX_QUE_LOCK_WAIT) {
+		if (lock != NULL) {
+			lock = lock_get_next_lock(ctx, lock, heap_no);
+		}
 
-				/* Another trx ahead has requested lock	in an
-				incompatible mode, and is itself waiting for
-				a lock */
+		if (lock == NULL && ctx->depth > 0) {
+			const lock_stack_t*	stack;
 
-				ret = lock_deadlock_recursive(
-					start, lock_trx,
-					lock_trx->wait_lock, cost, depth + 1);
+			/* Restore previous search state. */
 
-				if (ret != 0) {
+			stack = lock_deadlock_pop(ctx);
 
-					return(ret);
-				}
+			if (stack != NULL) {
+				lock = stack->lock;
+				heap_no = stack->heap_no;
+				ctx->wait_lock = stack->wait_lock;
 			}
 		}
-		/* Get the next record lock to check. */
-		if (heap_no != ULINT_UNDEFINED) {
 
-			ut_a(lock != NULL);
+	} while (lock != NULL || ctx->depth > 0);
 
-			do {
-				lock = lock_rec_get_next_on_page(lock);
-			} while (lock != NULL
-				&& lock != wait_lock
-				&& !lock_rec_get_nth_bit(lock, heap_no));
+	/* No deadlock found. */
+	return(0);
+}
 
-			if (lock == wait_lock) {
-				lock = NULL;
-			}
+/********************************************************************//**
+Print info about transaction that was rolled back. */
+static
+void
+lock_deadlock_joining_trx_print(
+/*============================*/
+	const trx_t*	trx,		/*!< in: transaction rolled back */
+	const lock_t*	lock)		/*!< in: lock trx wants */
+{
+	ut_ad(lock_mutex_own());
+
+	/* If the lock search exceeds the max step
+	or the max depth, the current trx will be
+	the victim. Print its information. */
+	lock_deadlock_start_print();
+
+	lock_deadlock_fputs(
+		"TOO DEEP OR LONG SEARCH IN THE LOCK TABLE"
+		" WAITS-FOR GRAPH, WE WILL ROLL BACK"
+		" FOLLOWING TRANSACTION \n\n"
+		"*** TRANSACTION:\n");
+
+	lock_deadlock_trx_print(trx, 3000);
+
+	lock_deadlock_fputs("*** WAITING FOR THIS LOCK TO BE GRANTED:\n");
+
+	lock_deadlock_lock_print(lock);
+}
+
+/********************************************************************//**
+Rollback transaction selected as the victim. */
+static
+void
+lock_deadlock_trx_rollback(
+/*=======================*/
+	lock_deadlock_ctx_t*	ctx)		/*!< in: deadlock context */
+{
+	trx_t*			trx;
+
+	ut_ad(lock_mutex_own());
+
+	trx = ctx->wait_lock->trx;
+
+	lock_deadlock_fputs("*** WE ROLL BACK TRANSACTION (1)\n");
+
+	trx_mutex_enter(trx);
+
+	trx->lock.was_chosen_as_deadlock_victim = TRUE;
+
+	lock_cancel_waiting_and_release(trx->lock.wait_lock);
+
+	trx_mutex_exit(trx);
+}
+
+/********************************************************************//**
+Checks if a joining lock request results in a deadlock. If a deadlock is
+found this function will resolve the dadlock by choosing a victim transaction
+and rolling it back. It will attempt to resolve all deadlocks. The returned
+transaction id will be the joining transaction id or 0 if some other
+transaction was chosen as a victim and rolled back or no deadlock found.
+
+@return id of transaction chosen as victim or 0 */
+static
+trx_id_t
+lock_deadlock_check_and_resolve(
+/*============================*/
+	const lock_t*	lock,	/*!< in: lock the transaction is requesting */
+	const trx_t*	trx)	/*!< in: transaction */
+{
+	trx_id_t	victim_trx_id;
+
+	ut_ad(trx != NULL);
+	ut_ad(lock != NULL);
+	ut_ad(lock_mutex_own());
+	assert_trx_in_list(trx);
+
+	/* Try and resolve as many deadlocks as possible. */
+	do {
+		lock_deadlock_ctx_t	ctx;
+
+		/* Reset the context. */
+		ctx.cost = 0;
+		ctx.depth = 0;
+		ctx.start = trx;
+		ctx.too_deep = FALSE;
+		ctx.wait_lock = lock;
+		ctx.mark_start = lock_mark_counter;
+
+		victim_trx_id = lock_deadlock_search(&ctx);
+
+		/* Search too deep, we rollback the joining transaction. */
+		if (ctx.too_deep) {
+
+			ut_a(trx == ctx.start);
+			ut_a(victim_trx_id == trx->id);
+
+			lock_deadlock_joining_trx_print(trx, lock);
+
+			MONITOR_INC(MONITOR_DEADLOCK);
+
+		} else if (victim_trx_id != 0 && victim_trx_id != trx->id) {
+
+			ut_ad(victim_trx_id == ctx.wait_lock->trx->id);
+			lock_deadlock_trx_rollback(&ctx);
+
+			lock_deadlock_found = TRUE;
+
+			MONITOR_INC(MONITOR_DEADLOCK);
 		}
-	}/* end of the 'for (;;)'-loop */
+
+	} while (victim_trx_id != 0 && victim_trx_id != trx->id);
+
+	/* If the joining transaction was selected as the victim. */
+	if (victim_trx_id != 0) {
+		ut_a(victim_trx_id == trx->id);
+
+		lock_deadlock_fputs("*** WE ROLL BACK TRANSACTION (2)\n");
+
+		lock_deadlock_found = TRUE;
+	}
+
+	return(victim_trx_id);
 }
 
 /*========================= TABLE LOCKS ==============================*/
@@ -3580,7 +4002,8 @@ UNIV_INLINE
 lock_t*
 lock_table_create(
 /*==============*/
-	dict_table_t*	table,	/*!< in: database table in dictionary cache */
+	dict_table_t*	table,	/*!< in/out: database table
+				in dictionary cache */
 	ulint		type_mode,/*!< in: lock mode possibly ORed with
 				LOCK_WAIT */
 	trx_t*		trx)	/*!< in: trx */
@@ -3588,7 +4011,12 @@ lock_table_create(
 	lock_t*	lock;
 
 	ut_ad(table && trx);
-	ut_ad(mutex_own(&kernel_mutex));
+	ut_ad(lock_mutex_own());
+	ut_ad(trx_mutex_own(trx));
+
+	/* Non-locking autocommit read-only transactions should not set
+	any locks. */
+	assert_trx_in_list(trx);
 
 	if ((type_mode & LOCK_MODE_MASK) == LOCK_AUTO_INC) {
 		++table->n_waiting_or_granted_auto_inc_locks;
@@ -3603,18 +4031,20 @@ lock_table_create(
 
 		table->autoinc_trx = trx;
 
-		ib_vector_push(trx->autoinc_locks, lock);
+		ib_vector_push(trx->autoinc_locks, &lock);
 	} else {
-		lock = mem_heap_alloc(trx->lock_heap, sizeof(lock_t));
+		lock = static_cast<lock_t*>(
+			mem_heap_alloc(trx->lock.lock_heap, sizeof(*lock)));
 	}
 
-	UT_LIST_ADD_LAST(trx_locks, trx->trx_locks, lock);
-
 	lock->type_mode = type_mode | LOCK_TABLE;
 	lock->trx = trx;
 
 	lock->un_member.tab_lock.table = table;
 
+	ut_ad(table->n_ref_count > 0 || !table->can_be_evicted);
+
+	UT_LIST_ADD_LAST(trx_locks, trx->lock.trx_locks, lock);
 	UT_LIST_ADD_LAST(un_member.tab_lock.locks, table->locks, lock);
 
 	if (UNIV_UNLIKELY(type_mode & LOCK_WAIT)) {
@@ -3622,6 +4052,11 @@ lock_table_create(
 		lock_set_lock_and_trx_wait(lock, trx);
 	}
 
+	ib_vector_push(lock->trx->lock.table_locks, &lock);
+
+	MONITOR_INC(MONITOR_TABLELOCK_CREATED);
+	MONITOR_INC(MONITOR_NUM_TABLELOCK);
+
 	return(lock);
 }
 
@@ -3635,7 +4070,7 @@ lock_table_pop_autoinc_locks(
 /*=========================*/
 	trx_t*	trx)	/*!< in/out: transaction that owns the AUTOINC locks */
 {
-	ut_ad(mutex_own(&kernel_mutex));
+	ut_ad(lock_mutex_own());
 	ut_ad(!ib_vector_is_empty(trx->autoinc_locks));
 
 	/* Skip any gaps, gaps are NULL lock entries in the
@@ -3648,7 +4083,7 @@ lock_table_pop_autoinc_locks(
 			return;
 		}
 
-	} while (ib_vector_get_last(trx->autoinc_locks) == NULL);
+	} while (*(lock_t**) ib_vector_get_last(trx->autoinc_locks) == NULL);
 }
 
 /*************************************************************//**
@@ -3663,7 +4098,7 @@ lock_table_remove_autoinc_lock(
 	lock_t*	autoinc_lock;
 	lint	i = ib_vector_size(trx->autoinc_locks) - 1;
 
-	ut_ad(mutex_own(&kernel_mutex));
+	ut_ad(lock_mutex_own());
 	ut_ad(lock_get_mode(lock) == LOCK_AUTO_INC);
 	ut_ad(lock_get_type_low(lock) & LOCK_TABLE);
 	ut_ad(!ib_vector_is_empty(trx->autoinc_locks));
@@ -3673,7 +4108,8 @@ lock_table_remove_autoinc_lock(
 	to be handled by deleting only those AUTOINC locks that were
 	held by the table being dropped. */
 
-	autoinc_lock = ib_vector_get(trx->autoinc_locks, i);
+	autoinc_lock = *static_cast<lock_t**>(
+		ib_vector_get(trx->autoinc_locks, i));
 
 	/* This is the default fast case. */
 
@@ -3686,10 +4122,12 @@ lock_table_remove_autoinc_lock(
 		/* Handle freeing the locks from within the stack. */
 
 		while (--i >= 0) {
-			autoinc_lock = ib_vector_get(trx->autoinc_locks, i);
+			autoinc_lock = *static_cast<lock_t**>(
+				ib_vector_get(trx->autoinc_locks, i));
 
 			if (UNIV_LIKELY(autoinc_lock == lock)) {
-				ib_vector_set(trx->autoinc_locks, i, NULL);
+				void*	null_var = NULL;
+				ib_vector_set(trx->autoinc_locks, i, &null_var);
 				return;
 			}
 		}
@@ -3707,18 +4145,18 @@ UNIV_INLINE
 void
 lock_table_remove_low(
 /*==================*/
-	lock_t*	lock)	/*!< in: table lock */
+	lock_t*	lock)	/*!< in/out: table lock */
 {
 	trx_t*		trx;
 	dict_table_t*	table;
 
-	ut_ad(mutex_own(&kernel_mutex));
+	ut_ad(lock_mutex_own());
 
 	trx = lock->trx;
 	table = lock->un_member.tab_lock.table;
 
 	/* Remove the table from the transaction's AUTOINC vector, if
-	the lock that is being release is an AUTOINC lock. */
+	the lock that is being released is an AUTOINC lock. */
 	if (lock_get_mode(lock) == LOCK_AUTO_INC) {
 
 		/* The table's AUTOINC lock can get transferred to
@@ -3729,7 +4167,7 @@ lock_table_remove_low(
 
 		/* The locks must be freed in the reverse order from
 		the one in which they were acquired. This is to avoid
-		traversing the AUTOINC lock vector unnecessarily. 
+		traversing the AUTOINC lock vector unnecessarily.
 
 		We only store locks that were granted in the
 		trx->autoinc_locks vector (see lock_table_create()
@@ -3743,11 +4181,14 @@ lock_table_remove_low(
 		}
 
 		ut_a(table->n_waiting_or_granted_auto_inc_locks > 0);
-		--table->n_waiting_or_granted_auto_inc_locks;
+		table->n_waiting_or_granted_auto_inc_locks--;
 	}
 
-	UT_LIST_REMOVE(trx_locks, trx->trx_locks, lock);
+	UT_LIST_REMOVE(trx_locks, trx->lock.trx_locks, lock);
 	UT_LIST_REMOVE(un_member.tab_lock.locks, table->locks, lock);
+
+	MONITOR_INC(MONITOR_TABLELOCK_REMOVED);
+	MONITOR_DEC(MONITOR_NUM_TABLELOCK);
 }
 
 /*********************************************************************//**
@@ -3763,13 +4204,17 @@ lock_table_enqueue_waiting(
 /*=======================*/
 	ulint		mode,	/*!< in: lock mode this transaction is
 				requesting */
-	dict_table_t*	table,	/*!< in: table */
+	dict_table_t*	table,	/*!< in/out: table */
 	que_thr_t*	thr)	/*!< in: query thread */
 {
-	lock_t*	lock;
-	trx_t*	trx;
+	trx_t*		trx;
+	lock_t*		lock;
+	trx_id_t	victim_trx_id;
+
+	ut_ad(lock_mutex_own());
 
-	ut_ad(mutex_own(&kernel_mutex));
+	trx = thr_get_trx(thr);
+	ut_ad(trx_mutex_own(trx));
 
 	/* Test if there already is some other reason to suspend thread:
 	we do not enqueue a lock request if the query thread should be
@@ -3781,8 +4226,6 @@ lock_table_enqueue_waiting(
 		return(DB_QUE_THR_SUSPENDED);
 	}
 
-	trx = thr_get_trx(thr);
-
 	switch (trx_get_dict_operation(trx)) {
 	case TRX_DICT_OP_NONE:
 		break;
@@ -3804,10 +4247,22 @@ lock_table_enqueue_waiting(
 
 	lock = lock_table_create(table, mode | LOCK_WAIT, trx);
 
-	/* Check if a deadlock occurs: if yes, remove the lock request and
-	return an error code */
+	/* Release the mutex to obey the latching order.
+	This is safe, because lock_deadlock_check_and_resolve()
+	is invoked when a lock wait is enqueued for the currently
+	running transaction. Because trx is a running transaction
+	(it is not currently suspended because of a lock wait),
+	its state can only be changed by this thread, which is
+	currently associated with the transaction. */
+
+	trx_mutex_exit(trx);
 
-	if (lock_deadlock_occurs(lock, trx)) {
+	victim_trx_id = lock_deadlock_check_and_resolve(lock, trx);
+
+	trx_mutex_enter(trx);
+
+	if (victim_trx_id != 0) {
+		ut_ad(victim_trx_id == trx->id);
 
 		/* The order here is important, we don't want to
 		lose the state of the lock before calling remove. */
@@ -3815,21 +4270,22 @@ lock_table_enqueue_waiting(
 		lock_reset_lock_and_trx_wait(lock);
 
 		return(DB_DEADLOCK);
-	}
-
-	if (trx->wait_lock == NULL) {
+	} else if (trx->lock.wait_lock == NULL) {
 		/* Deadlock resolution chose another transaction as a victim,
 		and we accidentally got our lock granted! */
 
 		return(DB_SUCCESS);
 	}
 
-	trx->que_state = TRX_QUE_LOCK_WAIT;
-	trx->was_chosen_as_deadlock_victim = FALSE;
-	trx->wait_started = time(NULL);
+	trx->lock.que_state = TRX_QUE_LOCK_WAIT;
+
+	trx->lock.wait_started = ut_time();
+	trx->lock.was_chosen_as_deadlock_victim = FALSE;
 
 	ut_a(que_thr_stop(thr));
 
+	MONITOR_INC(MONITOR_TABLELOCK_WAIT);
+
 	return(DB_LOCK_WAIT);
 }
 
@@ -3851,20 +4307,18 @@ lock_table_other_has_incompatible(
 {
 	const lock_t*	lock;
 
-	ut_ad(mutex_own(&kernel_mutex));
-
-	lock = UT_LIST_GET_LAST(table->locks);
+	ut_ad(lock_mutex_own());
 
-	while (lock != NULL) {
+	for (lock = UT_LIST_GET_LAST(table->locks);
+	     lock != NULL;
+	     lock = UT_LIST_GET_PREV(un_member.tab_lock.locks, lock)) {
 
-		if ((lock->trx != trx)
-		    && (!lock_mode_compatible(lock_get_mode(lock), mode))
-		    && (wait || !(lock_get_wait(lock)))) {
+		if (lock->trx != trx
+		    && !lock_mode_compatible(lock_get_mode(lock), mode)
+		    && (wait || !lock_get_wait(lock))) {
 
 			return(lock);
 		}
-
-		lock = UT_LIST_GET_PREV(un_member.tab_lock.locks, lock);
 	}
 
 	return(NULL);
@@ -3880,12 +4334,14 @@ lock_table(
 /*=======*/
 	ulint		flags,	/*!< in: if BTR_NO_LOCKING_FLAG bit is set,
 				does nothing */
-	dict_table_t*	table,	/*!< in: database table in dictionary cache */
+	dict_table_t*	table,	/*!< in/out: database table
+				in dictionary cache */
 	enum lock_mode	mode,	/*!< in: lock mode */
 	que_thr_t*	thr)	/*!< in: query thread */
 {
-	trx_t*	trx;
-	ulint	err;
+	trx_t*		trx;
+	ulint		err;
+	const lock_t*	wait_for;
 
 	ut_ad(table && thr);
 
@@ -3898,39 +4354,44 @@ lock_table(
 
 	trx = thr_get_trx(thr);
 
-	lock_mutex_enter_kernel();
-
-	/* Look for stronger locks the same trx already has on the table */
+	/* Look for equal or stronger locks the same trx already
+	has on the table. No need to acquire the lock mutex here
+	because only this transacton can add/access table locks
+	to/from trx_t::table_locks. */
 
 	if (lock_table_has(trx, table, mode)) {
 
-		lock_mutex_exit_kernel();
-
 		return(DB_SUCCESS);
 	}
 
+	lock_mutex_enter();
+
 	/* We have to check if the new lock is compatible with any locks
 	other transactions have in the table lock queue. */
 
-	if (lock_table_other_has_incompatible(trx, LOCK_WAIT, table, mode)) {
+	wait_for = lock_table_other_has_incompatible(
+		trx, LOCK_WAIT, table, mode);
 
-		/* Another trx has a request on the table in an incompatible
-		mode: this trx may have to wait */
+	trx_mutex_enter(trx);
 
+	/* Another trx has a request on the table in an incompatible
+	mode: this trx may have to wait */
+
+	if (wait_for != NULL) {
 		err = lock_table_enqueue_waiting(mode | flags, table, thr);
+	} else {
+		lock_table_create(table, mode | flags, trx);
 
-		lock_mutex_exit_kernel();
+		ut_a(!flags || mode == LOCK_S || mode == LOCK_X);
 
-		return(err);
+		err = DB_SUCCESS;
 	}
 
-	lock_table_create(table, mode | flags, trx);
-
-	ut_a(!flags || mode == LOCK_S || mode == LOCK_X);
+	lock_mutex_exit();
 
-	lock_mutex_exit_kernel();
+	trx_mutex_exit(trx);
 
-	return(DB_SUCCESS);
+	return(err);
 }
 
 /*********************************************************************//**
@@ -3945,21 +4406,19 @@ lock_table_has_to_wait_in_queue(
 	const dict_table_t*	table;
 	const lock_t*		lock;
 
-	ut_ad(mutex_own(&kernel_mutex));
+	ut_ad(lock_mutex_own());
 	ut_ad(lock_get_wait(wait_lock));
 
 	table = wait_lock->un_member.tab_lock.table;
 
-	lock = UT_LIST_GET_FIRST(table->locks);
-
-	while (lock != wait_lock) {
+	for (lock = UT_LIST_GET_FIRST(table->locks);
+	     lock != wait_lock;
+	     lock = UT_LIST_GET_NEXT(un_member.tab_lock.locks, lock)) {
 
 		if (lock_has_to_wait(wait_lock, lock)) {
 
 			return(TRUE);
 		}
-
-		lock = UT_LIST_GET_NEXT(un_member.tab_lock.locks, lock);
 	}
 
 	return(FALSE);
@@ -3973,13 +4432,13 @@ static
 void
 lock_table_dequeue(
 /*===============*/
-	lock_t*	in_lock)/*!< in: table lock object; transactions waiting
+	lock_t*	in_lock)/*!< in/out: table lock object; transactions waiting
 			behind will get their lock requests granted, if
 			they are now qualified to it */
 {
 	lock_t*	lock;
 
-	ut_ad(mutex_own(&kernel_mutex));
+	ut_ad(lock_mutex_own());
 	ut_a(lock_get_type_low(in_lock) == LOCK_TABLE);
 
 	lock = UT_LIST_GET_NEXT(un_member.tab_lock.locks, in_lock);
@@ -3989,16 +4448,17 @@ lock_table_dequeue(
 	/* Check if waiting locks in the queue can now be granted: grant
 	locks if there are no conflicting locks ahead. */
 
-	while (lock != NULL) {
+	for (/* No op */;
+	     lock != NULL;
+	     lock = UT_LIST_GET_NEXT(un_member.tab_lock.locks, lock)) {
 
 		if (lock_get_wait(lock)
 		    && !lock_table_has_to_wait_in_queue(lock)) {
 
 			/* Grant the lock */
+			ut_ad(in_lock->trx != lock->trx);
 			lock_grant(lock);
 		}
-
-		lock = UT_LIST_GET_NEXT(un_member.tab_lock.locks, lock);
 	}
 }
 
@@ -4012,47 +4472,60 @@ UNIV_INTERN
 void
 lock_rec_unlock(
 /*============*/
-	trx_t*			trx,	/*!< in: transaction that has
+	trx_t*			trx,	/*!< in/out: transaction that has
 					set a record lock */
 	const buf_block_t*	block,	/*!< in: buffer block containing rec */
 	const rec_t*		rec,	/*!< in: record */
 	enum lock_mode		lock_mode)/*!< in: LOCK_S or LOCK_X */
 {
-	lock_t*	first_lock;
-	lock_t*	lock;
-	ulint	heap_no;
+	lock_t*		first_lock;
+	lock_t*		lock;
+	ulint		heap_no;
+	const char*	stmt;
+	size_t		stmt_len;
 
-	ut_ad(trx && rec);
+	ut_ad(trx);
+	ut_ad(rec);
 	ut_ad(block->frame == page_align(rec));
+	ut_ad(!trx->lock.wait_lock);
+	ut_ad(trx_state_eq(trx, TRX_STATE_ACTIVE));
 
 	heap_no = page_rec_get_heap_no(rec);
 
-	mutex_enter(&kernel_mutex);
+	lock_mutex_enter();
+	trx_mutex_enter(trx);
 
 	first_lock = lock_rec_get_first(block, heap_no);
 
 	/* Find the last lock with the same lock_mode and transaction
-	from the record. */
+	on the record. */
 
 	for (lock = first_lock; lock != NULL;
 	     lock = lock_rec_get_next(heap_no, lock)) {
 		if (lock->trx == trx && lock_get_mode(lock) == lock_mode) {
-			ut_a(!lock_get_wait(lock));
-			lock_rec_reset_nth_bit(lock, heap_no);
 			goto released;
 		}
 	}
 
-	mutex_exit(&kernel_mutex);
+	lock_mutex_exit();
+	trx_mutex_exit(trx);
+
+	stmt = innobase_get_stmt(trx->mysql_thd, &stmt_len);
 	ut_print_timestamp(stderr);
 	fprintf(stderr,
-		"  InnoDB: Error: unlock row could not"
+		" InnoDB: Error: unlock row could not"
 		" find a %lu mode lock on the record\n",
 		(ulong) lock_mode);
+	ut_print_timestamp(stderr);
+	fprintf(stderr, " InnoDB: current statement: %.*s\n",
+		(int) stmt_len, stmt);
 
 	return;
 
 released:
+	ut_a(!lock_get_wait(lock));
+	lock_rec_reset_nth_bit(lock, heap_no);
+
 	/* Check if we can now grant waiting lock requests */
 
 	for (lock = first_lock; lock != NULL;
@@ -4061,39 +4534,40 @@ released:
 		    && !lock_rec_has_to_wait_in_queue(lock)) {
 
 			/* Grant the lock */
+			ut_ad(trx != lock->trx);
 			lock_grant(lock);
 		}
 	}
 
-	mutex_exit(&kernel_mutex);
+	lock_mutex_exit();
+	trx_mutex_exit(trx);
 }
 
 /*********************************************************************//**
 Releases transaction locks, and releases possible other transactions waiting
 because of these locks. */
-UNIV_INTERN
+static
 void
-lock_release_off_kernel(
-/*====================*/
-	trx_t*	trx)	/*!< in: transaction */
+lock_release(
+/*=========*/
+	trx_t*	trx)	/*!< in/out: transaction */
 {
-	dict_table_t*	table;
-	ulint		count;
 	lock_t*		lock;
+	ulint		count = 0;
+	trx_id_t	max_trx_id;
 
-	ut_ad(mutex_own(&kernel_mutex));
-
-	lock = UT_LIST_GET_LAST(trx->trx_locks);
-
-	count = 0;
+	ut_ad(lock_mutex_own());
+	ut_ad(!trx_mutex_own(trx));
 
-	while (lock != NULL) {
+	max_trx_id = trx_sys_get_max_trx_id();
 
-		count++;
+	for (lock = UT_LIST_GET_LAST(trx->lock.trx_locks);
+	     lock != NULL;
+	     lock = UT_LIST_GET_LAST(trx->lock.trx_locks)) {
 
 		if (lock_get_type_low(lock) == LOCK_REC) {
-
 			lock_rec_dequeue_from_page(lock);
+
 		} else {
 			ut_ad(lock_get_type_low(lock) & LOCK_TABLE);
 
@@ -4104,79 +4578,104 @@ lock_release_off_kernel(
 				block the use of the MySQL query cache for
 				all currently active transactions. */
 
-				table = lock->un_member.tab_lock.table;
-
-				table->query_cache_inv_trx_id
-					= trx_sys->max_trx_id;
+				lock->un_member.tab_lock.table
+					->query_cache_inv_trx_id = max_trx_id;
 			}
 
 			lock_table_dequeue(lock);
 		}
 
-		if (count == LOCK_RELEASE_KERNEL_INTERVAL) {
-			/* Release the kernel mutex for a while, so that we
+		if (count == LOCK_RELEASE_INTERVAL) {
+			/* Release the  mutex for a while, so that we
 			do not monopolize it */
 
-			lock_mutex_exit_kernel();
+			lock_mutex_exit();
 
-			lock_mutex_enter_kernel();
+			lock_mutex_enter();
 
 			count = 0;
 		}
 
-		lock = UT_LIST_GET_LAST(trx->trx_locks);
+		++count;
 	}
 
-	ut_a(ib_vector_size(trx->autoinc_locks) == 0);
+	/* We don't remove the locks one by one from the vector for
+	efficiency reasons. We simply reset it because we would have
+	released all the locks anyway. */
 
-	mem_heap_empty(trx->lock_heap);
+	ib_vector_reset(trx->lock.table_locks);
+
+	ut_a(UT_LIST_GET_LEN(trx->lock.trx_locks) == 0);
+	ut_a(ib_vector_is_empty(trx->autoinc_locks));
+	ut_a(ib_vector_is_empty(trx->lock.table_locks));
+
+	mem_heap_empty(trx->lock.lock_heap);
 }
 
+/* True if a lock mode is S or X */
+#define IS_LOCK_S_OR_X(lock) \
+	(lock_get_mode(lock) == LOCK_S \
+	 || lock_get_mode(lock) == LOCK_X)
+
 /*********************************************************************//**
-Cancels a waiting lock request and releases possible other transactions
-waiting behind it. */
-UNIV_INTERN
+Removes table locks of the transaction on a table to be dropped. */
+static
 void
-lock_cancel_waiting_and_release(
-/*============================*/
-	lock_t*	lock)	/*!< in: waiting lock request */
+lock_trx_table_locks_remove(
+/*========================*/
+	const lock_t*	lock_to_remove)		/*!< in: lock to remove */
 {
-	ut_ad(mutex_own(&kernel_mutex));
+	lint		i;
+	trx_t*		trx = lock_to_remove->trx;
 
-	if (lock_get_type_low(lock) == LOCK_REC) {
+	ut_ad(lock_mutex_own());
 
-		lock_rec_dequeue_from_page(lock);
+	/* It is safe to read this because we are holding the lock mutex */
+	if (!trx->lock.cancel) {
+		trx_mutex_enter(trx);
 	} else {
-		ut_ad(lock_get_type_low(lock) & LOCK_TABLE);
+		ut_ad(trx_mutex_own(trx));
+	}
 
-		if (lock->trx->autoinc_locks != NULL) {
-			/* Release the transaction's AUTOINC locks/ */
-			lock_release_autoinc_locks(lock->trx);
-		}
+	for (i = ib_vector_size(trx->lock.table_locks) - 1; i >= 0; --i) {
+		const lock_t*	lock;
 
-		lock_table_dequeue(lock);
-	}
+		lock = *static_cast<lock_t**>(
+			ib_vector_get(trx->lock.table_locks, i));
 
-	/* Reset the wait flag and the back pointer to lock in trx */
+		if (lock == NULL) {
+			continue;
+		}
 
-	lock_reset_lock_and_trx_wait(lock);
+		ut_a(trx == lock->trx);
+		ut_a(lock_get_type_low(lock) & LOCK_TABLE);
+		ut_a(lock->un_member.tab_lock.table != NULL);
 
-	/* The following function releases the trx from lock wait */
+		if (lock == lock_to_remove) {
+			void*	null_var = NULL;
+			ib_vector_set(trx->lock.table_locks, i, &null_var);
 
-	trx_end_lock_wait(lock->trx);
-}
+			if (!trx->lock.cancel) {
+				trx_mutex_exit(trx);
+			}
 
-/* True if a lock mode is S or X */
-#define IS_LOCK_S_OR_X(lock) \
-	(lock_get_mode(lock) == LOCK_S \
-	 || lock_get_mode(lock) == LOCK_X)
+			return;
+		}
+	}
+
+	if (!trx->lock.cancel) {
+		trx_mutex_exit(trx);
+	}
 
+	/* Lock must exist in the vector. */
+	ut_error;
+}
 
 /*********************************************************************//**
 Removes locks of a transaction on a table to be dropped.
 If remove_also_table_sx_locks is TRUE then table-level S and X locks are
 also removed in addition to other table-level and record-level locks.
-No lock, that is going to be removed, is allowed to be a wait lock. */
+No lock that is going to be removed is allowed to be a wait lock. */
 static
 void
 lock_remove_all_on_table_for_trx(
@@ -4186,14 +4685,15 @@ lock_remove_all_on_table_for_trx(
 	ibool		remove_also_table_sx_locks)/*!< in: also removes
 						table S and X locks */
 {
-	lock_t*	lock;
-	lock_t*	prev_lock;
+	lock_t*		lock;
+	lock_t*		prev_lock;
 
-	ut_ad(mutex_own(&kernel_mutex));
+	ut_ad(lock_mutex_own());
 
-	lock = UT_LIST_GET_LAST(trx->trx_locks);
+	for (lock = UT_LIST_GET_LAST(trx->lock.trx_locks);
+	     lock != NULL;
+	     lock = prev_lock) {
 
-	while (lock != NULL) {
 		prev_lock = UT_LIST_GET_PREV(trx_locks, lock);
 
 		if (lock_get_type_low(lock) == LOCK_REC
@@ -4208,11 +4708,77 @@ lock_remove_all_on_table_for_trx(
 
 			ut_a(!lock_get_wait(lock));
 
+			lock_trx_table_locks_remove(lock);
 			lock_table_remove_low(lock);
 		}
+	}
+}
+
+/*******************************************************************//**
+Remove any explicit record locks held by recovering transactions on
+the table.
+@return number of recovered transactions examined */
+static
+ulint
+lock_remove_recovered_trx_record_locks(
+/*===================================*/
+	dict_table_t*	table)	/*!< in: check if there are any locks
+				held on records in this table or on the
+				table itself */
+{
+	trx_t*		trx;
+	ulint		n_recovered_trx = 0;
+
+	ut_a(table != NULL);
+	ut_ad(lock_mutex_own());
+
+	mutex_enter(&trx_sys->mutex);
+
+	for (trx = UT_LIST_GET_FIRST(trx_sys->rw_trx_list);
+	     trx != NULL;
+	     trx = UT_LIST_GET_NEXT(trx_list, trx)) {
+
+		lock_t*	lock;
+		lock_t*	next_lock;
+
+		assert_trx_in_rw_list(trx);
+
+		if (!trx->is_recovered) {
+			continue;
+		}
+
+		/* Because we are holding the lock_sys->mutex,
+		implicit locks cannot be converted to explicit ones
+		while we are scanning the explicit locks. */
+
+		for (lock = UT_LIST_GET_FIRST(trx->lock.trx_locks);
+		     lock != NULL;
+		     lock = next_lock) {
+
+			ut_a(lock->trx == trx);
 
-		lock = prev_lock;
+			/* Recovered transactions can't wait on a lock. */
+
+			ut_a(!lock_get_wait(lock));
+
+			/* Recovered transactions don't have any
+			table level locks. */
+
+			ut_a(lock_get_type_low(lock) == LOCK_REC);
+
+			next_lock = UT_LIST_GET_NEXT(trx_locks, lock);
+
+			if (lock->index->table == table) {
+				lock_rec_discard(lock);
+			}
+		}
+
+		++n_recovered_trx;
 	}
+
+	mutex_exit(&trx_sys->mutex);
+
+	return(n_recovered_trx);
 }
 
 /*********************************************************************//**
@@ -4229,17 +4795,17 @@ lock_remove_all_on_table(
 	ibool		remove_also_table_sx_locks)/*!< in: also removes
 						table S and X locks */
 {
-	lock_t*	lock;
-	lock_t*	prev_lock;
+	lock_t*		lock;
 
-	mutex_enter(&kernel_mutex);
+	lock_mutex_enter();
 
-	lock = UT_LIST_GET_FIRST(table->locks);
+	for (lock = UT_LIST_GET_FIRST(table->locks);
+	     lock != NULL;
+	     /* No op */) {
 
-	while (lock != NULL) {
+		lock_t*	prev_lock;
 
-		prev_lock = UT_LIST_GET_PREV(un_member.tab_lock.locks,
-					     lock);
+		prev_lock = UT_LIST_GET_PREV(un_member.tab_lock.locks, lock);
 
 		/* If we should remove all locks (remove_also_table_sx_locks
 		is TRUE), or if the lock is not table-level S or X lock,
@@ -4251,8 +4817,8 @@ lock_remove_all_on_table(
 			ut_a(!lock_get_wait(lock));
 		}
 
-		lock_remove_all_on_table_for_trx(table, lock->trx,
-						 remove_also_table_sx_locks);
+		lock_remove_all_on_table_for_trx(
+			table, lock->trx, remove_also_table_sx_locks);
 
 		if (prev_lock == NULL) {
 			if (lock == UT_LIST_GET_FIRST(table->locks)) {
@@ -4277,7 +4843,18 @@ lock_remove_all_on_table(
 		}
 	}
 
-	mutex_exit(&kernel_mutex);
+	/* Note: Recovered transactions don't have table level IX or IS locks
+	but can have implicit record locks that have been converted to explicit
+	record locks. Such record locks cannot be freed by traversing the
+	transaction lock list in dict_table_t (as above). */
+
+	if (!lock_sys->rollback_complete
+	    && lock_remove_recovered_trx_record_locks(table) == 0) {
+
+		lock_sys->rollback_complete = TRUE;
+	}
+
+	lock_mutex_exit();
 }
 
 /*===================== VALIDATION AND DEBUGGING  ====================*/
@@ -4291,13 +4868,13 @@ lock_table_print(
 	FILE*		file,	/*!< in: file where to print */
 	const lock_t*	lock)	/*!< in: table type lock */
 {
-	ut_ad(mutex_own(&kernel_mutex));
+	ut_ad(lock_mutex_own());
 	ut_a(lock_get_type_low(lock) == LOCK_TABLE);
 
 	fputs("TABLE LOCK table ", file);
 	ut_print_name(file, lock->trx, TRUE,
 		      lock->un_member.tab_lock.table->name);
-	fprintf(file, " trx id " TRX_ID_FMT, (ullint) lock->trx->id);
+	fprintf(file, " trx id " TRX_ID_FMT, lock->trx->id);
 
 	if (lock_get_mode(lock) == LOCK_S) {
 		fputs(" lock mode S", file);
@@ -4340,7 +4917,7 @@ lock_rec_print(
 	ulint*			offsets		= offsets_;
 	rec_offs_init(offsets_);
 
-	ut_ad(mutex_own(&kernel_mutex));
+	ut_ad(lock_mutex_own());
 	ut_a(lock_get_type_low(lock) == LOCK_REC);
 
 	space = lock->un_member.rec_lock.space;
@@ -4350,7 +4927,7 @@ lock_rec_print(
 		(ulong) space, (ulong) page_no,
 		(ulong) lock_rec_get_n_bits(lock));
 	dict_index_name_print(file, lock->trx, lock->index);
-	fprintf(file, " trx id " TRX_ID_FMT, (ullint) lock->trx->id);
+	fprintf(file, " trx id " TRX_ID_FMT, lock->trx->id);
 
 	if (lock_get_mode(lock) == LOCK_S) {
 		fputs(" lock mode S", file);
@@ -4429,20 +5006,21 @@ ulint
 lock_get_n_rec_locks(void)
 /*======================*/
 {
-	lock_t*	lock;
 	ulint	n_locks	= 0;
 	ulint	i;
 
-	ut_ad(mutex_own(&kernel_mutex));
+	ut_ad(lock_mutex_own());
 
 	for (i = 0; i < hash_get_n_cells(lock_sys->rec_hash); i++) {
+		const lock_t*	lock;
 
-		lock = HASH_GET_FIRST(lock_sys->rec_hash, i);
+		for (lock = static_cast<const lock_t*>(
+				HASH_GET_FIRST(lock_sys->rec_hash, i));
+		     lock != 0;
+		     lock = static_cast<const lock_t*>(
+				HASH_GET_NEXT(hash, lock))) {
 
-		while (lock) {
 			n_locks++;
-
-			lock = HASH_GET_NEXT(hash, lock);
 		}
 	}
 
@@ -4452,22 +5030,22 @@ lock_get_n_rec_locks(void)
 
 /*********************************************************************//**
 Prints info of locks for all transactions.
-@return FALSE if not able to obtain kernel mutex
+@return FALSE if not able to obtain lock mutex
 and exits without printing info */
 UNIV_INTERN
 ibool
 lock_print_info_summary(
 /*====================*/
 	FILE*	file,	/*!< in: file where to print */
-	ibool   nowait)	/*!< in: whether to wait for the kernel mutex */
+	ibool   nowait)	/*!< in: whether to wait for the lock mutex */
 {
-	/* if nowait is FALSE, wait on the kernel mutex,
+	/* if nowait is FALSE, wait on the lock mutex,
 	otherwise return immediately if fail to obtain the
 	mutex. */
 	if (!nowait) {
-		lock_mutex_enter_kernel();
-	} else if (mutex_enter_nowait(&kernel_mutex)) {
-		fputs("FAIL TO OBTAIN KERNEL MUTEX, "
+		lock_mutex_enter();
+	} else if (lock_mutex_enter_nowait()) {
+		fputs("FAIL TO OBTAIN LOCK MUTEX, "
 		      "SKIP LOCK INFO PRINTING\n", file);
 		return(FALSE);
 	}
@@ -4485,13 +5063,38 @@ lock_print_info_summary(
 	      "------------\n", file);
 
 	fprintf(file, "Trx id counter " TRX_ID_FMT "\n",
-		(ullint) trx_sys->max_trx_id);
+		trx_sys_get_max_trx_id());
 
 	fprintf(file,
 		"Purge done for trx's n:o < " TRX_ID_FMT
-		" undo n:o < " TRX_ID_FMT "\n",
-		(ullint) purge_sys->purge_trx_no,
-		(ullint) purge_sys->purge_undo_no);
+		" undo n:o < " TRX_ID_FMT " state: ",
+		purge_sys->iter.trx_no,
+		purge_sys->iter.undo_no);
+
+	/* Note: We are reading the state without the latch. One because it
+	will violate the latching order and two because we are merely querying
+	the state of the variable for display. */
+
+	switch (purge_sys->state){
+	case PURGE_STATE_EXIT:
+	case PURGE_STATE_INIT:
+		/* Should never be in this state while the system is running. */
+		ut_error;
+
+	case PURGE_STATE_RUN:
+		fprintf(file, "running");
+		/* Check if it is waiting for more data to arrive. */
+		if (!purge_sys->running) {
+			fprintf(file, " but idle");
+		}
+		break;
+
+	case PURGE_STATE_STOP:
+		fprintf(file, "stopped");
+		break;
+	}
+
+	fprintf(file, "\n");
 
 	fprintf(file,
 		"History list length %lu\n",
@@ -4506,83 +5109,113 @@ lock_print_info_summary(
 }
 
 /*********************************************************************//**
-Prints info of locks for each transaction. */
+Prints info of locks for each transaction. This function assumes that the
+caller holds the lock mutex and more importantly it will release the lock
+mutex on behalf of the caller. (This should be fixed in the future). */
 UNIV_INTERN
 void
 lock_print_info_all_transactions(
 /*=============================*/
 	FILE*	file)	/*!< in: file where to print */
 {
-	lock_t*	lock;
-	ibool	load_page_first = TRUE;
-	ulint	nth_trx		= 0;
-	ulint	nth_lock	= 0;
-	ulint	i;
-	mtr_t	mtr;
-	trx_t*	trx;
+	const lock_t*	lock;
+	ibool		load_page_first = TRUE;
+	ulint		nth_trx		= 0;
+	ulint		nth_lock	= 0;
+	ulint		i;
+	mtr_t		mtr;
+	const trx_t*	trx;
+	trx_list_t*	trx_list = &trx_sys->rw_trx_list;
 
 	fprintf(file, "LIST OF TRANSACTIONS FOR EACH SESSION:\n");
 
+	ut_ad(lock_mutex_own());
+
+	mutex_enter(&trx_sys->mutex);
+
 	/* First print info on non-active transactions */
 
-	trx = UT_LIST_GET_FIRST(trx_sys->mysql_trx_list);
+	/* NOTE: information of auto-commit non-locking read-only
+	transactions will be omitted here. The information will be
+	available from INFORMATION_SCHEMA.INNODB_TRX. */
 
-	while (trx) {
-		if (trx->conc_state == TRX_NOT_STARTED) {
+	for (trx = UT_LIST_GET_FIRST(trx_sys->mysql_trx_list);
+	     trx != NULL;
+	     trx = UT_LIST_GET_NEXT(mysql_trx_list, trx)) {
+
+		ut_ad(trx->in_mysql_trx_list);
+
+		/* See state transitions and locking rules in trx0trx.h */
+
+		if (trx_state_eq(trx, TRX_STATE_NOT_STARTED)) {
 			fputs("---", file);
-			trx_print(file, trx, 600);
+			trx_print_latched(file, trx, 600);
 		}
-
-		trx = UT_LIST_GET_NEXT(mysql_trx_list, trx);
 	}
 
 loop:
-	trx = UT_LIST_GET_FIRST(trx_sys->trx_list);
+	/* Since we temporarily release lock_sys->mutex and
+	trx_sys->mutex when reading a database page in below,
+	variable trx may be obsolete now and we must loop
+	through the trx list to get probably the same trx,
+	or some other trx. */
 
-	i = 0;
-
-	/* Since we temporarily release the kernel mutex when
-	reading a database page in below, variable trx may be
-	obsolete now and we must loop through the trx list to
-	get probably the same trx, or some other trx. */
+	for (trx = UT_LIST_GET_FIRST(*trx_list), i = 0;
+	     trx && (i < nth_trx);
+	     trx = UT_LIST_GET_NEXT(trx_list, trx), i++) {
 
-	while (trx && (i < nth_trx)) {
-		trx = UT_LIST_GET_NEXT(trx_list, trx);
-		i++;
+		assert_trx_in_list(trx);
+		ut_ad(trx->read_only == (trx_list == &trx_sys->ro_trx_list));
 	}
 
+	ut_ad(trx == NULL
+	      || trx->read_only == (trx_list == &trx_sys->ro_trx_list));
+
 	if (trx == NULL) {
-		lock_mutex_exit_kernel();
+		/* Check the read-only transaction list next. */
+		if (trx_list == &trx_sys->rw_trx_list) {
+			trx_list = &trx_sys->ro_trx_list;
+			nth_trx = 0;
+			nth_lock = 0;
+			goto loop;
+		}
+
+		lock_mutex_exit();
+		mutex_exit(&trx_sys->mutex);
 
 		ut_ad(lock_validate());
 
 		return;
 	}
 
+	assert_trx_in_list(trx);
+
 	if (nth_lock == 0) {
 		fputs("---", file);
-		trx_print(file, trx, 600);
+
+		trx_print_latched(file, trx, 600);
 
 		if (trx->read_view) {
 			fprintf(file,
 				"Trx read view will not see trx with"
 				" id >= " TRX_ID_FMT
 				", sees < " TRX_ID_FMT "\n",
-				(ullint) trx->read_view->low_limit_id,
-				(ullint) trx->read_view->up_limit_id);
+				trx->read_view->low_limit_id,
+				trx->read_view->up_limit_id);
 		}
 
-		if (trx->que_state == TRX_QUE_LOCK_WAIT) {
+		if (trx->lock.que_state == TRX_QUE_LOCK_WAIT) {
+
 			fprintf(file,
 				"------- TRX HAS BEEN WAITING %lu SEC"
 				" FOR THIS LOCK TO BE GRANTED:\n",
-				(ulong) difftime(time(NULL),
-						 trx->wait_started));
+				(ulong) difftime(ut_time(),
+						 trx->lock.wait_started));
 
-			if (lock_get_type_low(trx->wait_lock) == LOCK_REC) {
-				lock_rec_print(file, trx->wait_lock);
+			if (lock_get_type_low(trx->lock.wait_lock) == LOCK_REC) {
+				lock_rec_print(file, trx->lock.wait_lock);
 			} else {
-				lock_table_print(file, trx->wait_lock);
+				lock_table_print(file, trx->lock.wait_lock);
 			}
 
 			fputs("------------------\n", file);
@@ -4599,7 +5232,7 @@ loop:
 	/* Look at the note about the trx loop above why we loop here:
 	lock may be an obsolete pointer now. */
 
-	lock = UT_LIST_GET_FIRST(trx->trx_locks);
+	lock = UT_LIST_GET_FIRST(trx->lock.trx_locks);
 
 	while (lock && (i < nth_lock)) {
 		lock = UT_LIST_GET_NEXT(trx_locks, lock);
@@ -4633,18 +5266,21 @@ loop:
 				goto print_rec;
 			}
 
-			lock_mutex_exit_kernel();
+			lock_mutex_exit();
+			mutex_exit(&trx_sys->mutex);
 
 			mtr_start(&mtr);
 
-			buf_page_get_with_no_latch(space, zip_size,
-						   page_no, &mtr);
+			buf_page_get_with_no_latch(
+				space, zip_size, page_no, &mtr);
 
 			mtr_commit(&mtr);
 
 			load_page_first = FALSE;
 
-			lock_mutex_enter_kernel();
+			lock_mutex_enter();
+
+			mutex_enter(&trx_sys->mutex);
 
 			goto loop;
 		}
@@ -4668,8 +5304,6 @@ print_rec:
 
 		nth_trx++;
 		nth_lock = 0;
-
-		goto loop;
 	}
 
 	goto loop;
@@ -4677,6 +5311,45 @@ print_rec:
 
 #ifdef UNIV_DEBUG
 /*********************************************************************//**
+Find the the lock in the trx_t::trx_lock_t::table_locks vector.
+@return TRUE if found */
+static
+ibool
+lock_trx_table_locks_find(
+/*======================*/
+	trx_t*		trx,		/*!< in: trx to validate */
+	const lock_t*	find_lock)	/*!< in: lock to find */
+{
+	lint		i;
+	ibool		found = FALSE;
+
+	trx_mutex_enter(trx);
+
+	for (i = ib_vector_size(trx->lock.table_locks) - 1; i >= 0; --i) {
+		const lock_t*	lock;
+
+		lock = *static_cast<const lock_t**>(
+			ib_vector_get(trx->lock.table_locks, i));
+
+		if (lock == NULL) {
+			continue;
+		} else if (lock == find_lock) {
+			/* Can't be duplicates. */
+			ut_a(!found);
+			found = TRUE;
+		}
+
+		ut_a(trx == lock->trx);
+		ut_a(lock_get_type_low(lock) & LOCK_TABLE);
+		ut_a(lock->un_member.tab_lock.table != NULL);
+	}
+
+	trx_mutex_exit(trx);
+
+	return(found);
+}
+
+/*********************************************************************//**
 Validates the lock queue on a table.
 @return	TRUE if ok */
 static
@@ -4687,14 +5360,18 @@ lock_table_queue_validate(
 {
 	const lock_t*	lock;
 
-	ut_ad(mutex_own(&kernel_mutex));
+	ut_ad(lock_mutex_own());
+	ut_ad(mutex_own(&trx_sys->mutex));
 
-	lock = UT_LIST_GET_FIRST(table->locks);
+	for (lock = UT_LIST_GET_FIRST(table->locks);
+	     lock != NULL;
+	     lock = UT_LIST_GET_NEXT(un_member.tab_lock.locks, lock)) {
 
-	while (lock) {
-		ut_a(((lock->trx)->conc_state == TRX_ACTIVE)
-		     || ((lock->trx)->conc_state == TRX_PREPARED)
-		     || ((lock->trx)->conc_state == TRX_COMMITTED_IN_MEMORY));
+		/* lock->trx->state cannot change from or to NOT_STARTED
+		while we are holding the trx_sys->mutex. It may change
+		from ACTIVE to PREPARED, but it may not change to
+		COMMITTED, because we are holding the lock_sys->mutex. */
+		ut_ad(trx_assert_started(lock->trx));
 
 		if (!lock_get_wait(lock)) {
 
@@ -4706,7 +5383,7 @@ lock_table_queue_validate(
 			ut_a(lock_table_has_to_wait_in_queue(lock));
 		}
 
-		lock = UT_LIST_GET_NEXT(un_member.tab_lock.locks, lock);
+		ut_a(lock_trx_table_locks_find(lock->trx, lock));
 	}
 
 	return(TRUE);
@@ -4719,37 +5396,37 @@ static
 ibool
 lock_rec_queue_validate(
 /*====================*/
+	ibool			locked_lock_trx_sys,
+					/*!< in: if the caller holds
+					both the lock mutex and
+					trx_sys_t->lock. */
 	const buf_block_t*	block,	/*!< in: buffer block containing rec */
 	const rec_t*		rec,	/*!< in: record to look at */
 	const dict_index_t*	index,	/*!< in: index, or NULL if not known */
 	const ulint*		offsets)/*!< in: rec_get_offsets(rec, index) */
 {
-	trx_t*	impl_trx;
-	lock_t*	lock;
-	ulint	heap_no;
+	const trx_t*	impl_trx;
+	const lock_t*	lock;
+	ulint		heap_no;
 
 	ut_a(rec);
 	ut_a(block->frame == page_align(rec));
 	ut_ad(rec_offs_validate(rec, index, offsets));
 	ut_ad(!page_rec_is_comp(rec) == !rec_offs_comp(offsets));
+	ut_ad(lock_mutex_own() == locked_lock_trx_sys);
 
 	heap_no = page_rec_get_heap_no(rec);
 
-	lock_mutex_enter_kernel();
+	if (!locked_lock_trx_sys) {
+		lock_mutex_enter();
+		mutex_enter(&trx_sys->mutex);
+	}
 
 	if (!page_rec_is_user_rec(rec)) {
 
-		lock = lock_rec_get_first(block, heap_no);
-
-		while (lock) {
-			switch(lock->trx->conc_state) {
-			case TRX_ACTIVE:
-			case TRX_PREPARED:
-			case TRX_COMMITTED_IN_MEMORY:
-				break;
-			default:
-				ut_error;
-			}
+		for (lock = lock_rec_get_first(block, heap_no);
+		     lock != NULL;
+		     lock = lock_rec_get_next_const(heap_no, lock)) {
 
 			ut_a(trx_in_trx_list(lock->trx));
 
@@ -4760,78 +5437,38 @@ lock_rec_queue_validate(
 			if (index) {
 				ut_a(lock->index == index);
 			}
-
-			lock = lock_rec_get_next(heap_no, lock);
 		}
 
-		lock_mutex_exit_kernel();
-
-		return(TRUE);
+		goto func_exit;
 	}
 
 	if (!index);
 	else if (dict_index_is_clust(index)) {
+		trx_id_t	trx_id;
 
-		impl_trx = lock_clust_rec_some_has_impl(rec, index, offsets);
+		/* Unlike the non-debug code, this invariant can only succeed
+		if the check and assertion are covered by the lock mutex. */
 
-		if (impl_trx
-		    && lock_rec_other_has_expl_req(LOCK_S, 0, LOCK_WAIT,
-						   block, heap_no, impl_trx)) {
+		trx_id = lock_clust_rec_some_has_impl(rec, index, offsets);
+		impl_trx = trx_rw_is_active_low(trx_id, NULL);
 
-			ut_a(lock_rec_has_expl(LOCK_X | LOCK_REC_NOT_GAP,
-					       block, heap_no, impl_trx));
-		}
-#if 0
-	} else {
+		ut_ad(lock_mutex_own());
+		/* impl_trx cannot be committed until lock_mutex_exit()
+		because lock_trx_release_locks() acquires lock_sys->mutex */
 
-		/* The kernel mutex may get released temporarily in the
-		next function call: we have to release lock table mutex
-		to obey the latching order */
-
-		/* If this thread is holding the file space latch
-		(fil_space_t::latch), the following check WILL break
-		latching order and may cause a deadlock of threads. */
-
-		/* NOTE: This is a bogus check that would fail in the
-		following case: Our transaction is updating a
-		row. After it has updated the clustered index record,
-		it goes to a secondary index record and finds someone
-		else holding an explicit S- or X-lock on that
-		secondary index record, presumably from a locking
-		read. Our transaction cannot update the secondary
-		index immediately, but places a waiting X-lock request
-		on the secondary index record. There is nothing
-		illegal in this. The assertion is simply too strong. */
-
-		/* From the locking point of view, each secondary
-		index is a separate table. A lock that is held on
-		secondary index rec does not give any rights to modify
-		or read the clustered index rec. Therefore, we can
-		think of the sec index as a separate 'table' from the
-		clust index 'table'. Conversely, a transaction that
-		has acquired a lock on and modified a clustered index
-		record may need to wait for a lock on the
-		corresponding record in a secondary index. */
-
-		impl_trx = lock_sec_rec_some_has_impl_off_kernel(
-			rec, index, offsets);
-
-		if (impl_trx
+		if (impl_trx != NULL
 		    && lock_rec_other_has_expl_req(LOCK_S, 0, LOCK_WAIT,
 						   block, heap_no, impl_trx)) {
 
 			ut_a(lock_rec_has_expl(LOCK_X | LOCK_REC_NOT_GAP,
 					       block, heap_no, impl_trx));
 		}
-#endif
 	}
 
-	lock = lock_rec_get_first(block, heap_no);
+	for (lock = lock_rec_get_first(block, heap_no);
+	     lock != NULL;
+	     lock = lock_rec_get_next_const(heap_no, lock)) {
 
-	while (lock) {
-		ut_a(lock->trx->conc_state == TRX_ACTIVE
-		     || lock->trx->conc_state == TRX_PREPARED
-		     || lock->trx->conc_state == TRX_COMMITTED_IN_MEMORY);
 		ut_a(trx_in_trx_list(lock->trx));
 
 		if (index) {
@@ -4854,11 +5491,13 @@ lock_rec_queue_validate(
 
 			ut_a(lock_rec_has_to_wait_in_queue(lock));
 		}
-
-		lock = lock_rec_get_next(heap_no, lock);
 	}
 
-	lock_mutex_exit_kernel();
+func_exit:
+	if (!locked_lock_trx_sys) {
+		lock_mutex_exit();
+		mutex_exit(&trx_sys->mutex);
+	}
 
 	return(TRUE);
 }
@@ -4882,10 +5521,10 @@ lock_rec_validate_page(
 	ulint*		offsets		= offsets_;
 	rec_offs_init(offsets_);
 
-	ut_ad(!mutex_own(&kernel_mutex));
-	ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
+	ut_ad(!lock_mutex_own());
 
-	lock_mutex_enter_kernel();
+	lock_mutex_enter();
+	mutex_enter(&trx_sys->mutex);
 loop:
 	lock = lock_rec_get_first_on_page_addr(buf_block_get_space(block),
 					       buf_block_get_page_no(block));
@@ -4908,9 +5547,6 @@ loop:
 	}
 
 	ut_a(trx_in_trx_list(lock->trx));
-	ut_a(lock->trx->conc_state == TRX_ACTIVE
-	     || lock->trx->conc_state == TRX_PREPARED
-	     || lock->trx->conc_state == TRX_COMMITTED_IN_MEMORY);
 
 # ifdef UNIV_SYNC_DEBUG
 	/* Only validate the record queues when this thread is not
@@ -4932,17 +5568,13 @@ loop:
 				"Validating %u %u\n",
 				block->page.space, block->page.offset);
 #endif
-			lock_mutex_exit_kernel();
-
 			/* If this thread is holding the file space
 			latch (fil_space_t::latch), the following
 			check WILL break the latching order and may
 			cause a deadlock of threads. */
 
-			lock_rec_queue_validate(block, rec, lock->index,
-						offsets);
-
-			lock_mutex_enter_kernel();
+			lock_rec_queue_validate(
+				TRUE, block, rec, lock->index, offsets);
 
 			nth_bit = i + 1;
 
@@ -4956,7 +5588,8 @@ loop:
 	goto loop;
 
 function_exit:
-	lock_mutex_exit_kernel();
+	lock_mutex_exit();
+	mutex_exit(&trx_sys->mutex);
 
 	if (UNIV_LIKELY_NULL(heap)) {
 		mem_heap_free(heap);
@@ -4965,6 +5598,47 @@ function_exit:
 }
 
 /*********************************************************************//**
+Validates the table locks.
+@return	TRUE if ok */
+static
+ibool
+lock_validate_table_locks(
+/*======================*/
+	const trx_list_t*	trx_list)	/*!< in: trx list */
+{
+	const trx_t*	trx;
+
+	ut_ad(lock_mutex_own());
+	ut_ad(mutex_own(&trx_sys->mutex));
+
+	ut_ad(trx_list == &trx_sys->rw_trx_list
+	      || trx_list == &trx_sys->ro_trx_list);
+
+	for (trx = UT_LIST_GET_FIRST(*trx_list);
+	     trx != NULL;
+	     trx = UT_LIST_GET_NEXT(trx_list, trx)) {
+
+		const lock_t*	lock;
+
+		assert_trx_in_list(trx);
+		ut_ad(trx->read_only == (trx_list == &trx_sys->ro_trx_list));
+
+		for (lock = UT_LIST_GET_FIRST(trx->lock.trx_locks);
+		     lock != NULL;
+		     lock = UT_LIST_GET_NEXT(trx_locks, lock)) {
+
+			if (lock_get_type_low(lock) & LOCK_TABLE) {
+
+				lock_table_queue_validate(
+					lock->un_member.tab_lock.table);
+			}
+		}
+	}
+
+	return(TRUE);
+}
+
+/*********************************************************************//**
 Validate record locks up to a limit.
 @return lock at limit or NULL if no more locks in the hash bucket */
 static __attribute__((nonnull, warn_unused_result))
@@ -4976,12 +5650,13 @@ lock_rec_validate(
 	ib_uint64_t*	limit)		/*!< in/out: upper limit of
 					(space, page_no) */
 {
-	lock_t*		lock;
-	ut_ad(mutex_own(&kernel_mutex));
+	ut_ad(lock_mutex_own());
+	ut_ad(mutex_own(&trx_sys->mutex));
 
-	for (lock = HASH_GET_FIRST(lock_sys->rec_hash, start);
+	for (const lock_t* lock = static_cast<const lock_t*>(
+			HASH_GET_FIRST(lock_sys->rec_hash, start));
 	     lock != NULL;
-	     lock = HASH_GET_NEXT(hash, lock)) {
+	     lock = static_cast<const lock_t*>(HASH_GET_NEXT(hash, lock))) {
 
 		ib_uint64_t	current;
 
@@ -4998,7 +5673,7 @@ lock_rec_validate(
 		}
 	}
 
-	return(NULL);
+	return(0);
 }
 
 /*********************************************************************//**
@@ -5016,81 +5691,64 @@ lock_rec_block_validate(
 	!block->page.file_page_was_freed. */
 
 	mtr_t		mtr;
-	buf_block_t*	block;
 
-	/* Make sure that the tablespace is not deleted while we are
-	trying to access the page. */
-	if (!fil_inc_pending_ops(space)) {
-		mtr_start(&mtr);
-		block = buf_page_get_gen(
-			space, fil_space_get_zip_size(space),
-			page_no, RW_X_LATCH, NULL,
-			BUF_GET_POSSIBLY_FREED,
-			__FILE__, __LINE__, &mtr);
+	mtr_start(&mtr);
 
-		buf_block_dbg_add_level(block, SYNC_NO_ORDER_CHECK);
+	buf_block_t*	block = buf_page_get_gen(
+		space, fil_space_get_zip_size(space),
+		page_no, RW_X_LATCH, NULL,
+		BUF_GET_POSSIBLY_FREED,
+		__FILE__, __LINE__, &mtr);
 
-		ut_ad(lock_rec_validate_page(block));
-		mtr_commit(&mtr);
+	buf_block_dbg_add_level(block, SYNC_NO_ORDER_CHECK);
 
-		fil_decr_pending_ops(space);
-	}
+	ut_ad(lock_rec_validate_page(block));
+	mtr_commit(&mtr);
 }
 
 /*********************************************************************//**
 Validates the lock system.
 @return	TRUE if ok */
 static
-ibool
-lock_validate(void)
-/*===============*/
+bool
+lock_validate()
+/*===========*/
 {
-	const lock_t*	lock;
-	const trx_t*	trx;
-	ulint		i;
-
-	lock_mutex_enter_kernel();
-
-	trx = UT_LIST_GET_FIRST(trx_sys->trx_list);
-
-	while (trx) {
-		lock = UT_LIST_GET_FIRST(trx->trx_locks);
+	lock_mutex_enter();
 
-		while (lock) {
-			if (lock_get_type_low(lock) & LOCK_TABLE) {
-
-				lock_table_queue_validate(
-					lock->un_member.tab_lock.table);
-			}
-
-			lock = UT_LIST_GET_NEXT(trx_locks, lock);
-		}
+	mutex_enter(&trx_sys->mutex);
 
-		trx = UT_LIST_GET_NEXT(trx_list, trx);
-	}
+	ut_a(lock_validate_table_locks(&trx_sys->rw_trx_list));
+	ut_a(lock_validate_table_locks(&trx_sys->ro_trx_list));
 
 	/* Iterate over all the record locks and validate the locks. We
 	don't want to hog the lock_sys_t::mutex and the trx_sys_t::mutex.
 	Release both mutexes during the validation check. */
 
-	for (i = 0; i < hash_get_n_cells(lock_sys->rec_hash); i++) {
+	for (ulint i = 0; i < hash_get_n_cells(lock_sys->rec_hash); i++) {
 		const lock_t*	lock;
 		ib_uint64_t	limit = 0;
 
-		while ((lock = lock_rec_validate(i, &limit)) != NULL) {
+		while ((lock = lock_rec_validate(i, &limit)) != 0) {
 
 			ulint	space = lock->un_member.rec_lock.space;
 			ulint	page_no = lock->un_member.rec_lock.page_no;
 
-			lock_mutex_exit_kernel();
+			lock_mutex_exit();
+			mutex_exit(&trx_sys->mutex);
+
 			lock_rec_block_validate(space, page_no);
-			lock_mutex_enter_kernel();
+
+			lock_mutex_enter();
+			mutex_enter(&trx_sys->mutex);
 		}
 	}
 
-	lock_mutex_exit_kernel();
+	mutex_exit(&trx_sys->mutex);
 
-	return(TRUE);
+	lock_mutex_exit();
+
+	return(true);
 }
 #endif /* UNIV_DEBUG */
 /*============ RECORD LOCK CHECKS FOR ROW OPERATIONS ====================*/
@@ -5135,7 +5793,10 @@ lock_rec_insert_check_and_lock(
 	next_rec = page_rec_get_next_const(rec);
 	next_rec_heap_no = page_rec_get_heap_no(next_rec);
 
-	lock_mutex_enter_kernel();
+	lock_mutex_enter();
+	/* Because this code is invoked for a running transaction by
+	the thread that is serving the transaction, it is not necessary
+	to hold trx->mutex here. */
 
 	/* When inserting a record into an index, the table must be at
 	least IX-locked or we must be building an index, in which case
@@ -5149,7 +5810,7 @@ lock_rec_insert_check_and_lock(
 	if (UNIV_LIKELY(lock == NULL)) {
 		/* We optimize CPU time usage in the simplest case */
 
-		lock_mutex_exit_kernel();
+		lock_mutex_exit();
 
 		if (!dict_index_is_clust(index)) {
 			/* Update the page max trx id field */
@@ -5176,19 +5837,23 @@ lock_rec_insert_check_and_lock(
 	on the successor, which produced an unnecessary deadlock. */
 
 	if (lock_rec_other_has_conflicting(
-		    LOCK_X | LOCK_GAP | LOCK_INSERT_INTENTION,
+		    static_cast<enum lock_mode>(
+			    LOCK_X | LOCK_GAP | LOCK_INSERT_INTENTION),
 		    block, next_rec_heap_no, trx)) {
 
 		/* Note that we may get DB_SUCCESS also here! */
-		err = lock_rec_enqueue_waiting(LOCK_X | LOCK_GAP
-					       | LOCK_INSERT_INTENTION,
-					       block, next_rec_heap_no,
-					       index, thr);
+		trx_mutex_enter(trx);
+
+		err = lock_rec_enqueue_waiting(
+			LOCK_X | LOCK_GAP | LOCK_INSERT_INTENTION,
+			block, next_rec_heap_no, index, thr);
+
+		trx_mutex_exit(trx);
 	} else {
 		err = DB_SUCCESS;
 	}
 
-	lock_mutex_exit_kernel();
+	lock_mutex_exit();
 
 	switch (err) {
 	case DB_SUCCESS_LOCKED_REC:
@@ -5213,8 +5878,10 @@ lock_rec_insert_check_and_lock(
 
 		offsets = rec_get_offsets(next_rec, index, offsets_,
 					  ULINT_UNDEFINED, &heap);
-		ut_ad(lock_rec_queue_validate(block,
-					      next_rec, index, offsets));
+
+		ut_ad(lock_rec_queue_validate(
+				FALSE, block, next_rec, index, offsets));
+
 		if (UNIV_LIKELY_NULL(heap)) {
 			mem_heap_free(heap);
 		}
@@ -5226,8 +5893,7 @@ lock_rec_insert_check_and_lock(
 
 /*********************************************************************//**
 If a transaction has an implicit x-lock on a record, but no explicit x-lock
-set on the record, sets one for it. NOTE that in the case of a secondary
-index, the kernel mutex may get temporarily released. */
+set on the record, sets one for it. */
 static
 void
 lock_rec_convert_impl_to_expl(
@@ -5237,33 +5903,49 @@ lock_rec_convert_impl_to_expl(
 	dict_index_t*		index,	/*!< in: index of record */
 	const ulint*		offsets)/*!< in: rec_get_offsets(rec, index) */
 {
-	trx_t*	impl_trx;
+	trx_id_t		trx_id;
 
-	ut_ad(mutex_own(&kernel_mutex));
+	ut_ad(!lock_mutex_own());
 	ut_ad(page_rec_is_user_rec(rec));
 	ut_ad(rec_offs_validate(rec, index, offsets));
 	ut_ad(!page_rec_is_comp(rec) == !rec_offs_comp(offsets));
 
 	if (dict_index_is_clust(index)) {
-		impl_trx = lock_clust_rec_some_has_impl(rec, index, offsets);
+		trx_id = lock_clust_rec_some_has_impl(rec, index, offsets);
+		/* The clustered index record was last modified by
+		this transaction. The transaction may have been
+		committed a long time ago. */
 	} else {
-		impl_trx = lock_sec_rec_some_has_impl_off_kernel(
-			rec, index, offsets);
+		trx_id = lock_sec_rec_some_has_impl(rec, index, offsets);
+		/* The transaction can be committed before the
+		trx_is_active(trx_id, NULL) check below, because we are not
+		holding lock_mutex. */
 	}
 
-	if (impl_trx) {
+	if (trx_id != 0) {
+		trx_t*	impl_trx;
 		ulint	heap_no = page_rec_get_heap_no(rec);
 
-		/* If the transaction has no explicit x-lock set on the
-		record, set one for it */
+		lock_mutex_enter();
+
+		/* If the transaction is still active and has no
+		explicit x-lock set on the record, set one for it */
+
+		impl_trx = trx_rw_is_active(trx_id, NULL);
+
+		/* impl_trx cannot be committed until lock_mutex_exit()
+		because lock_trx_release_locks() acquires lock_sys->mutex */
 
-		if (!lock_rec_has_expl(LOCK_X | LOCK_REC_NOT_GAP, block,
+		if (impl_trx != NULL
+		    && !lock_rec_has_expl(LOCK_X | LOCK_REC_NOT_GAP, block,
 				       heap_no, impl_trx)) {
 
 			lock_rec_add_to_queue(
 				LOCK_REC | LOCK_X | LOCK_REC_NOT_GAP,
-				block, heap_no, index, impl_trx);
+				block, heap_no, index, impl_trx, FALSE);
 		}
+
+		lock_mutex_exit();
 	}
 }
 
@@ -5304,21 +5986,23 @@ lock_clust_rec_modify_check_and_lock(
 		? rec_get_heap_no_new(rec)
 		: rec_get_heap_no_old(rec);
 
-	lock_mutex_enter_kernel();
-
-	ut_ad(lock_table_has(thr_get_trx(thr), index->table, LOCK_IX));
-
 	/* If a transaction has no explicit x-lock set on the record, set one
 	for it */
 
 	lock_rec_convert_impl_to_expl(block, rec, index, offsets);
 
+	lock_mutex_enter();
+
+	ut_ad(lock_table_has(thr_get_trx(thr), index->table, LOCK_IX));
+
 	err = lock_rec_lock(TRUE, LOCK_X | LOCK_REC_NOT_GAP,
 			    block, heap_no, index, thr);
 
-	lock_mutex_exit_kernel();
+	MONITOR_INC(MONITOR_NUM_RECLOCK_REQ);
+
+	lock_mutex_exit();
 
-	ut_ad(lock_rec_queue_validate(block, rec, index, offsets));
+	ut_ad(lock_rec_queue_validate(FALSE, block, rec, index, offsets));
 
 	if (UNIV_UNLIKELY(err == DB_SUCCESS_LOCKED_REC)) {
 		err = DB_SUCCESS;
@@ -5365,14 +6049,16 @@ lock_sec_rec_modify_check_and_lock(
 	index record, and this would not have been possible if another active
 	transaction had modified this secondary index record. */
 
-	lock_mutex_enter_kernel();
+	lock_mutex_enter();
 
 	ut_ad(lock_table_has(thr_get_trx(thr), index->table, LOCK_IX));
 
 	err = lock_rec_lock(TRUE, LOCK_X | LOCK_REC_NOT_GAP,
 			    block, heap_no, index, thr);
 
-	lock_mutex_exit_kernel();
+	MONITOR_INC(MONITOR_NUM_RECLOCK_REQ);
+
+	lock_mutex_exit();
 
 #ifdef UNIV_DEBUG
 	{
@@ -5383,7 +6069,10 @@ lock_sec_rec_modify_check_and_lock(
 
 		offsets = rec_get_offsets(rec, index, offsets_,
 					  ULINT_UNDEFINED, &heap);
-		ut_ad(lock_rec_queue_validate(block, rec, index, offsets));
+
+		ut_ad(lock_rec_queue_validate(
+			FALSE, block, rec, index, offsets));
+
 		if (UNIV_LIKELY_NULL(heap)) {
 			mem_heap_free(heap);
 		}
@@ -5447,30 +6136,32 @@ lock_sec_rec_read_check_and_lock(
 
 	heap_no = page_rec_get_heap_no(rec);
 
-	lock_mutex_enter_kernel();
-
-	ut_ad(mode != LOCK_X
-	      || lock_table_has(thr_get_trx(thr), index->table, LOCK_IX));
-	ut_ad(mode != LOCK_S
-	      || lock_table_has(thr_get_trx(thr), index->table, LOCK_IS));
-
 	/* Some transaction may have an implicit x-lock on the record only
 	if the max trx id for the page >= min trx id for the trx list or a
 	database recovery is running. */
 
-	if ((page_get_max_trx_id(block->frame) >= trx_list_get_min_trx_id()
+	if ((page_get_max_trx_id(block->frame) >= trx_rw_min_trx_id()
 	     || recv_recovery_is_on())
 	    && !page_rec_is_supremum(rec)) {
 
 		lock_rec_convert_impl_to_expl(block, rec, index, offsets);
 	}
 
+	lock_mutex_enter();
+
+	ut_ad(mode != LOCK_X
+	      || lock_table_has(thr_get_trx(thr), index->table, LOCK_IX));
+	ut_ad(mode != LOCK_S
+	      || lock_table_has(thr_get_trx(thr), index->table, LOCK_IS));
+
 	err = lock_rec_lock(FALSE, mode | gap_mode,
 			    block, heap_no, index, thr);
 
-	lock_mutex_exit_kernel();
+	MONITOR_INC(MONITOR_NUM_RECLOCK_REQ);
+
+	lock_mutex_exit();
 
-	ut_ad(lock_rec_queue_validate(block, rec, index, offsets));
+	ut_ad(lock_rec_queue_validate(FALSE, block, rec, index, offsets));
 
 	return(err);
 }
@@ -5523,24 +6214,25 @@ lock_clust_rec_read_check_and_lock(
 
 	heap_no = page_rec_get_heap_no(rec);
 
-	lock_mutex_enter_kernel();
+	if (UNIV_LIKELY(heap_no != PAGE_HEAP_NO_SUPREMUM)) {
+
+		lock_rec_convert_impl_to_expl(block, rec, index, offsets);
+	}
+
+	lock_mutex_enter();
 
 	ut_ad(mode != LOCK_X
 	      || lock_table_has(thr_get_trx(thr), index->table, LOCK_IX));
 	ut_ad(mode != LOCK_S
 	      || lock_table_has(thr_get_trx(thr), index->table, LOCK_IS));
 
-	if (UNIV_LIKELY(heap_no != PAGE_HEAP_NO_SUPREMUM)) {
-
-		lock_rec_convert_impl_to_expl(block, rec, index, offsets);
-	}
+	err = lock_rec_lock(FALSE, mode | gap_mode, block, heap_no, index, thr);
 
-	err = lock_rec_lock(FALSE, mode | gap_mode,
-			    block, heap_no, index, thr);
+	MONITOR_INC(MONITOR_NUM_RECLOCK_REQ);
 
-	lock_mutex_exit_kernel();
+	lock_mutex_exit();
 
-	ut_ad(lock_rec_queue_validate(block, rec, index, offsets));
+	ut_ad(lock_rec_queue_validate(FALSE, block, rec, index, offsets));
 
 	return(err);
 }
@@ -5607,12 +6299,12 @@ lock_release_autoinc_last_lock(
 	ulint		last;
 	lock_t*		lock;
 
-	ut_ad(mutex_own(&kernel_mutex));
+	ut_ad(lock_mutex_own());
 	ut_a(!ib_vector_is_empty(autoinc_locks));
 
 	/* The lock to be release must be the last lock acquired. */
 	last = ib_vector_size(autoinc_locks) - 1;
-	lock = ib_vector_get(autoinc_locks, last);
+	lock = *static_cast<lock_t**>(ib_vector_get(autoinc_locks, last));
 
 	/* Should have only AUTOINC locks in the vector. */
 	ut_a(lock_get_mode(lock) == LOCK_AUTO_INC);
@@ -5622,12 +6314,15 @@ lock_release_autoinc_last_lock(
 
 	/* This will remove the lock from the trx autoinc_locks too. */
 	lock_table_dequeue(lock);
+
+	/* Remove from the table vector too. */
+	lock_trx_table_locks_remove(lock);
 }
 
 /*******************************************************************//**
-Check if a transaction holds any autoinc locks. 
+Check if a transaction holds any autoinc locks.
 @return TRUE if the transaction holds any AUTOINC locks. */
-UNIV_INTERN
+static
 ibool
 lock_trx_holds_autoinc_locks(
 /*=========================*/
@@ -5640,13 +6335,16 @@ lock_trx_holds_autoinc_locks(
 
 /*******************************************************************//**
 Release all the transaction's autoinc locks. */
-UNIV_INTERN
+static
 void
 lock_release_autoinc_locks(
 /*=======================*/
 	trx_t*		trx)		/*!< in/out: transaction */
 {
-	ut_ad(mutex_own(&kernel_mutex));
+	ut_ad(lock_mutex_own());
+	/* If this is invoked for a running transaction by the thread
+	that is serving the transaction, then it is not necessary to
+	hold trx->mutex here. */
 
 	ut_a(trx->autoinc_locks != NULL);
 
@@ -5865,3 +6563,279 @@ lock_rec_get_page_no(
 
 	return(lock->un_member.rec_lock.page_no);
 }
+
+/*********************************************************************//**
+Cancels a waiting lock request and releases possible other transactions
+waiting behind it. */
+UNIV_INTERN
+void
+lock_cancel_waiting_and_release(
+/*============================*/
+	lock_t*	lock)	/*!< in/out: waiting lock request */
+{
+	que_thr_t*	thr;
+
+	ut_ad(lock_mutex_own());
+	ut_ad(trx_mutex_own(lock->trx));
+
+	lock->trx->lock.cancel = TRUE;
+
+	if (lock_get_type_low(lock) == LOCK_REC) {
+
+		lock_rec_dequeue_from_page(lock);
+	} else {
+		ut_ad(lock_get_type_low(lock) & LOCK_TABLE);
+
+		if (lock->trx->autoinc_locks != NULL) {
+			/* Release the transaction's AUTOINC locks. */
+			lock_release_autoinc_locks(lock->trx);
+		}
+
+		lock_table_dequeue(lock);
+	}
+
+	/* Reset the wait flag and the back pointer to lock in trx. */
+
+	lock_reset_lock_and_trx_wait(lock);
+
+	/* The following function releases the trx from lock wait. */
+
+	thr = que_thr_end_lock_wait(lock->trx);
+
+	if (thr != NULL) {
+		lock_wait_release_thread_if_suspended(thr);
+	}
+
+	lock->trx->lock.cancel = FALSE;
+}
+
+/*********************************************************************//**
+Unlocks AUTO_INC type locks that were possibly reserved by a trx. This
+function should be called at the the end of an SQL statement, by the
+connection thread that owns the transaction (trx->mysql_thd). */
+UNIV_INTERN
+void
+lock_unlock_table_autoinc(
+/*======================*/
+	trx_t*	trx)	/*!< in/out: transaction */
+{
+	ut_ad(!lock_mutex_own());
+	ut_ad(!trx_mutex_own(trx));
+	ut_ad(!trx->lock.wait_lock);
+	/* This can be invoked on NOT_STARTED, ACTIVE, PREPARED,
+	but not COMMITTED transactions. */
+	ut_ad(trx_state_eq(trx, TRX_STATE_NOT_STARTED)
+	      || !trx_state_eq(trx, TRX_STATE_COMMITTED_IN_MEMORY));
+
+	/* This function is invoked for a running transaction by the
+	thread that is serving the transaction. Therefore it is not
+	necessary to hold trx->mutex here. */
+
+	if (lock_trx_holds_autoinc_locks(trx)) {
+		lock_mutex_enter();
+
+		lock_release_autoinc_locks(trx);
+
+		lock_mutex_exit();
+	}
+}
+
+/*********************************************************************//**
+Releases a transaction's locks, and releases possible other transactions
+waiting because of these locks. Change the state of the transaction to
+TRX_STATE_COMMITTED_IN_MEMORY. */
+UNIV_INTERN
+void
+lock_trx_release_locks(
+/*===================*/
+	trx_t*	trx)	/*!< in/out: transaction */
+{
+	assert_trx_in_list(trx);
+
+	if (UNIV_UNLIKELY(trx_state_eq(trx, TRX_STATE_PREPARED))) {
+		mutex_enter(&trx_sys->mutex);
+		ut_a(trx_sys->n_prepared_trx > 0);
+		trx_sys->n_prepared_trx--;
+		mutex_exit(&trx_sys->mutex);
+	} else {
+		ut_ad(trx_state_eq(trx, TRX_STATE_ACTIVE));
+	}
+
+	/* The transition of trx->state to TRX_STATE_COMMITTED_IN_MEMORY
+	is protected by both the lock_sys->mutex and the trx->mutex. */
+	lock_mutex_enter();
+	trx_mutex_enter(trx);
+
+	/* The following assignment makes the transaction committed in memory
+	and makes its changes to data visible to other transactions.
+	NOTE that there is a small discrepancy from the strict formal
+	visibility rules here: a human user of the database can see
+	modifications made by another transaction T even before the necessary
+	log segment has been flushed to the disk. If the database happens to
+	crash before the flush, the user has seen modifications from T which
+	will never be a committed transaction. However, any transaction T2
+	which sees the modifications of the committing transaction T, and
+	which also itself makes modifications to the database, will get an lsn
+	larger than the committing transaction T. In the case where the log
+	flush fails, and T never gets committed, also T2 will never get
+	committed. */
+
+	/*--------------------------------------*/
+	trx->state = TRX_STATE_COMMITTED_IN_MEMORY;
+	/*--------------------------------------*/
+
+	/* If the background thread trx_rollback_or_clean_recovered()
+	is still active then there is a chance that the rollback
+	thread may see this trx as COMMITTED_IN_MEMORY and goes ahead
+	to clean it up calling trx_cleanup_at_db_startup(). This can
+	happen in the case we are committing a trx here that is left
+	in PREPARED state during the crash. Note that commit of the
+	rollback of a PREPARED trx happens in the recovery thread
+	while the rollback of other transactions happen in the
+	background thread. To avoid this race we unconditionally unset
+	the is_recovered flag. */
+
+	trx->is_recovered = FALSE;
+
+	trx_mutex_exit(trx);
+
+	lock_release(trx);
+
+	lock_mutex_exit();
+}
+
+/*********************************************************************//**
+Check whether the transaction has already been rolled back because it
+was selected as a deadlock victim, or if it has to wait then cancel
+the wait lock.
+@return DB_DEADLOCK, DB_LOCK_WAIT or DB_SUCCESS */
+UNIV_INTERN
+enum db_err
+lock_trx_handle_wait(
+/*=================*/
+	trx_t*	trx)	/*!< in/out: trx lock state */
+{
+	enum db_err	err;
+
+	lock_mutex_enter();
+
+	trx_mutex_enter(trx);
+
+	if (trx->lock.was_chosen_as_deadlock_victim) {
+		err = DB_DEADLOCK;
+	} else if (trx->lock.wait_lock != NULL) {
+		lock_cancel_waiting_and_release(trx->lock.wait_lock);
+		err = DB_LOCK_WAIT;
+	} else {
+		/* The lock was probably granted before we got here. */
+		err = DB_SUCCESS;
+	}
+
+	lock_mutex_exit();
+	trx_mutex_exit(trx);
+
+	return(err);
+}
+
+/*********************************************************************//**
+Get the number of locks on a table.
+@return number of locks */
+UNIV_INTERN
+ulint
+lock_table_get_n_locks(
+/*===================*/
+	const dict_table_t*	table)	/*!< in: table */
+{
+	ulint		n_table_locks;
+
+	lock_mutex_enter();
+
+	n_table_locks = UT_LIST_GET_LEN(table->locks);
+
+	lock_mutex_exit();
+
+	return(n_table_locks);
+}
+
+#ifdef UNIV_DEBUG
+/*******************************************************************//**
+Do an exhaustive check for any locks (table or rec) against the table.
+@return	lock if found */
+static
+const lock_t*
+lock_table_locks_lookup(
+/*====================*/
+	const dict_table_t*	table,		/*!< in: check if there are
+						any locks held on records in
+						this table or on the table
+						itself */
+	const trx_list_t*	trx_list)	/*!< in: trx list to check */
+{
+	trx_t*			trx;
+
+	ut_a(table != NULL);
+	ut_ad(lock_mutex_own());
+	ut_ad(mutex_own(&trx_sys->mutex));
+
+	ut_ad(trx_list == &trx_sys->rw_trx_list
+	      || trx_list == &trx_sys->ro_trx_list);
+
+	for (trx = UT_LIST_GET_FIRST(*trx_list);
+	     trx != NULL;
+	     trx = UT_LIST_GET_NEXT(trx_list, trx)) {
+
+		const lock_t*	lock;
+
+		assert_trx_in_list(trx);
+		ut_ad(trx->read_only == (trx_list == &trx_sys->ro_trx_list));
+
+		for (lock = UT_LIST_GET_FIRST(trx->lock.trx_locks);
+		     lock != NULL;
+		     lock = UT_LIST_GET_NEXT(trx_locks, lock)) {
+
+			ut_a(lock->trx == trx);
+
+			if (lock_get_type_low(lock) == LOCK_REC) {
+				if (lock->index->table == table) {
+					return(lock);
+				}
+			} else if (lock->un_member.tab_lock.table == table) {
+				return(lock);
+			}
+		}
+	}
+
+	return(NULL);
+}
+#endif /* UNIV_DEBUG */
+
+/*******************************************************************//**
+Check if there are any locks (table or rec) against table.
+@return	TRUE if table has either table or record locks. */
+UNIV_INTERN
+ibool
+lock_table_has_locks(
+/*=================*/
+	const dict_table_t*	table)	/*!< in: check if there are any locks
+					held on records in this table or on the
+					table itself */
+{
+	ibool			has_locks;
+
+	lock_mutex_enter();
+
+#ifdef UNIV_DEBUG
+	mutex_enter(&trx_sys->mutex);
+
+	ut_ad(lock_table_locks_lookup(table, &trx_sys->rw_trx_list) == NULL);
+	ut_ad(lock_table_locks_lookup(table, &trx_sys->ro_trx_list) == NULL);
+
+	mutex_exit(&trx_sys->mutex);
+#endif /* UNIV_DEBUG */
+
+	has_locks = UT_LIST_GET_LEN(table->locks) > 0 || table->n_rec_locks > 0;
+
+	lock_mutex_exit();
+
+	return(has_locks);
+}
diff --git a/storage/innobase/lock/lock0wait.cc b/storage/innobase/lock/lock0wait.cc
new file mode 100644
index 00000000000..99059f19813
--- /dev/null
+++ b/storage/innobase/lock/lock0wait.cc
@@ -0,0 +1,517 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2011, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file lock/lock0wait.cc
+The transaction lock system
+
+Created 25/5/2010 Sunny Bains
+*******************************************************/
+
+#define LOCK_MODULE_IMPLEMENTATION
+
+#include "srv0mon.h"
+#include "que0que.h"
+#include "lock0lock.h"
+#include "row0mysql.h"
+#include "srv0start.h"
+#include "ha_prototypes.h"
+#include "lock0priv.h"
+
+UNIV_INTERN ibool	srv_lock_timeout_active 	= FALSE;
+UNIV_INTERN ulint	srv_n_lock_wait_count		= 0;
+UNIV_INTERN ulint	srv_n_lock_wait_current_count	= 0;
+UNIV_INTERN ib_int64_t	srv_n_lock_wait_time		= 0;
+UNIV_INTERN ulint	srv_n_lock_max_wait_time	= 0;
+
+UNIV_INTERN os_event_t	srv_timeout_event;
+
+/*********************************************************************//**
+Print the contents of the lock_sys_t::waiting_threads array. */
+static
+void
+lock_wait_table_print(void)
+/*=======================*/
+{
+	ulint			i;
+	const srv_slot_t*	slot;
+
+	ut_ad(lock_wait_mutex_own());
+
+	slot = lock_sys->waiting_threads;
+
+	for (i = 0; i < OS_THREAD_MAX_N; i++, ++slot) {
+
+		fprintf(stderr,
+			"Slot %lu: thread type %lu,"
+			" in use %lu, susp %lu, timeout %lu, time %lu\n",
+			(ulong) i,
+			(ulong) slot->type,
+			(ulong) slot->in_use,
+			(ulong) slot->suspended,
+			slot->wait_timeout,
+			(ulong) difftime(ut_time(), slot->suspend_time));
+	}
+}
+
+/*********************************************************************//**
+Release a slot in the lock_sys_t::waiting_threads. Adjust the array last pointer
+if there are empty slots towards the end of the table. */
+static
+void
+lock_wait_table_release_slot(
+/*=========================*/
+	srv_slot_t*	slot)		/*!< in: slot to release */
+{
+#ifdef UNIV_DEBUG
+	srv_slot_t*	upper = lock_sys->waiting_threads + OS_THREAD_MAX_N;
+#endif /* UNIV_DEBUG */
+
+	lock_wait_mutex_enter();
+
+	ut_ad(slot->in_use);
+	ut_ad(slot->thr != NULL);
+	ut_ad(slot->thr->slot != NULL);
+	ut_ad(slot->thr->slot == slot);
+
+	/* Must be within the array boundaries. */
+	ut_ad(slot >= lock_sys->waiting_threads);
+	ut_ad(slot < upper);
+
+	/* Note: When we reserve the slot we use the trx_t::mutex to update
+	the slot values to change the state to reserved. Here we are using the
+	lock mutex to change the state of the slot to free. This is by design,
+	because when we query the slot state we always hold both the lock and
+	trx_t::mutex. To reduce contention on the lock mutex when reserving the
+	slot we avoid acquiring the lock mutex. */
+
+	lock_mutex_enter();
+
+	slot->thr->slot = NULL;
+	slot->thr = NULL;
+	slot->in_use = FALSE;
+
+	lock_mutex_exit();
+
+	/* Scan backwards and adjust the last free slot pointer. */
+	for (slot = lock_sys->last_slot;
+	     slot > lock_sys->waiting_threads && !slot->in_use;
+	     --slot) {
+		/* No op */
+	}
+
+	/* Either the array is empty or the last scanned slot is in use. */
+	ut_ad(slot->in_use || slot == lock_sys->waiting_threads);
+
+	lock_sys->last_slot = slot + 1;
+
+	/* The last slot is either outside of the array boundary or it's
+	on an empty slot. */
+	ut_ad(lock_sys->last_slot == upper || !lock_sys->last_slot->in_use);
+
+	ut_ad(lock_sys->last_slot >= lock_sys->waiting_threads);
+	ut_ad(lock_sys->last_slot <= upper);
+
+	lock_wait_mutex_exit();
+}
+
+/*********************************************************************//**
+Reserves a slot in the thread table for the current user OS thread.
+@return	reserved slot */
+static
+srv_slot_t*
+lock_wait_table_reserve_slot(
+/*=========================*/
+	que_thr_t*	thr,		/*!< in: query thread associated
+					with the user OS thread */
+	ulong		wait_timeout)	/*!< in: lock wait timeout value */
+{
+	ulint		i;
+	srv_slot_t*	slot;
+
+	ut_ad(lock_wait_mutex_own());
+	ut_ad(trx_mutex_own(thr_get_trx(thr)));
+
+	slot = lock_sys->waiting_threads;
+
+	for (i = OS_THREAD_MAX_N; i--; ++slot) {
+		if (!slot->in_use) {
+			slot->in_use = TRUE;
+			slot->thr = thr;
+			slot->thr->slot = slot;
+
+			if (slot->event == NULL) {
+				slot->event = os_event_create(NULL);
+				ut_a(slot->event);
+			}
+
+			os_event_reset(slot->event);
+			slot->suspended = TRUE;
+			slot->suspend_time = ut_time();
+			slot->wait_timeout = wait_timeout;
+
+			if (slot == lock_sys->last_slot) {
+				++lock_sys->last_slot;
+			}
+
+			ut_ad(lock_sys->last_slot
+			      <= lock_sys->waiting_threads + OS_THREAD_MAX_N);
+
+			return(slot);
+		}
+	}
+
+	ut_print_timestamp(stderr);
+
+	fprintf(stderr,
+		"  InnoDB: There appear to be %lu user"
+		" threads currently waiting\n"
+		"InnoDB: inside InnoDB, which is the"
+		" upper limit. Cannot continue operation.\n"
+		"InnoDB: As a last thing, we print"
+		" a list of waiting threads.\n", (ulong) OS_THREAD_MAX_N);
+
+	lock_wait_table_print();
+
+	ut_error;
+	return(NULL);
+}
+
+/***************************************************************//**
+Puts a user OS thread to wait for a lock to be released. If an error
+occurs during the wait trx->error_state associated with thr is
+!= DB_SUCCESS when we return. DB_LOCK_WAIT_TIMEOUT and DB_DEADLOCK
+are possible errors. DB_DEADLOCK is returned if selective deadlock
+resolution chose this transaction as a victim. */
+UNIV_INTERN
+void
+lock_wait_suspend_thread(
+/*=====================*/
+	que_thr_t*	thr)	/*!< in: query thread associated with the
+				user OS thread */
+{
+	srv_slot_t*	slot;
+	double		wait_time;
+	trx_t*		trx;
+	ulint		had_dict_lock;
+	ibool		was_declared_inside_innodb;
+	ib_int64_t	start_time			= 0;
+	ib_int64_t	finish_time;
+	ulint		sec;
+	ulint		ms;
+	ulong		lock_wait_timeout;
+
+	trx = thr_get_trx(thr);
+
+	if (trx->mysql_thd != 0) {
+		DEBUG_SYNC_C("lock_wait_suspend_thread_enter");
+	}
+
+	/* InnoDB system transactions (such as the purge, and
+	incomplete transactions that are being rolled back after crash
+	recovery) will use the global value of
+	innodb_lock_wait_timeout, because trx->mysql_thd == NULL. */
+	lock_wait_timeout = trx_lock_wait_timeout_get(trx);
+
+	lock_wait_mutex_enter();
+
+	trx_mutex_enter(trx);
+
+	trx->error_state = DB_SUCCESS;
+
+	if (thr->state == QUE_THR_RUNNING) {
+
+		ut_ad(thr->is_active);
+
+		/* The lock has already been released or this transaction
+		was chosen as a deadlock victim: no need to suspend */
+
+		if (trx->lock.was_chosen_as_deadlock_victim) {
+
+			trx->error_state = DB_DEADLOCK;
+			trx->lock.was_chosen_as_deadlock_victim = FALSE;
+		}
+
+		lock_wait_mutex_exit();
+		trx_mutex_exit(trx);
+		return;
+	}
+
+	ut_ad(!thr->is_active);
+
+	slot = lock_wait_table_reserve_slot(thr, lock_wait_timeout);
+
+	if (thr->lock_state == QUE_THR_LOCK_ROW) {
+		srv_n_lock_wait_count++;
+		srv_n_lock_wait_current_count++;
+
+		if (ut_usectime(&sec, &ms) == -1) {
+			start_time = -1;
+		} else {
+			start_time = (ib_int64_t) sec * 1000000 + ms;
+		}
+	}
+
+	/* Wake the lock timeout monitor thread, if it is suspended */
+
+	os_event_set(srv_timeout_event);
+
+	lock_wait_mutex_exit();
+	trx_mutex_exit(trx);
+
+	had_dict_lock = trx->dict_operation_lock_mode;
+
+	switch (had_dict_lock) {
+	case 0:
+		break;
+	case RW_S_LATCH:
+		/* Release foreign key check latch */
+		row_mysql_unfreeze_data_dictionary(trx);
+		break;
+	default:
+		/* There should never be a lock wait when the
+		dictionary latch is reserved in X mode.  Dictionary
+		transactions should only acquire locks on dictionary
+		tables, not other tables. All access to dictionary
+		tables should be covered by dictionary
+		transactions. */
+		ut_error;
+	}
+
+	ut_a(trx->dict_operation_lock_mode == 0);
+
+	/* Suspend this thread and wait for the event. */
+
+	was_declared_inside_innodb = trx->declared_to_be_inside_innodb;
+
+	if (was_declared_inside_innodb) {
+		/* We must declare this OS thread to exit InnoDB, since a
+		possible other thread holding a lock which this thread waits
+		for must be allowed to enter, sooner or later */
+
+		srv_conc_force_exit_innodb(trx);
+	}
+
+	os_event_wait(slot->event);
+
+	/* After resuming, reacquire the data dictionary latch if
+	necessary. */
+
+	if (was_declared_inside_innodb) {
+
+		/* Return back inside InnoDB */
+
+		srv_conc_force_enter_innodb(trx);
+	}
+
+	if (had_dict_lock) {
+
+		row_mysql_freeze_data_dictionary(trx);
+	}
+
+	wait_time = ut_difftime(ut_time(), slot->suspend_time);
+
+	/* Release the slot for others to use */
+
+	lock_wait_table_release_slot(slot);
+
+	if (thr->lock_state == QUE_THR_LOCK_ROW) {
+		ulint	diff_time;
+
+		if (ut_usectime(&sec, &ms) == -1) {
+			finish_time = -1;
+		} else {
+			finish_time = (ib_int64_t) sec * 1000000 + ms;
+		}
+
+		diff_time = (ulint) (finish_time - start_time);
+
+		srv_n_lock_wait_current_count--;
+		srv_n_lock_wait_time = srv_n_lock_wait_time + diff_time;
+
+		if (diff_time > srv_n_lock_max_wait_time &&
+		    /* only update the variable if we successfully
+		    retrieved the start and finish times. See Bug#36819. */
+		    start_time != -1 && finish_time != -1) {
+			srv_n_lock_max_wait_time = diff_time;
+		}
+	}
+
+	if (lock_wait_timeout < 100000000
+	    && wait_time > (double) lock_wait_timeout) {
+
+		trx->error_state = DB_LOCK_WAIT_TIMEOUT;
+
+		MONITOR_INC(MONITOR_TIMEOUT);
+	}
+
+	if (trx_is_interrupted(trx)) {
+
+		trx->error_state = DB_INTERRUPTED;
+	}
+}
+
+/********************************************************************//**
+Releases a user OS thread waiting for a lock to be released, if the
+thread is already suspended. */
+UNIV_INTERN
+void
+lock_wait_release_thread_if_suspended(
+/*==================================*/
+	que_thr_t*	thr)	/*!< in: query thread associated with the
+				user OS thread	 */
+{
+	ut_ad(lock_mutex_own());
+	ut_ad(trx_mutex_own(thr_get_trx(thr)));
+
+	/* We own both the lock mutex and the trx_t::mutex but not the
+	lock wait mutex. This is OK because other threads will see the state
+	of this slot as being in use and no other thread can change the state
+	of the slot to free unless that thread also owns the lock mutex. */
+
+	if (thr->slot != NULL && thr->slot->in_use && thr->slot->thr == thr) {
+		trx_t*	trx = thr_get_trx(thr);
+
+		if (trx->lock.was_chosen_as_deadlock_victim) {
+
+			trx->error_state = DB_DEADLOCK;
+			trx->lock.was_chosen_as_deadlock_victim = FALSE;
+		}
+
+		os_event_set(thr->slot->event);
+	}
+}
+
+/*********************************************************************//**
+Check if the thread lock wait has timed out. Release its locks if the
+wait has actually timed out. */
+static
+void
+lock_wait_check_and_cancel(
+/*=======================*/
+	const srv_slot_t*	slot)	/*!< in: slot reserved by a user
+					thread when the wait started */
+{
+	trx_t*		trx;
+	double		wait_time;
+	ib_time_t	suspend_time = slot->suspend_time;
+
+	ut_ad(lock_wait_mutex_own());
+
+	ut_ad(slot->in_use);
+
+	ut_ad(slot->suspended);
+
+	wait_time = ut_difftime(ut_time(), suspend_time);
+
+	trx = thr_get_trx(slot->thr);
+
+	if (trx_is_interrupted(trx)
+	    || (slot->wait_timeout < 100000000
+		&& (wait_time > (double) slot->wait_timeout
+		   || wait_time < 0))) {
+
+		/* Timeout exceeded or a wrap-around in system
+		time counter: cancel the lock request queued
+		by the transaction and release possible
+		other transactions waiting behind; it is
+		possible that the lock has already been
+		granted: in that case do nothing */
+
+		lock_mutex_enter();
+
+		trx_mutex_enter(trx);
+
+		if (trx->lock.wait_lock) {
+
+			ut_a(trx->lock.que_state == TRX_QUE_LOCK_WAIT);
+
+			lock_cancel_waiting_and_release(trx->lock.wait_lock);
+		}
+
+		lock_mutex_exit();
+
+		trx_mutex_exit(trx);
+	}
+
+}
+
+/*********************************************************************//**
+A thread which wakes up threads whose lock wait may have lasted too long.
+@return	a dummy parameter */
+extern "C" UNIV_INTERN
+os_thread_ret_t
+DECLARE_THREAD(lock_wait_timeout_thread)(
+/*=====================================*/
+	void*	arg __attribute__((unused)))
+			/* in: a dummy parameter required by
+			os_thread_create */
+{
+	ib_int64_t	sig_count = 0;
+
+#ifdef UNIV_PFS_THREAD
+	pfs_register_thread(srv_lock_timeout_thread_key);
+#endif
+	srv_lock_timeout_active = TRUE;
+
+	do {
+		srv_slot_t*	slot;
+
+		/* When someone is waiting for a lock, we wake up every second
+		and check if a timeout has passed for a lock wait */
+
+		os_event_wait_time_low(srv_timeout_event, 1000000, sig_count);
+
+		if (srv_shutdown_state >= SRV_SHUTDOWN_CLEANUP) {
+			break;
+		}
+
+		lock_wait_mutex_enter();
+
+		/* Check all slots for user threads that are waiting
+	       	on locks, and if they have exceeded the time limit. */
+
+		for (slot = lock_sys->waiting_threads;
+		     slot < lock_sys->last_slot;
+		     ++slot) {
+
+			/* We are doing a read without the lock mutex
+			and/or the trx mutex. This is OK because a slot
+		       	can't be freed or reserved without the lock wait
+		       	mutex. */
+
+			if (slot->in_use) {
+				lock_wait_check_and_cancel(slot);
+			}
+		}
+
+		sig_count = os_event_reset(srv_timeout_event);
+
+		lock_wait_mutex_exit();
+
+	} while (srv_shutdown_state < SRV_SHUTDOWN_CLEANUP);
+
+	srv_lock_timeout_active = FALSE;
+
+	/* We count the number of threads in os_thread_exit(). A created
+	thread should always use that to exit and not use return() to exit. */
+
+	os_thread_exit(NULL);
+
+	OS_THREAD_DUMMY_RETURN;
+}
diff --git a/storage/innobase/log/log0log.c b/storage/innobase/log/log0log.cc
index 8bae95f0a5d..ecbc6c59da1 100644
--- a/storage/innobase/log/log0log.c
+++ b/storage/innobase/log/log0log.cc
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1995, 2010, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 1995, 2011, Oracle and/or its affiliates. All Rights Reserved.
 Copyright (c) 2009, Google Inc.
 
 Portions of this file contain modifications contributed and copyrighted by
@@ -24,7 +24,7 @@ this program; if not, write to the Free Software Foundation, Inc.,
 *****************************************************************************/
 
 /**************************************************//**
-@file log/log0log.c
+@file log/log0log.cc
 Database log
 
 Created 12/9/1995 Heikki Tuuri
@@ -48,6 +48,7 @@ Created 12/9/1995 Heikki Tuuri
 #include "srv0start.h"
 #include "trx0sys.h"
 #include "trx0trx.h"
+#include "srv0mon.h"
 
 /*
 General philosophy of InnoDB redo-logs:
@@ -75,10 +76,6 @@ reduce the size of the log.
 
 */
 
-/* Current free limit of space 0; protected by the log sys mutex; 0 means
-uninitialized */
-UNIV_INTERN ulint	log_fsp_current_free_limit		= 0;
-
 /* Global log system variable */
 UNIV_INTERN log_t*	log_sys	= NULL;
 
@@ -164,42 +161,15 @@ log_io_complete_archive(void);
 #endif /* UNIV_LOG_ARCHIVE */
 
 /****************************************************************//**
-Sets the global variable log_fsp_current_free_limit. Also makes a checkpoint,
-so that we know that the limit has been written to a log checkpoint field
-on disk. */
-UNIV_INTERN
-void
-log_fsp_current_free_limit_set_and_checkpoint(
-/*==========================================*/
-	ulint	limit)	/*!< in: limit to set */
-{
-	ibool	success;
-
-	mutex_enter(&(log_sys->mutex));
-
-	log_fsp_current_free_limit = limit;
-
-	mutex_exit(&(log_sys->mutex));
-
-	/* Try to make a synchronous checkpoint */
-
-	success = FALSE;
-
-	while (!success) {
-		success = log_checkpoint(TRUE, TRUE);
-	}
-}
-
-/****************************************************************//**
 Returns the oldest modified block lsn in the pool, or log_sys->lsn if none
 exists.
 @return	LSN of oldest modification */
 static
-ib_uint64_t
+lsn_t
 log_buf_pool_get_oldest_modification(void)
 /*======================================*/
 {
-	ib_uint64_t	lsn;
+	lsn_t	lsn;
 
 	ut_ad(mutex_own(&(log_sys->mutex)));
 
@@ -218,7 +188,7 @@ Opens the log for log_write_low. The log must be closed with log_close and
 released with log_release.
 @return	start lsn of the log record */
 UNIV_INTERN
-ib_uint64_t
+lsn_t
 log_reserve_and_open(
 /*=================*/
 	ulint	len)	/*!< in: length of data to be catenated */
@@ -328,8 +298,10 @@ part_loop:
 	str_len -= len;
 	str = str + len;
 
-	log_block = ut_align_down(log->buf + log->buf_free,
-				  OS_FILE_LOG_BLOCK_SIZE);
+	log_block = static_cast<byte*>(
+		ut_align_down(
+			log->buf + log->buf_free, OS_FILE_LOG_BLOCK_SIZE));
+
 	log_block_set_data_len(log_block, data_len);
 
 	if (data_len == OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_TRL_SIZE) {
@@ -362,24 +334,26 @@ part_loop:
 Closes the log.
 @return	lsn */
 UNIV_INTERN
-ib_uint64_t
+lsn_t
 log_close(void)
 /*===========*/
 {
 	byte*		log_block;
 	ulint		first_rec_group;
-	ib_uint64_t	oldest_lsn;
-	ib_uint64_t	lsn;
+	lsn_t		oldest_lsn;
+	lsn_t		lsn;
 	log_t*		log	= log_sys;
-	ib_uint64_t	checkpoint_age;
+	lsn_t		checkpoint_age;
 
 	ut_ad(mutex_own(&(log->mutex)));
 	ut_ad(!recv_no_log_write);
 
 	lsn = log->lsn;
 
-	log_block = ut_align_down(log->buf + log->buf_free,
-				  OS_FILE_LOG_BLOCK_SIZE);
+	log_block = static_cast<byte*>(
+		ut_align_down(
+			log->buf + log->buf_free, OS_FILE_LOG_BLOCK_SIZE));
+
 	first_rec_group = log_block_get_first_rec_group(log_block);
 
 	if (first_rec_group == 0) {
@@ -413,21 +387,21 @@ log_close(void)
 
 			ut_print_timestamp(stderr);
 			fprintf(stderr,
-				"  InnoDB: ERROR: the age of the last"
-				" checkpoint is %lu,\n"
+				" InnoDB: ERROR: the age of the last"
+				" checkpoint is " LSN_PF ",\n"
 				"InnoDB: which exceeds the log group"
-				" capacity %lu.\n"
+				" capacity " LSN_PF ".\n"
 				"InnoDB: If you are using big"
 				" BLOB or TEXT rows, you must set the\n"
 				"InnoDB: combined size of log files"
 				" at least 10 times bigger than the\n"
 				"InnoDB: largest such row.\n",
-				(ulong) checkpoint_age,
-				(ulong) log->log_group_capacity);
+				checkpoint_age,
+				log->log_group_capacity);
 		}
 	}
 
-	if (checkpoint_age <= log->max_modified_age_async) {
+	if (checkpoint_age <= log->max_modified_age_sync) {
 
 		goto function_exit;
 	}
@@ -435,7 +409,7 @@ log_close(void)
 	oldest_lsn = buf_pool_get_oldest_modification();
 
 	if (!oldest_lsn
-	    || lsn - oldest_lsn > log->max_modified_age_async
+	    || lsn - oldest_lsn > log->max_modified_age_sync
 	    || checkpoint_age > log->max_checkpoint_age_async) {
 
 		log->check_flush_or_checkpoint = TRUE;
@@ -489,7 +463,7 @@ Calculates the data capacity of a log group, when the log file headers are not
 included.
 @return	capacity in bytes */
 UNIV_INTERN
-ulint
+lsn_t
 log_group_get_capacity(
 /*===================*/
 	const log_group_t*	group)	/*!< in: log group */
@@ -504,10 +478,10 @@ Calculates the offset within a log group, when the log file headers are not
 included.
 @return	size offset (<= offset) */
 UNIV_INLINE
-ulint
+lsn_t
 log_group_calc_size_offset(
 /*=======================*/
-	ulint			offset,	/*!< in: real offset within the
+	lsn_t			offset,	/*!< in: real offset within the
 					log group */
 	const log_group_t*	group)	/*!< in: log group */
 {
@@ -521,10 +495,10 @@ Calculates the offset within a log group, when the log file headers are
 included.
 @return	real offset (>= offset) */
 UNIV_INLINE
-ulint
+lsn_t
 log_group_calc_real_offset(
 /*=======================*/
-	ulint			offset,	/*!< in: size offset within the
+	lsn_t			offset,	/*!< in: size offset within the
 					log group */
 	const log_group_t*	group)	/*!< in: log group */
 {
@@ -538,36 +512,31 @@ log_group_calc_real_offset(
 Calculates the offset of an lsn within a log group.
 @return	offset within the log group */
 static
-ulint
+lsn_t
 log_group_calc_lsn_offset(
 /*======================*/
-	ib_uint64_t		lsn,	/*!< in: lsn, must be within 4 GB of
-					group->lsn */
+	lsn_t			lsn,	/*!< in: lsn */
 	const log_group_t*	group)	/*!< in: log group */
 {
-	ib_uint64_t	gr_lsn;
-	ib_int64_t	gr_lsn_size_offset;
-	ib_int64_t	difference;
-	ib_int64_t	group_size;
-	ib_int64_t	offset;
+	lsn_t	gr_lsn;
+	lsn_t	gr_lsn_size_offset;
+	lsn_t	difference;
+	lsn_t	group_size;
+	lsn_t	offset;
 
 	ut_ad(mutex_own(&(log_sys->mutex)));
 
-	/* If total log file size is > 2 GB we can easily get overflows
-	with 32-bit integers. Use 64-bit integers instead. */
-
 	gr_lsn = group->lsn;
 
-	gr_lsn_size_offset = (ib_int64_t)
-		log_group_calc_size_offset(group->lsn_offset, group);
+	gr_lsn_size_offset = log_group_calc_size_offset(group->lsn_offset, group);
 
-	group_size = (ib_int64_t) log_group_get_capacity(group);
+	group_size = log_group_get_capacity(group);
 
 	if (lsn >= gr_lsn) {
 
-		difference = (ib_int64_t) (lsn - gr_lsn);
+		difference = lsn - gr_lsn;
 	} else {
-		difference = (ib_int64_t) (gr_lsn - lsn);
+		difference = gr_lsn - lsn;
 
 		difference = difference % group_size;
 
@@ -576,14 +545,13 @@ log_group_calc_lsn_offset(
 
 	offset = (gr_lsn_size_offset + difference) % group_size;
 
-	ut_a(offset < (((ib_int64_t) 1) << 32)); /* offset must be < 4 GB */
-
 	/* fprintf(stderr,
-	"Offset is %lu gr_lsn_offset is %lu difference is %lu\n",
-	(ulint)offset,(ulint)gr_lsn_size_offset, (ulint)difference);
+	"Offset is " LSN_PF " gr_lsn_offset is " LSN_PF
+	" difference is " LSN_PF "\n",
+	offset, gr_lsn_size_offset, difference);
 	*/
 
-	return(log_group_calc_real_offset((ulint)offset, group));
+	return(log_group_calc_real_offset(offset, group));
 }
 #endif /* !UNIV_HOTBACKUP */
 
@@ -615,9 +583,9 @@ log_calc_where_lsn_is(
 
 	if (lsn < first_header_lsn) {
 		add_this_many = 1 + (first_header_lsn - lsn)
-			/ (capacity * (ib_int64_t)n_log_files);
+			/ (capacity * (ib_int64_t) n_log_files);
 		lsn += add_this_many
-			* capacity * (ib_int64_t)n_log_files;
+			* capacity * (ib_int64_t) n_log_files;
 	}
 
 	ut_a(lsn >= first_header_lsn);
@@ -641,7 +609,7 @@ void
 log_group_set_fields(
 /*=================*/
 	log_group_t*	group,	/*!< in/out: group */
-	ib_uint64_t	lsn)	/*!< in: lsn for which the values should be
+	lsn_t		lsn)	/*!< in: lsn for which the values should be
 				set */
 {
 	group->lsn_offset = log_group_calc_lsn_offset(lsn, group);
@@ -659,12 +627,12 @@ log_calc_max_ages(void)
 /*===================*/
 {
 	log_group_t*	group;
-	ulint		margin;
+	lsn_t		margin;
 	ulint		free;
 	ibool		success		= TRUE;
-	ulint		smallest_capacity;
-	ulint		archive_margin;
-	ulint		smallest_archive_margin;
+	lsn_t		smallest_capacity;
+	lsn_t		archive_margin;
+	lsn_t		smallest_archive_margin;
 
 	mutex_enter(&(log_sys->mutex));
 
@@ -672,8 +640,8 @@ log_calc_max_ages(void)
 
 	ut_ad(group);
 
-	smallest_capacity = ULINT_MAX;
-	smallest_archive_margin = ULINT_MAX;
+	smallest_capacity = LSN_MAX;
+	smallest_archive_margin = LSN_MAX;
 
 	while (group) {
 		if (log_group_get_capacity(group) < smallest_capacity) {
@@ -711,8 +679,6 @@ log_calc_max_ages(void)
 		margin = smallest_capacity - free;
 	}
 
-	margin = ut_min(margin, log_sys->adm_checkpoint_interval);
-
 	margin = margin - margin / 10;	/* Add still some extra safety */
 
 	log_sys->log_group_capacity = smallest_capacity;
@@ -751,7 +717,7 @@ failure:
 			"InnoDB: " REFMAN "adding-and-removing.html\n"
 			"InnoDB: Cannot continue operation."
 			" Calling exit(1).\n",
-			(ulong)srv_thread_concurrency);
+			(ulong) srv_thread_concurrency);
 
 		exit(1);
 	}
@@ -766,7 +732,7 @@ void
 log_init(void)
 /*==========*/
 {
-	log_sys = mem_alloc(sizeof(log_t));
+	log_sys = static_cast<log_t*>(mem_alloc(sizeof(log_t)));
 
 	mutex_create(log_sys_mutex_key, &log_sys->mutex, SYNC_LOG);
 
@@ -781,15 +747,19 @@ log_init(void)
 
 	log_sys->lsn = LOG_START_LSN;
 
+	MONITOR_SET(MONITOR_LSN_CHECKPOINT_AGE,
+		    log_sys->lsn - log_sys->last_checkpoint_lsn);
+
 	ut_a(LOG_BUFFER_SIZE >= 16 * OS_FILE_LOG_BLOCK_SIZE);
 	ut_a(LOG_BUFFER_SIZE >= 4 * UNIV_PAGE_SIZE);
 
-	log_sys->buf_ptr = mem_alloc(LOG_BUFFER_SIZE + OS_FILE_LOG_BLOCK_SIZE);
-	log_sys->buf = ut_align(log_sys->buf_ptr, OS_FILE_LOG_BLOCK_SIZE);
+	log_sys->buf_ptr = static_cast<byte*>(
+		mem_zalloc(LOG_BUFFER_SIZE + OS_FILE_LOG_BLOCK_SIZE));
 
-	log_sys->buf_size = LOG_BUFFER_SIZE;
+	log_sys->buf = static_cast<byte*>(
+		ut_align(log_sys->buf_ptr, OS_FILE_LOG_BLOCK_SIZE));
 
-	memset(log_sys->buf, '\0', LOG_BUFFER_SIZE);
+	log_sys->buf_size = LOG_BUFFER_SIZE;
 
 	log_sys->max_buf_free = log_sys->buf_size / LOG_BUF_FLUSH_RATIO
 		- LOG_BUF_FLUSH_MARGIN;
@@ -822,19 +792,22 @@ log_init(void)
 	os_event_set(log_sys->one_flushed_event);
 
 	/*----------------------------*/
-	log_sys->adm_checkpoint_interval = ULINT_MAX;
 
 	log_sys->next_checkpoint_no = 0;
 	log_sys->last_checkpoint_lsn = log_sys->lsn;
+	MONITOR_SET(MONITOR_LSN_CHECKPOINT_AGE, 0);
 	log_sys->n_pending_checkpoint_writes = 0;
 
+
 	rw_lock_create(checkpoint_lock_key, &log_sys->checkpoint_lock,
 		       SYNC_NO_ORDER_CHECK);
 
-	log_sys->checkpoint_buf_ptr = mem_alloc(2 * OS_FILE_LOG_BLOCK_SIZE);
-	log_sys->checkpoint_buf = ut_align(log_sys->checkpoint_buf_ptr,
-					   OS_FILE_LOG_BLOCK_SIZE);
-	memset(log_sys->checkpoint_buf, '\0', OS_FILE_LOG_BLOCK_SIZE);
+	log_sys->checkpoint_buf_ptr = static_cast<byte*>(
+		mem_zalloc(2 * OS_FILE_LOG_BLOCK_SIZE));
+
+	log_sys->checkpoint_buf = static_cast<byte*>(
+		ut_align(log_sys->checkpoint_buf_ptr, OS_FILE_LOG_BLOCK_SIZE));
+
 	/*----------------------------*/
 
 #ifdef UNIV_LOG_ARCHIVE
@@ -869,6 +842,9 @@ log_init(void)
 	log_sys->buf_free = LOG_BLOCK_HDR_SIZE;
 	log_sys->lsn = LOG_START_LSN + LOG_BLOCK_HDR_SIZE;
 
+	MONITOR_SET(MONITOR_LSN_CHECKPOINT_AGE,
+		    log_sys->lsn - log_sys->last_checkpoint_lsn);
+
 	mutex_exit(&(log_sys->mutex));
 
 #ifdef UNIV_LOG_DEBUG
@@ -891,7 +867,7 @@ log_group_init(
 /*===========*/
 	ulint	id,			/*!< in: group id */
 	ulint	n_files,		/*!< in: number of log files */
-	ulint	file_size,		/*!< in: log file size in bytes */
+	lsn_t	file_size,		/*!< in: log file size in bytes */
 	ulint	space_id,		/*!< in: space id of the file space
 					which contains the log files of this
 					group */
@@ -906,7 +882,7 @@ log_group_init(
 
 	log_group_t*	group;
 
-	group = mem_alloc(sizeof(log_group_t));
+	group = static_cast<log_group_t*>(mem_alloc(sizeof(log_group_t)));
 
 	group->id = id;
 	group->n_files = n_files;
@@ -917,35 +893,35 @@ log_group_init(
 	group->lsn_offset = LOG_FILE_HDR_SIZE;
 	group->n_pending_writes = 0;
 
-	group->file_header_bufs_ptr = mem_alloc(sizeof(byte*) * n_files);
-	group->file_header_bufs = mem_alloc(sizeof(byte*) * n_files);
+	group->file_header_bufs_ptr = static_cast<byte**>(
+		mem_zalloc(sizeof(byte*) * n_files));
+
+	group->file_header_bufs = static_cast<byte**>(
+		mem_zalloc(sizeof(byte**) * n_files));
+
 #ifdef UNIV_LOG_ARCHIVE
-	group->archive_file_header_bufs_ptr = mem_alloc(
-		sizeof(byte*) * n_files);
-	group->archive_file_header_bufs = mem_alloc(sizeof(byte*) * n_files);
+	group->archive_file_header_bufs_ptr = static_cast<byte*>(
+		mem_zalloc( sizeof(byte*) * n_files));
+
+	group->archive_file_header_bufs = static_cast<byte*>(
+		mem_zalloc(sizeof(byte*) * n_files));
 #endif /* UNIV_LOG_ARCHIVE */
 
 	for (i = 0; i < n_files; i++) {
-		group->file_header_bufs_ptr[i] = mem_alloc(
-			LOG_FILE_HDR_SIZE + OS_FILE_LOG_BLOCK_SIZE);
-
-		group->file_header_bufs[i] = ut_align(
-			group->file_header_bufs_ptr[i],
-			OS_FILE_LOG_BLOCK_SIZE);
+		group->file_header_bufs_ptr[i] = static_cast<byte*>(
+			mem_zalloc(LOG_FILE_HDR_SIZE + OS_FILE_LOG_BLOCK_SIZE));
 
-		memset(*(group->file_header_bufs + i), '\0',
-		       LOG_FILE_HDR_SIZE);
+		group->file_header_bufs[i] = static_cast<byte*>(
+			ut_align(group->file_header_bufs_ptr[i],
+				 OS_FILE_LOG_BLOCK_SIZE));
 
 #ifdef UNIV_LOG_ARCHIVE
-		group->archive_file_header_bufs_ptr[i] = mem_alloc(
-			LOG_FILE_HDR_SIZE + OS_FILE_LOG_BLOCK_SIZE);
+		group->archive_file_header_bufs_ptr[i] = static_cast<byte*>(
+			mem_zalloc(LOG_FILE_HDR_SIZE + OS_FILE_LOG_BLOCK_SIZE));
 
-		group->archive_file_header_bufs[i] = ut_align(
-			group->archive_file_header_bufs_ptr[i],
-			OS_FILE_LOG_BLOCK_SIZE);
-
-		memset(*(group->archive_file_header_bufs + i), '\0',
-		       LOG_FILE_HDR_SIZE);
+		group->archive_file_header_bufs[i] = static_cast<byte*>(
+			ut_align(group->archive_file_header_bufs_ptr[i],
+				 OS_FILE_LOG_BLOCK_SIZE));
 #endif /* UNIV_LOG_ARCHIVE */
 	}
 
@@ -956,11 +932,11 @@ log_group_init(
 	group->archived_offset = 0;
 #endif /* UNIV_LOG_ARCHIVE */
 
-	group->checkpoint_buf_ptr = mem_alloc(2 * OS_FILE_LOG_BLOCK_SIZE);
-	group->checkpoint_buf = ut_align(group->checkpoint_buf_ptr,
-					 OS_FILE_LOG_BLOCK_SIZE);
+	group->checkpoint_buf_ptr = static_cast<byte*>(
+		mem_zalloc(2 * OS_FILE_LOG_BLOCK_SIZE));
 
-	memset(group->checkpoint_buf, '\0', OS_FILE_LOG_BLOCK_SIZE);
+	group->checkpoint_buf = static_cast<byte*>(
+		ut_align(group->checkpoint_buf_ptr,OS_FILE_LOG_BLOCK_SIZE));
 
 	UT_LIST_ADD_LAST(log_groups, log_sys->log_groups, group);
 
@@ -1086,7 +1062,7 @@ log_io_complete(
 	ulint	unlock;
 
 #ifdef UNIV_LOG_ARCHIVE
-	if ((byte*)group == &log_archive_io) {
+	if ((byte*) group == &log_archive_io) {
 		/* It was an archive write */
 
 		log_io_complete_archive();
@@ -1095,9 +1071,9 @@ log_io_complete(
 	}
 #endif /* UNIV_LOG_ARCHIVE */
 
-	if ((ulint)group & 0x1UL) {
+	if ((ulint) group & 0x1UL) {
 		/* It was a checkpoint write */
-		group = (log_group_t*)((ulint)group - 1);
+		group = (log_group_t*)((ulint) group - 1);
 
 		if (srv_unix_file_flush_method != SRV_UNIX_O_DSYNC
 		    && srv_unix_file_flush_method != SRV_UNIX_NOSYNC) {
@@ -1135,6 +1111,7 @@ log_io_complete(
 
 	group->n_pending_writes--;
 	log_sys->n_pending_writes--;
+	MONITOR_DEC(MONITOR_PENDING_LOG_WRITE);
 
 	unlock = log_group_check_flush_completion(group);
 	unlock = unlock | log_sys_check_flush_completion();
@@ -1153,11 +1130,11 @@ log_group_file_header_flush(
 	log_group_t*	group,		/*!< in: log group */
 	ulint		nth_file,	/*!< in: header to the nth file in the
 					log file space */
-	ib_uint64_t	start_lsn)	/*!< in: log file data starts at this
+	lsn_t		start_lsn)	/*!< in: log file data starts at this
 					lsn */
 {
 	byte*	buf;
-	ulint	dest_offset;
+	lsn_t	dest_offset;
 
 	ut_ad(mutex_own(&(log_sys->mutex)));
 	ut_ad(!recv_no_log_write);
@@ -1183,11 +1160,13 @@ log_group_file_header_flush(
 	if (log_do_write) {
 		log_sys->n_log_ios++;
 
+		MONITOR_INC(MONITOR_LOG_IO);
+
 		srv_os_log_pending_writes++;
 
 		fil_io(OS_FILE_WRITE | OS_FILE_LOG, TRUE, group->space_id, 0,
-		       dest_offset / UNIV_PAGE_SIZE,
-		       dest_offset % UNIV_PAGE_SIZE,
+		       (ulint) (dest_offset / UNIV_PAGE_SIZE),
+		       (ulint) (dest_offset % UNIV_PAGE_SIZE),
 		       OS_FILE_LOG_BLOCK_SIZE,
 		       buf, group);
 
@@ -1218,7 +1197,7 @@ log_group_write_buf(
 	byte*		buf,		/*!< in: buffer */
 	ulint		len,		/*!< in: buffer len; must be divisible
 					by OS_FILE_LOG_BLOCK_SIZE */
-	ib_uint64_t	start_lsn,	/*!< in: start lsn of the buffer; must
+	lsn_t		start_lsn,	/*!< in: start lsn of the buffer; must
 					be divisible by
 					OS_FILE_LOG_BLOCK_SIZE */
 	ulint		new_data_offset)/*!< in: start offset of new data in
@@ -1226,15 +1205,15 @@ log_group_write_buf(
 					if we have to write a new log file
 					header */
 {
-	ulint	write_len;
-	ibool	write_header;
-	ulint	next_offset;
-	ulint	i;
+	ulint		write_len;
+	ibool		write_header;
+	lsn_t		next_offset;
+	ulint		i;
 
 	ut_ad(mutex_own(&(log_sys->mutex)));
 	ut_ad(!recv_no_log_write);
 	ut_a(len % OS_FILE_LOG_BLOCK_SIZE == 0);
-	ut_a(((ulint) start_lsn) % OS_FILE_LOG_BLOCK_SIZE == 0);
+	ut_a(start_lsn % OS_FILE_LOG_BLOCK_SIZE == 0);
 
 	if (new_data_offset == 0) {
 		write_header = TRUE;
@@ -1253,17 +1232,21 @@ loop:
 	    && write_header) {
 		/* We start to write a new log file instance in the group */
 
-		log_group_file_header_flush(group,
-					    next_offset / group->file_size,
+		ut_a(next_offset / group->file_size <= ULINT_MAX);
+
+		log_group_file_header_flush(group, (ulint)
+					    (next_offset / group->file_size),
 					    start_lsn);
-		srv_os_log_written+= OS_FILE_LOG_BLOCK_SIZE;
+		srv_os_log_written += OS_FILE_LOG_BLOCK_SIZE;
 		srv_log_writes++;
 	}
 
 	if ((next_offset % group->file_size) + len > group->file_size) {
 
-		write_len = group->file_size
-			- (next_offset % group->file_size);
+		/* if the above condition holds, then the below expression
+		is < len which is ulint, so the typecast is ok */
+		write_len = (ulint)
+			(group->file_size - (next_offset % group->file_size));
 	} else {
 		write_len = len;
 	}
@@ -1273,11 +1256,11 @@ loop:
 
 		fprintf(stderr,
 			"Writing log file segment to group %lu"
-			" offset %lu len %lu\n"
-			"start lsn %llu\n"
+			" offset " LSN_PF " len %lu\n"
+			"start lsn " LSN_PF "\n"
 			"First block n:o %lu last block n:o %lu\n",
-			(ulong) group->id, (ulong) next_offset,
-			(ulong) write_len,
+			(ulong) group->id, next_offset,
+			write_len,
 			start_lsn,
 			(ulong) log_block_get_hdr_no(buf),
 			(ulong) log_block_get_hdr_no(
@@ -1303,15 +1286,20 @@ loop:
 	if (log_do_write) {
 		log_sys->n_log_ios++;
 
+		MONITOR_INC(MONITOR_LOG_IO);
+
 		srv_os_log_pending_writes++;
 
+		ut_a(next_offset / UNIV_PAGE_SIZE <= ULINT_MAX);
+
 		fil_io(OS_FILE_WRITE | OS_FILE_LOG, TRUE, group->space_id, 0,
-		       next_offset / UNIV_PAGE_SIZE,
-		       next_offset % UNIV_PAGE_SIZE, write_len, buf, group);
+		       (ulint) (next_offset / UNIV_PAGE_SIZE),
+		       (ulint) (next_offset % UNIV_PAGE_SIZE), write_len, buf,
+		       group);
 
 		srv_os_log_pending_writes--;
 
-		srv_os_log_written+= write_len;
+		srv_os_log_written += write_len;
 		srv_log_writes++;
 	}
 
@@ -1335,14 +1323,14 @@ UNIV_INTERN
 void
 log_write_up_to(
 /*============*/
-	ib_uint64_t	lsn,	/*!< in: log sequence number up to which
-				the log should be written,
-				IB_ULONGLONG_MAX if not specified */
-	ulint		wait,	/*!< in: LOG_NO_WAIT, LOG_WAIT_ONE_GROUP,
-				or LOG_WAIT_ALL_GROUPS */
-	ibool		flush_to_disk)
-				/*!< in: TRUE if we want the written log
-				also to be flushed to disk */
+	lsn_t	lsn,	/*!< in: log sequence number up to which
+			the log should be written,
+			IB_ULONGLONG_MAX if not specified */
+	ulint	wait,	/*!< in: LOG_NO_WAIT, LOG_WAIT_ONE_GROUP,
+			or LOG_WAIT_ALL_GROUPS */
+	ibool	flush_to_disk)
+			/*!< in: TRUE if we want the written log
+			also to be flushed to disk */
 {
 	log_group_t*	group;
 	ulint		start_offset;
@@ -1401,7 +1389,7 @@ loop:
 		if (flush_to_disk
 		    && log_sys->current_flush_lsn >= lsn) {
 			/* The write + flush will write enough: wait for it to
-			complete  */
+			complete */
 
 			goto do_waits;
 		}
@@ -1409,7 +1397,7 @@ loop:
 		if (!flush_to_disk
 		    && log_sys->write_lsn >= lsn) {
 			/* The write will write enough: wait for it to
-			complete  */
+			complete */
 
 			goto do_waits;
 		}
@@ -1436,12 +1424,13 @@ loop:
 #ifdef UNIV_DEBUG
 	if (log_debug_writes) {
 		fprintf(stderr,
-			"Writing log from %llu up to lsn %llu\n",
+			"Writing log from " LSN_PF " up to lsn " LSN_PF "\n",
 			log_sys->written_to_all_lsn,
 			log_sys->lsn);
 	}
 #endif /* UNIV_DEBUG */
 	log_sys->n_pending_writes++;
+	MONITOR_INC(MONITOR_PENDING_LOG_WRITE);
 
 	group = UT_LIST_GET_FIRST(log_sys->log_groups);
 	group->n_pending_writes++;	/*!< We assume here that we have only
@@ -1524,6 +1513,7 @@ loop:
 
 	group->n_pending_writes--;
 	log_sys->n_pending_writes--;
+	MONITOR_DEC(MONITOR_PENDING_LOG_WRITE);
 
 	unlock = log_group_check_flush_completion(group);
 	unlock = unlock | log_sys_check_flush_completion();
@@ -1560,7 +1550,7 @@ void
 log_buffer_flush_to_disk(void)
 /*==========================*/
 {
-	ib_uint64_t	lsn;
+	lsn_t	lsn;
 
 	mutex_enter(&(log_sys->mutex));
 
@@ -1582,7 +1572,7 @@ log_buffer_sync_in_background(
 /*==========================*/
 	ibool	flush)	/*!< in: flush the logs to disk */
 {
-	ib_uint64_t	lsn;
+	lsn_t	lsn;
 
 	mutex_enter(&(log_sys->mutex));
 
@@ -1602,8 +1592,8 @@ void
 log_flush_margin(void)
 /*==================*/
 {
-	log_t*		log	= log_sys;
-	ib_uint64_t	lsn	= 0;
+	log_t*	log	= log_sys;
+	lsn_t	lsn	= 0;
 
 	mutex_enter(&(log->mutex));
 
@@ -1630,15 +1620,12 @@ buffer pool. NOTE: this function may only be called if the calling thread owns
 no synchronization objects!
 @return FALSE if there was a flush batch of the same type running,
 which means that we could not start this flush batch */
-UNIV_INTERN
+static
 ibool
 log_preflush_pool_modified_pages(
 /*=============================*/
-	ib_uint64_t	new_oldest,	/*!< in: try to advance
-					oldest_modified_lsn at least
-					to this lsn */
-	ibool		sync)		/*!< in: TRUE if synchronous
-					operation is desired */
+	lsn_t	new_oldest)	/*!< in: try to advance oldest_modified_lsn
+				at least to this lsn */
 {
 	ulint	n_pages;
 
@@ -1657,15 +1644,19 @@ log_preflush_pool_modified_pages(
 
 	n_pages = buf_flush_list(ULINT_MAX, new_oldest);
 
-	if (sync) {
-		buf_flush_wait_batch_end(NULL, BUF_FLUSH_LIST);
-	}
+	buf_flush_wait_batch_end(NULL, BUF_FLUSH_LIST);
 
 	if (n_pages == ULINT_UNDEFINED) {
 
 		return(FALSE);
 	}
 
+	MONITOR_INC_VALUE_CUMULATIVE(
+		MONITOR_FLUSH_SYNC_TOTAL_PAGE,
+		MONITOR_FLUSH_SYNC_COUNT,
+		MONITOR_FLUSH_SYNC_PAGES,
+		n_pages);
+
 	return(TRUE);
 }
 
@@ -1682,6 +1673,8 @@ log_complete_checkpoint(void)
 	log_sys->next_checkpoint_no++;
 
 	log_sys->last_checkpoint_lsn = log_sys->next_checkpoint_lsn;
+	MONITOR_SET(MONITOR_LSN_CHECKPOINT_AGE,
+		    log_sys->lsn - log_sys->last_checkpoint_lsn);
 
 	rw_lock_x_unlock_gen(&(log_sys->checkpoint_lock), LOG_CHECKPOINT);
 }
@@ -1698,6 +1691,7 @@ log_io_complete_checkpoint(void)
 	ut_ad(log_sys->n_pending_checkpoint_writes > 0);
 
 	log_sys->n_pending_checkpoint_writes--;
+	MONITOR_DEC(MONITOR_PENDING_CHECKPOINT_WRITE);
 
 	if (log_sys->n_pending_checkpoint_writes == 0) {
 		log_complete_checkpoint();
@@ -1757,6 +1751,7 @@ log_group_checkpoint(
 	ib_uint64_t	archived_lsn;
 	ib_uint64_t	next_archived_lsn;
 #endif /* UNIV_LOG_ARCHIVE */
+	lsn_t		lsn_offset;
 	ulint		write_offset;
 	ulint		fold;
 	byte*		buf;
@@ -1772,9 +1767,12 @@ log_group_checkpoint(
 	mach_write_to_8(buf + LOG_CHECKPOINT_NO, log_sys->next_checkpoint_no);
 	mach_write_to_8(buf + LOG_CHECKPOINT_LSN, log_sys->next_checkpoint_lsn);
 
-	mach_write_to_4(buf + LOG_CHECKPOINT_OFFSET,
-			log_group_calc_lsn_offset(
-				log_sys->next_checkpoint_lsn, group));
+	lsn_offset = log_group_calc_lsn_offset(log_sys->next_checkpoint_lsn,
+					       group);
+	mach_write_to_4(buf + LOG_CHECKPOINT_OFFSET_LOW32,
+			lsn_offset & 0xFFFFFFFFUL);
+	mach_write_to_4(buf + LOG_CHECKPOINT_OFFSET_HIGH32,
+			lsn_offset >> 32);
 
 	mach_write_to_4(buf + LOG_CHECKPOINT_LOG_BUF_SIZE, log_sys->buf_size);
 
@@ -1821,15 +1819,6 @@ log_group_checkpoint(
 			      LOG_CHECKPOINT_CHECKSUM_2 - LOG_CHECKPOINT_LSN);
 	mach_write_to_4(buf + LOG_CHECKPOINT_CHECKSUM_2, fold);
 
-	/* Starting from InnoDB-3.23.50, we also write info on allocated
-	size in the tablespace */
-
-	mach_write_to_4(buf + LOG_CHECKPOINT_FSP_FREE_LIMIT,
-			log_fsp_current_free_limit);
-
-	mach_write_to_4(buf + LOG_CHECKPOINT_FSP_MAGIC_N,
-			LOG_CHECKPOINT_FSP_MAGIC_N_VAL);
-
 	/* We alternate the physical place of the checkpoint info in the first
 	log file */
 
@@ -1847,9 +1836,12 @@ log_group_checkpoint(
 		}
 
 		log_sys->n_pending_checkpoint_writes++;
+		MONITOR_INC(MONITOR_PENDING_CHECKPOINT_WRITE);
 
 		log_sys->n_log_ios++;
 
+		MONITOR_INC(MONITOR_LOG_IO);
+
 		/* We send as the last parameter the group machine address
 		added with 1, as we want to distinguish between a normal log
 		file write and a checkpoint field write */
@@ -1858,9 +1850,9 @@ log_group_checkpoint(
 		       write_offset / UNIV_PAGE_SIZE,
 		       write_offset % UNIV_PAGE_SIZE,
 		       OS_FILE_LOG_BLOCK_SIZE,
-		       buf, ((byte*)group + 1));
+		       buf, ((byte*) group + 1));
 
-		ut_ad(((ulint)group & 0x1UL) == 0);
+		ut_ad(((ulint) group & 0x1UL) == 0);
 	}
 }
 #endif /* !UNIV_HOTBACKUP */
@@ -1899,8 +1891,9 @@ log_reset_first_header_and_checkpoint(
 	mach_write_to_8(buf + LOG_CHECKPOINT_NO, 0);
 	mach_write_to_8(buf + LOG_CHECKPOINT_LSN, lsn);
 
-	mach_write_to_4(buf + LOG_CHECKPOINT_OFFSET,
+	mach_write_to_4(buf + LOG_CHECKPOINT_OFFSET_LOW32,
 			LOG_FILE_HDR_SIZE + LOG_BLOCK_HDR_SIZE);
+	mach_write_to_4(buf + LOG_CHECKPOINT_OFFSET_HIGH32, 0);
 
 	mach_write_to_4(buf + LOG_CHECKPOINT_LOG_BUF_SIZE, 2 * 1024 * 1024);
 
@@ -1933,6 +1926,8 @@ log_group_read_checkpoint_info(
 
 	log_sys->n_log_ios++;
 
+	MONITOR_INC(MONITOR_LOG_IO);
+
 	fil_io(OS_FILE_READ | OS_FILE_LOG, TRUE, group->space_id, 0,
 	       field / UNIV_PAGE_SIZE, field % UNIV_PAGE_SIZE,
 	       OS_FILE_LOG_BLOCK_SIZE, log_sys->checkpoint_buf, NULL);
@@ -1977,7 +1972,7 @@ log_checkpoint(
 				parameter TRUE, a physical write will always be
 				made to log files */
 {
-	ib_uint64_t	oldest_lsn;
+	lsn_t	oldest_lsn;
 
 	if (recv_recovery_is_on()) {
 		recv_apply_hashed_log_recs(TRUE);
@@ -2034,14 +2029,17 @@ log_checkpoint(
 
 #ifdef UNIV_DEBUG
 	if (log_debug_writes) {
-		fprintf(stderr, "Making checkpoint no %lu at lsn %llu\n",
-			(ulong) log_sys->next_checkpoint_no,
+		fprintf(stderr, "Making checkpoint no "
+			LSN_PF " at lsn " LSN_PF "\n",
+			log_sys->next_checkpoint_no,
 			oldest_lsn);
 	}
 #endif /* UNIV_DEBUG */
 
 	log_groups_write_checkpoint_info();
 
+	MONITOR_INC(MONITOR_NUM_CHECKPOINT);
+
 	mutex_exit(&(log_sys->mutex));
 
 	if (sync) {
@@ -2059,22 +2057,58 @@ UNIV_INTERN
 void
 log_make_checkpoint_at(
 /*===================*/
-	ib_uint64_t	lsn,		/*!< in: make a checkpoint at this or a
-					later lsn, if IB_ULONGLONG_MAX, makes
-					a checkpoint at the latest lsn */
-	ibool		write_always)	/*!< in: the function normally checks if
-					the new checkpoint would have a
-					greater lsn than the previous one: if
-					not, then no physical write is done;
-					by setting this parameter TRUE, a
-					physical write will always be made to
-					log files */
+	lsn_t	lsn,		/*!< in: make a checkpoint at this or a
+				later lsn, if IB_ULONGLONG_MAX, makes
+				a checkpoint at the latest lsn */
+	ibool	write_always)	/*!< in: the function normally checks if
+				the new checkpoint would have a
+				greater lsn than the previous one: if
+				not, then no physical write is done;
+				by setting this parameter TRUE, a
+				physical write will always be made to
+				log files */
 {
 	/* Preflush pages synchronously */
 
-	while (!log_preflush_pool_modified_pages(lsn, TRUE));
+	while (!log_preflush_pool_modified_pages(lsn)) {
+		/* Flush as much as we can */
+	}
+
+	while (!log_checkpoint(TRUE, write_always)) {
+		/* Force a checkpoint */
+	}
+}
+
+/****************************************************************//**
+Checks if an asynchronous flushing of dirty pages is required in the
+background. This function is only called from the page cleaner thread.
+@return lsn to which the flushing should happen or LSN_MAX
+if flushing is not required */
+UNIV_INTERN
+lsn_t
+log_async_flush_lsn(void)
+/*=====================*/
+{
+	lsn_t	age;
+	lsn_t	oldest_lsn;
+	lsn_t	new_lsn = LSN_MAX;
+
+	mutex_enter(&log_sys->mutex);
+
+	oldest_lsn = log_buf_pool_get_oldest_modification();
+
+	ut_a(log_sys->lsn >= oldest_lsn);
+	age = log_sys->lsn - oldest_lsn;
+
+	if (age > log_sys->max_modified_age_async) {
+		/* An asynchronous preflush is required */
+		ut_a(log_sys->lsn >= log_sys->max_modified_age_async);
+		new_lsn = log_sys->lsn - log_sys->max_modified_age_async;
+	}
 
-	while (!log_checkpoint(TRUE, write_always));
+	mutex_exit(&log_sys->mutex);
+
+	return(new_lsn);
 }
 
 /****************************************************************//**
@@ -2088,18 +2122,17 @@ log_checkpoint_margin(void)
 /*=======================*/
 {
 	log_t*		log		= log_sys;
-	ib_uint64_t	age;
-	ib_uint64_t	checkpoint_age;
+	lsn_t		age;
+	lsn_t		checkpoint_age;
 	ib_uint64_t	advance;
-	ib_uint64_t	oldest_lsn;
-	ibool		sync;
+	lsn_t		oldest_lsn;
 	ibool		checkpoint_sync;
 	ibool		do_checkpoint;
 	ibool		success;
 loop:
-	sync = FALSE;
 	checkpoint_sync = FALSE;
 	do_checkpoint = FALSE;
+	advance = 0;
 
 	mutex_enter(&(log->mutex));
 	ut_ad(!recv_no_log_write);
@@ -2117,15 +2150,7 @@ loop:
 	if (age > log->max_modified_age_sync) {
 
 		/* A flush is urgent: we have to do a synchronous preflush */
-
-		sync = TRUE;
 		advance = 2 * (age - log->max_modified_age_sync);
-	} else if (age > log->max_modified_age_async) {
-
-		/* A flush is not urgent: we do an asynchronous preflush */
-		advance = age - log->max_modified_age_async;
-	} else {
-		advance = 0;
 	}
 
 	checkpoint_age = log->lsn - log->last_checkpoint_lsn;
@@ -2150,17 +2175,14 @@ loop:
 	mutex_exit(&(log->mutex));
 
 	if (advance) {
-		ib_uint64_t	new_oldest = oldest_lsn + advance;
+		lsn_t	new_oldest = oldest_lsn + advance;
 
-		success = log_preflush_pool_modified_pages(new_oldest, sync);
+		success = log_preflush_pool_modified_pages(new_oldest);
 
 		/* If the flush succeeded, this thread has done its part
 		and can proceed. If it did not succeed, there was another
-		thread doing a flush at the same time. If sync was FALSE,
-		the flush was not urgent, and we let this thread proceed.
-		Otherwise, we let it start from the beginning again. */
-
-		if (sync && !success) {
+		thread doing a flush at the same time. */
+		if (!success) {
 			mutex_enter(&(log->mutex));
 
 			log->check_flush_or_checkpoint = TRUE;
@@ -2189,11 +2211,11 @@ log_group_read_log_seg(
 	ulint		type,		/*!< in: LOG_ARCHIVE or LOG_RECOVER */
 	byte*		buf,		/*!< in: buffer where to read */
 	log_group_t*	group,		/*!< in: log group */
-	ib_uint64_t	start_lsn,	/*!< in: read area start */
-	ib_uint64_t	end_lsn)	/*!< in: read area end */
+	lsn_t		start_lsn,	/*!< in: read area start */
+	lsn_t		end_lsn)	/*!< in: read area end */
 {
 	ulint	len;
-	ulint	source_offset;
+	lsn_t	source_offset;
 	ibool	sync;
 
 	ut_ad(mutex_own(&(log_sys->mutex)));
@@ -2202,13 +2224,17 @@ log_group_read_log_seg(
 loop:
 	source_offset = log_group_calc_lsn_offset(start_lsn, group);
 
+	ut_a(end_lsn - start_lsn <= ULINT_MAX);
 	len = (ulint) (end_lsn - start_lsn);
 
 	ut_ad(len != 0);
 
 	if ((source_offset % group->file_size) + len > group->file_size) {
 
-		len = group->file_size - (source_offset % group->file_size);
+		/* If the above condition is true then len (which is ulint)
+		is > the expression below, so the typecast is ok */
+		len = (ulint) (group->file_size -
+			(source_offset % group->file_size));
 	}
 
 #ifdef UNIV_LOG_ARCHIVE
@@ -2220,8 +2246,13 @@ loop:
 
 	log_sys->n_log_ios++;
 
+	MONITOR_INC(MONITOR_LOG_IO);
+
+	ut_a(source_offset / UNIV_PAGE_SIZE <= ULINT_MAX);
+
 	fil_io(OS_FILE_READ | OS_FILE_LOG, sync, group->space_id, 0,
-	       source_offset / UNIV_PAGE_SIZE, source_offset % UNIV_PAGE_SIZE,
+	       (ulint) (source_offset / UNIV_PAGE_SIZE),
+	       (ulint) (source_offset % UNIV_PAGE_SIZE),
 	       len, buf, NULL);
 
 	start_lsn += len;
@@ -2281,6 +2312,8 @@ log_group_archive_file_header_write(
 
 	log_sys->n_log_ios++;
 
+	MONITOR_INC(MONITOR_LOG_IO);
+
 	fil_io(OS_FILE_WRITE | OS_FILE_LOG, TRUE, group->archive_space_id,
 	       dest_offset / UNIV_PAGE_SIZE,
 	       dest_offset % UNIV_PAGE_SIZE,
@@ -2314,6 +2347,8 @@ log_group_archive_completed_header_write(
 
 	log_sys->n_log_ios++;
 
+	MONITOR_INC(MONITOR_LOG_IO);
+
 	fil_io(OS_FILE_WRITE | OS_FILE_LOG, TRUE, group->archive_space_id,
 	       dest_offset / UNIV_PAGE_SIZE,
 	       dest_offset % UNIV_PAGE_SIZE,
@@ -2330,14 +2365,14 @@ log_group_archive(
 /*==============*/
 	log_group_t*	group)	/*!< in: log group */
 {
-	os_file_t	 file_handle;
-	ib_uint64_t	start_lsn;
-	ib_uint64_t	end_lsn;
+	os_file_t	file_handle;
+	lsn_t		start_lsn;
+	lsn_t		end_lsn;
 	char		name[1024];
 	byte*		buf;
 	ulint		len;
 	ibool		ret;
-	ulint		next_offset;
+	lsn_t		next_offset;
 	ulint		n_files;
 	ulint		open_mode;
 
@@ -2430,7 +2465,7 @@ loop:
 #ifdef UNIV_DEBUG
 	if (log_debug_writes) {
 		fprintf(stderr,
-			"Archiving starting at lsn %llu, len %lu"
+			"Archiving starting at lsn " LSN_PF ", len %lu"
 			" to group %lu\n",
 			start_lsn,
 			(ulong) len, (ulong) group->id);
@@ -2441,8 +2476,11 @@ loop:
 
 	log_sys->n_log_ios++;
 
+	MONITOR_INC(MONITOR_LOG_IO);
+
 	fil_io(OS_FILE_WRITE | OS_FILE_LOG, FALSE, group->archive_space_id,
-	       next_offset / UNIV_PAGE_SIZE, next_offset % UNIV_PAGE_SIZE,
+	       (ulint) (next_offset / UNIV_PAGE_SIZE),
+	       (ulint) (next_offset % UNIV_PAGE_SIZE),
 	       ut_calc_align(len, OS_FILE_LOG_BLOCK_SIZE), buf,
 	       &log_archive_io);
 
@@ -2711,7 +2749,7 @@ arch_none:
 #ifdef UNIV_DEBUG
 	if (log_debug_writes) {
 		fprintf(stderr,
-			"Archiving from lsn %llu to lsn %llu\n",
+			"Archiving from lsn " LSN_PF " to lsn " LSN_PF "\n",
 			log_sys->archived_lsn, limit_lsn);
 	}
 #endif /* UNIV_DEBUG */
@@ -3076,16 +3114,18 @@ void
 logs_empty_and_mark_files_at_shutdown(void)
 /*=======================================*/
 {
-	ib_uint64_t		lsn;
+	lsn_t			lsn;
 	ulint			arch_log_no;
-	ibool			server_busy;
 	ulint			count = 0;
+	ulint			total_trx;
 	ulint			pending_io;
-	ulint			active_thd;
+	enum srv_thread_type	active_thd;
+	const char*		thread_name;
+	ibool			server_busy;
 
 	if (srv_print_verbose_log) {
 		ut_print_timestamp(stderr);
-		fprintf(stderr, "  InnoDB: Starting shutdown...\n");
+		fprintf(stderr, " InnoDB: Starting shutdown...\n");
 	}
 	/* Wait until the master thread and all other operations are idle: our
 	algorithm only works if the server is idle at shutdown */
@@ -3096,40 +3136,20 @@ loop:
 
 	count++;
 
-	mutex_enter(&kernel_mutex);
-
 	/* We need the monitor threads to stop before we proceed with
 	a shutdown. */
 
-	if (srv_error_monitor_active
-	    || srv_lock_timeout_active
-	    || srv_monitor_active) {
-		const char*	thread_active = NULL;
+	thread_name = srv_any_background_threads_are_active();
 
+	if (thread_name != NULL) {
 		/* Print a message every 60 seconds if we are waiting
-		for the monitor thread to exit. Master and worker threads
-		check will be done later. */
-		if (srv_print_verbose_log && count > 600) {
-
-		       if (srv_error_monitor_active) {
-			       thread_active = "srv_error_monitor_thread";
-		       } else if (srv_lock_timeout_active) {
-			       thread_active = "srv_lock_timeout thread";
-		       } else if (srv_monitor_active) {
-			       thread_active = "srv_monitor_thread";
-		       }
-		}
+		for the monitor thread to exit. Master and worker
+		threads check will be done later. */
 
-		mutex_exit(&kernel_mutex);
-
-		os_event_set(srv_error_event);
-		os_event_set(srv_monitor_event);
-		os_event_set(srv_timeout_event);
-
-		if (thread_active) {
+		if (srv_print_verbose_log && count > 600) {
 			ut_print_timestamp(stderr);
-			fprintf(stderr, "  InnoDB: Waiting for %s to exit\n",
-				thread_active);
+			fprintf(stderr, " InnoDB: Waiting for %s to exit\n",
+				thread_name);
 			count = 0;
 		}
 
@@ -3141,32 +3161,31 @@ loop:
 	shutdown, because the InnoDB layer may have committed or
 	prepared transactions and we don't want to lose them. */
 
-	server_busy = trx_n_mysql_transactions > 0
-		|| UT_LIST_GET_LEN(trx_sys->trx_list) > trx_n_prepared;
-
-	if (server_busy) {
-		ulint	total_trx = UT_LIST_GET_LEN(trx_sys->trx_list)
-				    + trx_n_mysql_transactions;
+	total_trx = trx_sys_any_active_transactions();
 
-		mutex_exit(&kernel_mutex);
+	if (total_trx > 0) {
 
 		if (srv_print_verbose_log && count > 600) {
 			ut_print_timestamp(stderr);
-			fprintf(stderr, "  InnoDB: Waiting for %lu "
+			fprintf(stderr, " InnoDB: Waiting for %lu "
 				"active transactions to finish\n",
 				(ulong) total_trx);
+
 			count = 0;
 		}
 
 		goto loop;
 	}
 
-	mutex_exit(&kernel_mutex);
-
 	/* Check that the background threads are suspended */
+
 	active_thd = srv_get_active_thread_type();
 
-	if (active_thd != ULINT_UNDEFINED) {
+	if (active_thd != SRV_NONE) {
+
+		if (active_thd == SRV_PURGE) {
+			srv_purge_wakeup();
+		}
 
 		/* The srv_lock_timeout_thread, srv_error_monitor_thread
 		and srv_monitor_thread should already exit by now. The
@@ -3174,19 +3193,28 @@ loop:
 		and worker threads (purge threads). Print the thread
 		type if any of such threads not in suspended mode */
 		if (srv_print_verbose_log && count > 600) {
-			const char*     thread_type = "<null>";
+			const char*	thread_type = "<null>";
 
 			switch (active_thd) {
+			case SRV_NONE:
+				/* This shouldn't happen because we've
+				already checked for this case before
+				entering the if(). We handle it here
+				to avoid a compiler warning. */
+				ut_error;
 			case SRV_WORKER:
 				thread_type = "worker threads";
 				break;
 			case SRV_MASTER:
 				thread_type = "master thread";
 				break;
+			case SRV_PURGE:
+				thread_type = "purge thread";
+				break;
 			}
 
 			ut_print_timestamp(stderr);
-			fprintf(stderr, "  InnoDB: Waiting for %s "
+			fprintf(stderr, " InnoDB: Waiting for %s "
 				"to be suspended\n", thread_type);
 			count = 0;
 		}
@@ -3194,6 +3222,23 @@ loop:
 		goto loop;
 	}
 
+	/* At this point only page_cleaner should be active. We wait
+	here to let it complete the flushing of the buffer pools
+	before proceeding further. */
+	srv_shutdown_state = SRV_SHUTDOWN_FLUSH_PHASE;
+	count = 0;
+	while (buf_page_cleaner_is_active) {
+		++count;
+		os_thread_sleep(100000);
+		if (srv_print_verbose_log && count > 600) {
+			ut_print_timestamp(stderr);
+			fprintf(stderr,
+				" InnoDB: Waiting for page_cleaner to "
+				"finish flushing of buffer pool\n");
+			count = 0;
+		}
+	}
+
 	mutex_enter(&log_sys->mutex);
 	server_busy = log_sys->n_pending_checkpoint_writes
 #ifdef UNIV_LOG_ARCHIVE
@@ -3206,22 +3251,21 @@ loop:
 		if (srv_print_verbose_log && count > 600) {
 			ut_print_timestamp(stderr);
 			fprintf(stderr,
-				"  InnoDB: Pending checkpoint_writes: %lu\n"
-				"  InnoDB: Pending log flush writes: %lu\n",
+				" InnoDB: Pending checkpoint_writes: %lu\n"
+				" InnoDB: Pending log flush writes: %lu\n",
 				(ulong) log_sys->n_pending_checkpoint_writes,
 				(ulong) log_sys->n_pending_writes);
 			count = 0;
 		}
-
 		goto loop;
 	}
 
-	pending_io = buf_pool_check_num_pending_io();
+	pending_io = buf_pool_check_no_pending_io();
 
 	if (pending_io) {
 		if (srv_print_verbose_log && count > 600) {
 			ut_print_timestamp(stderr);
-			fprintf(stderr, "  InnoDB: Waiting for %lu buffer page "
+			fprintf(stderr, " InnoDB: Waiting for %lu buffer page "
 				"I/Os to complete\n",
 				(ulong) pending_io);
 			count = 0;
@@ -3230,44 +3274,45 @@ loop:
 		goto loop;
 	}
 
-
 #ifdef UNIV_LOG_ARCHIVE
 	log_archive_all();
 #endif /* UNIV_LOG_ARCHIVE */
 	if (srv_fast_shutdown == 2) {
 		ut_print_timestamp(stderr);
 		fprintf(stderr,
-			"  InnoDB: MySQL has requested a very fast shutdown"
+			" InnoDB: MySQL has requested a very fast shutdown"
 			" without flushing "
 			"the InnoDB buffer pool to data files."
 			" At the next mysqld startup "
 			"InnoDB will do a crash recovery!\n");
 
-		/* In this fastest shutdown we do not flush the buffer
-		pool: it is essentially a 'crash' of the InnoDB
-		server. Make sure that the log is all flushed to disk,
-		so that we can recover all committed transactions in a
-		crash recovery. We must not write the lsn stamps to
-		the data files, since at a startup InnoDB deduces from
-		the stamps if the previous shutdown was clean. */
+		/* In this fastest shutdown we do not flush the buffer pool:
+		it is essentially a 'crash' of the InnoDB server. Make sure
+		that the log is all flushed to disk, so that we can recover
+		all committed transactions in a crash recovery. We must not
+		write the lsn stamps to the data files, since at a startup
+		InnoDB deduces from the stamps if the previous shutdown was
+		clean. */
 
 		log_buffer_flush_to_disk();
 
 		/* Check that the background threads stay suspended */
-		if (srv_get_active_thread_type() != ULINT_UNDEFINED) {
+		thread_name = srv_any_background_threads_are_active();
+		if (thread_name != NULL) {
 			fprintf(stderr,
-				"InnoDB: Warning: some background thread"
-				" woke up during shutdown\n");
+				"InnoDB: Warning: background thread %s"
+				" woke up during shutdown\n", thread_name);
 			goto loop;
 		}
 
 		srv_shutdown_state = SRV_SHUTDOWN_LAST_PHASE;
 		fil_close_all_files();
-		ut_a(srv_get_active_thread_type() == ULINT_UNDEFINED);
+		thread_name = srv_any_background_threads_are_active();
+		ut_a(!thread_name);
 		return;
 	}
 
-	log_make_checkpoint_at(IB_ULONGLONG_MAX, TRUE);
+	log_make_checkpoint_at(LSN_MAX, TRUE);
 
 	mutex_enter(&log_sys->mutex);
 
@@ -3301,10 +3346,11 @@ loop:
 	mutex_exit(&log_sys->mutex);
 
 	/* Check that the background threads stay suspended */
-	if (srv_get_active_thread_type() != ULINT_UNDEFINED) {
+	thread_name = srv_any_background_threads_are_active();
+	if (thread_name != NULL) {
 		fprintf(stderr,
-			"InnoDB: Warning: some background thread woke up"
-			" during shutdown\n");
+			"InnoDB: Warning: background thread %s"
+			" woke up during shutdown\n", thread_name);
 
 		goto loop;
 	}
@@ -3332,7 +3378,7 @@ loop:
 	srv_shutdown_state = SRV_SHUTDOWN_LAST_PHASE;
 
 	/* Make some checks that the server really is quiet */
-	ut_a(srv_get_active_thread_type() == ULINT_UNDEFINED);
+	ut_a(srv_get_active_thread_type() == SRV_NONE);
 
 	ut_a(buf_all_freed());
 	ut_a(lsn == log_sys->lsn);
@@ -3340,8 +3386,8 @@ loop:
 	if (lsn < srv_start_lsn) {
 		fprintf(stderr,
 			"InnoDB: Error: log sequence number"
-			" at shutdown %llu\n"
-			"InnoDB: is lower than at startup %llu!\n",
+			" at shutdown " LSN_PF "\n"
+			"InnoDB: is lower than at startup " LSN_PF "!\n",
 			lsn, srv_start_lsn);
 	}
 
@@ -3354,7 +3400,7 @@ loop:
 	fil_close_all_files();
 
 	/* Make some checks that the server really is quiet */
-	ut_a(srv_get_active_thread_type() == ULINT_UNDEFINED);
+	ut_a(srv_get_active_thread_type() == SRV_NONE);
 
 	ut_a(buf_all_freed());
 	ut_a(lsn == log_sys->lsn);
@@ -3419,7 +3465,7 @@ UNIV_INTERN
 ibool
 log_peek_lsn(
 /*=========*/
-	ib_uint64_t*	lsn)	/*!< out: if returns TRUE, current lsn is here */
+	lsn_t*	lsn)	/*!< out: if returns TRUE, current lsn is here */
 {
 	if (0 == mutex_enter_nowait(&(log_sys->mutex))) {
 		*lsn = log_sys->lsn;
@@ -3446,24 +3492,31 @@ log_print(
 	mutex_enter(&(log_sys->mutex));
 
 	fprintf(file,
-		"Log sequence number %llu\n"
-		"Log flushed up to   %llu\n"
-		"Last checkpoint at  %llu\n",
+		"Log sequence number " LSN_PF "\n"
+		"Log flushed up to   " LSN_PF "\n"
+		"Pages flushed up to " LSN_PF "\n"
+		"Last checkpoint at  " LSN_PF "\n",
 		log_sys->lsn,
 		log_sys->flushed_to_disk_lsn,
+		log_buf_pool_get_oldest_modification(),
 		log_sys->last_checkpoint_lsn);
 
 	current_time = time(NULL);
 
-	time_elapsed = 0.001 + difftime(current_time,
-					log_sys->last_printout_time);
+	time_elapsed = difftime(current_time,
+				log_sys->last_printout_time);
+
+	if (time_elapsed <= 0) {
+		time_elapsed = 1;
+	}
+
 	fprintf(file,
 		"%lu pending log writes, %lu pending chkp writes\n"
 		"%lu log i/o's done, %.2f log i/o's/second\n",
 		(ulong) log_sys->n_pending_writes,
 		(ulong) log_sys->n_pending_checkpoint_writes,
 		(ulong) log_sys->n_log_ios,
-		((log_sys->n_log_ios - log_sys->n_log_ios_old)
+		((double)(log_sys->n_log_ios - log_sys->n_log_ios_old)
 		 / time_elapsed));
 
 	log_sys->n_log_ios_old = log_sys->n_log_ios;
diff --git a/storage/innobase/log/log0recv.c b/storage/innobase/log/log0recv.cc
index f9e0fecb6c6..9919ec2a80a 100644
--- a/storage/innobase/log/log0recv.c
+++ b/storage/innobase/log/log0recv.cc
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1997, 2010, Innobase Oy. All Rights Reserved.
+Copyright (c) 1997, 2011, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,13 +11,13 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
 /**************************************************//**
-@file log/log0recv.c
+@file log/log0recv.cc
 Recovery
 
 Created 9/20/1997 Heikki Tuuri
@@ -42,6 +42,8 @@ Created 9/20/1997 Heikki Tuuri
 #include "trx0undo.h"
 #include "trx0rec.h"
 #include "fil0fil.h"
+#include "buf0dblwr.h"
+#include "srv0mon.h"
 #ifndef UNIV_HOTBACKUP
 # include "buf0rea.h"
 # include "srv0srv.h"
@@ -146,7 +148,7 @@ UNIV_INTERN ulint	recv_n_pool_free_frames;
 /** The maximum lsn we see for a page during the recovery process. If this
 is bigger than the lsn we are able to scan up to, that is an indication that
 the recovery failed and the database may be corrupt. */
-UNIV_INTERN ib_uint64_t	recv_max_page_lsn;
+UNIV_INTERN lsn_t	recv_max_page_lsn;
 
 #ifdef UNIV_PFS_THREAD
 UNIV_INTERN mysql_pfs_key_t	trx_rollback_clean_thread_key;
@@ -180,8 +182,7 @@ recv_sys_create(void)
 		return;
 	}
 
-	recv_sys = mem_alloc(sizeof(*recv_sys));
-	memset(recv_sys, 0x0, sizeof(*recv_sys));
+	recv_sys = static_cast<recv_sys_t*>(mem_zalloc(sizeof(*recv_sys)));
 
 	mutex_create(recv_sys_mutex_key, &recv_sys->mutex, SYNC_RECV);
 
@@ -315,7 +316,8 @@ recv_sys_init(
 
 	mutex_enter(&(recv_sys->mutex));
 
-	recv_sys->heap = mem_heap_create_in_buffer(256);
+	recv_sys->heap = mem_heap_create_typed(256,
+					MEM_HEAP_FOR_RECV_SYS);
 #else /* !UNIV_HOTBACKUP */
 	recv_sys->heap = mem_heap_create(256);
 	recv_is_from_backup = TRUE;
@@ -327,7 +329,7 @@ recv_sys_init(
 		recv_n_pool_free_frames = 512;
 	}
 
-	recv_sys->buf = ut_malloc(RECV_PARSING_BUF_SIZE);
+	recv_sys->buf = static_cast<byte*>(ut_malloc(RECV_PARSING_BUF_SIZE));
 	recv_sys->len = 0;
 	recv_sys->recovered_offset = 0;
 
@@ -337,10 +339,12 @@ recv_sys_init(
 	recv_sys->apply_log_recs = FALSE;
 	recv_sys->apply_batch_on = FALSE;
 
-	recv_sys->last_block_buf_start = mem_alloc(2 * OS_FILE_LOG_BLOCK_SIZE);
+	recv_sys->last_block_buf_start = static_cast<byte*>(
+		mem_alloc(2 * OS_FILE_LOG_BLOCK_SIZE));
+
+	recv_sys->last_block = static_cast<byte*>(ut_align(
+		recv_sys->last_block_buf_start, OS_FILE_LOG_BLOCK_SIZE));
 
-	recv_sys->last_block = ut_align(recv_sys->last_block_buf_start,
-					OS_FILE_LOG_BLOCK_SIZE);
 	recv_sys->found_corrupt_log = FALSE;
 
 	recv_max_page_lsn = 0;
@@ -409,24 +413,23 @@ void
 recv_truncate_group(
 /*================*/
 	log_group_t*	group,		/*!< in: log group */
-	ib_uint64_t	recovered_lsn,	/*!< in: recovery succeeded up to this
+	lsn_t		recovered_lsn,	/*!< in: recovery succeeded up to this
 					lsn */
-	ib_uint64_t	limit_lsn,	/*!< in: this was the limit for
+	lsn_t		limit_lsn,	/*!< in: this was the limit for
 					recovery */
-	ib_uint64_t	checkpoint_lsn,	/*!< in: recovery was started from this
+	lsn_t		checkpoint_lsn,	/*!< in: recovery was started from this
 					checkpoint */
-	ib_uint64_t	archived_lsn)	/*!< in: the log has been archived up to
+	lsn_t		archived_lsn)	/*!< in: the log has been archived up to
 					this lsn */
 {
-	ib_uint64_t	start_lsn;
-	ib_uint64_t	end_lsn;
-	ib_uint64_t	finish_lsn1;
-	ib_uint64_t	finish_lsn2;
-	ib_uint64_t	finish_lsn;
-	ulint		len;
+	lsn_t		start_lsn;
+	lsn_t		end_lsn;
+	lsn_t		finish_lsn1;
+	lsn_t		finish_lsn2;
+	lsn_t		finish_lsn;
 	ulint		i;
 
-	if (archived_lsn == IB_ULONGLONG_MAX) {
+	if (archived_lsn == LSN_MAX) {
 		/* Checkpoint was taken in the NOARCHIVELOG mode */
 		archived_lsn = checkpoint_lsn;
 	}
@@ -439,7 +442,7 @@ recv_truncate_group(
 					 OS_FILE_LOG_BLOCK_SIZE)
 		+ recv_sys->last_log_buf_size;
 
-	if (limit_lsn != IB_ULONGLONG_MAX) {
+	if (limit_lsn != LSN_MAX) {
 		/* We do not know how far we should erase log records: erase
 		as much as possible */
 
@@ -464,11 +467,13 @@ recv_truncate_group(
 	if (start_lsn != recovered_lsn) {
 		/* Copy the last incomplete log block to the log buffer and
 		edit its data length: */
+		lsn_t	diff = recovered_lsn - start_lsn;
+
+		ut_a(diff <= 0xFFFFUL);
 
 		ut_memcpy(log_sys->buf, recv_sys->last_block,
 			  OS_FILE_LOG_BLOCK_SIZE);
-		log_block_set_data_len(log_sys->buf,
-				       (ulint) (recovered_lsn - start_lsn));
+		log_block_set_data_len(log_sys->buf, (ulint) diff);
 	}
 
 	if (start_lsn >= finish_lsn) {
@@ -477,6 +482,8 @@ recv_truncate_group(
 	}
 
 	for (;;) {
+		ulint	len;
+
 		end_lsn = start_lsn + RECV_SCAN_SIZE;
 
 		if (end_lsn > finish_lsn) {
@@ -513,12 +520,11 @@ recv_copy_group(
 						group */
 	log_group_t*	group,			/*!< in: copy to this log
 						group */
-	ib_uint64_t	recovered_lsn)		/*!< in: recovery succeeded up
+	lsn_t		recovered_lsn)		/*!< in: recovery succeeded up
 						to this lsn */
 {
-	ib_uint64_t	start_lsn;
-	ib_uint64_t	end_lsn;
-	ulint		len;
+	lsn_t		start_lsn;
+	lsn_t		end_lsn;
 
 	if (group->scanned_lsn >= recovered_lsn) {
 
@@ -530,6 +536,8 @@ recv_copy_group(
 	start_lsn = ut_uint64_align_down(group->scanned_lsn,
 					 OS_FILE_LOG_BLOCK_SIZE);
 	for (;;) {
+		ulint	len;
+
 		end_lsn = start_lsn + RECV_SCAN_SIZE;
 
 		if (end_lsn > recovered_lsn) {
@@ -566,9 +574,9 @@ recv_synchronize_groups(
 						log group */
 {
 	log_group_t*	group;
-	ib_uint64_t	start_lsn;
-	ib_uint64_t	end_lsn;
-	ib_uint64_t	recovered_lsn;
+	lsn_t		start_lsn;
+	lsn_t		end_lsn;
+	lsn_t		recovered_lsn;
 
 	recovered_lsn = recv_sys->recovered_lsn;
 
@@ -705,7 +713,9 @@ recv_find_max_checkpoint(
 			group->lsn = mach_read_from_8(
 				buf + LOG_CHECKPOINT_LSN);
 			group->lsn_offset = mach_read_from_4(
-				buf + LOG_CHECKPOINT_OFFSET);
+				buf + LOG_CHECKPOINT_OFFSET_LOW32);
+			group->lsn_offset |= ((lsn_t) mach_read_from_4(
+				buf + LOG_CHECKPOINT_OFFSET_HIGH32)) << 32;
 			checkpoint_no = mach_read_from_8(
 				buf + LOG_CHECKPOINT_NO);
 
@@ -755,17 +765,14 @@ Reads the checkpoint info needed in hot backup.
 @return	TRUE if success */
 UNIV_INTERN
 ibool
-recv_read_cp_info_for_backup(
-/*=========================*/
+recv_read_checkpoint_info_for_backup(
+/*=================================*/
 	const byte*	hdr,	/*!< in: buffer containing the log group
 				header */
-	ib_uint64_t*	lsn,	/*!< out: checkpoint lsn */
-	ulint*		offset,	/*!< out: checkpoint offset in the log group */
-	ulint*		fsp_limit,/*!< out: fsp limit of space 0,
-				1000000000 if the database is running
-				with < version 3.23.50 of InnoDB */
-	ib_uint64_t*	cp_no,	/*!< out: checkpoint number */
-	ib_uint64_t*	first_header_lsn)
+	lsn_t*		lsn,	/*!< out: checkpoint lsn */
+	lsn_t*		offset,	/*!< out: checkpoint offset in the log group */
+	lsn_t*		cp_no,	/*!< out: checkpoint number */
+	lsn_t*		first_header_lsn)
 				/*!< out: lsn of of the start of the
 				first log file */
 {
@@ -795,24 +802,10 @@ recv_read_cp_info_for_backup(
 	cp_buf = hdr + max_cp;
 
 	*lsn = mach_read_from_8(cp_buf + LOG_CHECKPOINT_LSN);
-	*offset = mach_read_from_4(cp_buf + LOG_CHECKPOINT_OFFSET);
-
-	/* If the user is running a pre-3.23.50 version of InnoDB, its
-	checkpoint data does not contain the fsp limit info */
-	if (mach_read_from_4(cp_buf + LOG_CHECKPOINT_FSP_MAGIC_N)
-	    == LOG_CHECKPOINT_FSP_MAGIC_N_VAL) {
-
-		*fsp_limit = mach_read_from_4(
-			cp_buf + LOG_CHECKPOINT_FSP_FREE_LIMIT);
-
-		if (*fsp_limit == 0) {
-			*fsp_limit = 1000000000;
-		}
-	} else {
-		*fsp_limit = 1000000000;
-	}
-
-	/*	fprintf(stderr, "fsp limit %lu MB\n", *fsp_limit); */
+	*offset = mach_read_from_4(
+		cp_buf + LOG_CHECKPOINT_OFFSET_LOW32);
+	*offset |= ((lsn_t) mach_read_from_4(
+			    cp_buf + LOG_CHECKPOINT_OFFSET_HIGH32)) << 32;
 
 	*cp_no = mach_read_from_8(cp_buf + LOG_CHECKPOINT_NO);
 
@@ -868,7 +861,7 @@ recv_scan_log_seg_for_backup(
 /*=========================*/
 	byte*		buf,		/*!< in: buffer containing log data */
 	ulint		buf_len,	/*!< in: data length in that buffer */
-	ib_uint64_t*	scanned_lsn,	/*!< in/out: lsn of buffer start,
+	lsn_t*		scanned_lsn,	/*!< in/out: lsn of buffer start,
 					we return scanned lsn */
 	ulint*		scanned_checkpoint_no,
 					/*!< in/out: 4 lowest bytes of the
@@ -1319,19 +1312,21 @@ recv_get_fil_addr_struct(
 {
 	recv_addr_t*	recv_addr;
 
-	recv_addr = HASH_GET_FIRST(recv_sys->addr_hash,
-				   recv_hash(space, page_no));
-	while (recv_addr) {
-		if ((recv_addr->space == space)
-		    && (recv_addr->page_no == page_no)) {
+	for (recv_addr = static_cast<recv_addr_t*>(
+			HASH_GET_FIRST(recv_sys->addr_hash,
+				       recv_hash(space, page_no)));
+	     recv_addr != 0;
+	     recv_addr = static_cast<recv_addr_t*>(
+		     HASH_GET_NEXT(addr_hash, recv_addr))) {
 
-			break;
-		}
+		if (recv_addr->space == space
+		    && recv_addr->page_no == page_no) {
 
-		recv_addr = HASH_GET_NEXT(addr_hash, recv_addr);
+			return(recv_addr);
+		}
 	}
 
-	return(recv_addr);
+	return(NULL);
 }
 
 /*******************************************************************//**
@@ -1340,13 +1335,13 @@ static
 void
 recv_add_to_hash_table(
 /*===================*/
-	byte		type,		/*!< in: log record type */
-	ulint		space,		/*!< in: space id */
-	ulint		page_no,	/*!< in: page number */
-	byte*		body,		/*!< in: log record body */
-	byte*		rec_end,	/*!< in: log record end */
-	ib_uint64_t	start_lsn,	/*!< in: start lsn of the mtr */
-	ib_uint64_t	end_lsn)	/*!< in: end lsn of the mtr */
+	byte	type,		/*!< in: log record type */
+	ulint	space,		/*!< in: space id */
+	ulint	page_no,	/*!< in: page number */
+	byte*	body,		/*!< in: log record body */
+	byte*	rec_end,	/*!< in: log record end */
+	lsn_t	start_lsn,	/*!< in: start lsn of the mtr */
+	lsn_t	end_lsn)	/*!< in: end lsn of the mtr */
 {
 	recv_t*		recv;
 	ulint		len;
@@ -1363,7 +1358,9 @@ recv_add_to_hash_table(
 
 	len = rec_end - body;
 
-	recv = mem_heap_alloc(recv_sys->heap, sizeof(recv_t));
+	recv = static_cast<recv_t*>(
+		mem_heap_alloc(recv_sys->heap, sizeof(recv_t)));
+
 	recv->type = type;
 	recv->len = rec_end - body;
 	recv->start_lsn = start_lsn;
@@ -1372,8 +1369,9 @@ recv_add_to_hash_table(
 	recv_addr = recv_get_fil_addr_struct(space, page_no);
 
 	if (recv_addr == NULL) {
-		recv_addr = mem_heap_alloc(recv_sys->heap,
-					   sizeof(recv_addr_t));
+		recv_addr = static_cast<recv_addr_t*>(
+			mem_heap_alloc(recv_sys->heap, sizeof(recv_addr_t)));
+
 		recv_addr->space = space;
 		recv_addr->page_no = page_no;
 		recv_addr->state = RECV_NOT_PROCESSED;
@@ -1405,8 +1403,10 @@ recv_add_to_hash_table(
 			len = RECV_DATA_BLOCK_SIZE;
 		}
 
-		recv_data = mem_heap_alloc(recv_sys->heap,
-					   sizeof(recv_data_t) + len);
+		recv_data = static_cast<recv_data_t*>(
+			mem_heap_alloc(recv_sys->heap,
+				       sizeof(recv_data_t) + len));
+
 		*prev_field = recv_data;
 
 		memcpy(recv_data + 1, body, len);
@@ -1442,7 +1442,7 @@ recv_data_copy_to_buf(
 			part_len = len;
 		}
 
-		ut_memcpy(buf, ((byte*)recv_data) + sizeof(recv_data_t),
+		ut_memcpy(buf, ((byte*) recv_data) + sizeof(recv_data_t),
 			  part_len);
 		buf += part_len;
 		len -= part_len;
@@ -1471,10 +1471,10 @@ recv_recover_page_func(
 	recv_addr_t*	recv_addr;
 	recv_t*		recv;
 	byte*		buf;
-	ib_uint64_t	start_lsn;
-	ib_uint64_t	end_lsn;
-	ib_uint64_t	page_lsn;
-	ib_uint64_t	page_newest_lsn;
+	lsn_t		start_lsn;
+	lsn_t		end_lsn;
+	lsn_t		page_lsn;
+	lsn_t		page_newest_lsn;
 	ibool		modification_to_page;
 #ifndef UNIV_HOTBACKUP
 	ibool		success;
@@ -1568,7 +1568,7 @@ recv_recover_page_func(
 			/* We have to copy the record body to a separate
 			buffer */
 
-			buf = mem_alloc(recv->len);
+			buf = static_cast<byte*>(mem_alloc(recv->len));
 
 			recv_data_copy_to_buf(buf, recv);
 		} else {
@@ -1589,7 +1589,7 @@ recv_recover_page_func(
 
 		if (recv->start_lsn >= page_lsn) {
 
-			ib_uint64_t	end_lsn;
+			lsn_t	end_lsn;
 
 			if (!modification_to_page) {
 
@@ -1767,9 +1767,12 @@ loop:
 
 	for (i = 0; i < hash_get_n_cells(recv_sys->addr_hash); i++) {
 
-		recv_addr = HASH_GET_FIRST(recv_sys->addr_hash, i);
+		for (recv_addr = static_cast<recv_addr_t*>(
+				HASH_GET_FIRST(recv_sys->addr_hash, i));
+		     recv_addr != 0;
+		     recv_addr = static_cast<recv_addr_t*>(
+				HASH_GET_NEXT(addr_hash, recv_addr))) {
 
-		while (recv_addr) {
 			ulint	space = recv_addr->space;
 			ulint	zip_size = fil_space_get_zip_size(space);
 			ulint	page_no = recv_addr->page_no;
@@ -1807,8 +1810,6 @@ loop:
 
 				mutex_enter(&(recv_sys->mutex));
 			}
-
-			recv_addr = HASH_GET_NEXT(addr_hash, recv_addr);
 		}
 
 		if (has_printed
@@ -1846,10 +1847,10 @@ loop:
 		mutex_exit(&(recv_sys->mutex));
 		mutex_exit(&(log_sys->mutex));
 
- 		n_pages = buf_flush_list(ULINT_MAX, IB_ULONGLONG_MAX);
-  		ut_a(n_pages != ULINT_UNDEFINED);
-  
- 		buf_flush_wait_batch_end(NULL, BUF_FLUSH_LIST);
+		n_pages = buf_flush_list(ULINT_MAX, LSN_MAX);
+		ut_a(n_pages != ULINT_UNDEFINED);
+
+		buf_flush_wait_batch_end(NULL, BUF_FLUSH_LIST);
 
 		buf_pool_invalidate();
 
@@ -1944,14 +1945,14 @@ recv_apply_log_recs_for_backup(void)
 			if (!success) {
 				fprintf(stderr,
 					"InnoDB: Fatal error: cannot extend"
-					" tablespace %lu to hold %lu pages\n",
+					" tablespace %u to hold %u pages\n",
 					recv_addr->space, recv_addr->page_no);
 
 				exit(1);
 			}
 
 			/* Read the page from the tablespace file using the
-			fil0fil.c routines */
+			fil0fil.cc routines */
 
 			if (zip_size) {
 				error = fil_io(OS_FILE_READ, TRUE,
@@ -1985,7 +1986,7 @@ recv_apply_log_recs_for_backup(void)
 			recv_recover_page(FALSE, block);
 
 			/* Write the page back to the tablespace file using the
-			fil0fil.c routines */
+			fil0fil.cc routines */
 
 			buf_flush_init_for_writing(
 				block->frame, buf_block_get_page_zip(block),
@@ -2069,7 +2070,7 @@ recv_parse_log_rec(
 
 #ifdef UNIV_LOG_LSN_DEBUG
 	if (*type == MLOG_LSN) {
-		ib_uint64_t	lsn = (ib_uint64_t) *space << 32 | *page_no;
+		lsn_t	lsn = (lsn_t) *space << 32 | *page_no;
 # ifdef UNIV_LOG_DEBUG
 		ut_a(lsn == log_sys->old_lsn);
 # else /* UNIV_LOG_DEBUG */
@@ -2095,21 +2096,20 @@ recv_parse_log_rec(
 /*******************************************************//**
 Calculates the new value for lsn when more data is added to the log. */
 static
-ib_uint64_t
+lsn_t
 recv_calc_lsn_on_data_add(
 /*======================*/
-	ib_uint64_t	lsn,	/*!< in: old lsn */
+	lsn_t		lsn,	/*!< in: old lsn */
 	ib_uint64_t	len)	/*!< in: this many bytes of data is
 				added, log block headers not included */
 {
-	ulint	frag_len;
-	ulint	lsn_len;
+	ulint		frag_len;
+	ib_uint64_t	lsn_len;
 
-	frag_len = (((ulint) lsn) % OS_FILE_LOG_BLOCK_SIZE)
-		- LOG_BLOCK_HDR_SIZE;
+	frag_len = (lsn % OS_FILE_LOG_BLOCK_SIZE) - LOG_BLOCK_HDR_SIZE;
 	ut_ad(frag_len < OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_HDR_SIZE
 	      - LOG_BLOCK_TRL_SIZE);
-	lsn_len = (ulint) len;
+	lsn_len = len;
 	lsn_len += (lsn_len + frag_len)
 		/ (OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_HDR_SIZE
 		   - LOG_BLOCK_TRL_SIZE)
@@ -2156,7 +2156,7 @@ recv_report_corrupt_log(
 	fprintf(stderr,
 		"InnoDB: ############### CORRUPT LOG RECORD FOUND\n"
 		"InnoDB: Log record type %lu, space id %lu, page number %lu\n"
-		"InnoDB: Log parsing proceeded successfully up to %llu\n"
+		"InnoDB: Log parsing proceeded successfully up to " LSN_PF "\n"
 		"InnoDB: Previous log record type %lu, is multi %lu\n"
 		"InnoDB: Recv offset %lu, prev %lu\n",
 		(ulong) type, (ulong) space, (ulong) page_no,
@@ -2217,18 +2217,18 @@ recv_parse_log_recs(
 				to the hash table; this is set to FALSE if just
 				debug checking is needed */
 {
-	byte*		ptr;
-	byte*		end_ptr;
-	ulint		single_rec;
-	ulint		len;
-	ulint		total_len;
-	ib_uint64_t	new_recovered_lsn;
-	ib_uint64_t	old_lsn;
-	byte		type;
-	ulint		space;
-	ulint		page_no;
-	byte*		body;
-	ulint		n_recs;
+	byte*	ptr;
+	byte*	end_ptr;
+	ulint	single_rec;
+	ulint	len;
+	ulint	total_len;
+	lsn_t	new_recovered_lsn;
+	lsn_t	old_lsn;
+	byte	type;
+	ulint	space;
+	ulint	page_no;
+	byte*	body;
+	ulint	n_recs;
 
 	ut_ad(mutex_own(&(log_sys->mutex)));
 	ut_ad(recv_sys->parse_start_lsn != 0);
@@ -2275,7 +2275,7 @@ loop:
 			return(FALSE);
 		}
 
-		recv_previous_parsed_rec_type = (ulint)type;
+		recv_previous_parsed_rec_type = (ulint) type;
 		recv_previous_parsed_rec_offset = recv_sys->recovered_offset;
 		recv_previous_parsed_rec_is_multi = 0;
 
@@ -2325,7 +2325,7 @@ loop:
 						" space %lu not complete in\n"
 						"InnoDB: the replay phase."
 						" Path %s\n",
-						(ulint)type, space,
+						(ulint) type, space,
 						(char*)(body + 2));
 
 					ut_error;
@@ -2366,7 +2366,7 @@ loop:
 				return(FALSE);
 			}
 
-			recv_previous_parsed_rec_type = (ulint)type;
+			recv_previous_parsed_rec_type = (ulint) type;
 			recv_previous_parsed_rec_offset
 				= recv_sys->recovered_offset + total_len;
 			recv_previous_parsed_rec_is_multi = 1;
@@ -2466,7 +2466,7 @@ ibool
 recv_sys_add_to_parsing_buf(
 /*========================*/
 	const byte*	log_block,	/*!< in: log block */
-	ib_uint64_t	scanned_lsn)	/*!< in: lsn of how far we were able
+	lsn_t		scanned_lsn)	/*!< in: lsn of how far we were able
 					to find data in this log block */
 {
 	ulint	more_len;
@@ -2567,16 +2567,16 @@ recv_scan_log_recs(
 	const byte*	buf,		/*!< in: buffer containing a log
 					segment or garbage */
 	ulint		len,		/*!< in: buffer length */
-	ib_uint64_t	start_lsn,	/*!< in: buffer start lsn */
-	ib_uint64_t*	contiguous_lsn,	/*!< in/out: it is known that all log
+	lsn_t		start_lsn,	/*!< in: buffer start lsn */
+	lsn_t*		contiguous_lsn,	/*!< in/out: it is known that all log
 					groups contain contiguous log data up
 					to this lsn */
-	ib_uint64_t*	group_scanned_lsn)/*!< out: scanning succeeded up to
+	lsn_t*		group_scanned_lsn)/*!< out: scanning succeeded up to
 					this lsn */
 {
 	const byte*	log_block;
 	ulint		no;
-	ib_uint64_t	scanned_lsn;
+	lsn_t		scanned_lsn;
 	ibool		finished;
 	ulint		data_len;
 	ibool		more_data;
@@ -2608,7 +2608,7 @@ recv_scan_log_recs(
 				    log_block)) {
 				fprintf(stderr,
 					"InnoDB: Log block no %lu at"
-					" lsn %llu has\n"
+					" lsn " LSN_PF " has\n"
 					"InnoDB: ok header, but checksum field"
 					" contains %lu, should be %lu\n",
 					(ulong) no,
@@ -2689,7 +2689,7 @@ recv_scan_log_recs(
 
 				fprintf(stderr,
 					"InnoDB: Log scan progressed"
-					" past the checkpoint lsn %llu\n",
+					" past the checkpoint lsn " LSN_PF "\n",
 					recv_sys->scanned_lsn);
 				recv_init_crash_recovery();
 			}
@@ -2748,7 +2748,7 @@ recv_scan_log_recs(
 
 			fprintf(stderr,
 				"InnoDB: Doing recovery: scanned up to"
-				" log sequence number %llu\n",
+				" log sequence number " LSN_PF "\n",
 				*group_scanned_lsn);
 		}
 	}
@@ -2791,15 +2791,15 @@ void
 recv_group_scan_log_recs(
 /*=====================*/
 	log_group_t*	group,		/*!< in: log group */
-	ib_uint64_t*	contiguous_lsn,	/*!< in/out: it is known that all log
+	lsn_t*		contiguous_lsn,	/*!< in/out: it is known that all log
 					groups contain contiguous log data up
 					to this lsn */
-	ib_uint64_t*	group_scanned_lsn)/*!< out: scanning succeeded up to
+	lsn_t*		group_scanned_lsn)/*!< out: scanning succeeded up to
 					this lsn */
 {
-	ibool		finished;
-	ib_uint64_t	start_lsn;
-	ib_uint64_t	end_lsn;
+	ibool	finished;
+	lsn_t	start_lsn;
+	lsn_t	end_lsn;
 
 	finished = FALSE;
 
@@ -2824,7 +2824,7 @@ recv_group_scan_log_recs(
 	if (log_debug_writes) {
 		fprintf(stderr,
 			"InnoDB: Scanned group %lu up to"
-			" log sequence number %llu\n",
+			" log sequence number " LSN_PF "\n",
 			(ulong) group->id,
 			*group_scanned_lsn);
 	}
@@ -2868,7 +2868,7 @@ recv_init_crash_recovery(void)
 			" half-written data pages from"
 			" the doublewrite\n"
 			"InnoDB: buffer...\n");
-		trx_sys_doublewrite_init_or_restore_pages(TRUE);
+		buf_dblwr_init_or_restore_pages(TRUE);
 	}
 }
 
@@ -2883,34 +2883,30 @@ ulint
 recv_recovery_from_checkpoint_start_func(
 /*=====================================*/
 #ifdef UNIV_LOG_ARCHIVE
-	ulint		type,		/*!< in: LOG_CHECKPOINT or
-					LOG_ARCHIVE */
-	ib_uint64_t	limit_lsn,	/*!< in: recover up to this lsn
-					if possible */
+	ulint	type,		/*!< in: LOG_CHECKPOINT or LOG_ARCHIVE */
+	lsn_t	limit_lsn,	/*!< in: recover up to this lsn if possible */
 #endif /* UNIV_LOG_ARCHIVE */
-	ib_uint64_t	min_flushed_lsn,/*!< in: min flushed lsn from
-					data files */
-	ib_uint64_t	max_flushed_lsn)/*!< in: max flushed lsn from
-					data files */
+	lsn_t	min_flushed_lsn,/*!< in: min flushed lsn from data files */
+	lsn_t	max_flushed_lsn)/*!< in: max flushed lsn from data files */
 {
 	log_group_t*	group;
 	log_group_t*	max_cp_group;
 	log_group_t*	up_to_date_group;
 	ulint		max_cp_field;
-	ib_uint64_t	checkpoint_lsn;
+	lsn_t		checkpoint_lsn;
 	ib_uint64_t	checkpoint_no;
-	ib_uint64_t	old_scanned_lsn;
-	ib_uint64_t	group_scanned_lsn;
-	ib_uint64_t	contiguous_lsn;
+	lsn_t		old_scanned_lsn;
+	lsn_t		group_scanned_lsn;
+	lsn_t		contiguous_lsn;
 #ifdef UNIV_LOG_ARCHIVE
-	ib_uint64_t	archived_lsn;
+	lsn_t		archived_lsn;
 #endif /* UNIV_LOG_ARCHIVE */
 	byte*		buf;
 	byte		log_hdr_buf[LOG_FILE_HDR_SIZE];
 	ulint		err;
 
 #ifdef UNIV_LOG_ARCHIVE
-	ut_ad(type != LOG_CHECKPOINT || limit_lsn == IB_ULONGLONG_MAX);
+	ut_ad(type != LOG_CHECKPOINT || limit_lsn == LSN_MAX);
 /** TRUE when recovering from a checkpoint */
 # define TYPE_CHECKPOINT	(type == LOG_CHECKPOINT)
 /** Recover up to this log sequence number */
@@ -2919,7 +2915,7 @@ recv_recovery_from_checkpoint_start_func(
 /** TRUE when recovering from a checkpoint */
 # define TYPE_CHECKPOINT	1
 /** Recover up to this log sequence number */
-# define LIMIT_LSN		IB_ULONGLONG_MAX
+# define LIMIT_LSN		LSN_MAX
 #endif /* UNIV_LOG_ARCHIVE */
 
 	if (TYPE_CHECKPOINT) {
@@ -3122,10 +3118,10 @@ recv_recovery_from_checkpoint_start_func(
 					" ib_logfiles to start up"
 					" the database?\n"
 					"InnoDB: Log sequence number in"
-					" ib_logfiles is %llu, log\n"
+					" ib_logfiles is " LSN_PF ", log\n"
 					"InnoDB: sequence numbers stamped"
 					" to ibdata file headers are between\n"
-					"InnoDB: %llu and %llu.\n"
+					"InnoDB: " LSN_PF " and " LSN_PF ".\n"
 					"InnoDB: #########################"
 					"#################################\n",
 					checkpoint_lsn,
@@ -3145,7 +3141,7 @@ recv_recovery_from_checkpoint_start_func(
 
 		if (!recv_needed_recovery) {
 			/* Init the doublewrite buffer memory structure */
-			trx_sys_doublewrite_init_or_restore_pages(FALSE);
+			buf_dblwr_init_or_restore_pages(FALSE);
 		}
 	}
 
@@ -3155,7 +3151,8 @@ recv_recovery_from_checkpoint_start_func(
 		fprintf(stderr,
 			"  InnoDB: ERROR: We were only able to scan the log"
 			" up to\n"
-			"InnoDB: %llu, but a checkpoint was at %llu.\n"
+			"InnoDB: " LSN_PF ", but a checkpoint was at "
+			LSN_PF ".\n"
 			"InnoDB: It is possible that"
 			" the database is now corrupt!\n",
 			group_scanned_lsn,
@@ -3166,8 +3163,8 @@ recv_recovery_from_checkpoint_start_func(
 		ut_print_timestamp(stderr);
 		fprintf(stderr,
 			"  InnoDB: ERROR: We were only able to scan the log"
-			" up to %llu\n"
-			"InnoDB: but a database page a had an lsn %llu."
+			" up to " LSN_PF "\n"
+			"InnoDB: but a database page a had an lsn " LSN_PF "."
 			" It is possible that the\n"
 			"InnoDB: database is now corrupt!\n",
 			group_scanned_lsn,
@@ -3217,6 +3214,9 @@ recv_recovery_from_checkpoint_start_func(
 
 	log_sys->last_checkpoint_lsn = checkpoint_lsn;
 
+	MONITOR_SET(MONITOR_LSN_CHECKPOINT_AGE,
+		    log_sys->lsn - log_sys->last_checkpoint_lsn);
+
 	log_sys->next_checkpoint_no = checkpoint_no + 1;
 
 #ifdef UNIV_LOG_ARCHIVE
@@ -3299,7 +3299,9 @@ recv_recovery_from_checkpoint_finish(void)
 	that the data dictionary tables will be free of any locks.
 	The data dictionary latch should guarantee that there is at
 	most one data dictionary transaction active at a time. */
-	trx_rollback_or_clean_recovered(FALSE);
+	if (srv_force_recovery < SRV_FORCE_NO_TRX_UNDO) {
+		trx_rollback_or_clean_recovered(FALSE);
+	}
 }
 
 /********************************************************//**
@@ -3316,20 +3318,28 @@ recv_recovery_rollback_active(void)
 	themselves before we switch the latching order checks on */
 	os_thread_sleep(1000000);
 
-	/* Switch latching order checks on in sync0sync.c */
+	/* Switch latching order checks on in sync0sync.cc */
 	sync_order_checks_on = TRUE;
 #endif
-	/* Drop partially created indexes. */
-	row_merge_drop_temp_indexes();
-	/* Drop temporary tables. */
-	row_mysql_drop_temp_tables();
-
+	/* We can't start any (DDL) transactions if UNDO logging
+	has been disabled, additionally disable ROLLBACK of recovered
+	user transactions. */
 	if (srv_force_recovery < SRV_FORCE_NO_TRX_UNDO) {
+		/* Drop partially created indexes. */
+		row_merge_drop_temp_indexes();
+		/* Drop temporary tables. */
+		row_mysql_drop_temp_tables();
+
+		/* Drop any auxiliary tables that were not dropped when the
+		parent table was dropped. This can happen if the parent table
+		was dropped but the server crashed before the auxiliary tables
+		were dropped. */
+		fts_drop_orphaned_tables();
+
 		/* Rollback the uncommitted transactions which have no user
 		session */
 
-		os_thread_create(trx_rollback_or_clean_all_recovered,
-				 (void *)&i, NULL);
+		os_thread_create(trx_rollback_or_clean_all_recovered, &i, NULL);
 	}
 }
 
@@ -3339,7 +3349,7 @@ UNIV_INTERN
 void
 recv_reset_logs(
 /*============*/
-	ib_uint64_t	lsn,		/*!< in: reset to this lsn
+	lsn_t		lsn,		/*!< in: reset to this lsn
 					rounded up to be divisible by
 					OS_FILE_LOG_BLOCK_SIZE, after
 					which we add
@@ -3393,12 +3403,14 @@ recv_reset_logs(
 	log_sys->buf_free = LOG_BLOCK_HDR_SIZE;
 	log_sys->lsn += LOG_BLOCK_HDR_SIZE;
 
+	MONITOR_SET(MONITOR_LSN_CHECKPOINT_AGE,
+		    (log_sys->lsn - log_sys->last_checkpoint_lsn));
+
 	mutex_exit(&(log_sys->mutex));
 
 	/* Reset the checkpoint fields in logs */
 
-	log_make_checkpoint_at(IB_ULONGLONG_MAX, TRUE);
-	log_make_checkpoint_at(IB_ULONGLONG_MAX, TRUE);
+	log_make_checkpoint_at(LSN_MAX, TRUE);
 
 	mutex_enter(&(log_sys->mutex));
 }
@@ -3413,8 +3425,8 @@ recv_reset_log_files_for_backup(
 /*============================*/
 	const char*	log_dir,	/*!< in: log file directory path */
 	ulint		n_log_files,	/*!< in: number of log files */
-	ulint		log_file_size,	/*!< in: log file size */
-	ib_uint64_t	lsn)		/*!< in: new start lsn, must be
+	lsn_t		log_file_size,	/*!< in: log file size */
+	lsn_t		lsn)		/*!< in: new start lsn, must be
 					divisible by OS_FILE_LOG_BLOCK_SIZE */
 {
 	os_file_t	log_file;
@@ -3437,7 +3449,7 @@ recv_reset_log_files_for_backup(
 	for (i = 0; i < n_log_files; i++) {
 
 		sprintf(name, "%s%s%lu", log_dir,
-			ib_logfile_basename, (ulong)i);
+			ib_logfile_basename, (ulong) i);
 
 		log_file = os_file_create_simple(innodb_file_log_key,
 						 name, OS_FILE_CREATE,
@@ -3452,19 +3464,15 @@ recv_reset_log_files_for_backup(
 		}
 
 		fprintf(stderr,
-			"Setting log file size to %lu %lu\n",
-			(ulong) ut_get_high32(log_file_size),
-			(ulong) log_file_size & 0xFFFFFFFFUL);
+			"Setting log file size to %llu\n",
+			log_file_size);
 
-		success = os_file_set_size(name, log_file,
-					   log_file_size & 0xFFFFFFFFUL,
-					   ut_get_high32(log_file_size));
+		success = os_file_set_size(name, log_file, log_file_size);
 
 		if (!success) {
 			fprintf(stderr,
-				"InnoDB: Cannot set %s size to %lu %lu\n",
-				name, (ulong) ut_get_high32(log_file_size),
-				(ulong) (log_file_size & 0xFFFFFFFFUL));
+				"InnoDB: Cannot set %s size to %llu\n",
+				name, log_file_size);
 			exit(1);
 		}
 
@@ -3490,7 +3498,7 @@ recv_reset_log_files_for_backup(
 		exit(1);
 	}
 
-	os_file_write(name, log_file, buf, 0, 0,
+	os_file_write(name, log_file, buf, 0,
 		      LOG_FILE_HDR_SIZE + OS_FILE_LOG_BLOCK_SIZE);
 	os_file_flush(log_file);
 	os_file_close(log_file);
@@ -3518,9 +3526,8 @@ log_group_recover_from_archive_file(
 	ulint		len;
 	ibool		ret;
 	byte*		buf;
-	ulint		read_offset;
-	ulint		file_size;
-	ulint		file_size_high;
+	os_offset_t	read_offset;
+	os_offset_t	file_size;
 	int		input_char;
 	char		name[10000];
 
@@ -3562,10 +3569,8 @@ ask_again:
 		}
 	}
 
-	ret = os_file_get_size(file_handle, &file_size, &file_size_high);
-	ut_a(ret);
-
-	ut_a(file_size_high == 0);
+	file_size = os_file_get_size(file_handle);
+	ut_a(file_size != (os_offset_t) -1);
 
 	fprintf(stderr, "InnoDB: Opened archived log file %s\n", name);
 
diff --git a/storage/innobase/mach/mach0data.c b/storage/innobase/mach/mach0data.cc
index 647d9e57384..df68aab8a18 100644
--- a/storage/innobase/mach/mach0data.c
+++ b/storage/innobase/mach/mach0data.cc
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1995, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,13 +11,13 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
 /******************************************************************//**
-@file mach/mach0data.c
+@file mach/mach0data.cc
 Utilities for converting data from the database file
 to the machine format.
 
diff --git a/storage/innobase/mem/mem0dbg.c b/storage/innobase/mem/mem0dbg.cc
index ae43d6097a6..83e14ad6071 100644
--- a/storage/innobase/mem/mem0dbg.c
+++ b/storage/innobase/mem/mem0dbg.cc
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1994, 2010, Innobase Oy. All Rights Reserved.
+Copyright (c) 1994, 2011, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,23 +11,22 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
 /********************************************************************//**
-@file mem/mem0dbg.c
+@file mem/mem0dbg.cc
 The memory management: the debug code. This is not a compilation module,
 but is included in mem0mem.* !
 
 Created 6/9/1994 Heikki Tuuri
 *************************************************************************/
 
-#include "ha_prototypes.h"
-
 #ifdef UNIV_MEM_DEBUG
 # ifndef UNIV_HOTBACKUP
+#  include "ha_prototypes.h"
 /* The mutex which protects in the debug version the hash table
 containing the list of live memory heaps, and also the global
 variables below. */
@@ -262,7 +261,7 @@ mem_field_erase(
 	mutex_exit(&mem_hash_mutex);
 
 	/* Check that the field lengths agree */
-	ut_ad(n == (ulint)mem_field_header_get_len(usr_buf));
+	ut_ad(n == (ulint) mem_field_header_get_len(usr_buf));
 
 	/* In the debug version, set the freed space to a random
 	combination of 0xDE and 0xAD */
@@ -339,10 +338,10 @@ mem_hash_insert(
 
 	mutex_enter(&mem_hash_mutex);
 
-	cell_no = ut_hash_ulint((ulint)heap, MEM_HASH_SIZE);
+	cell_no = ut_hash_ulint((ulint) heap, MEM_HASH_SIZE);
 
 	/* Allocate a new node to the list */
-	new_node = ut_malloc(sizeof(mem_hash_node_t));
+	new_node = static_cast<mem_hash_node_t*>(ut_malloc(sizeof(*new_node)));
 
 	new_node->heap = heap;
 	new_node->file_name = file_name;
@@ -384,7 +383,7 @@ mem_hash_remove(
 
 	mutex_enter(&mem_hash_mutex);
 
-	cell_no = ut_hash_ulint((ulint)heap, MEM_HASH_SIZE);
+	cell_no = ut_hash_ulint((ulint) heap, MEM_HASH_SIZE);
 
 	/* Look for the heap in the hash table list */
 	node = UT_LIST_GET_FIRST(*mem_hash_get_nth_cell(cell_no));
@@ -424,7 +423,7 @@ mem_hash_remove(
 			node->nth_heap,
 			innobase_basename(node->file_name), (ulong) node->line,
 			innobase_basename(file_name), (ulong) line);
-		ut_print_buf(stderr, (byte*)node->heap - 200, 400);
+		ut_print_buf(stderr, (byte*) node->heap - 200, 400);
 		fputs("\nDump of the mem heap:\n", stderr);
 		mem_heap_validate_or_print(node->heap, NULL, TRUE, &error,
 					   &size, NULL, NULL);
@@ -528,14 +527,14 @@ mem_heap_validate_or_print(
 			fprintf(stderr, " Block %ld:", block_count);
 		}
 
-		field = (byte*)block + mem_block_get_start(block);
+		field = (byte*) block + mem_block_get_start(block);
 
 		if (top && (field == top)) {
 
 			goto completed;
 		}
 
-		while (field < (byte*)block + mem_block_get_free(block)) {
+		while (field < (byte*) block + mem_block_get_free(block)) {
 
 			/* Calculate the pointer to the storage
 			which was given to the user */
@@ -561,8 +560,8 @@ mem_heap_validate_or_print(
 					" field %lx len %lu\n"
 					"InnoDB: header check field is"
 					" %lx but trailer %lx\n",
-					(ulint)block,
-					(ulint)field, len, check_field,
+					(ulint) block,
+					(ulint) field, len, check_field,
 					mem_field_trailer_get_check(
 						user_field));
 
@@ -582,15 +581,15 @@ mem_heap_validate_or_print(
 		/* At the end check that we have arrived to the first free
 		position */
 
-		if (field != (byte*)block + mem_block_get_free(block)) {
+		if (field != (byte*) block + mem_block_get_free(block)) {
 			/* error */
 
 			fprintf(stderr,
 				"InnoDB: Error: block %lx end of"
 				" mem fields %lx\n"
 				"InnoDB: but block free at %lx\n",
-				(ulint)block, (ulint)field,
-				(ulint)((byte*)block
+				(ulint) block, (ulint) field,
+				(ulint)((byte*) block
 					+ mem_block_get_free(block)));
 
 			return;
@@ -828,19 +827,19 @@ mem_analyze_corruption(
 	ulint	dist;
 
 	fputs("InnoDB: Apparent memory corruption: mem dump ", stderr);
-	ut_print_buf(stderr, (byte*)ptr - 250, 500);
+	ut_print_buf(stderr, (byte*) ptr - 250, 500);
 
 	fputs("\nInnoDB: Scanning backward trying to find"
 	      " previous allocated mem blocks\n", stderr);
 
-	p = (byte*)ptr;
+	p = (byte*) ptr;
 	dist = 0;
 
 	for (i = 0; i < 10; i++) {
 		for (;;) {
-			if (((ulint)p) % 4 == 0) {
+			if (((ulint) p) % 4 == 0) {
 
-				if (*((ulint*)p) == MEM_BLOCK_MAGIC_N) {
+				if (*((ulint*) p) == MEM_BLOCK_MAGIC_N) {
 					fprintf(stderr,
 						"Mem block at - %lu,"
 						" file %s, line %lu\n",
@@ -853,7 +852,7 @@ mem_analyze_corruption(
 					break;
 				}
 
-				if (*((ulint*)p) == MEM_FREED_BLOCK_MAGIC_N) {
+				if (*((ulint*) p) == MEM_FREED_BLOCK_MAGIC_N) {
 					fprintf(stderr,
 						"Freed mem block at - %lu,"
 						" file %s, line %lu\n",
@@ -879,14 +878,14 @@ mem_analyze_corruption(
 		"InnoDB: Scanning forward trying to find next"
 		" allocated mem blocks\n");
 
-	p = (byte*)ptr;
+	p = (byte*) ptr;
 	dist = 0;
 
 	for (i = 0; i < 10; i++) {
 		for (;;) {
-			if (((ulint)p) % 4 == 0) {
+			if (((ulint) p) % 4 == 0) {
 
-				if (*((ulint*)p) == MEM_BLOCK_MAGIC_N) {
+				if (*((ulint*) p) == MEM_BLOCK_MAGIC_N) {
 					fprintf(stderr,
 						"Mem block at + %lu, file %s,"
 						" line %lu\n",
@@ -899,7 +898,7 @@ mem_analyze_corruption(
 					break;
 				}
 
-				if (*((ulint*)p) == MEM_FREED_BLOCK_MAGIC_N) {
+				if (*((ulint*) p) == MEM_FREED_BLOCK_MAGIC_N) {
 					fprintf(stderr,
 						"Freed mem block at + %lu,"
 						" file %s, line %lu\n",
diff --git a/storage/innobase/mem/mem0mem.c b/storage/innobase/mem/mem0mem.cc
index 7727760f1cd..33060f22c6a 100644
--- a/storage/innobase/mem/mem0mem.c
+++ b/storage/innobase/mem/mem0mem.cc
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1994, 2010, Innobase Oy. All Rights Reserved.
+Copyright (c) 1994, 2011, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,13 +11,13 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
 /********************************************************************//**
-@file mem/mem0mem.c
+@file mem/mem0mem.cc
 The memory management
 
 Created 6/9/1994 Heikki Tuuri
@@ -30,7 +30,7 @@ Created 6/9/1994 Heikki Tuuri
 
 #include "buf0buf.h"
 #include "srv0srv.h"
-#include "mem0dbg.c"
+#include "mem0dbg.cc"
 #include <stdarg.h>
 
 /*
@@ -108,7 +108,7 @@ mem_heap_strdup(
 	mem_heap_t*	heap,	/*!< in: memory heap where string is allocated */
 	const char*	str)	/*!< in: string to be copied */
 {
-	return(mem_heap_dup(heap, str, strlen(str) + 1));
+	return(static_cast<char*>(mem_heap_dup(heap, str, strlen(str) + 1)));
 }
 
 /**********************************************************************//**
@@ -140,7 +140,7 @@ mem_heap_strcat(
 	ulint	s1_len = strlen(s1);
 	ulint	s2_len = strlen(s2);
 
-	s = mem_heap_alloc(heap, s1_len + s2_len + 1);
+	s = static_cast<char*>(mem_heap_alloc(heap, s1_len + s2_len + 1));
 
 	memcpy(s, s1, s1_len);
 	memcpy(s + s1_len, s2, s2_len);
@@ -261,7 +261,7 @@ mem_heap_printf_low(
 }
 
 /****************************************************************//**
-A simple (s)printf replacement that dynamically allocates the space for the
+A simple sprintf replacement that dynamically allocates the space for the
 formatted string from the given heap. This supports a very limited set of
 the printf syntax: types 's' and 'u' and length modifier 'l' (which is
 required for the 'u' type).
@@ -285,7 +285,7 @@ mem_heap_printf(
 	va_end(ap);
 
 	/* Now create it for real. */
-	str = mem_heap_alloc(heap, len);
+	str = static_cast<char*>(mem_heap_alloc(heap, len));
 	va_start(ap, format);
 	mem_heap_printf_low(str, format, ap);
 	va_end(ap);
@@ -330,7 +330,8 @@ mem_heap_create_block(
 
 		ut_ad(type == MEM_HEAP_DYNAMIC || n <= MEM_MAX_ALLOC_IN_BUF);
 
-		block = mem_area_alloc(&len, mem_comm_pool);
+		block = static_cast<mem_block_t*>(
+			mem_area_alloc(&len, mem_comm_pool));
 	} else {
 		len = UNIV_PAGE_SIZE;
 
@@ -339,7 +340,7 @@ mem_heap_create_block(
 			buffer pool, but must get the free block from
 			the heap header free block field */
 
-			buf_block = heap->free_block;
+			buf_block = static_cast<buf_block_t*>(heap->free_block);
 			heap->free_block = NULL;
 
 			if (UNIV_UNLIKELY(!buf_block)) {
@@ -469,7 +470,9 @@ mem_heap_block_free(
 	ulint		type;
 	ulint		len;
 #ifndef UNIV_HOTBACKUP
-	buf_block_t*	buf_block	= block->buf_block;
+	buf_block_t*	buf_block;
+
+	buf_block = static_cast<buf_block_t*>(block->buf_block);
 #endif /* !UNIV_HOTBACKUP */
 
 	if (block->magic_n != MEM_BLOCK_MAGIC_N) {
@@ -499,7 +502,7 @@ mem_heap_block_free(
 		/* In the debug version we set the memory to a random
 		combination of hex 0xDE and 0xAD. */
 
-		mem_erase_buf((byte*)block, len);
+		mem_erase_buf((byte*) block, len);
 #else /* UNIV_MEM_DEBUG */
 		UNIV_MEM_ASSERT_AND_FREE(block, len);
 #endif /* UNIV_MEM_DEBUG */
@@ -519,7 +522,7 @@ mem_heap_block_free(
 	/* In the debug version we set the memory to a random
 	combination of hex 0xDE and 0xAD. */
 
-	mem_erase_buf((byte*)block, len);
+	mem_erase_buf((byte*) block, len);
 #else /* UNIV_MEM_DEBUG */
 	UNIV_MEM_ASSERT_AND_FREE(block, len);
 #endif /* UNIV_MEM_DEBUG */
@@ -538,7 +541,7 @@ mem_heap_free_block_free(
 {
 	if (UNIV_LIKELY_NULL(heap->free_block)) {
 
-		buf_block_free(heap->free_block);
+		buf_block_free(static_cast<buf_block_t*>(heap->free_block));
 
 		heap->free_block = NULL;
 	}
diff --git a/storage/innobase/mem/mem0pool.c b/storage/innobase/mem/mem0pool.cc
index 50dbe526d64..2135926a26f 100644
--- a/storage/innobase/mem/mem0pool.c
+++ b/storage/innobase/mem/mem0pool.cc
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1997, 2011, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,13 +11,13 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
 /********************************************************************//**
-@file mem/mem0pool.c
+@file mem/mem0pool.cc
 The lowest-level memory management
 
 Created 5/12/1997 Heikki Tuuri
@@ -226,9 +226,9 @@ mem_pool_create(
 	ulint		i;
 	ulint		used;
 
-	pool = ut_malloc(sizeof(mem_pool_t));
+	pool = static_cast<mem_pool_t*>(ut_malloc(sizeof(mem_pool_t)));
 
-	pool->buf = ut_malloc_low(size, TRUE);
+	pool->buf = static_cast<byte*>(ut_malloc_low(size, TRUE));
 	pool->size = size;
 
 	mutex_create(mem_pool_mutex_key, &pool->mutex, SYNC_MEM_POOL);
@@ -340,7 +340,7 @@ mem_pool_fill_free_list(
 
 	UT_LIST_REMOVE(free_list, pool->free_list[i + 1], area);
 
-	area2 = (mem_area_t*)(((byte*)area) + ut_2_exp(i));
+	area2 = (mem_area_t*)(((byte*) area) + ut_2_exp(i));
 	UNIV_MEM_ALLOC(area2, MEM_AREA_EXTRA_SIZE);
 
 	mem_area_set_size(area2, ut_2_exp(i));
@@ -454,9 +454,9 @@ mem_area_alloc(
 	ut_ad(mem_pool_validate(pool));
 
 	*psize = ut_2_exp(n) - MEM_AREA_EXTRA_SIZE;
-	UNIV_MEM_ALLOC(MEM_AREA_EXTRA_SIZE + (byte*)area, *psize);
+	UNIV_MEM_ALLOC(MEM_AREA_EXTRA_SIZE + (byte*) area, *psize);
 
-	return((void*)(MEM_AREA_EXTRA_SIZE + ((byte*)area)));
+	return((void*)(MEM_AREA_EXTRA_SIZE + ((byte*) area)));
 }
 
 /********************************************************************//**
@@ -474,13 +474,13 @@ mem_area_get_buddy(
 
 	ut_ad(size != 0);
 
-	if (((((byte*)area) - pool->buf) % (2 * size)) == 0) {
+	if (((((byte*) area) - pool->buf) % (2 * size)) == 0) {
 
 		/* The buddy is in a higher address */
 
-		buddy = (mem_area_t*)(((byte*)area) + size);
+		buddy = (mem_area_t*)(((byte*) area) + size);
 
-		if ((((byte*)buddy) - pool->buf) + size > pool->size) {
+		if ((((byte*) buddy) - pool->buf) + size > pool->size) {
 
 			/* The buddy is not wholly contained in the pool:
 			there is no buddy */
@@ -493,7 +493,7 @@ mem_area_get_buddy(
 		the upper branch in this if-clause: the remainder would be
 		0 */
 
-		buddy = (mem_area_t*)(((byte*)area) - size);
+		buddy = (mem_area_t*)(((byte*) area) - size);
 	}
 
 	return(buddy);
@@ -524,13 +524,13 @@ mem_area_free(
 	/* It may be that the area was really allocated from the OS with
 	regular malloc: check if ptr points within our memory pool */
 
-	if ((byte*)ptr < pool->buf || (byte*)ptr >= pool->buf + pool->size) {
+	if ((byte*) ptr < pool->buf || (byte*) ptr >= pool->buf + pool->size) {
 		ut_free(ptr);
 
 		return;
 	}
 
-	area = (mem_area_t*) (((byte*)ptr) - MEM_AREA_EXTRA_SIZE);
+	area = (mem_area_t*) (((byte*) ptr) - MEM_AREA_EXTRA_SIZE);
 
 	if (mem_area_get_free(area)) {
 		fprintf(stderr,
@@ -556,12 +556,12 @@ mem_area_free(
 	}
 
 #ifdef UNIV_LIGHT_MEM_DEBUG
-	if (((byte*)area) + size < pool->buf + pool->size) {
+	if (((byte*) area) + size < pool->buf + pool->size) {
 
 		ulint	next_size;
 
 		next_size = mem_area_get_size(
-			(mem_area_t*)(((byte*)area) + size));
+			(mem_area_t*)(((byte*) area) + size));
 		if (UNIV_UNLIKELY(!next_size || !ut_is_2pow(next_size))) {
 			fprintf(stderr,
 				"InnoDB: Error: Memory area size %lu,"
@@ -589,8 +589,8 @@ mem_area_free(
 
 		/* The buddy is in a free list */
 
-		if ((byte*)buddy < (byte*)area) {
-			new_ptr = ((byte*)buddy) + MEM_AREA_EXTRA_SIZE;
+		if ((byte*) buddy < (byte*) area) {
+			new_ptr = ((byte*) buddy) + MEM_AREA_EXTRA_SIZE;
 
 			mem_area_set_size(buddy, 2 * size);
 			mem_area_set_free(buddy, FALSE);
@@ -648,12 +648,12 @@ mem_pool_validate(
 
 	for (i = 0; i < 64; i++) {
 
-		UT_LIST_VALIDATE(free_list, mem_area_t, pool->free_list[i],
-				 (void) 0);
+		UT_LIST_CHECK(free_list, mem_area_t, pool->free_list[i]);
 
-		area = UT_LIST_GET_FIRST(pool->free_list[i]);
+		for (area = UT_LIST_GET_FIRST(pool->free_list[i]);
+		     area != 0;
+		     area = UT_LIST_GET_NEXT(free_list, area)) {
 
-		while (area != NULL) {
 			ut_a(mem_area_get_free(area));
 			ut_a(mem_area_get_size(area) == ut_2_exp(i));
 
@@ -662,8 +662,6 @@ mem_pool_validate(
 			ut_a(!buddy || !mem_area_get_free(buddy)
 			     || (ut_2_exp(i) != mem_area_get_size(buddy)));
 
-			area = UT_LIST_GET_NEXT(free_list, area);
-
 			free += ut_2_exp(i);
 		}
 	}
diff --git a/storage/innobase/mtr/mtr0log.c b/storage/innobase/mtr/mtr0log.cc
index 864970cef40..d549de8802e 100644
--- a/storage/innobase/mtr/mtr0log.c
+++ b/storage/innobase/mtr/mtr0log.cc
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1995, 2011, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,13 +11,13 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
 /**************************************************//**
-@file mtr/mtr0log.c
+@file mtr/mtr0log.cc
 Mini-transaction log routines
 
 Created 12/7/1995 Heikki Tuuri
@@ -175,7 +175,7 @@ mlog_parse_nbytes(
 		}
 
 		if (page) {
-			if (UNIV_LIKELY_NULL(page_zip)) {
+			if (page_zip) {
 				mach_write_to_8
 					(((page_zip_des_t*) page_zip)->data
 					 + offset, dval);
@@ -199,7 +199,7 @@ mlog_parse_nbytes(
 			goto corrupt;
 		}
 		if (page) {
-			if (UNIV_LIKELY_NULL(page_zip)) {
+			if (page_zip) {
 				mach_write_to_1
 					(((page_zip_des_t*) page_zip)->data
 					 + offset, val);
@@ -212,7 +212,7 @@ mlog_parse_nbytes(
 			goto corrupt;
 		}
 		if (page) {
-			if (UNIV_LIKELY_NULL(page_zip)) {
+			if (page_zip) {
 				mach_write_to_2
 					(((page_zip_des_t*) page_zip)->data
 					 + offset, val);
@@ -222,7 +222,7 @@ mlog_parse_nbytes(
 		break;
 	case MLOG_4BYTES:
 		if (page) {
-			if (UNIV_LIKELY_NULL(page_zip)) {
+			if (page_zip) {
 				mach_write_to_4
 					(((page_zip_des_t*) page_zip)->data
 					 + offset, val);
@@ -420,7 +420,7 @@ mlog_parse_string(
 	}
 
 	if (page) {
-		if (UNIV_LIKELY_NULL(page_zip)) {
+		if (page_zip) {
 			memcpy(((page_zip_des_t*) page_zip)->data
 				+ offset, ptr, len);
 		}
@@ -538,7 +538,7 @@ mlog_parse_index(
 /*=============*/
 	byte*		ptr,	/*!< in: buffer */
 	const byte*	end_ptr,/*!< in: buffer end */
-	ibool		comp,	/*!< in: TRUE=compact record format */
+	ibool		comp,	/*!< in: TRUE=compact row format */
 	dict_index_t**	index)	/*!< out, own: dummy index */
 {
 	ulint		i, n, n_uniq;
@@ -563,7 +563,7 @@ mlog_parse_index(
 		n = n_uniq = 1;
 	}
 	table = dict_mem_table_create("LOG_DUMMY", DICT_HDR_SPACE, n,
-				      comp ? DICT_TF_COMPACT : 0);
+				      comp ? DICT_TF_COMPACT : 0, 0);
 	ind = dict_mem_index_create("LOG_DUMMY", "LOG_DUMMY",
 				    DICT_HDR_SPACE, 0, n);
 	ind->table = table;
diff --git a/storage/innobase/mtr/mtr0mtr.c b/storage/innobase/mtr/mtr0mtr.cc
index d852ed6f496..4832e8c7710 100644
--- a/storage/innobase/mtr/mtr0mtr.c
+++ b/storage/innobase/mtr/mtr0mtr.cc
@@ -17,7 +17,7 @@ this program; if not, write to the Free Software Foundation, Inc.,
 *****************************************************************************/
 
 /**************************************************//**
-@file mtr/mtr0mtr.c
+@file mtr/mtr0mtr.cc
 Mini-transaction buffer
 
 Created 11/26/1995 Heikki Tuuri
@@ -37,6 +37,25 @@ Created 11/26/1995 Heikki Tuuri
 
 #ifndef UNIV_HOTBACKUP
 # include "log0recv.h"
+
+/***************************************************//**
+Checks if a mini-transaction is dirtying a clean page.
+@return TRUE if the mtr is dirtying a clean page. */
+UNIV_INTERN
+ibool
+mtr_block_dirtied(
+/*==============*/
+	const buf_block_t*	block)	/*!< in: block being x-fixed */
+{
+	ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
+	ut_ad(block->page.buf_fix_count > 0);
+
+	/* It is OK to read oldest_modification because no
+	other thread can be performing a write of it and it
+	is only during write that the value is reset to 0. */
+	return(block->page.oldest_modification == 0);
+}
+
 /*****************************************************************//**
 Releases the item in the slot given. */
 static
@@ -52,6 +71,10 @@ mtr_memo_slot_release(
 	ut_ad(mtr);
 	ut_ad(slot);
 
+	/* slot release is a local operation for the current mtr.
+	We must not be holding the flush_order mutex while
+	doing this. */
+	ut_ad(!log_flush_order_mutex_own());
 #ifndef UNIV_DEBUG
 	UT_NOT_USED(mtr);
 #endif /* UNIV_DEBUG */
@@ -61,9 +84,9 @@ mtr_memo_slot_release(
 
 	if (UNIV_LIKELY(object != NULL)) {
 		if (type <= MTR_MEMO_BUF_FIX) {
-			buf_page_release((buf_block_t*)object, type);
+			buf_page_release((buf_block_t*) object, type);
 		} else if (type == MTR_MEMO_S_LOCK) {
-			rw_lock_s_unlock((rw_lock_t*)object);
+			rw_lock_s_unlock((rw_lock_t*) object);
 #ifdef UNIV_DEBUG
 		} else if (type != MTR_MEMO_X_LOCK) {
 			ut_ad(type == MTR_MEMO_MODIFY);
@@ -71,7 +94,7 @@ mtr_memo_slot_release(
 						MTR_MEMO_PAGE_X_FIX));
 #endif /* UNIV_DEBUG */
 		} else {
-			rw_lock_x_unlock((rw_lock_t*)object);
+			rw_lock_x_unlock((rw_lock_t*) object);
 		}
 	}
 
@@ -102,7 +125,9 @@ mtr_memo_pop_all(
 
 	while (offset > 0) {
 		offset -= sizeof(mtr_memo_slot_t);
-		slot = dyn_array_get_element(memo, offset);
+
+		slot = static_cast<mtr_memo_slot_t*>(
+			dyn_array_get_element(memo, offset));
 
 		mtr_memo_slot_release(mtr, slot);
 	}
@@ -124,9 +149,7 @@ mtr_memo_slot_note_modification(
 	if (slot->object != NULL && slot->type == MTR_MEMO_PAGE_X_FIX) {
 		buf_block_t*	block = (buf_block_t*) slot->object;
 
-#ifdef UNIV_DEBUG
-		ut_ad(log_flush_order_mutex_own());
-#endif /* UNIV_DEBUG */
+		ut_ad(!mtr->made_dirty || log_flush_order_mutex_own());
 		buf_flush_note_modification(block, mtr);
 	}
 }
@@ -159,7 +182,9 @@ mtr_memo_note_modifications(
 		mtr_memo_slot_t* slot;
 
 		offset -= sizeof(mtr_memo_slot_t);
-		slot = dyn_array_get_element(memo, offset);
+
+		slot = static_cast<mtr_memo_slot_t*>(
+			dyn_array_get_element(memo, offset));
 
 		mtr_memo_slot_note_modification(mtr, slot);
 	}
@@ -225,7 +250,15 @@ mtr_log_reserve_and_write(
 	mtr->end_lsn = log_close();
 
 func_exit:
-	log_flush_order_mutex_enter();
+
+	/* No need to acquire log_flush_order_mutex if this mtr has
+	not dirtied a clean page. log_flush_order_mutex is used to
+	ensure ordered insertions in the flush_list. We need to
+	insert in the flush_list iff the page in question was clean
+	before modifications. */
+	if (mtr->made_dirty) {
+		log_flush_order_mutex_enter();
+	}
 
 	/* It is now safe to release the log mutex because the
 	flush_order mutex will ensure that we are the first one
@@ -236,7 +269,9 @@ func_exit:
 		mtr_memo_note_modifications(mtr);
 	}
 
-	log_flush_order_mutex_exit();
+	if (mtr->made_dirty) {
+		log_flush_order_mutex_exit();
+	}
 }
 #endif /* !UNIV_HOTBACKUP */
 
@@ -271,8 +306,8 @@ mtr_commit(
 	/* Declare everything uninitialized except
 	mtr->start_lsn, mtr->end_lsn and mtr->state. */
 	{
-		ib_uint64_t	start_lsn	= mtr->start_lsn;
-		ib_uint64_t	end_lsn		= mtr->end_lsn;
+		lsn_t	start_lsn	= mtr->start_lsn;
+		lsn_t	end_lsn		= mtr->end_lsn;
 		UNIV_MEM_INVALID(mtr, sizeof *mtr);
 		mtr->start_lsn = start_lsn;
 		mtr->end_lsn = end_lsn;
@@ -304,11 +339,11 @@ mtr_memo_release(
 
 	offset = dyn_array_get_data_size(memo);
 
-	log_flush_order_mutex_enter();
 	while (offset > 0) {
 		offset -= sizeof(mtr_memo_slot_t);
 
-		slot = dyn_array_get_element(memo, offset);
+		slot = static_cast<mtr_memo_slot_t*>(
+			dyn_array_get_element(memo, offset));
 
 		if (object == slot->object && type == slot->type) {
 
@@ -323,7 +358,6 @@ mtr_memo_release(
 			break;
 		}
 	}
-	log_flush_order_mutex_exit();
 }
 #endif /* !UNIV_HOTBACKUP */
 
diff --git a/storage/innobase/os/os0file.c b/storage/innobase/os/os0file.cc
index 15e66167f26..ec9e234229a 100644
--- a/storage/innobase/os/os0file.c
+++ b/storage/innobase/os/os0file.cc
@@ -1,6 +1,6 @@
 /***********************************************************************
 
-Copyright (c) 1995, 2010, Innobase Oy. All Rights Reserved.
+Copyright (c) 1995, 2011, Oracle and/or its affiliates. All Rights Reserved.
 Copyright (c) 2009, Percona Inc.
 
 Portions of this file contain modifications contributed and copyrighted
@@ -19,14 +19,14 @@ WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
 Public License for more details.
 
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 ***********************************************************************/
 
 /**************************************************//**
-@file os/os0file.c
+@file os/os0file.cc
 The interface to the operating system file i/o primitives
 
 Created 10/21/1995 Heikki Tuuri
@@ -43,6 +43,7 @@ Created 10/21/1995 Heikki Tuuri
 #include "srv0start.h"
 #include "fil0fil.h"
 #include "buf0buf.h"
+#include "srv0mon.h"
 #ifndef UNIV_HOTBACKUP
 # include "os0sync.h"
 # include "os0thread.h"
@@ -72,14 +73,6 @@ UNIV_INTERN ulint	os_innodb_umask
 UNIV_INTERN ulint	os_innodb_umask		= 0;
 #endif
 
-#ifdef UNIV_DO_FLUSH
-/* If the following is set to TRUE, we do not call os_file_flush in every
-os_file_write. We can set this TRUE when the doublewrite buffer is used. */
-UNIV_INTERN ibool	os_do_not_call_flush_at_each_write	= FALSE;
-#else
-/* We do not call os_file_flush in every os_file_write. */
-#endif /* UNIV_DO_FLUSH */
-
 #ifndef UNIV_HOTBACKUP
 /* We use these mutexes to protect lseek + file i/o operation, if the
 OS does not provide an atomic pread or pwrite, or similar */
@@ -105,7 +98,7 @@ of the high level design.
 There are four io-threads (for ibuf, log, read, write).
 All synchronous IO requests are serviced by the calling thread using
 os_file_write/os_file_read. The Asynchronous requests are queued up
-in an array (there are four such arrays) by the calling thread. 
+in an array (there are four such arrays) by the calling thread.
 Later these requests are picked up by the io-thread and are serviced
 synchronously.
 
@@ -167,9 +160,7 @@ struct os_aio_slot_struct{
 					write */
 	byte*		buf;		/*!< buffer used in i/o */
 	ulint		type;		/*!< OS_FILE_READ or OS_FILE_WRITE */
-	ulint		offset;		/*!< 32 low bits of file offset in
-					bytes */
-	ulint		offset_high;	/*!< 32 high bits of file offset */
+	os_offset_t	offset;		/*!< file offset in bytes */
 	os_file_t	file;		/*!< file where to read or write */
 	const char*	name;		/*!< file name or path */
 	ibool		io_already_done;/*!< used only in simulated aio:
@@ -236,7 +227,7 @@ struct os_aio_array_struct{
 
 #if defined(LINUX_NATIVE_AIO)
 	io_context_t*		aio_ctx;
-				/* completion queue for IO. There is 
+				/* completion queue for IO. There is
 				one such queue per segment. Each thread
 				will work on one ctx exclusively. */
 	struct io_event*	aio_events;
@@ -259,7 +250,7 @@ struct os_aio_array_struct{
 #endif
 
 /** Array of events used in simulated aio */
-static os_event_t*	os_aio_segment_wait_events	= NULL;
+static os_event_t*	os_aio_segment_wait_events = NULL;
 
 /** The aio arrays for non-ibuf i/o and ibuf i/o, as well as sync aio. These
 are NULL when the module has not yet been initialized. @{ */
@@ -289,10 +280,12 @@ UNIV_INTERN time_t	os_last_printout;
 
 UNIV_INTERN ibool	os_has_said_disk_full	= FALSE;
 
-#ifndef UNIV_HOTBACKUP
+#if !defined(UNIV_HOTBACKUP)	\
+    && (!defined(HAVE_ATOMIC_BUILTINS) || UNIV_WORD_SIZE < 8)
 /** The mutex protecting the following counts of pending I/O operations */
 static os_mutex_t	os_file_count_mutex;
-#endif /* !UNIV_HOTBACKUP */
+#endif /* !UNIV_HOTBACKUP && (!HAVE_ATOMIC_BUILTINS || UNIV_WORD_SIZE < 8) */
+
 /** Number of pending os_file_pread() operations */
 UNIV_INTERN ulint	os_file_n_pending_preads  = 0;
 /** Number of pending os_file_pwrite() operations */
@@ -303,6 +296,7 @@ UNIV_INTERN ulint	os_n_pending_writes = 0;
 UNIV_INTERN ulint	os_n_pending_reads = 0;
 
 #ifdef UNIV_DEBUG
+# ifndef UNIV_HOTBACKUP
 /**********************************************************************//**
 Validates the consistency the aio system some of the time.
 @return	TRUE if ok or the check was skipped */
@@ -329,6 +323,7 @@ os_aio_validate_skip(void)
 	os_aio_validate_count = OS_AIO_VALIDATE_SKIP;
 	return(os_aio_validate());
 }
+# endif /* !UNIV_HOTBACKUP */
 #endif /* UNIV_DEBUG */
 
 #ifdef __WIN__
@@ -378,12 +373,14 @@ The number should be retrieved before any other OS calls (because they may
 overwrite the error number). If the number is not known to this program,
 the OS error number + 100 is returned.
 @return	error number, or OS error number + 100 */
-UNIV_INTERN
+static
 ulint
-os_file_get_last_error(
-/*===================*/
-	ibool	report_all_errors)	/*!< in: TRUE if we want an error message
-					printed of all errors */
+os_file_get_last_error_low(
+/*=======================*/
+	ibool	report_all_errors,	/*!< in: TRUE if we want an error
+					message printed of all errors */
+	ibool	on_error_silent)	/*!< in: TRUE then don't print any
+					diagnostic to the log */
 {
 	ulint	err;
 
@@ -392,7 +389,9 @@ os_file_get_last_error(
 	err = (ulint) GetLastError();
 
 	if (report_all_errors
-	    || (err != ERROR_DISK_FULL && err != ERROR_FILE_EXISTS)) {
+	    || (!on_error_silent
+		&& err != ERROR_DISK_FULL
+		&& err != ERROR_FILE_EXISTS)) {
 
 		ut_print_timestamp(stderr);
 		fprintf(stderr,
@@ -473,7 +472,7 @@ os_file_get_last_error(
 	err = (ulint) errno;
 
 	if (report_all_errors
-	    || (err != ENOSPC && err != EEXIST)) {
+	    || (err != ENOSPC && err != EEXIST && !on_error_silent)) {
 
 		ut_print_timestamp(stderr);
 		fprintf(stderr,
@@ -498,13 +497,14 @@ os_file_get_last_error(
 				" the access rights to\n"
 				"InnoDB: the directory.\n");
 		} else {
-			if (strerror((int)err) != NULL) {
+			if (strerror((int) err) != NULL) {
 				fprintf(stderr,
 					"InnoDB: Error number %lu"
 					" means '%s'.\n",
-					err, strerror((int)err));
+					err, strerror((int) err));
 			}
 
+
 			fprintf(stderr,
 				"InnoDB: Some operating system"
 				" error numbers are described at\n"
@@ -542,10 +542,26 @@ os_file_get_last_error(
 #endif
 }
 
+/***********************************************************************//**
+Retrieves the last error number if an error occurs in a file io function.
+The number should be retrieved before any other OS calls (because they may
+overwrite the error number). If the number is not known to this program,
+the OS error number + 100 is returned.
+@return	error number, or OS error number + 100 */
+UNIV_INTERN
+ulint
+os_file_get_last_error(
+/*===================*/
+	ibool	report_all_errors)	/*!< in: TRUE if we want an error
+					message printed of all errors */
+{
+	return(os_file_get_last_error_low(report_all_errors, FALSE));
+}
+
 /****************************************************************//**
 Does error handling when a file operation fails.
 Conditionally exits (calling exit(3)) based on should_exit value and the
-error type
+error type, if should_exit is TRUE then on_error_silent is ignored.
 @return	TRUE if we should retry the operation */
 static
 ibool
@@ -553,14 +569,18 @@ os_file_handle_error_cond_exit(
 /*===========================*/
 	const char*	name,		/*!< in: name of a file or NULL */
 	const char*	operation,	/*!< in: operation */
-	ibool		should_exit)	/*!< in: call exit(3) if unknown error
+	ibool		should_exit,	/*!< in: call exit(3) if unknown error
 					and this parameter is TRUE */
+	ibool		on_error_silent)/*!< in: if TRUE then don't print
+					any message to the log iff it is
+					an unknown non-fatal error */
 {
 	ulint	err;
 
-	err = os_file_get_last_error(FALSE);
+	err = os_file_get_last_error_low(FALSE, on_error_silent);
 
-	if (err == OS_FILE_DISK_FULL) {
+	switch (err) {
+	case OS_FILE_DISK_FULL:
 		/* We only print a warning about disk full once */
 
 		if (os_has_said_disk_full) {
@@ -568,6 +588,9 @@ os_file_handle_error_cond_exit(
 			return(FALSE);
 		}
 
+		/* Disk full error is reported irrespective of the
+		on_error_silent setting. */
+
 		if (name) {
 			ut_print_timestamp(stderr);
 			fprintf(stderr,
@@ -585,42 +608,53 @@ os_file_handle_error_cond_exit(
 		fflush(stderr);
 
 		return(FALSE);
-	} else if (err == OS_FILE_AIO_RESOURCES_RESERVED) {
 
-		return(TRUE);
-	} else if (err == OS_FILE_AIO_INTERRUPTED) {
+	case OS_FILE_AIO_RESOURCES_RESERVED:
+	case OS_FILE_AIO_INTERRUPTED:
 
 		return(TRUE);
-	} else if (err == OS_FILE_ALREADY_EXISTS
-		   || err == OS_FILE_PATH_ERROR) {
+
+	case OS_FILE_PATH_ERROR:
+	case OS_FILE_ALREADY_EXISTS:
 
 		return(FALSE);
-	} else if (err == OS_FILE_SHARING_VIOLATION) {
+
+	case OS_FILE_SHARING_VIOLATION:
 
 		os_thread_sleep(10000000);  /* 10 sec */
 		return(TRUE);
-	} else if (err == OS_FILE_INSUFFICIENT_RESOURCE) {
 
-		os_thread_sleep(100000);	/* 100 ms */
-		return(TRUE);
-	} else if (err == OS_FILE_OPERATION_ABORTED) {
+	case OS_FILE_OPERATION_ABORTED:
+	case OS_FILE_INSUFFICIENT_RESOURCE:
 
 		os_thread_sleep(100000);	/* 100 ms */
 		return(TRUE);
-	} else {
-		if (name) {
-			fprintf(stderr, "InnoDB: File name %s\n", name);
-		}
 
-		fprintf(stderr, "InnoDB: File operation call: '%s'.\n",
-			operation);
+	default:
+
+		/* If it is an operation that can crash on error then it
+		is better to ignore on_error_silent and print an error message
+		to the log. */
+
+		if (should_exit || !on_error_silent) {
+			if (name) {
+				ut_print_timestamp(stderr);
+				fprintf(stderr,
+					"  InnoDB: File name %s\n", name);
+			}
+
+			ut_print_timestamp(stderr);
+			fprintf(stderr, "  InnoDB: File operation call: "
+				"'%s'.\n", operation);
+		}
 
 		if (should_exit) {
-			fprintf(stderr, "InnoDB: Cannot continue operation.\n");
+			ut_print_timestamp(stderr);
+			fprintf(stderr, "  InnoDB: Cannot continue "
+				"operation.\n");
 
 			fflush(stderr);
-
-			exit(1);
+			ut_error;
 		}
 	}
 
@@ -634,11 +668,11 @@ static
 ibool
 os_file_handle_error(
 /*=================*/
-	const char*	name,	/*!< in: name of a file or NULL */
-	const char*	operation)/*!< in: operation */
+	const char*	name,		/*!< in: name of a file or NULL */
+	const char*	operation)	/*!< in: operation */
 {
 	/* exit in case of unknown error */
-	return(os_file_handle_error_cond_exit(name, operation, TRUE));
+	return(os_file_handle_error_cond_exit(name, operation, TRUE, FALSE));
 }
 
 /****************************************************************//**
@@ -648,11 +682,14 @@ static
 ibool
 os_file_handle_error_no_exit(
 /*=========================*/
-	const char*	name,	/*!< in: name of a file or NULL */
-	const char*	operation)/*!< in: operation */
+	const char*	name,		/*!< in: name of a file or NULL */
+	const char*	operation,	/*!< in: operation */
+	ibool		on_error_silent)/*!< in: if TRUE then don't print
+					any message to the log. */
 {
 	/* don't exit in case of unknown error */
-	return(os_file_handle_error_cond_exit(name, operation, FALSE));
+	return(os_file_handle_error_cond_exit(
+			name, operation, FALSE, on_error_silent));
 }
 
 #undef USE_FILE_LOCK
@@ -707,7 +744,9 @@ os_io_init_simple(void)
 {
 	ulint	i;
 
+#if !defined(HAVE_ATOMIC_BUILTINS) || UNIV_WORD_SIZE < 8
 	os_file_count_mutex = os_mutex_create();
+#endif /* !HAVE_ATOMIC_BUILTINS || UNIV_WORD_SIZE < 8 */
 
 	for (i = 0; i < OS_FILE_N_SEEK_MUTEXES; i++) {
 		os_file_seek_mutexes[i] = os_mutex_create();
@@ -776,7 +815,8 @@ os_file_opendir(
 	the first entry in the directory. Since it is '.', that is no problem,
 	as we will skip over the '.' and '..' entries anyway. */
 
-	lpFindFileData = ut_malloc(sizeof(WIN32_FIND_DATA));
+	lpFindFileData = static_cast<LPWIN32_FIND_DATA>(
+		ut_malloc(sizeof(WIN32_FIND_DATA)));
 
 	dir = FindFirstFile((LPCTSTR) path, lpFindFileData);
 
@@ -818,7 +858,7 @@ os_file_closedir(
 	ret = FindClose(dir);
 
 	if (!ret) {
-		os_file_handle_error_no_exit(NULL, "closedir");
+		os_file_handle_error_no_exit(NULL, "closedir", FALSE);
 
 		return(-1);
 	}
@@ -830,7 +870,7 @@ os_file_closedir(
 	ret = closedir(dir);
 
 	if (ret) {
-		os_file_handle_error_no_exit(NULL, "closedir");
+		os_file_handle_error_no_exit(NULL, "closedir", FALSE);
 	}
 
 	return(ret);
@@ -853,21 +893,22 @@ os_file_readdir_next_file(
 	LPWIN32_FIND_DATA	lpFindFileData;
 	BOOL			ret;
 
-	lpFindFileData = ut_malloc(sizeof(WIN32_FIND_DATA));
+	lpFindFileData = static_cast<LPWIN32_FIND_DATA>(
+		ut_malloc(sizeof(WIN32_FIND_DATA)));
 next_file:
 	ret = FindNextFile(dir, lpFindFileData);
 
 	if (ret) {
-		ut_a(strlen((char *) lpFindFileData->cFileName)
+		ut_a(strlen((char*) lpFindFileData->cFileName)
 		     < OS_FILE_MAX_PATH);
 
-		if (strcmp((char *) lpFindFileData->cFileName, ".") == 0
-		    || strcmp((char *) lpFindFileData->cFileName, "..") == 0) {
+		if (strcmp((char*) lpFindFileData->cFileName, ".") == 0
+		    || strcmp((char*) lpFindFileData->cFileName, "..") == 0) {
 
 			goto next_file;
 		}
 
-		strcpy(info->name, (char *) lpFindFileData->cFileName);
+		strcpy(info->name, (char*) lpFindFileData->cFileName);
 
 		info->size = (ib_int64_t)(lpFindFileData->nFileSizeLow)
 			+ (((ib_int64_t)(lpFindFileData->nFileSizeHigh))
@@ -901,8 +942,7 @@ next_file:
 
 		return(1);
 	} else {
-		os_file_handle_error_no_exit(dirname,
-					     "readdir_next_file");
+		os_file_handle_error_no_exit(NULL, "readdir_next_file", FALSE);
 		return(-1);
 	}
 #else
@@ -921,7 +961,7 @@ next_file:
 next_file:
 
 #ifdef HAVE_READDIR_R
-	ret = readdir_r(dir, (struct dirent*)dirent_buf, &ent);
+	ret = readdir_r(dir, (struct dirent*) dirent_buf, &ent);
 
 	if (ret != 0
 #ifdef UNIV_AIX
@@ -934,7 +974,7 @@ next_file:
 	   ) {
 		fprintf(stderr,
 			"InnoDB: cannot read directory %s, error %lu\n",
-			dirname, (ulong)ret);
+			dirname, (ulong) ret);
 
 		return(-1);
 	}
@@ -963,7 +1003,8 @@ next_file:
 
 	strcpy(info->name, ent->d_name);
 
-	full_path = ut_malloc(strlen(dirname) + strlen(ent->d_name) + 10);
+	full_path = static_cast<char*>(
+		ut_malloc(strlen(dirname) + strlen(ent->d_name) + 10));
 
 	sprintf(full_path, "%s/%s", dirname, ent->d_name);
 
@@ -987,14 +1028,14 @@ next_file:
 			goto next_file;
 		}
 
-		os_file_handle_error_no_exit(full_path, "stat");
+		os_file_handle_error_no_exit(full_path, "stat", FALSE);
 
 		ut_free(full_path);
 
 		return(-1);
 	}
 
-	info->size = (ib_int64_t)statinfo.st_size;
+	info->size = (ib_int64_t) statinfo.st_size;
 
 	if (S_ISDIR(statinfo.st_mode)) {
 		info->type = OS_FILE_TYPE_DIR;
@@ -1069,13 +1110,7 @@ os_file_create_simple_func(
 /*=======================*/
 	const char*	name,	/*!< in: name of the file or path as a
 				null-terminated string */
-	ulint		create_mode,/*!< in: OS_FILE_OPEN if an existing file is
-				opened (if does not exist, error), or
-				OS_FILE_CREATE if a new file is created
-				(if exists, error), or
-				OS_FILE_CREATE_PATH if new file
-				(if exists, error) and subdirectories along
-				its path are created (if needed)*/
+	ulint		create_mode,/*!< in: create mode */
 	ulint		access_type,/*!< in: OS_FILE_READ_ONLY or
 				OS_FILE_READ_WRITE */
 	ibool*		success)/*!< out: TRUE if succeed, FALSE if error */
@@ -1087,6 +1122,8 @@ os_file_create_simple_func(
 	DWORD		attributes	= 0;
 	ibool		retry;
 
+	ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT));
+	ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT));
 try_again:
 	ut_a(name);
 
@@ -1145,6 +1182,9 @@ try_again:
 	int		create_flag;
 	ibool		retry;
 
+	ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT));
+	ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT));
+
 try_again:
 	ut_a(name);
 
@@ -1212,10 +1252,7 @@ os_file_create_simple_no_error_handling_func(
 /*=========================================*/
 	const char*	name,	/*!< in: name of the file or path as a
 				null-terminated string */
-	ulint		create_mode,/*!< in: OS_FILE_OPEN if an existing file
-				is opened (if does not exist, error), or
-				OS_FILE_CREATE if a new file is created
-				(if exists, error) */
+	ulint		create_mode,/*!< in: create mode */
 	ulint		access_type,/*!< in: OS_FILE_READ_ONLY,
 				OS_FILE_READ_WRITE, or
 				OS_FILE_READ_ALLOW_DELETE; the last option is
@@ -1231,6 +1268,9 @@ os_file_create_simple_no_error_handling_func(
 
 	ut_a(name);
 
+	ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT));
+	ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT));
+
 	if (create_mode == OS_FILE_OPEN) {
 		create_flag = OPEN_EXISTING;
 	} else if (create_mode == OS_FILE_CREATE) {
@@ -1277,6 +1317,9 @@ os_file_create_simple_no_error_handling_func(
 
 	ut_a(name);
 
+	ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT));
+	ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT));
+
 	if (create_mode == OS_FILE_OPEN) {
 		if (access_type == OS_FILE_READ_ONLY) {
 			create_flag = O_RDONLY;
@@ -1332,7 +1375,7 @@ os_file_set_nocache(
 #if defined(UNIV_SOLARIS) && defined(DIRECTIO_ON)
 	if (directio(fd, DIRECTIO_ON) == -1) {
 		int	errno_save;
-		errno_save = (int)errno;
+		errno_save = (int) errno;
 		ut_print_timestamp(stderr);
 		fprintf(stderr,
 			"  InnoDB: Failed to set DIRECTIO_ON "
@@ -1342,7 +1385,7 @@ os_file_set_nocache(
 #elif defined(O_DIRECT)
 	if (fcntl(fd, F_SETFL, O_DIRECT) == -1) {
 		int	errno_save;
-		errno_save = (int)errno;
+		errno_save = (int) errno;
 		ut_print_timestamp(stderr);
 		fprintf(stderr,
 			"  InnoDB: Failed to set O_DIRECT "
@@ -1371,14 +1414,7 @@ os_file_create_func(
 /*================*/
 	const char*	name,	/*!< in: name of the file or path as a
 				null-terminated string */
-	ulint		create_mode,/*!< in: OS_FILE_OPEN if an existing file
-				is opened (if does not exist, error), or
-				OS_FILE_CREATE if a new file is created
-				(if exists, error),
-				OS_FILE_OVERWRITE if a new file is created
-				or an old overwritten;
-				OS_FILE_OPEN_RAW, if a raw device or disk
-				partition should be opened */
+	ulint		create_mode,/*!< in: create mode */
 	ulint		purpose,/*!< in: OS_FILE_AIO, if asynchronous,
 				non-buffered i/o is desired,
 				OS_FILE_NORMAL, if any normal file;
@@ -1389,12 +1425,24 @@ os_file_create_func(
 	ulint		type,	/*!< in: OS_DATA_FILE or OS_LOG_FILE */
 	ibool*		success)/*!< out: TRUE if succeed, FALSE if error */
 {
+	ibool		on_error_no_exit;
+	ibool		on_error_silent;
+
 #ifdef __WIN__
 	os_file_t	file;
 	DWORD		share_mode	= FILE_SHARE_READ;
 	DWORD		create_flag;
 	DWORD		attributes;
 	ibool		retry;
+
+	on_error_no_exit = create_mode & OS_FILE_ON_ERROR_NO_EXIT
+		? TRUE : FALSE;
+	on_error_silent = create_mode & OS_FILE_ON_ERROR_SILENT
+		? TRUE : FALSE;
+
+	create_mode &= ~OS_FILE_ON_ERROR_NO_EXIT;
+	create_mode &= ~OS_FILE_ON_ERROR_SILENT;
+
 try_again:
 	ut_a(name);
 
@@ -1477,23 +1525,17 @@ try_again:
 			  NULL);	/*!< no template file */
 
 	if (file == INVALID_HANDLE_VALUE) {
+		const char*	operation;
+
+		operation = create_mode == OS_FILE_CREATE ? "create" : "open";
+
 		*success = FALSE;
 
-		/* When srv_file_per_table is on, file creation failure may not
-		be critical to the whole instance. Do not crash the server in
-		case of unknown errors.
-		Please note "srv_file_per_table" is a global variable with
-		no explicit synchronization protection. It could be
-		changed during this execution path. It might not have the
-		same value as the one when building the table definition */
-		if (srv_file_per_table) {
-			retry = os_file_handle_error_no_exit(name,
-						create_mode == OS_FILE_CREATE ?
-						"create" : "open");
+		if (on_error_no_exit) {
+			retry = os_file_handle_error_no_exit(
+				name, operation, on_error_silent);
 		} else {
-			retry = os_file_handle_error(name,
-						create_mode == OS_FILE_CREATE ?
-						"create" : "open");
+			retry = os_file_handle_error(name, operation);
 		}
 
 		if (retry) {
@@ -1510,6 +1552,14 @@ try_again:
 	ibool		retry;
 	const char*	mode_str	= NULL;
 
+	on_error_no_exit = create_mode & OS_FILE_ON_ERROR_NO_EXIT
+		? TRUE : FALSE;
+	on_error_silent = create_mode & OS_FILE_ON_ERROR_SILENT
+		? TRUE : FALSE;
+
+	create_mode &= ~OS_FILE_ON_ERROR_NO_EXIT;
+	create_mode &= ~OS_FILE_ON_ERROR_SILENT;
+
 try_again:
 	ut_a(name);
 
@@ -1549,23 +1599,17 @@ try_again:
 	file = open(name, create_flag, os_innodb_umask);
 
 	if (file == -1) {
+		const char*	operation;
+
+		operation = create_mode == OS_FILE_CREATE ? "create" : "open";
+
 		*success = FALSE;
 
-		/* When srv_file_per_table is on, file creation failure may not
-		be critical to the whole instance. Do not crash the server in
-		case of unknown errors.
-		Please note "srv_file_per_table" is a global variable with
-		no explicit synchronization protection. It could be
-		changed during this execution path. It might not have the
-		same value as the one when building the table definition */
-		if (srv_file_per_table) {
-			retry = os_file_handle_error_no_exit(name,
-						create_mode == OS_FILE_CREATE ?
-						"create" : "open");
+		if (on_error_no_exit) {
+			retry = os_file_handle_error_no_exit(
+				name, operation, on_error_silent);
 		} else {
-			retry = os_file_handle_error(name,
-						create_mode == OS_FILE_CREATE ?
-						"create" : "open");
+			retry = os_file_handle_error(name, operation);
 		}
 
 		if (retry) {
@@ -1581,7 +1625,7 @@ try_again:
 	/* We disable OS caching (O_DIRECT) only on data files */
 	if (type != OS_LOG_FILE
 	    && srv_unix_file_flush_method == SRV_UNIX_O_DIRECT) {
-		
+
 		os_file_set_nocache(file, name, mode_str);
 	}
 
@@ -1591,7 +1635,7 @@ try_again:
 		if (create_mode == OS_FILE_OPEN_RETRY) {
 			int i;
 			ut_print_timestamp(stderr);
-			fputs("  InnoDB: Retrying to lock"
+			fputs(" InnoDB: Retrying to lock"
 			      " the first data file\n",
 			      stderr);
 			for (i = 0; i < 100; i++) {
@@ -1602,7 +1646,7 @@ try_again:
 				}
 			}
 			ut_print_timestamp(stderr);
-			fputs("  InnoDB: Unable to open the first data file\n",
+			fputs(" InnoDB: Unable to open the first data file\n",
 			      stderr);
 		}
 
@@ -1623,7 +1667,8 @@ UNIV_INTERN
 ibool
 os_file_delete_if_exists(
 /*=====================*/
-	const char*	name)	/*!< in: file path as a null-terminated string */
+	const char*	name)	/*!< in: file path as a null-terminated
+				string */
 {
 #ifdef __WIN__
 	BOOL	ret;
@@ -1632,7 +1677,7 @@ loop:
 	/* In Windows, deleting an .ibd file may fail if ibbackup is copying
 	it */
 
-	ret = DeleteFile((LPCTSTR)name);
+	ret = DeleteFile((LPCTSTR) name);
 
 	if (ret) {
 		return(TRUE);
@@ -1669,7 +1714,7 @@ loop:
 	ret = unlink(name);
 
 	if (ret != 0 && errno != ENOENT) {
-		os_file_handle_error_no_exit(name, "delete");
+		os_file_handle_error_no_exit(name, "delete", FALSE);
 
 		return(FALSE);
 	}
@@ -1685,7 +1730,8 @@ UNIV_INTERN
 ibool
 os_file_delete(
 /*===========*/
-	const char*	name)	/*!< in: file path as a null-terminated string */
+	const char*	name)	/*!< in: file path as a null-terminated
+				string */
 {
 #ifdef __WIN__
 	BOOL	ret;
@@ -1694,7 +1740,7 @@ loop:
 	/* In Windows, deleting an .ibd file may fail if ibbackup is copying
 	it */
 
-	ret = DeleteFile((LPCTSTR)name);
+	ret = DeleteFile((LPCTSTR) name);
 
 	if (ret) {
 		return(TRUE);
@@ -1732,7 +1778,7 @@ loop:
 	ret = unlink(name);
 
 	if (ret != 0) {
-		os_file_handle_error_no_exit(name, "delete");
+		os_file_handle_error_no_exit(name, "delete", FALSE);
 
 		return(FALSE);
 	}
@@ -1757,13 +1803,13 @@ os_file_rename_func(
 #ifdef __WIN__
 	BOOL	ret;
 
-	ret = MoveFile((LPCTSTR)oldpath, (LPCTSTR)newpath);
+	ret = MoveFile((LPCTSTR) oldpath, (LPCTSTR) newpath);
 
 	if (ret) {
 		return(TRUE);
 	}
 
-	os_file_handle_error_no_exit(oldpath, "rename");
+	os_file_handle_error_no_exit(oldpath, "rename", FALSE);
 
 	return(FALSE);
 #else
@@ -1772,7 +1818,7 @@ os_file_rename_func(
 	ret = rename(oldpath, newpath);
 
 	if (ret != 0) {
-		os_file_handle_error_no_exit(oldpath, "rename");
+		os_file_handle_error_no_exit(oldpath, "rename", FALSE);
 
 		return(FALSE);
 	}
@@ -1860,76 +1906,33 @@ os_file_close_no_error_handling(
 
 /***********************************************************************//**
 Gets a file size.
-@return	TRUE if success */
+@return	file size, or (os_offset_t) -1 on failure */
 UNIV_INTERN
-ibool
+os_offset_t
 os_file_get_size(
 /*=============*/
-	os_file_t	file,	/*!< in: handle to a file */
-	ulint*		size,	/*!< out: least significant 32 bits of file
-				size */
-	ulint*		size_high)/*!< out: most significant 32 bits of size */
+	os_file_t	file)	/*!< in: handle to a file */
 {
 #ifdef __WIN__
-	DWORD	high;
-	DWORD	low;
+	os_offset_t	offset;
+	DWORD		high;
+	DWORD		low;
 
 	low = GetFileSize(file, &high);
 
 	if ((low == 0xFFFFFFFF) && (GetLastError() != NO_ERROR)) {
-		return(FALSE);
+		return((os_offset_t) -1);
 	}
 
-	*size = low;
-	*size_high = high;
+	offset = (os_offset_t) low | ((os_offset_t) high << 32);
 
-	return(TRUE);
+	return(offset);
 #else
-	off_t	offs;
-
-	offs = lseek(file, 0, SEEK_END);
-
-	if (offs == ((off_t)-1)) {
-
-		return(FALSE);
-	}
-
-	if (sizeof(off_t) > 4) {
-		*size = (ulint)(offs & 0xFFFFFFFFUL);
-		*size_high = (ulint)(offs >> 32);
-	} else {
-		*size = (ulint) offs;
-		*size_high = 0;
-	}
-
-	return(TRUE);
+	return((os_offset_t) lseek(file, 0, SEEK_END));
 #endif
 }
 
 /***********************************************************************//**
-Gets file size as a 64-bit integer ib_int64_t.
-@return	size in bytes, -1 if error */
-UNIV_INTERN
-ib_int64_t
-os_file_get_size_as_iblonglong(
-/*===========================*/
-	os_file_t	file)	/*!< in: handle to a file */
-{
-	ulint	size;
-	ulint	size_high;
-	ibool	success;
-
-	success = os_file_get_size(file, &size, &size_high);
-
-	if (!success) {
-
-		return(-1);
-	}
-
-	return((((ib_int64_t)size_high) << 32) + (ib_int64_t)size);
-}
-
-/***********************************************************************//**
 Write the specified number of zeros to a newly created file.
 @return	TRUE if success */
 UNIV_INTERN
@@ -1939,69 +1942,60 @@ os_file_set_size(
 	const char*	name,	/*!< in: name of the file or path as a
 				null-terminated string */
 	os_file_t	file,	/*!< in: handle to a file */
-	ulint		size,	/*!< in: least significant 32 bits of file
-				size */
-	ulint		size_high)/*!< in: most significant 32 bits of size */
+	os_offset_t	size)	/*!< in: file size */
 {
-	ib_int64_t	current_size;
-	ib_int64_t	desired_size;
+	os_offset_t	current_size;
 	ibool		ret;
 	byte*		buf;
 	byte*		buf2;
 	ulint		buf_size;
 
-	ut_a(size == (size & 0xFFFFFFFF));
-
 	current_size = 0;
-	desired_size = (ib_int64_t)size + (((ib_int64_t)size_high) << 32);
 
 	/* Write up to 1 megabyte at a time. */
-	buf_size = ut_min(64, (ulint) (desired_size / UNIV_PAGE_SIZE))
+	buf_size = ut_min(64, (ulint) (size / UNIV_PAGE_SIZE))
 		* UNIV_PAGE_SIZE;
-	buf2 = ut_malloc(buf_size + UNIV_PAGE_SIZE);
+	buf2 = static_cast<byte*>(ut_malloc(buf_size + UNIV_PAGE_SIZE));
 
 	/* Align the buffer for possible raw i/o */
-	buf = ut_align(buf2, UNIV_PAGE_SIZE);
+	buf = static_cast<byte*>(ut_align(buf2, UNIV_PAGE_SIZE));
 
 	/* Write buffer full of zeros */
 	memset(buf, 0, buf_size);
 
-	if (desired_size >= (ib_int64_t)(100 * 1024 * 1024)) {
+	if (size >= (os_offset_t) 100 << 20) {
 
 		fprintf(stderr, "InnoDB: Progress in MB:");
 	}
 
-	while (current_size < desired_size) {
+	while (current_size < size) {
 		ulint	n_bytes;
 
-		if (desired_size - current_size < (ib_int64_t) buf_size) {
-			n_bytes = (ulint) (desired_size - current_size);
+		if (size - current_size < (os_offset_t) buf_size) {
+			n_bytes = (ulint) (size - current_size);
 		} else {
 			n_bytes = buf_size;
 		}
 
-		ret = os_file_write(name, file, buf,
-				    (ulint)(current_size & 0xFFFFFFFF),
-				    (ulint)(current_size >> 32),
-				    n_bytes);
+		ret = os_file_write(name, file, buf, current_size, n_bytes);
 		if (!ret) {
 			ut_free(buf2);
 			goto error_handling;
 		}
 
 		/* Print about progress for each 100 MB written */
-		if ((ib_int64_t) (current_size + n_bytes) / (ib_int64_t)(100 * 1024 * 1024)
-		    != current_size / (ib_int64_t)(100 * 1024 * 1024)) {
+		if ((current_size + n_bytes) / (100 << 20)
+		    != current_size / (100 << 20)) {
 
 			fprintf(stderr, " %lu00",
 				(ulong) ((current_size + n_bytes)
-					 / (ib_int64_t)(100 * 1024 * 1024)));
+					 / (100 << 20)));
 		}
 
 		current_size += n_bytes;
 	}
 
-	if (desired_size >= (ib_int64_t)(100 * 1024 * 1024)) {
+	if (size >= (os_offset_t) 100 << 20) {
 
 		fprintf(stderr, "\n");
 	}
@@ -2065,7 +2059,7 @@ os_file_fsync(
 
 				ut_print_timestamp(stderr);
 				fprintf(stderr,
-					"  InnoDB: fsync(): "
+					" InnoDB: fsync(): "
 					"No locks available; retrying\n");
 			}
 
@@ -2171,7 +2165,7 @@ os_file_flush_func(
 	ut_print_timestamp(stderr);
 
 	fprintf(stderr,
-		"  InnoDB: Error: the OS said file flush did not succeed\n");
+		" InnoDB: Error: the OS said file flush did not succeed\n");
 
 	os_file_handle_error(NULL, "flush");
 
@@ -2187,35 +2181,28 @@ os_file_flush_func(
 /*******************************************************************//**
 Does a synchronous read operation in Posix.
 @return	number of bytes read, -1 if error */
-static
+static __attribute__((nonnull, warn_unused_result))
 ssize_t
 os_file_pread(
 /*==========*/
 	os_file_t	file,	/*!< in: handle to a file */
 	void*		buf,	/*!< in: buffer where to read */
 	ulint		n,	/*!< in: number of bytes to read */
-	ulint		offset,	/*!< in: least significant 32 bits of file
-				offset from where to read */
-	ulint		offset_high) /*!< in: most significant 32 bits of
-				offset */
+	os_offset_t	offset)	/*!< in: file offset from where to read */
 {
 	off_t	offs;
 #if defined(HAVE_PREAD) && !defined(HAVE_BROKEN_PREAD)
 	ssize_t	n_bytes;
 #endif /* HAVE_PREAD && !HAVE_BROKEN_PREAD */
 
-	ut_a((offset & 0xFFFFFFFFUL) == offset);
+	ut_ad(n);
 
 	/* If off_t is > 4 bytes in size, then we assume we can pass a
 	64-bit address */
+	offs = (off_t) offset;
 
-	if (sizeof(off_t) > 4) {
-		offs = (off_t)offset + (((off_t)offset_high) << 32);
-
-	} else {
-		offs = (off_t)offset;
-
-		if (offset_high > 0) {
+	if (sizeof(off_t) <= 4) {
+		if (UNIV_UNLIKELY(offset != (os_offset_t) offs)) {
 			fprintf(stderr,
 				"InnoDB: Error: file read at offset > 4 GB\n");
 		}
@@ -2224,17 +2211,31 @@ os_file_pread(
 	os_n_file_reads++;
 
 #if defined(HAVE_PREAD) && !defined(HAVE_BROKEN_PREAD)
+#if defined(HAVE_ATOMIC_BUILTINS) && UNIV_WORD_SIZE == 8
+	(void) os_atomic_increment_ulint(&os_n_pending_reads, 1);
+	(void) os_atomic_increment_ulint(&os_file_n_pending_preads, 1);
+	MONITOR_ATOMIC_INC(MONITOR_OS_PENDING_READS);
+#else
 	os_mutex_enter(os_file_count_mutex);
 	os_file_n_pending_preads++;
 	os_n_pending_reads++;
+	MONITOR_INC(MONITOR_OS_PENDING_READS);
 	os_mutex_exit(os_file_count_mutex);
+#endif /* HAVE_ATOMIC_BUILTINS && UNIV_WORD == 8 */
 
-	n_bytes = pread(file, buf, (ssize_t)n, offs);
+	n_bytes = pread(file, buf, n, offs);
 
+#if defined(HAVE_ATOMIC_BUILTINS) && UNIV_WORD_SIZE == 8
+	(void) os_atomic_decrement_ulint(&os_n_pending_reads, 1);
+	(void) os_atomic_decrement_ulint(&os_file_n_pending_preads, 1);
+	MONITOR_ATOMIC_DEC(MONITOR_OS_PENDING_READS);
+#else
 	os_mutex_enter(os_file_count_mutex);
 	os_file_n_pending_preads--;
 	os_n_pending_reads--;
+	MONITOR_DEC(MONITOR_OS_PENDING_READS);
 	os_mutex_exit(os_file_count_mutex);
+#endif /* !HAVE_ATOMIC_BUILTINS || UNIV_WORD == 8 */
 
 	return(n_bytes);
 #else
@@ -2245,10 +2246,15 @@ os_file_pread(
 		ulint	i;
 #endif /* !UNIV_HOTBACKUP */
 
+#if defined(HAVE_ATOMIC_BUILTINS) && UNIV_WORD_SIZE == 8
+		(void) os_atomic_increment_ulint(&os_n_pending_reads, 1);
+		MONITOR_ATOMIC_INC(MONITOR_OS_PENDING_READS);
+#else
 		os_mutex_enter(os_file_count_mutex);
 		os_n_pending_reads++;
+		MONITOR_INC(MONITOR_OS_PENDING_READS);
 		os_mutex_exit(os_file_count_mutex);
-
+#endif /* HAVE_ATOMIC_BUILTINS && UNIV_WORD == 8 */
 #ifndef UNIV_HOTBACKUP
 		/* Protect the seek / read operation with a mutex */
 		i = ((ulint) file) % OS_FILE_N_SEEK_MUTEXES;
@@ -2261,16 +2267,22 @@ os_file_pread(
 		if (ret_offset < 0) {
 			ret = -1;
 		} else {
-			ret = read(file, buf, (ssize_t)n);
+			ret = read(file, buf, (ssize_t) n);
 		}
 
 #ifndef UNIV_HOTBACKUP
 		os_mutex_exit(os_file_seek_mutexes[i]);
 #endif /* !UNIV_HOTBACKUP */
 
+#if defined(HAVE_ATOMIC_BUILTINS) && UNIV_WORD_SIZE == 8
+		(void) os_atomic_decrement_ulint(&os_n_pending_reads, 1);
+		MONITOR_ATOIC_DEC(MONITOR_OS_PENDING_READS);
+#else
 		os_mutex_enter(os_file_count_mutex);
 		os_n_pending_reads--;
+		MONITOR_DEC(MONITOR_OS_PENDING_READS);
 		os_mutex_exit(os_file_count_mutex);
+#endif /* HAVE_ATOMIC_BUILTINS && UNIV_WORD_SIZE == 8 */
 
 		return(ret);
 	}
@@ -2280,32 +2292,26 @@ os_file_pread(
 /*******************************************************************//**
 Does a synchronous write operation in Posix.
 @return	number of bytes written, -1 if error */
-static
+static __attribute__((nonnull, warn_unused_result))
 ssize_t
 os_file_pwrite(
 /*===========*/
 	os_file_t	file,	/*!< in: handle to a file */
 	const void*	buf,	/*!< in: buffer from where to write */
 	ulint		n,	/*!< in: number of bytes to write */
-	ulint		offset,	/*!< in: least significant 32 bits of file
-				offset where to write */
-	ulint		offset_high) /*!< in: most significant 32 bits of
-				offset */
+	os_offset_t	offset)	/*!< in: file offset where to write */
 {
 	ssize_t	ret;
 	off_t	offs;
 
-	ut_a((offset & 0xFFFFFFFFUL) == offset);
+	ut_ad(n);
 
 	/* If off_t is > 4 bytes in size, then we assume we can pass a
 	64-bit address */
+	offs = (off_t) offset;
 
-	if (sizeof(off_t) > 4) {
-		offs = (off_t)offset + (((off_t)offset_high) << 32);
-	} else {
-		offs = (off_t)offset;
-
-		if (offset_high > 0) {
+	if (sizeof(off_t) <= 4) {
+		if (UNIV_UNLIKELY(offset != (os_offset_t) offs)) {
 			fprintf(stderr,
 				"InnoDB: Error: file write"
 				" at offset > 4 GB\n");
@@ -2315,30 +2321,31 @@ os_file_pwrite(
 	os_n_file_writes++;
 
 #if defined(HAVE_PWRITE) && !defined(HAVE_BROKEN_PREAD)
+#if !defined(HAVE_ATOMIC_BUILTINS) || UNIV_WORD_SIZE < 8
 	os_mutex_enter(os_file_count_mutex);
 	os_file_n_pending_pwrites++;
 	os_n_pending_writes++;
+	MONITOR_INC(MONITOR_OS_PENDING_WRITES);
 	os_mutex_exit(os_file_count_mutex);
+#else
+	(void) os_atomic_increment_ulint(&os_n_pending_writes, 1);
+	(void) os_atomic_increment_ulint(&os_file_n_pending_pwrites, 1);
+	MONITOR_ATOMIC_INC(MONITOR_OS_PENDING_WRITES);
+#endif /* !HAVE_ATOMIC_BUILTINS || UNIV_WORD < 8 */
 
-	ret = pwrite(file, buf, (ssize_t)n, offs);
+	ret = pwrite(file, buf, (ssize_t) n, offs);
 
+#if !defined(HAVE_ATOMIC_BUILTINS) || UNIV_WORD_SIZE < 8
 	os_mutex_enter(os_file_count_mutex);
 	os_file_n_pending_pwrites--;
 	os_n_pending_writes--;
+	MONITOR_DEC(MONITOR_OS_PENDING_WRITES);
 	os_mutex_exit(os_file_count_mutex);
-
-# ifdef UNIV_DO_FLUSH
-	if (srv_unix_file_flush_method != SRV_UNIX_LITTLESYNC
-	    && srv_unix_file_flush_method != SRV_UNIX_NOSYNC
-	    && !os_do_not_call_flush_at_each_write) {
-
-		/* Always do fsync to reduce the probability that when
-		the OS crashes, a database page is only partially
-		physically written to disk. */
-
-		ut_a(TRUE == os_file_flush(file));
-	}
-# endif /* UNIV_DO_FLUSH */
+#else
+	(void) os_atomic_decrement_ulint(&os_n_pending_writes, 1);
+	(void) os_atomic_decrement_ulint(&os_file_n_pending_pwrites, 1);
+	MONITOR_ATOMIC_DEC(MONITOR_OS_PENDING_WRITES);
+#endif /* !HAVE_ATOMIC_BUILTINS || UNIV_WORD < 8 */
 
 	return(ret);
 #else
@@ -2350,6 +2357,7 @@ os_file_pwrite(
 
 		os_mutex_enter(os_file_count_mutex);
 		os_n_pending_writes++;
+		MONITOR_INC(MONITOR_OS_PENDING_WRITES);
 		os_mutex_exit(os_file_count_mutex);
 
 # ifndef UNIV_HOTBACKUP
@@ -2367,20 +2375,7 @@ os_file_pwrite(
 			goto func_exit;
 		}
 
-		ret = write(file, buf, (ssize_t)n);
-
-# ifdef UNIV_DO_FLUSH
-		if (srv_unix_file_flush_method != SRV_UNIX_LITTLESYNC
-		    && srv_unix_file_flush_method != SRV_UNIX_NOSYNC
-		    && !os_do_not_call_flush_at_each_write) {
-
-			/* Always do fsync to reduce the probability that when
-			the OS crashes, a database page is only partially
-			physically written to disk. */
-
-			ut_a(TRUE == os_file_flush(file));
-		}
-# endif /* UNIV_DO_FLUSH */
+		ret = write(file, buf, (ssize_t) n);
 
 func_exit:
 # ifndef UNIV_HOTBACKUP
@@ -2389,6 +2384,7 @@ func_exit:
 
 		os_mutex_enter(os_file_count_mutex);
 		os_n_pending_writes--;
+		MONITOR_DEC(MONITOR_OS_PENDING_WRITES);
 		os_mutex_exit(os_file_count_mutex);
 
 		return(ret);
@@ -2408,10 +2404,7 @@ os_file_read_func(
 /*==============*/
 	os_file_t	file,	/*!< in: handle to a file */
 	void*		buf,	/*!< in: buffer where to read */
-	ulint		offset,	/*!< in: least significant 32 bits of file
-				offset where to read */
-	ulint		offset_high, /*!< in: most significant 32 bits of
-				offset */
+	os_offset_t	offset,	/*!< in: file offset where to read */
 	ulint		n)	/*!< in: number of bytes to read */
 {
 #ifdef __WIN__
@@ -2427,7 +2420,6 @@ os_file_read_func(
 
 	/* On 64-bit Windows, ulint is 64 bits. But offset and n should be
 	no more than 32 bits. */
-	ut_a((offset & 0xFFFFFFFFUL) == offset);
 	ut_a((n & 0xFFFFFFFFUL) == n);
 
 	os_n_file_reads++;
@@ -2438,11 +2430,12 @@ try_again:
 	ut_ad(buf);
 	ut_ad(n > 0);
 
-	low = (DWORD) offset;
-	high = (DWORD) offset_high;
+	low = (DWORD) offset & 0xFFFFFFFF;
+	high = (DWORD) (offset >> 32);
 
 	os_mutex_enter(os_file_count_mutex);
 	os_n_pending_reads++;
+	MONITOR_INC(MONITOR_OS_PENDING_READS);
 	os_mutex_exit(os_file_count_mutex);
 
 #ifndef UNIV_HOTBACKUP
@@ -2452,7 +2445,8 @@ try_again:
 	os_mutex_enter(os_file_seek_mutexes[i]);
 #endif /* !UNIV_HOTBACKUP */
 
-	ret2 = SetFilePointer(file, low, &high, FILE_BEGIN);
+	ret2 = SetFilePointer(
+		file, low, reinterpret_cast<PLONG>(&high), FILE_BEGIN);
 
 	if (ret2 == 0xFFFFFFFF && GetLastError() != NO_ERROR) {
 
@@ -2462,6 +2456,7 @@ try_again:
 
 		os_mutex_enter(os_file_count_mutex);
 		os_n_pending_reads--;
+		MONITOR_DEC(MONITOR_OS_PENDING_READS);
 		os_mutex_exit(os_file_count_mutex);
 
 		goto error_handling;
@@ -2475,6 +2470,7 @@ try_again:
 
 	os_mutex_enter(os_file_count_mutex);
 	os_n_pending_reads--;
+	MONITOR_DEC(MONITOR_OS_PENDING_READS);
 	os_mutex_exit(os_file_count_mutex);
 
 	if (ret && len == n) {
@@ -2487,18 +2483,18 @@ try_again:
 	os_bytes_read_since_printout += n;
 
 try_again:
-	ret = os_file_pread(file, buf, n, offset, offset_high);
+	ret = os_file_pread(file, buf, n, offset);
 
-	if ((ulint)ret == n) {
+	if ((ulint) ret == n) {
 
 		return(TRUE);
 	}
 
 	fprintf(stderr,
-		"InnoDB: Error: tried to read %lu bytes at offset %lu %lu.\n"
+		"InnoDB: Error: tried to read "ULINTPF" bytes at offset "
+		UINT64PF"\n"
 		"InnoDB: Was only able to read %ld.\n",
-		(ulong)n, (ulong)offset_high,
-		(ulong)offset, (long)ret);
+		n, offset, (lint) ret);
 #endif /* __WIN__ */
 #ifdef __WIN__
 error_handling:
@@ -2537,10 +2533,7 @@ os_file_read_no_error_handling_func(
 /*================================*/
 	os_file_t	file,	/*!< in: handle to a file */
 	void*		buf,	/*!< in: buffer where to read */
-	ulint		offset,	/*!< in: least significant 32 bits of file
-				offset where to read */
-	ulint		offset_high, /*!< in: most significant 32 bits of
-				offset */
+	os_offset_t	offset,	/*!< in: file offset where to read */
 	ulint		n)	/*!< in: number of bytes to read */
 {
 #ifdef __WIN__
@@ -2556,7 +2549,6 @@ os_file_read_no_error_handling_func(
 
 	/* On 64-bit Windows, ulint is 64 bits. But offset and n should be
 	no more than 32 bits. */
-	ut_a((offset & 0xFFFFFFFFUL) == offset);
 	ut_a((n & 0xFFFFFFFFUL) == n);
 
 	os_n_file_reads++;
@@ -2567,11 +2559,12 @@ try_again:
 	ut_ad(buf);
 	ut_ad(n > 0);
 
-	low = (DWORD) offset;
-	high = (DWORD) offset_high;
+	low = (DWORD) offset & 0xFFFFFFFF;
+	high = (DWORD) (offset >> 32);
 
 	os_mutex_enter(os_file_count_mutex);
 	os_n_pending_reads++;
+	MONITOR_INC(MONITOR_OS_PENDING_READS);
 	os_mutex_exit(os_file_count_mutex);
 
 #ifndef UNIV_HOTBACKUP
@@ -2581,7 +2574,8 @@ try_again:
 	os_mutex_enter(os_file_seek_mutexes[i]);
 #endif /* !UNIV_HOTBACKUP */
 
-	ret2 = SetFilePointer(file, low, &high, FILE_BEGIN);
+	ret2 = SetFilePointer(
+		file, low, reinterpret_cast<PLONG>(&high), FILE_BEGIN);
 
 	if (ret2 == 0xFFFFFFFF && GetLastError() != NO_ERROR) {
 
@@ -2591,6 +2585,7 @@ try_again:
 
 		os_mutex_enter(os_file_count_mutex);
 		os_n_pending_reads--;
+		MONITOR_DEC(MONITOR_OS_PENDING_READS);
 		os_mutex_exit(os_file_count_mutex);
 
 		goto error_handling;
@@ -2604,6 +2599,7 @@ try_again:
 
 	os_mutex_enter(os_file_count_mutex);
 	os_n_pending_reads--;
+	MONITOR_DEC(MONITOR_OS_PENDING_READS);
 	os_mutex_exit(os_file_count_mutex);
 
 	if (ret && len == n) {
@@ -2616,9 +2612,9 @@ try_again:
 	os_bytes_read_since_printout += n;
 
 try_again:
-	ret = os_file_pread(file, buf, n, offset, offset_high);
+	ret = os_file_pread(file, buf, n, offset);
 
-	if ((ulint)ret == n) {
+	if ((ulint) ret == n) {
 
 		return(TRUE);
 	}
@@ -2626,7 +2622,7 @@ try_again:
 #ifdef __WIN__
 error_handling:
 #endif
-	retry = os_file_handle_error_no_exit(NULL, "read");
+	retry = os_file_handle_error_no_exit(NULL, "read", FALSE);
 
 	if (retry) {
 		goto try_again;
@@ -2671,10 +2667,7 @@ os_file_write_func(
 				null-terminated string */
 	os_file_t	file,	/*!< in: handle to a file */
 	const void*	buf,	/*!< in: buffer from which to write */
-	ulint		offset,	/*!< in: least significant 32 bits of file
-				offset where to write */
-	ulint		offset_high, /*!< in: most significant 32 bits of
-				offset */
+	os_offset_t	offset,	/*!< in: file offset where to write */
 	ulint		n)	/*!< in: number of bytes to write */
 {
 #ifdef __WIN__
@@ -2691,7 +2684,6 @@ os_file_write_func(
 
 	/* On 64-bit Windows, ulint is 64 bits. But offset and n should be
 	no more than 32 bits. */
-	ut_a((offset & 0xFFFFFFFFUL) == offset);
 	ut_a((n & 0xFFFFFFFFUL) == n);
 
 	os_n_file_writes++;
@@ -2700,11 +2692,12 @@ os_file_write_func(
 	ut_ad(buf);
 	ut_ad(n > 0);
 retry:
-	low = (DWORD) offset;
-	high = (DWORD) offset_high;
+	low = (DWORD) offset & 0xFFFFFFFF;
+	high = (DWORD) (offset >> 32);
 
 	os_mutex_enter(os_file_count_mutex);
 	os_n_pending_writes++;
+	MONITOR_INC(MONITOR_OS_PENDING_WRITES);
 	os_mutex_exit(os_file_count_mutex);
 
 #ifndef UNIV_HOTBACKUP
@@ -2714,7 +2707,8 @@ retry:
 	os_mutex_enter(os_file_seek_mutexes[i]);
 #endif /* !UNIV_HOTBACKUP */
 
-	ret2 = SetFilePointer(file, low, &high, FILE_BEGIN);
+	ret2 = SetFilePointer(
+		file, low, reinterpret_cast<PLONG>(&high), FILE_BEGIN);
 
 	if (ret2 == 0xFFFFFFFF && GetLastError() != NO_ERROR) {
 
@@ -2724,42 +2718,34 @@ retry:
 
 		os_mutex_enter(os_file_count_mutex);
 		os_n_pending_writes--;
+		MONITOR_DEC(MONITOR_OS_PENDING_WRITES);
 		os_mutex_exit(os_file_count_mutex);
 
 		ut_print_timestamp(stderr);
 
 		fprintf(stderr,
-			"  InnoDB: Error: File pointer positioning to"
+			" InnoDB: Error: File pointer positioning to"
 			" file %s failed at\n"
-			"InnoDB: offset %lu %lu. Operating system"
+			"InnoDB: offset %llu. Operating system"
 			" error number %lu.\n"
 			"InnoDB: Some operating system error numbers"
 			" are described at\n"
 			"InnoDB: "
 			REFMAN "operating-system-error-codes.html\n",
-			name, (ulong) offset_high, (ulong) offset,
-			(ulong) GetLastError());
+			name, offset, (ulong) GetLastError());
 
 		return(FALSE);
 	}
 
 	ret = WriteFile(file, buf, (DWORD) n, &len, NULL);
 
-	/* Always do fsync to reduce the probability that when the OS crashes,
-	a database page is only partially physically written to disk. */
-
-# ifdef UNIV_DO_FLUSH
-	if (!os_do_not_call_flush_at_each_write) {
-		ut_a(TRUE == os_file_flush(file));
-	}
-# endif /* UNIV_DO_FLUSH */
-
 #ifndef UNIV_HOTBACKUP
 	os_mutex_exit(os_file_seek_mutexes[i]);
 #endif /* !UNIV_HOTBACKUP */
 
 	os_mutex_enter(os_file_count_mutex);
 	os_n_pending_writes--;
+	MONITOR_DEC(MONITOR_OS_PENDING_WRITES);
 	os_mutex_exit(os_file_count_mutex);
 
 	if (ret && len == n) {
@@ -2782,13 +2768,13 @@ retry:
 
 	if (!os_has_said_disk_full) {
 
-		err = (ulint)GetLastError();
+		err = (ulint) GetLastError();
 
 		ut_print_timestamp(stderr);
 
 		fprintf(stderr,
-			"  InnoDB: Error: Write to file %s failed"
-			" at offset %lu %lu.\n"
+			" InnoDB: Error: Write to file %s failed"
+			" at offset %llu.\n"
 			"InnoDB: %lu bytes should have been written,"
 			" only %lu were written.\n"
 			"InnoDB: Operating system error number %lu.\n"
@@ -2796,13 +2782,13 @@ retry:
 			" support files of this size.\n"
 			"InnoDB: Check also that the disk is not full"
 			" or a disk quota exceeded.\n",
-			name, (ulong) offset_high, (ulong) offset,
-			(ulong) n, ret ? len : 0, (ulong) err);
+			name, offset,
+			(ulong) n, (ulong) len, (ulong) err);
 
-		if (strerror((int)err) != NULL) {
+		if (strerror((int) err) != NULL) {
 			fprintf(stderr,
 				"InnoDB: Error number %lu means '%s'.\n",
-				(ulong) err, strerror((int)err));
+				(ulong) err, strerror((int) err));
 		}
 
 		fprintf(stderr,
@@ -2818,9 +2804,9 @@ retry:
 #else
 	ssize_t	ret;
 
-	ret = os_file_pwrite(file, buf, n, offset, offset_high);
+	ret = os_file_pwrite(file, buf, n, offset);
 
-	if ((ulint)ret == n) {
+	if ((ulint) ret == n) {
 
 		return(TRUE);
 	}
@@ -2830,8 +2816,8 @@ retry:
 		ut_print_timestamp(stderr);
 
 		fprintf(stderr,
-			"  InnoDB: Error: Write to file %s failed"
-			" at offset %lu %lu.\n"
+			" InnoDB: Error: Write to file %s failed"
+			" at offset "UINT64PF".\n"
 			"InnoDB: %lu bytes should have been written,"
 			" only %ld were written.\n"
 			"InnoDB: Operating system error number %lu.\n"
@@ -2839,12 +2825,12 @@ retry:
 			" support files of this size.\n"
 			"InnoDB: Check also that the disk is not full"
 			" or a disk quota exceeded.\n",
-			name, offset_high, offset, n, (long int)ret,
-			(ulint)errno);
+			name, offset, n, (lint) ret,
+			(ulint) errno);
 		if (strerror(errno) != NULL) {
 			fprintf(stderr,
 				"InnoDB: Error number %lu means '%s'.\n",
-				(ulint)errno, strerror(errno));
+				(ulint) errno, strerror(errno));
 		}
 
 		fprintf(stderr,
@@ -2883,7 +2869,7 @@ os_file_status(
 	} else if (ret) {
 		/* file exists, but stat call failed */
 
-		os_file_handle_error_no_exit(path, "stat");
+		os_file_handle_error_no_exit(path, "stat", FALSE);
 
 		return(FALSE);
 	}
@@ -2911,7 +2897,7 @@ os_file_status(
 	} else if (ret) {
 		/* file exists, but stat call failed */
 
-		os_file_handle_error_no_exit(path, "stat");
+		os_file_handle_error_no_exit(path, "stat", FALSE);
 
 		return(FALSE);
 	}
@@ -2955,7 +2941,7 @@ os_file_get_status(
 	} else if (ret) {
 		/* file exists, but stat call failed */
 
-		os_file_handle_error_no_exit(path, "stat");
+		os_file_handle_error_no_exit(path, "stat", FALSE);
 
 		return(FALSE);
 	}
@@ -2986,7 +2972,7 @@ os_file_get_status(
 	} else if (ret) {
 		/* file exists, but stat call failed */
 
-		os_file_handle_error_no_exit(path, "stat");
+		os_file_handle_error_no_exit(path, "stat", FALSE);
 
 		return(FALSE);
 	}
@@ -3019,9 +3005,9 @@ os_file_get_status(
 
 /****************************************************************//**
 The function os_file_dirname returns a directory component of a
-null-terminated pathname string.  In the usual case, dirname returns
+null-terminated pathname string. In the usual case, dirname returns
 the string up to, but not including, the final '/', and basename
-is the component following the final '/'.  Trailing '/' charac�
+is the component following the final '/'. Trailing '/' charac�
 ters are not counted as part of the pathname.
 
 If path does not contain a slash, dirname returns the string ".".
@@ -3029,7 +3015,7 @@ If path does not contain a slash, dirname returns the string ".".
 Concatenating the string returned by dirname, a "/", and the basename
 yields a complete pathname.
 
-The return value is  a copy of the directory component of the pathname.
+The return value is a copy of the directory component of the pathname.
 The copy is allocated from heap. It is the caller responsibility
 to free it after it is no longer needed.
 
@@ -3167,7 +3153,7 @@ retry:
 			/* First time around. */
 			ut_print_timestamp(stderr);
 			fprintf(stderr,
-				"  InnoDB: Warning: io_setup() failed"
+				" InnoDB: Warning: io_setup() failed"
 				" with EAGAIN. Will make %d attempts"
 				" before giving up.\n",
 				OS_AIO_IO_SETUP_RETRY_ATTEMPTS);
@@ -3186,7 +3172,7 @@ retry:
 		/* Have tried enough. Better call it a day. */
 		ut_print_timestamp(stderr);
 		fprintf(stderr,
-			"  InnoDB: Error: io_setup() failed"
+			" InnoDB: Error: io_setup() failed"
 			" with EAGAIN after %d attempts.\n",
 			OS_AIO_IO_SETUP_RETRY_ATTEMPTS);
 		break;
@@ -3194,7 +3180,7 @@ retry:
 	case -ENOSYS:
 		ut_print_timestamp(stderr);
 		fprintf(stderr,
-			"  InnoDB: Error: Linux Native AIO interface"
+			" InnoDB: Error: Linux Native AIO interface"
 			" is not supported on this platform. Please"
 			" check your OS documentation and install"
 			" appropriate binary of InnoDB.\n");
@@ -3204,7 +3190,7 @@ retry:
 	default:
 		ut_print_timestamp(stderr);
 		fprintf(stderr,
-			"  InnoDB: Error: Linux Native AIO setup"
+			" InnoDB: Error: Linux Native AIO setup"
 			" returned following error[%d]\n", -ret);
 		break;
 	}
@@ -3253,8 +3239,8 @@ os_aio_native_aio_supported(void)
 
 	memset(&io_event, 0x0, sizeof(io_event));
 
-	buf = (byte*) ut_malloc(UNIV_PAGE_SIZE * 2);
-	ptr = (byte*) ut_align(buf, UNIV_PAGE_SIZE);
+	buf = static_cast<byte*>(ut_malloc(UNIV_PAGE_SIZE * 2));
+	ptr = static_cast<byte*>(ut_align(buf, UNIV_PAGE_SIZE));
 
 	/* Suppress valgrind warning. */
 	memset(buf, 0x00, UNIV_PAGE_SIZE * 2);
@@ -3325,7 +3311,7 @@ os_aio_array_create(
 	ut_a(n > 0);
 	ut_a(n_segments > 0);
 
-	array = ut_malloc(sizeof(os_aio_array_t));
+	array = static_cast<os_aio_array_t*>(ut_malloc(sizeof(os_aio_array_t)));
 
 	array->mutex		= os_mutex_create();
 	array->not_full		= os_event_create(NULL);
@@ -3337,9 +3323,11 @@ os_aio_array_create(
 	array->n_segments	= n_segments;
 	array->n_reserved	= 0;
 	array->cur_seg		= 0;
-	array->slots		= ut_malloc(n * sizeof(os_aio_slot_t));
+
+	array->slots = static_cast<os_aio_slot_t*>(
+		ut_malloc(n * sizeof(os_aio_slot_t)));
 #ifdef __WIN__
-	array->handles		= ut_malloc(n * sizeof(HANDLE));
+	array->handles = static_cast<HANDLE*>(ut_malloc(n * sizeof(HANDLE)));
 #endif
 
 #if defined(LINUX_NATIVE_AIO)
@@ -3355,34 +3343,26 @@ os_aio_array_create(
 	/* Initialize the io_context array. One io_context
 	per segment in the array. */
 
-	array->aio_ctx = ut_malloc(n_segments *
-				   sizeof(*array->aio_ctx));
+	array->aio_ctx = static_cast<io_context**>(
+		ut_malloc(n_segments * sizeof(*array->aio_ctx)));
+
 	for (i = 0; i < n_segments; ++i) {
 		if (!os_aio_linux_create_io_ctx(n/n_segments,
-					   &array->aio_ctx[i])) {
+						&array->aio_ctx[i])) {
 			/* If something bad happened during aio setup
-			we disable linux native aio.
-                        The disadvantage will be a small memory leak
-                        at shutdown but that's ok compared to a crash
-                        or a not working server.
-                        This frequently happens when running the test suite
-                        with many threads on a system with low fs.aio-max-nr!
-                        */
-
-                        fprintf(stderr,
-                                "  InnoDB: Warning: Linux Native AIO disabled "
-                                "because os_aio_linux_create_io_ctx() "
-                                "failed. To get rid of this warning you can "
-                                "try increasing system "
-                                "fs.aio-max-nr to 1048576 or larger or "
-                                "setting innodb_use_native_aio = 0 in my.cnf\n");
-                        srv_use_native_aio = FALSE;
-			goto skip_native_aio;
+			we should call it a day and return right away.
+			We don't care about any leaks because a failure
+			to initialize the io subsystem means that the
+			server (or atleast the innodb storage engine)
+			is not going to startup. */
+			return(NULL);
 		}
 	}
 
 	/* Initialize the event array. One event per slot. */
-	io_event = ut_malloc(n * sizeof(*io_event));
+	io_event = static_cast<struct io_event*>(
+		ut_malloc(n * sizeof(*io_event)));
+
 	memset(io_event, 0x0, sizeof(*io_event) * n);
 	array->aio_events = io_event;
 
@@ -3539,7 +3519,8 @@ os_aio_init(
 
 	os_aio_validate();
 
-	os_aio_segment_wait_events = ut_malloc(n_segments * sizeof(void*));
+	os_aio_segment_wait_events = static_cast<os_event_struct_t**>(
+		ut_malloc(n_segments * sizeof(void*)));
 
 	for (i = 0; i < n_segments; i++) {
 		os_aio_segment_wait_events[i] = os_event_create(NULL);
@@ -3742,10 +3723,7 @@ os_aio_array_reserve_slot(
 				null-terminated string */
 	void*		buf,	/*!< in: buffer where to read or from which
 				to write */
-	ulint		offset,	/*!< in: least significant 32 bits of file
-				offset */
-	ulint		offset_high, /*!< in: most significant 32 bits of
-				offset */
+	os_offset_t	offset,	/*!< in: file offset */
 	ulint		len)	/*!< in: length of the block to read or write */
 {
 	os_aio_slot_t*	slot = NULL;
@@ -3774,7 +3752,7 @@ os_aio_array_reserve_slot(
 	segment. This can help in merging IO requests when we are
 	doing simulated AIO */
 	local_seg = (offset >> (UNIV_PAGE_SIZE_SHIFT + 6))
-		    % array->n_segments;
+		% array->n_segments;
 
 loop:
 	os_mutex_enter(array->mutex);
@@ -3831,15 +3809,14 @@ found:
 	slot->name     = name;
 	slot->len      = len;
 	slot->type     = type;
-	slot->buf      = buf;
+	slot->buf      = static_cast<byte*>(buf);
 	slot->offset   = offset;
-	slot->offset_high = offset_high;
 	slot->io_already_done = FALSE;
 
 #ifdef WIN_ASYNC_IO
 	control = &(slot->control);
-	control->Offset = (DWORD)offset;
-	control->OffsetHigh = (DWORD)offset_high;
+	control->Offset = (DWORD) offset & 0xFFFFFFFF;
+	control->OffsetHigh = (DWORD) (offset >> 32);
 	ResetEvent(slot->handle);
 
 #elif defined(LINUX_NATIVE_AIO)
@@ -3851,14 +3828,10 @@ found:
 
 	/* Check if we are dealing with 64 bit arch.
 	If not then make sure that offset fits in 32 bits. */
-	if (sizeof(aio_offset) == 8) {
-		aio_offset = offset_high;
-		aio_offset <<= 32;
-		aio_offset += offset;
-	} else {
-		ut_a(offset_high == 0);
-		aio_offset = offset;
-	}
+	aio_offset = (off_t) offset;
+
+	ut_a(sizeof(aio_offset) >= sizeof(offset)
+	     || ((os_offset_t) aio_offset) == offset);
 
 	iocb = &slot->control;
 
@@ -3869,11 +3842,10 @@ found:
 		io_prep_pwrite(iocb, file, buf, len, aio_offset);
 	}
 
-	iocb->data = (void*)slot;
+	iocb->data = (void*) slot;
 	slot->n_bytes = 0;
 	slot->ret = 0;
 	/*fprintf(stderr, "Filled up Linux native iocb.\n");*/
-	
 
 skip_native_aio:
 #endif /* LINUX_NATIVE_AIO */
@@ -4067,7 +4039,7 @@ os_aio_linux_dispatch(
 	fprintf(stderr,
 		"io_submit[%c] ret[%d]: slot[%p] ctx[%p] seg[%lu]\n",
 		(slot->type == OS_FILE_WRITE) ? 'w' : 'r', ret, slot,
-		array->aio_ctx[io_ctx_index], (ulong)io_ctx_index);
+		array->aio_ctx[io_ctx_index], (ulong) io_ctx_index);
 #endif
 
 	/* io_submit returns number of successfully
@@ -4109,10 +4081,7 @@ os_aio_func(
 	os_file_t	file,	/*!< in: handle to a file */
 	void*		buf,	/*!< in: buffer where to read or from which
 				to write */
-	ulint		offset,	/*!< in: least significant 32 bits of file
-				offset where to read or write */
-	ulint		offset_high, /*!< in: most significant 32 bits of
-				offset */
+	os_offset_t	offset,	/*!< in: file offset where to read or write */
 	ulint		n,	/*!< in: number of bytes to read or write */
 	fil_node_t*	message1,/*!< in: message for the aio handler
 				(can be used to identify a completed
@@ -4168,14 +4137,12 @@ os_aio_func(
 		and os_file_write_func() */
 
 		if (type == OS_FILE_READ) {
-			return(os_file_read_func(file, buf, offset,
-					    offset_high, n));
+			return(os_file_read_func(file, buf, offset, n));
 		}
 
 		ut_a(type == OS_FILE_WRITE);
 
-		return(os_file_write_func(name, file, buf, offset,
-					  offset_high, n));
+		return(os_file_write_func(name, file, buf, offset, n));
 	}
 
 try_again:
@@ -4211,13 +4178,13 @@ try_again:
 	}
 
 	slot = os_aio_array_reserve_slot(type, array, message1, message2, file,
-					 name, buf, offset, offset_high, n);
+					 name, buf, offset, n);
 	if (type == OS_FILE_READ) {
 		if (srv_use_native_aio) {
 			os_n_file_reads++;
 			os_bytes_read_since_printout += n;
 #ifdef WIN_ASYNC_IO
-			ret = ReadFile(file, buf, (DWORD)n, &len,
+			ret = ReadFile(file, buf, (DWORD) n, &len,
 				       &(slot->control));
 
 #elif defined(LINUX_NATIVE_AIO)
@@ -4236,7 +4203,7 @@ try_again:
 		if (srv_use_native_aio) {
 			os_n_file_writes++;
 #ifdef WIN_ASYNC_IO
-			ret = WriteFile(file, buf, (DWORD)n, &len,
+			ret = WriteFile(file, buf, (DWORD) n, &len,
 					&(slot->control));
 
 #elif defined(LINUX_NATIVE_AIO)
@@ -4399,16 +4366,8 @@ os_aio_windows_handle(
 	*type = slot->type;
 
 	if (ret && len == slot->len) {
-		ret_val = TRUE;
 
-#ifdef UNIV_DO_FLUSH
-		if (slot->type == OS_FILE_WRITE
-		    && !os_do_not_call_flush_at_each_write) {
-			if (!os_file_flush(slot->file)) {
-				ut_error;
-			}
-		}
-#endif /* UNIV_DO_FLUSH */
+		ret_val = TRUE;
 	} else if (os_file_handle_error(slot->name, "Windows aio")) {
 
 		retry = TRUE;
@@ -4539,10 +4498,10 @@ retry:
 			os_aio_slot_t*	slot;
 			struct iocb*	control;
 
-			control = (struct iocb *)events[i].obj;
+			control = (struct iocb*) events[i].obj;
 			ut_a(control != NULL);
 
-			slot = (os_aio_slot_t *) control->data;
+			slot = (os_aio_slot_t*) control->data;
 
 			/* Some sanity checks. */
 			ut_a(slot != NULL);
@@ -4597,7 +4556,7 @@ retry:
 	/* All other errors should cause a trap for now. */
 	ut_print_timestamp(stderr);
 	fprintf(stderr,
-		"  InnoDB: unexpected ret_code[%d] from io_getevents()!\n",
+		" InnoDB: unexpected ret_code[%d] from io_getevents()!\n",
 		ret);
 	ut_error;
 }
@@ -4699,16 +4658,9 @@ found:
 
 	*type = slot->type;
 
-	if ((slot->ret == 0) && (slot->n_bytes == (long)slot->len)) {
-		ret = TRUE;
+	if ((slot->ret == 0) && (slot->n_bytes == (long) slot->len)) {
 
-#ifdef UNIV_DO_FLUSH
-		if (slot->type == OS_FILE_WRITE
-		    && !os_do_not_call_flush_at_each_write)
-		    && !os_file_flush(slot->file) {
-			ut_error;
-		}
-#endif /* UNIV_DO_FLUSH */
+		ret = TRUE;
 	} else {
 		errno = -slot->ret;
 
@@ -4761,7 +4713,7 @@ os_aio_simulated_handle(
 	ulint		n_consecutive;
 	ulint		total_len;
 	ulint		offs;
-	ulint		lowest_offset;
+	os_offset_t	lowest_offset;
 	ulint		biggest_age;
 	ulint		age;
 	byte*		combined_buf;
@@ -4848,14 +4800,14 @@ restart:
 	then pick the one at the lowest offset. */
 
 	biggest_age = 0;
-	lowest_offset = ULINT_MAX;
+	lowest_offset = IB_UINT64_MAX;
 
 	for (i = 0; i < n; i++) {
 		slot = os_aio_array_get_nth_slot(array, i + segment * n);
 
 		if (slot->reserved) {
-			age = (ulint)difftime(time(NULL),
-					      slot->reservation_time);
+			age = (ulint) difftime(time(NULL),
+					       slot->reservation_time);
 
 			if ((age >= 2 && age > biggest_age)
 			    || (age >= 2 && age == biggest_age
@@ -4877,7 +4829,7 @@ restart:
 		lowest offset in the array (we ignore the high 32 bits of the
 		offset in these heuristics) */
 
-		lowest_offset = ULINT_MAX;
+		lowest_offset = IB_UINT64_MAX;
 
 		for (i = 0; i < n; i++) {
 			slot = os_aio_array_get_nth_slot(array,
@@ -4917,9 +4869,6 @@ consecutive_loop:
 
 		if (slot2->reserved && slot2 != slot
 		    && slot2->offset == slot->offset + slot->len
-		    /* check that sum does not wrap over */
-		    && slot->offset + slot->len > slot->offset
-		    && slot2->offset_high == slot->offset_high
 		    && slot2->type == slot->type
 		    && slot2->file == slot->file) {
 
@@ -4957,11 +4906,13 @@ consecutive_loop:
 		combined_buf = slot->buf;
 		combined_buf2 = NULL;
 	} else {
-		combined_buf2 = ut_malloc(total_len + UNIV_PAGE_SIZE);
+		combined_buf2 = static_cast<byte*>(
+			ut_malloc(total_len + UNIV_PAGE_SIZE));
 
 		ut_a(combined_buf2);
 
-		combined_buf = ut_align(combined_buf2, UNIV_PAGE_SIZE);
+		combined_buf = static_cast<byte*>(
+			ut_align(combined_buf2, UNIV_PAGE_SIZE));
 	}
 
 	/* We release the array mutex for the time of the i/o: NOTE that
@@ -4986,20 +4937,18 @@ consecutive_loop:
 
 	if (os_aio_print_debug) {
 		fprintf(stderr,
-			"InnoDB: doing i/o of type %lu at offset %lu %lu,"
-			" length %lu\n",
-			(ulong) slot->type, (ulong) slot->offset_high,
-			(ulong) slot->offset, (ulong) total_len);
+			"InnoDB: doing i/o of type %lu at offset " UINT64PF
+			", length %lu\n",
+			(ulong) slot->type, slot->offset, (ulong) total_len);
 	}
 
 	/* Do the i/o with ordinary, synchronous i/o functions: */
 	if (slot->type == OS_FILE_WRITE) {
 		ret = os_file_write(slot->name, slot->file, combined_buf,
-				    slot->offset, slot->offset_high,
-				    total_len);
+				    slot->offset, total_len);
 	} else {
 		ret = os_file_read(slot->file, combined_buf,
-				   slot->offset, slot->offset_high, total_len);
+				   slot->offset, total_len);
 	}
 
 	ut_a(ret);
@@ -5298,7 +5247,7 @@ loop:
 		" %.2f writes/s, %.2f fsyncs/s\n",
 		(os_n_file_reads - os_n_file_reads_old)
 		/ time_elapsed,
-		(ulong)avg_bytes_read,
+		(ulong) avg_bytes_read,
 		(os_n_file_writes - os_n_file_writes_old)
 		/ time_elapsed,
 		(os_n_fsyncs - os_n_fsyncs_old)
diff --git a/storage/innobase/os/os0proc.c b/storage/innobase/os/os0proc.cc
index 68321e1aaf9..ff6d65e4ae6 100644
--- a/storage/innobase/os/os0proc.c
+++ b/storage/innobase/os/os0proc.cc
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1995, 2011, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,13 +11,13 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
 /**************************************************//**
-@file os/os0proc.c
+@file os/os0proc.cc
 The interface to the operating system
 process control primitives
 
@@ -58,7 +58,7 @@ os_proc_get_number(void)
 #ifdef __WIN__
 	return((ulint)GetCurrentProcessId());
 #else
-	return((ulint)getpid());
+	return((ulint) getpid());
 #endif
 }
 
@@ -86,14 +86,14 @@ os_mem_alloc_large(
 	size = ut_2pow_round(*n + (os_large_page_size - 1),
 			     os_large_page_size);
 
-	shmid = shmget(IPC_PRIVATE, (size_t)size, SHM_HUGETLB | SHM_R | SHM_W);
+	shmid = shmget(IPC_PRIVATE, (size_t) size, SHM_HUGETLB | SHM_R | SHM_W);
 	if (shmid < 0) {
 		fprintf(stderr, "InnoDB: HugeTLB: Warning: Failed to allocate"
 			" %lu bytes. errno %d\n", size, errno);
 		ptr = NULL;
 	} else {
 		ptr = shmat(shmid, NULL, 0);
-		if (ptr == (void *)-1) {
+		if (ptr == (void*)-1) {
 			fprintf(stderr, "InnoDB: HugeTLB: Warning: Failed to"
 				" attach shared memory segment, errno %d\n",
 				errno);
@@ -213,7 +213,11 @@ os_mem_free_large(
 #elif !defined OS_MAP_ANON
 	ut_free(ptr);
 #else
+# if defined(UNIV_SOLARIS)
+	if (munmap(static_cast<caddr_t>(ptr), size)) {
+# else
 	if (munmap(ptr, size)) {
+# endif /* UNIV_SOLARIS */
 		fprintf(stderr, "InnoDB: munmap(%p, %lu) failed;"
 			" errno %lu\n",
 			ptr, (ulong) size, (ulong) errno);
diff --git a/storage/innobase/os/os0sync.c b/storage/innobase/os/os0sync.cc
index 41a19843812..c2e2e7e477f 100644
--- a/storage/innobase/os/os0sync.c
+++ b/storage/innobase/os/os0sync.cc
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1995, 2011, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,13 +11,13 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
 /**************************************************//**
-@file os/os0sync.c
+@file os/os0sync.cc
 The interface to the operating system
 synchronization primitives.
 
@@ -39,7 +39,7 @@ Created 9/6/1995 Heikki Tuuri
 
 /* Type definition for an operating system mutex struct */
 struct os_mutex_struct{
-	os_event_t	event;	/*!< Used by sync0arr.c for queing threads */
+	os_event_t	event;	/*!< Used by sync0arr.cc for queing threads */
 	void*		handle;	/*!< OS handle to mutex */
 	ulint		count;	/*!< we use this counter to check
 				that the same thread does not
@@ -75,6 +75,11 @@ UNIV_INTERN ulint	os_fast_mutex_count	= 0;
 /* The number of microsecnds in a second. */
 static const ulint MICROSECS_IN_A_SECOND = 1000000;
 
+#ifdef UNIV_PFS_MUTEX
+UNIV_INTERN mysql_pfs_key_t	event_os_mutex_key;
+UNIV_INTERN mysql_pfs_key_t	os_mutex_key;
+#endif
+
 /* Because a mutex is embedded inside an event and there is an
 event embedded inside a mutex, on free, this generates a recursive call.
 This version of the free event function doesn't acquire the global lock */
@@ -132,7 +137,7 @@ ibool
 os_cond_wait_timed(
 /*===============*/
 	os_cond_t*		cond,		/*!< in: condition variable. */
-	os_fast_mutex_t*	mutex,		/*!< in: fast mutex */
+	os_fast_mutex_t*	fast_mutex,	/*!< in: fast mutex */
 #ifndef __WIN__
 	const struct timespec*	abstime		/*!< in: timeout */
 #else
@@ -141,6 +146,7 @@ os_cond_wait_timed(
 #endif /* !__WIN__ */
 )
 {
+	fast_mutex_t*	mutex = &fast_mutex->mutex;
 #ifdef __WIN__
 	BOOL	ret;
 	DWORD	err;
@@ -195,8 +201,9 @@ void
 os_cond_wait(
 /*=========*/
 	os_cond_t*		cond,	/*!< in: condition variable. */
-	os_fast_mutex_t*	mutex)	/*!< in: fast mutex */
+	os_fast_mutex_t*	fast_mutex)/*!< in: fast mutex */
 {
+	fast_mutex_t*	mutex = &fast_mutex->mutex;
 	ut_a(cond);
 	ut_a(mutex);
 
@@ -368,7 +375,7 @@ os_event_create(
 #ifdef __WIN__
 	if(!srv_use_native_conditions) {
 
-		event = ut_malloc(sizeof(struct os_event_struct));
+		event = static_cast<os_event_t>(ut_malloc(sizeof(*event)));
 
 		event->handle = CreateEvent(NULL,
 					    TRUE,
@@ -382,13 +389,17 @@ os_event_create(
 		}
 	} else /* Windows with condition variables */
 #endif
-
 	{
 		UT_NOT_USED(name);
 
-		event = ut_malloc(sizeof(struct os_event_struct));
+		event = static_cast<os_event_struct_t*>(
+			ut_malloc(sizeof(struct os_event_struct)));
 
-		os_fast_mutex_init(&(event->os_mutex));
+#ifndef PFS_SKIP_EVENT_MUTEX
+		os_fast_mutex_init(event_os_mutex_key, &event->os_mutex);
+#else
+		os_fast_mutex_init(PFS_NOT_INSTRUMENTED, &event->os_mutex);
+#endif
 
 		os_cond_init(&(event->cond_var));
 
@@ -440,8 +451,6 @@ os_event_set(
 	}
 #endif
 
-	ut_a(event);
-
 	os_fast_mutex_lock(&(event->os_mutex));
 
 	if (event->is_set) {
@@ -631,7 +640,6 @@ os_event_wait_time_low(
 	ib_int64_t	reset_sig_count)	/*!< in: zero or the value
 						returned by previous call of
 						os_event_reset(). */
-
 {
 	ibool		timed_out = FALSE;
 
@@ -741,10 +749,13 @@ os_mutex_create(void)
 	os_fast_mutex_t*	mutex;
 	os_mutex_t		mutex_str;
 
-	mutex = ut_malloc(sizeof(os_fast_mutex_t));
+	mutex = static_cast<os_fast_mutex_t*>(
+		ut_malloc(sizeof(os_fast_mutex_t)));
+
+	os_fast_mutex_init(os_mutex_key, mutex);
 
-	os_fast_mutex_init(mutex);
-	mutex_str = ut_malloc(sizeof(os_mutex_str_t));
+	mutex_str = static_cast<os_mutex_t>(
+		ut_malloc(sizeof(os_mutex_str_t)));
 
 	mutex_str->handle = mutex;
 	mutex_str->count = 0;
@@ -774,7 +785,7 @@ os_mutex_enter(
 /*===========*/
 	os_mutex_t	mutex)	/*!< in: mutex to acquire */
 {
-	os_fast_mutex_lock(mutex->handle);
+	os_fast_mutex_lock(static_cast<os_fast_mutex_t*>(mutex->handle));
 
 	(mutex->count)++;
 
@@ -794,7 +805,7 @@ os_mutex_exit(
 	ut_a(mutex->count == 1);
 
 	(mutex->count)--;
-	os_fast_mutex_unlock(mutex->handle);
+	os_fast_mutex_unlock(static_cast<os_fast_mutex_t*>(mutex->handle));
 }
 
 /**********************************************************//**
@@ -823,7 +834,7 @@ os_mutex_free(
 		os_mutex_exit(os_sync_mutex);
 	}
 
-	os_fast_mutex_free(mutex->handle);
+	os_fast_mutex_free(static_cast<os_fast_mutex_t*>(mutex->handle));
 	ut_free(mutex->handle);
 	ut_free(mutex);
 }
@@ -832,9 +843,9 @@ os_mutex_free(
 Initializes an operating system fast mutex semaphore. */
 UNIV_INTERN
 void
-os_fast_mutex_init(
-/*===============*/
-	os_fast_mutex_t*	fast_mutex)	/*!< in: fast mutex */
+os_fast_mutex_init_func(
+/*====================*/
+	fast_mutex_t*		fast_mutex)	/*!< in: fast mutex */
 {
 #ifdef __WIN__
 	ut_a(fast_mutex);
@@ -861,9 +872,9 @@ os_fast_mutex_init(
 Acquires ownership of a fast mutex. */
 UNIV_INTERN
 void
-os_fast_mutex_lock(
-/*===============*/
-	os_fast_mutex_t*	fast_mutex)	/*!< in: mutex to acquire */
+os_fast_mutex_lock_func(
+/*====================*/
+	fast_mutex_t*		fast_mutex)	/*!< in: mutex to acquire */
 {
 #ifdef __WIN__
 	EnterCriticalSection((LPCRITICAL_SECTION) fast_mutex);
@@ -876,9 +887,9 @@ os_fast_mutex_lock(
 Releases ownership of a fast mutex. */
 UNIV_INTERN
 void
-os_fast_mutex_unlock(
-/*=================*/
-	os_fast_mutex_t*	fast_mutex)	/*!< in: mutex to release */
+os_fast_mutex_unlock_func(
+/*======================*/
+	fast_mutex_t*		fast_mutex)	/*!< in: mutex to release */
 {
 #ifdef __WIN__
 	LeaveCriticalSection(fast_mutex);
@@ -891,9 +902,9 @@ os_fast_mutex_unlock(
 Frees a mutex object. */
 UNIV_INTERN
 void
-os_fast_mutex_free(
-/*===============*/
-	os_fast_mutex_t*	fast_mutex)	/*!< in: mutex to free */
+os_fast_mutex_free_func(
+/*====================*/
+	fast_mutex_t*		fast_mutex)	/*!< in: mutex to free */
 {
 #ifdef __WIN__
 	ut_a(fast_mutex);
@@ -908,7 +919,7 @@ os_fast_mutex_free(
 		ut_print_timestamp(stderr);
 		fprintf(stderr,
 			"  InnoDB: error: return value %lu when calling\n"
-			"InnoDB: pthread_mutex_destroy().\n", (ulint)ret);
+			"InnoDB: pthread_mutex_destroy().\n", (ulint) ret);
 		fprintf(stderr,
 			"InnoDB: Byte contents of the pthread mutex at %p:\n",
 			(void*) fast_mutex);
diff --git a/storage/innobase/os/os0thread.c b/storage/innobase/os/os0thread.cc
index b19b5378fcd..48ee61e9402 100644
--- a/storage/innobase/os/os0thread.c
+++ b/storage/innobase/os/os0thread.cc
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1995, 2010, Innobase Oy. All Rights Reserved.
+Copyright (c) 1995, 2011, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,13 +11,13 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
 /**************************************************//**
-@file os/os0thread.c
+@file os/os0thread.cc
 The interface to the operating system thread control primitives
 
 Created 9/8/1995 Heikki Tuuri
@@ -77,7 +77,7 @@ os_thread_pf(
 
 	return((ulint)(a.field1));
 #else
-	return((ulint)a);
+	return((ulint) a);
 #endif
 }
 
@@ -105,14 +105,10 @@ and returns an ulint.
 @return	handle to the thread */
 UNIV_INTERN
 os_thread_t
-os_thread_create(
-/*=============*/
-#ifndef __WIN__
-	os_posix_f_t		start_f,
-#else
-	ulint (*start_f)(void*),		/*!< in: pointer to function
+os_thread_create_func(
+/*==================*/
+	os_thread_func_t	func,		/*!< in: pointer to function
 						from which to start */
-#endif
 	void*			arg,		/*!< in: argument to start
 						function */
 	os_thread_id_t*		thread_id)	/*!< out: id of the created
@@ -128,7 +124,7 @@ os_thread_create(
 
 	thread = CreateThread(NULL,	/* no security attributes */
 			      0,	/* default size stack */
-			      (LPTHREAD_START_ROUTINE)start_f,
+			      func,
 			      arg,
 			      0,	/* thread runs immediately */
 			      &win_thread_id);
@@ -136,10 +132,8 @@ os_thread_create(
 	if (thread_id) {
 		*thread_id = win_thread_id;
 	}
-	if (thread) {
-		CloseHandle(thread);
-	}
-	return((os_thread_t)win_thread_id);
+
+	return(thread);
 #else
 	int		ret;
 	os_thread_t	pthread;
@@ -170,9 +164,9 @@ os_thread_create(
 	os_mutex_exit(os_sync_mutex);
 
 #ifdef UNIV_HPUX10
-	ret = pthread_create(&pthread, pthread_attr_default, start_f, arg);
+	ret = pthread_create(&pthread, pthread_attr_default, func, arg);
 #else
-	ret = pthread_create(&pthread, &attr, start_f, arg);
+	ret = pthread_create(&pthread, &attr, func, arg);
 #endif
 	if (ret) {
 		fprintf(stderr,
@@ -214,7 +208,7 @@ os_thread_exit(
 	os_mutex_exit(os_sync_mutex);
 
 #ifdef __WIN__
-	ExitThread((DWORD)exit_value);
+	ExitThread((DWORD) exit_value);
 #else
 	pthread_detach(pthread_self());
 	pthread_exit(exit_value);
diff --git a/storage/innobase/page/page0cur.c b/storage/innobase/page/page0cur.cc
index d49b121afab..3ae063dedec 100644
--- a/storage/innobase/page/page0cur.c
+++ b/storage/innobase/page/page0cur.cc
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1994, 2012, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 1994, 2011, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -17,7 +17,7 @@ this program; if not, write to the Free Software Foundation, Inc.,
 *****************************************************************************/
 
 /********************************************************************//**
-@file page/page0cur.c
+@file page/page0cur.cc
 The page cursor
 
 Created 10/4/1994 Heikki Tuuri
@@ -879,7 +879,8 @@ page_cur_parse_insert_rec(
 	if (mismatch_index + end_seg_len < sizeof buf1) {
 		buf = buf1;
 	} else {
-		buf = mem_alloc(mismatch_index + end_seg_len);
+		buf = static_cast<byte*>(
+			mem_alloc(mismatch_index + end_seg_len));
 	}
 
 	/* Build the inserted record to buf */
@@ -1173,7 +1174,7 @@ page_cur_insert_rec_zip_reorg(
 	ulint		pos;
 
 	/* Recompress or reorganize and recompress the page. */
-	if (UNIV_LIKELY(page_zip_compress(page_zip, page, index, mtr))) {
+	if (page_zip_compress(page_zip, page, index, mtr)) {
 		return(rec);
 	}
 
diff --git a/storage/innobase/page/page0page.c b/storage/innobase/page/page0page.cc
index 5f0380cb55f..c9089255c39 100644
--- a/storage/innobase/page/page0page.c
+++ b/storage/innobase/page/page0page.cc
@@ -17,7 +17,7 @@ this program; if not, write to the Free Software Foundation, Inc.,
 *****************************************************************************/
 
 /**************************************************//**
-@file page/page0page.c
+@file page/page0page.cc
 Index page routines
 
 Created 2/2/1994 Heikki Tuuri
@@ -222,7 +222,7 @@ page_set_max_trx_id(
 	during a database recovery we assume that the max trx id of every
 	page is the maximum trx id assigned before the crash. */
 
-	if (UNIV_LIKELY_NULL(page_zip)) {
+	if (page_zip) {
 		mach_write_to_8(page + (PAGE_HEADER + PAGE_MAX_TRX_ID), trx_id);
 		page_zip_write_header(page_zip,
 				      page + (PAGE_HEADER + PAGE_MAX_TRX_ID),
@@ -635,7 +635,7 @@ page_copy_rec_list_end(
 	/* Here, "ret" may be pointing to a user record or the
 	predefined supremum record. */
 
-	if (UNIV_LIKELY_NULL(new_page_zip)) {
+	if (new_page_zip) {
 		log_mode = mtr_set_log_mode(mtr, MTR_LOG_NONE);
 	}
 
@@ -655,11 +655,10 @@ page_copy_rec_list_end(
 				       page_get_max_trx_id(page), mtr);
 	}
 
-	if (UNIV_LIKELY_NULL(new_page_zip)) {
+	if (new_page_zip) {
 		mtr_set_log_mode(mtr, log_mode);
 
-		if (UNIV_UNLIKELY
-		    (!page_zip_compress(new_page_zip, new_page, index, mtr))) {
+		if (!page_zip_compress(new_page_zip, new_page, index, mtr)) {
 			/* Before trying to reorganize the page,
 			store the number of preceding records on the page. */
 			ulint	ret_pos
@@ -671,14 +670,12 @@ page_copy_rec_list_end(
 			that is smaller than "ret"). */
 			ut_a(ret_pos > 0);
 
-			if (UNIV_UNLIKELY
-			    (!page_zip_reorganize(new_block, index, mtr))) {
+			if (!page_zip_reorganize(new_block, index, mtr)) {
 
 				btr_blob_dbg_remove(new_page, index,
 						    "copy_end_reorg_fail");
-				if (UNIV_UNLIKELY
-				    (!page_zip_decompress(new_page_zip,
-							  new_page, FALSE))) {
+				if (!page_zip_decompress(new_page_zip,
+							 new_page, FALSE)) {
 					ut_error;
 				}
 				ut_ad(page_validate(new_page, index));
@@ -742,7 +739,7 @@ page_copy_rec_list_start(
 		return(ret);
 	}
 
-	if (UNIV_LIKELY_NULL(new_page_zip)) {
+	if (new_page_zip) {
 		log_mode = mtr_set_log_mode(mtr, MTR_LOG_NONE);
 	}
 
@@ -778,7 +775,7 @@ page_copy_rec_list_start(
 				       mtr);
 	}
 
-	if (UNIV_LIKELY_NULL(new_page_zip)) {
+	if (new_page_zip) {
 		mtr_set_log_mode(mtr, log_mode);
 
 		if (UNIV_UNLIKELY
@@ -971,7 +968,7 @@ page_delete_rec_list_end(
 				       ? MLOG_COMP_LIST_END_DELETE
 				       : MLOG_LIST_END_DELETE, mtr);
 
-	if (UNIV_LIKELY_NULL(page_zip)) {
+	if (page_zip) {
 		ulint		log_mode;
 
 		ut_a(page_is_comp(page));
@@ -1563,7 +1560,7 @@ page_rec_get_n_recs_before(
 	n--;
 
 	ut_ad(n >= 0);
-	ut_ad(n < UNIV_PAGE_SIZE / (REC_N_NEW_EXTRA_BYTES + 1));
+	ut_ad((ulong) n < UNIV_PAGE_SIZE / (REC_N_NEW_EXTRA_BYTES + 1));
 
 	return((ulint) n);
 }
@@ -2318,7 +2315,7 @@ page_validate(
 	/* The following buffer is used to check that the
 	records in the page record heap do not overlap */
 
-	buf = mem_heap_zalloc(heap, UNIV_PAGE_SIZE);
+	buf = static_cast<byte*>(mem_heap_zalloc(heap, UNIV_PAGE_SIZE));
 
 	/* Check first that the record heap and the directory do not
 	overlap. */
@@ -2328,7 +2325,7 @@ page_validate(
 	if (UNIV_UNLIKELY(!(page_header_get_ptr(page, PAGE_HEAP_TOP)
 			    <= page_dir_get_nth_slot(page, n_slots - 1)))) {
 
-		fprintf(stderr, 
+		fprintf(stderr,
 			"InnoDB: Record heap and dir overlap"
 			" on space %lu page %lu index %s, %p, %p\n",
 			(ulong) page_get_space_id(page),
@@ -2371,7 +2368,7 @@ page_validate(
 			if (UNIV_UNLIKELY
 			    (1 != cmp_rec_rec(rec, old_rec,
 					      offsets, old_offsets, index))) {
-				fprintf(stderr, 
+				fprintf(stderr,
 					"InnoDB: Records in wrong order"
 					" on space %lu page %lu index %s\n",
 					(ulong) page_get_space_id(page),
@@ -2542,7 +2539,7 @@ func_exit:
 
 	if (UNIV_UNLIKELY(ret == FALSE)) {
 func_exit2:
-		fprintf(stderr, 
+		fprintf(stderr,
 			"InnoDB: Apparent corruption"
 			" in space %lu page %lu index %s\n",
 			(ulong) page_get_space_id(page),
diff --git a/storage/innobase/page/page0zip.c b/storage/innobase/page/page0zip.cc
index fb618beac7e..ce75d4e15fc 100644
--- a/storage/innobase/page/page0zip.c
+++ b/storage/innobase/page/page0zip.cc
@@ -11,13 +11,13 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
 /**************************************************//**
-@file page/page0zip.c
+@file page/page0zip.cc
 Compressed page interface
 
 Created June 2005 by Marko Makela
@@ -38,18 +38,23 @@ Created June 2005 by Marko Makela
 #include "log0recv.h"
 #include "zlib.h"
 #ifndef UNIV_HOTBACKUP
+# include "buf0buf.h"
 # include "buf0lru.h"
 # include "btr0sea.h"
 # include "dict0boot.h"
 # include "lock0lock.h"
+# include "srv0mon.h"
+# include "srv0srv.h"
+# include "ut0crc32.h"
 #else /* !UNIV_HOTBACKUP */
+# include "buf0checksum.h"
 # define lock_move_reorganize_page(block, temp_block)	((void) 0)
 # define buf_LRU_stat_inc_unzip()			((void) 0)
 #endif /* !UNIV_HOTBACKUP */
 
 #ifndef UNIV_HOTBACKUP
 /** Statistics on compression, indexed by page_zip_des_t::ssize - 1 */
-UNIV_INTERN page_zip_stat_t page_zip_stat[PAGE_ZIP_NUM_SSIZE - 1];
+UNIV_INTERN page_zip_stat_t page_zip_stat[PAGE_ZIP_SSIZE_MAX];
 #endif /* !UNIV_HOTBACKUP */
 
 /* Please refer to ../include/page0zip.ic for a description of the
@@ -640,7 +645,7 @@ page_zip_dir_encode(
 		}
 
 		info_bits = rec_get_info_bits(rec, TRUE);
-		if (UNIV_UNLIKELY(info_bits & REC_INFO_DELETED_FLAG)) {
+		if (info_bits & REC_INFO_DELETED_FLAG) {
 			info_bits &= ~REC_INFO_DELETED_FLAG;
 			offs |= PAGE_ZIP_DIR_SLOT_DEL;
 		}
@@ -691,6 +696,8 @@ page_zip_dir_encode(
 	ut_a(i + PAGE_HEAP_NO_USER_LOW == n_heap);
 }
 
+extern "C" {
+
 /**********************************************************************//**
 Allocate memory for zlib. */
 static
@@ -701,7 +708,7 @@ page_zip_zalloc(
 	uInt	items,	/*!< in: number of items to allocate */
 	uInt	size)	/*!< in: size of an item in bytes */
 {
-	return(mem_heap_zalloc(opaque, items * size));
+	return(mem_heap_zalloc(static_cast<mem_heap_t*>(opaque), items * size));
 }
 
 /**********************************************************************//**
@@ -715,6 +722,8 @@ page_zip_free(
 {
 }
 
+} /* extern "C" */
+
 /**********************************************************************//**
 Configure the zlib allocator to use the given memory heap. */
 UNIV_INTERN
@@ -724,7 +733,7 @@ page_zip_set_alloc(
 	void*		stream,		/*!< in/out: zlib stream */
 	mem_heap_t*	heap)		/*!< in: memory heap to use */
 {
-	z_stream*	strm = stream;
+	z_stream*	strm = static_cast<z_stream*>(stream);
 
 	strm->zalloc = page_zip_zalloc;
 	strm->zfree = page_zip_free;
@@ -1088,7 +1097,7 @@ page_zip_compress_clust(
 		/* Check if there are any externally stored columns.
 		For each externally stored column, store the
 		BTR_EXTERN_FIELD_REF separately. */
-		if (UNIV_UNLIKELY(rec_offs_any_extern(offsets))) {
+		if (rec_offs_any_extern(offsets)) {
 			ut_ad(dict_index_is_clust(index));
 
 			err = page_zip_compress_clust_ext(
@@ -1264,6 +1273,8 @@ page_zip_compress(
 		goto err_exit;
 	}
 
+	MONITOR_INC(MONITOR_PAGE_COMPRESS);
+
 	heap = mem_heap_create(page_zip_get_size(page_zip)
 			       + n_fields * (2 + sizeof *offsets)
 			       + n_dense * ((sizeof *recs)
@@ -1271,11 +1282,14 @@ page_zip_compress(
 			       + UNIV_PAGE_SIZE * 4
 			       + (512 << MAX_MEM_LEVEL));
 
-	recs = mem_heap_zalloc(heap, n_dense * sizeof *recs);
+	recs = static_cast<const rec_t**>(
+		mem_heap_zalloc(heap, n_dense * sizeof *recs));
+
+	fields = static_cast<byte*>(mem_heap_alloc(heap, (n_fields + 1) * 2));
 
-	fields = mem_heap_alloc(heap, (n_fields + 1) * 2);
+	buf = static_cast<byte*>(
+		mem_heap_alloc(heap, page_zip_get_size(page_zip) - PAGE_DATA));
 
-	buf = mem_heap_alloc(heap, page_zip_get_size(page_zip) - PAGE_DATA);
 	buf_end = buf + page_zip_get_size(page_zip) - PAGE_DATA;
 
 	/* Compress the data payload. */
@@ -1555,7 +1569,7 @@ page_zip_fields_decode(
 	}
 
 	table = dict_mem_table_create("ZIP_DUMMY", DICT_HDR_SPACE, n,
-				      DICT_TF_COMPACT);
+				      DICT_TF_COMPACT, 0);
 	index = dict_mem_index_create("ZIP_DUMMY", "ZIP_DUMMY",
 				      DICT_HDR_SPACE, 0, n);
 	index->table = table;
@@ -1747,7 +1761,7 @@ page_zip_set_extra_bytes(
 	for (i = 0; i < n; i++) {
 		offs = page_zip_dir_get(page_zip, i);
 
-		if (UNIV_UNLIKELY(offs & PAGE_ZIP_DIR_SLOT_DEL)) {
+		if (offs & PAGE_ZIP_DIR_SLOT_DEL) {
 			info_bits |= REC_INFO_DELETED_FLAG;
 		}
 		if (UNIV_UNLIKELY(offs & PAGE_ZIP_DIR_SLOT_OWNED)) {
@@ -2629,7 +2643,7 @@ page_zip_decompress_clust(
 		For each externally stored column, restore the
 		BTR_EXTERN_FIELD_REF separately. */
 
-		if (UNIV_UNLIKELY(rec_offs_any_extern(offsets))) {
+		if (rec_offs_any_extern(offsets)) {
 			if (UNIV_UNLIKELY
 			    (!page_zip_decompress_clust_ext(
 				    d_stream, rec, offsets, trx_id_col))) {
@@ -2894,7 +2908,9 @@ page_zip_decompress(
 	}
 
 	heap = mem_heap_create(n_dense * (3 * sizeof *recs) + UNIV_PAGE_SIZE);
-	recs = mem_heap_alloc(heap, n_dense * (2 * sizeof *recs));
+
+	recs = static_cast<rec_t**>(
+		mem_heap_alloc(heap, n_dense * (2 * sizeof *recs)));
 
 	if (all) {
 		/* Copy the page header. */
@@ -2996,7 +3012,10 @@ zlib_error:
 		/* Pre-allocate the offsets for rec_get_offsets_reverse(). */
 		ulint	n = 1 + 1/* node ptr */ + REC_OFFS_HEADER_SIZE
 			+ dict_index_get_n_fields(index);
-		offsets = mem_heap_alloc(heap, n * sizeof(ulint));
+
+		offsets = static_cast<ulint*>(
+			mem_heap_alloc(heap, n * sizeof(ulint)));
+
 		*offsets = n;
 	}
 
@@ -3067,6 +3086,8 @@ err_exit:
 	/* Update the stat counter for LRU policy. */
 	buf_LRU_stat_inc_unzip();
 
+	MONITOR_INC(MONITOR_PAGE_DECOMPRESS);
+
 	return(TRUE);
 }
 
@@ -3081,7 +3102,7 @@ page_zip_hexdump_func(
 	const void*	buf,	/*!< in: data */
 	ulint		size)	/*!< in: length of the data, in bytes */
 {
-	const byte*	s	= buf;
+	const byte*	s	= static_cast<const byte*>(buf);
 	ulint		addr;
 	const ulint	width	= 32; /* bytes per line */
 
@@ -3147,8 +3168,8 @@ page_zip_validate_low(
 
 	/* page_zip_decompress() expects the uncompressed page to be
 	UNIV_PAGE_SIZE aligned. */
-	temp_page_buf = ut_malloc(2 * UNIV_PAGE_SIZE);
-	temp_page = ut_align(temp_page_buf, UNIV_PAGE_SIZE);
+	temp_page_buf = static_cast<byte*>(ut_malloc(2 * UNIV_PAGE_SIZE));
+	temp_page = static_cast<byte*>(ut_align(temp_page_buf, UNIV_PAGE_SIZE));
 
 #ifdef UNIV_DEBUG_VALGRIND
 	/* Get detailed information on the valid bits in case the
@@ -4287,7 +4308,7 @@ page_zip_dir_add_slot(
 	if (!page_is_leaf(page_zip->data)) {
 		ut_ad(!page_zip->n_blobs);
 		stored = dir - n_dense * REC_NODE_PTR_SIZE;
-	} else if (UNIV_UNLIKELY(is_clustered)) {
+	} else if (is_clustered) {
 		/* Move the BLOB pointer array backwards to make space for the
 		roll_ptr and trx_id columns and the dense directory slot. */
 		byte*	externs;
@@ -4433,7 +4454,9 @@ page_zip_reorganize(
 	dict_index_t*	index,	/*!< in: index of the B-tree node */
 	mtr_t*		mtr)	/*!< in: mini-transaction */
 {
+#ifndef UNIV_HOTBACKUP
 	buf_pool_t*	buf_pool	= buf_pool_from_block(block);
+#endif /* !UNIV_HOTBACKUP */
 	page_zip_des_t*	page_zip	= buf_block_get_page_zip(block);
 	page_t*		page		= buf_block_get_frame(block);
 	buf_block_t*	temp_block;
@@ -4487,7 +4510,7 @@ page_zip_reorganize(
 	/* Restore logging. */
 	mtr_set_log_mode(mtr, log_mode);
 
-	if (UNIV_UNLIKELY(!page_zip_compress(page_zip, page, index, mtr))) {
+	if (!page_zip_compress(page_zip, page, index, mtr)) {
 
 #ifndef UNIV_HOTBACKUP
 		buf_block_free(temp_block);
@@ -4668,21 +4691,113 @@ ulint
 page_zip_calc_checksum(
 /*===================*/
 	const void*	data,	/*!< in: compressed page */
-	ulint		size)	/*!< in: size of compressed page */
+	ulint		size,	/*!< in: size of compressed page */
+	srv_checksum_algorithm_t algo) /*!< in: algorithm to use */
 {
+	uLong		adler;
+	ib_uint32_t	crc32;
+	const Bytef*	s = static_cast<const byte*>(data);
+
 	/* Exclude FIL_PAGE_SPACE_OR_CHKSUM, FIL_PAGE_LSN,
 	and FIL_PAGE_FILE_FLUSH_LSN from the checksum. */
 
-	const Bytef*	s	= data;
-	uLong		adler;
+	switch (algo) {
+	case SRV_CHECKSUM_ALGORITHM_CRC32:
+	case SRV_CHECKSUM_ALGORITHM_STRICT_CRC32:
+
+		ut_ad(size > FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
+
+		crc32 = ut_crc32(s + FIL_PAGE_OFFSET,
+				 FIL_PAGE_LSN - FIL_PAGE_OFFSET)
+			^ ut_crc32(s + FIL_PAGE_TYPE, 2)
+			^ ut_crc32(s + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID,
+				   size - FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
+
+		return((ulint) crc32);
+	case SRV_CHECKSUM_ALGORITHM_INNODB:
+	case SRV_CHECKSUM_ALGORITHM_STRICT_INNODB:
+		ut_ad(size > FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
+
+		adler = adler32(0L, s + FIL_PAGE_OFFSET,
+				FIL_PAGE_LSN - FIL_PAGE_OFFSET);
+		adler = adler32(adler, s + FIL_PAGE_TYPE, 2);
+		adler = adler32(adler, s + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID,
+				size - FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
+
+		return((ulint) adler);
+	case SRV_CHECKSUM_ALGORITHM_NONE:
+	case SRV_CHECKSUM_ALGORITHM_STRICT_NONE:
+		return(BUF_NO_CHECKSUM_MAGIC);
+	/* no default so the compiler will emit a warning if new enum
+	is added and not handled here */
+	}
 
-	ut_ad(size > FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
+	ut_error;
+	return(0);
+}
+
+/**********************************************************************//**
+Verify a compressed page's checksum.
+@return	TRUE if the stored checksum is valid according to the value of
+innodb_checksum_algorithm */
+UNIV_INTERN
+ibool
+page_zip_verify_checksum(
+/*=====================*/
+	const void*	data,	/*!< in: compressed page */
+	ulint		size)	/*!< in: size of compressed page */
+{
+	ib_uint32_t	stored;
+	ib_uint32_t	calc;
+	ib_uint32_t	crc32 = 0 /* silence bogus warning */;
+	ib_uint32_t	innodb = 0 /* silence bogus warning */;
+
+	stored = mach_read_from_4(
+		(const unsigned char*) data + FIL_PAGE_SPACE_OR_CHKSUM);
+
+	/* declare empty pages non-corrupted */
+	if (stored == 0) {
+		/* make sure that the page is really empty */
+		ut_d(ulint i; for (i = 0; i < size; i++) {
+		     ut_a(*((const char*) data + i) == 0); });
+
+		return(TRUE);
+	}
+
+	calc = page_zip_calc_checksum(
+		data, size, static_cast<srv_checksum_algorithm_t>(
+			srv_checksum_algorithm));
+
+	if (stored == calc) {
+		return(TRUE);
+	}
 
-	adler = adler32(0L, s + FIL_PAGE_OFFSET,
-			FIL_PAGE_LSN - FIL_PAGE_OFFSET);
-	adler = adler32(adler, s + FIL_PAGE_TYPE, 2);
-	adler = adler32(adler, s + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID,
-			size - FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
+	switch ((srv_checksum_algorithm_t) srv_checksum_algorithm) {
+	case SRV_CHECKSUM_ALGORITHM_STRICT_CRC32:
+	case SRV_CHECKSUM_ALGORITHM_STRICT_INNODB:
+	case SRV_CHECKSUM_ALGORITHM_STRICT_NONE:
+		return(stored == calc);
+	case SRV_CHECKSUM_ALGORITHM_CRC32:
+		if (stored == BUF_NO_CHECKSUM_MAGIC) {
+			return(TRUE);
+		}
+		crc32 = calc;
+		innodb = page_zip_calc_checksum(
+			data, size, SRV_CHECKSUM_ALGORITHM_INNODB);
+		break;
+	case SRV_CHECKSUM_ALGORITHM_INNODB:
+		if (stored == BUF_NO_CHECKSUM_MAGIC) {
+			return(TRUE);
+		}
+		crc32 = page_zip_calc_checksum(
+			data, size, SRV_CHECKSUM_ALGORITHM_CRC32);
+		innodb = calc;
+		break;
+	case SRV_CHECKSUM_ALGORITHM_NONE:
+		return(TRUE);
+	/* no default so the compiler will emit a warning if new enum
+	is added and not handled here */
+	}
 
-	return((ulint) adler);
+	return(stored == crc32 || stored == innodb);
 }
diff --git a/storage/innobase/pars/lexyy.c b/storage/innobase/pars/lexyy.cc
index 815395ea316..9de8ea51efd 100644
--- a/storage/innobase/pars/lexyy.c
+++ b/storage/innobase/pars/lexyy.cc
@@ -1,25 +1,7 @@
-/*****************************************************************************
-
-Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
-
-This program is free software; you can redistribute it and/or modify it under
-the terms of the GNU General Public License as published by the Free Software
-Foundation; version 2 of the License.
-
-This program is distributed in the hope that it will be useful, but WITHOUT
-ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
-FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
-
-*****************************************************************************/
-
 #include "univ.i"
-#line 2 "lexyy.c"
+#line 2 "lexyy.cc"
 
-#line 4 "lexyy.c"
+#line 4 "lexyy.cc"
 
 #define  YY_INT_ALIGNED short int
 
@@ -28,7 +10,7 @@ Place, Suite 330, Boston, MA 02111-1307 USA
 #define FLEX_SCANNER
 #define YY_FLEX_MAJOR_VERSION 2
 #define YY_FLEX_MINOR_VERSION 5
-#define YY_FLEX_SUBMINOR_VERSION 31
+#define YY_FLEX_SUBMINOR_VERSION 35
 #if YY_FLEX_SUBMINOR_VERSION > 0
 #define FLEX_BETA
 #endif
@@ -50,7 +32,15 @@ Place, Suite 330, Boston, MA 02111-1307 USA
 
 /* C99 systems have <inttypes.h>. Non-C99 systems may or may not. */
 
-#if defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L
+#if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
+
+/* C99 says to define __STDC_LIMIT_MACROS before including stdint.h,
+ * if you want the limit (max/min) macros for int types.
+ */
+#ifndef __STDC_LIMIT_MACROS
+#define __STDC_LIMIT_MACROS 1
+#endif
+
 #include <inttypes.h>
 typedef int8_t flex_int8_t;
 typedef uint8_t flex_uint8_t;
@@ -62,7 +52,7 @@ typedef uint32_t flex_uint32_t;
 typedef signed char flex_int8_t;
 typedef short int flex_int16_t;
 typedef int flex_int32_t;
-typedef unsigned char flex_uint8_t; 
+typedef unsigned char flex_uint8_t;
 typedef unsigned short int flex_uint16_t;
 typedef unsigned int flex_uint32_t;
 #endif /* ! C99 */
@@ -105,11 +95,12 @@ typedef unsigned int flex_uint32_t;
 
 #else	/* ! __cplusplus */
 
-#if __STDC__
+/* C99 requires __STDC__ to be defined as 1. */
+#if defined (__STDC__)
 
 #define YY_USE_CONST
 
-#endif	/* __STDC__ */
+#endif	/* defined (__STDC__) */
 #endif	/* ! __cplusplus */
 
 #ifdef YY_USE_CONST
@@ -154,21 +145,30 @@ typedef unsigned int flex_uint32_t;
 #define YY_BUF_SIZE 16384
 #endif
 
+/* The state buf must be large enough to hold one state per character in the main buffer.
+ */
+#define YY_STATE_BUF_SIZE   ((YY_BUF_SIZE + 2) * sizeof(yy_state_type))
+
 #ifndef YY_TYPEDEF_YY_BUFFER_STATE
 #define YY_TYPEDEF_YY_BUFFER_STATE
 typedef struct yy_buffer_state *YY_BUFFER_STATE;
 #endif
 
-static int yyleng;
+#ifndef YY_TYPEDEF_YY_SIZE_T
+#define YY_TYPEDEF_YY_SIZE_T
+typedef size_t yy_size_t;
+#endif
+
+extern yy_size_t yyleng;
 
-static FILE *yyin, *yyout;
+extern FILE *yyin, *yyout;
 
 #define EOB_ACT_CONTINUE_SCAN 0
 #define EOB_ACT_END_OF_FILE 1
 #define EOB_ACT_LAST_MATCH 2
 
     #define YY_LESS_LINENO(n)
-    
+
 /* Return all but the first "n" matched characters back to the input stream. */
 #define yyless(n) \
 	do \
@@ -185,16 +185,6 @@ static FILE *yyin, *yyout;
 
 #define unput(c) yyunput( c, (yytext_ptr)  )
 
-/* The following is because we cannot portably get our hands on size_t
- * (without autoconf's help, which isn't available because we want
- * flex-generated scanners to compile on their own).
- */
-
-#ifndef YY_TYPEDEF_YY_SIZE_T
-#define YY_TYPEDEF_YY_SIZE_T
-typedef unsigned int yy_size_t;
-#endif
-
 #ifndef YY_STRUCT_YY_BUFFER_STATE
 #define YY_STRUCT_YY_BUFFER_STATE
 struct yy_buffer_state
@@ -212,7 +202,7 @@ struct yy_buffer_state
 	/* Number of characters read into yy_ch_buf, not including EOB
 	 * characters.
 	 */
-	int yy_n_chars;
+	yy_size_t yy_n_chars;
 
 	/* Whether we "own" the buffer - i.e., we know we created it,
 	 * and can realloc() it to grow it, and should free() it to
@@ -235,7 +225,7 @@ struct yy_buffer_state
 
     int yy_bs_lineno; /**< The line count. */
     int yy_bs_column; /**< The column count. */
-    
+
 	/* Whether to try to fill the input buffer when we reach the
 	 * end of it.
 	 */
@@ -282,12 +272,12 @@ static YY_BUFFER_STATE * yy_buffer_stack = 0; /**< Stack as an array. */
 
 /* yy_hold_char holds the character lost when yytext is formed. */
 static char yy_hold_char;
-static int yy_n_chars;		/* number of characters read into yy_ch_buf */
-static int yyleng;
+static yy_size_t yy_n_chars;		/* number of characters read into yy_ch_buf */
+yy_size_t yyleng;
 
 /* Points to current character in buffer. */
-static char *yy_c_buf_p = (char *) 0;
-static int yy_init = 1;		/* whether we need to initialize */
+static char *yy_c_buf_p = (char*) 0;
+static int yy_init = 0;		/* whether we need to initialize */
 static int yy_start = 0;	/* start state number */
 
 /* Flag which is used to allow yywrap()'s to do buffer switches
@@ -295,13 +285,13 @@ static int yy_start = 0;	/* start state number */
  */
 static int yy_did_buffer_switch_on_eof;
 
-static void yyrestart (FILE *input_file  );
+void yyrestart (FILE *input_file  );
 __attribute__((unused)) static void yy_switch_to_buffer (YY_BUFFER_STATE new_buffer  );
 static YY_BUFFER_STATE yy_create_buffer (FILE *file,int size  );
-static void yy_delete_buffer (YY_BUFFER_STATE b  );
-static void yy_flush_buffer (YY_BUFFER_STATE b  );
-__attribute__((unused)) static void yypush_buffer_state (YY_BUFFER_STATE new_buffer  );
-__attribute__((unused)) static void yypop_buffer_state (void );
+void yy_delete_buffer (YY_BUFFER_STATE b  );
+void yy_flush_buffer (YY_BUFFER_STATE b  );
+void yypush_buffer_state (YY_BUFFER_STATE new_buffer  );
+void yypop_buffer_state (void );
 
 static void yyensure_buffer_stack (void );
 static void yy_load_buffer_state (void );
@@ -311,11 +301,11 @@ static void yy_init_buffer (YY_BUFFER_STATE b,FILE *file  );
 
 YY_BUFFER_STATE yy_scan_buffer (char *base,yy_size_t size  );
 YY_BUFFER_STATE yy_scan_string (yyconst char *yy_str  );
-YY_BUFFER_STATE yy_scan_bytes (yyconst char *bytes,int len  );
+YY_BUFFER_STATE yy_scan_bytes (yyconst char *bytes,yy_size_t len  );
 
-static void *yyalloc (yy_size_t  );
-static void *yyrealloc (void *,yy_size_t  );
-static void yyfree (void *  );
+void *yyalloc (yy_size_t  );
+void *yyrealloc (void *,yy_size_t  );
+void yyfree (void *  );
 
 #define yy_new_buffer yy_create_buffer
 
@@ -348,15 +338,15 @@ static void yyfree (void *  );
 
 typedef unsigned char YY_CHAR;
 
-static FILE *yyin = (FILE *) 0, *yyout = (FILE *) 0;
+FILE *yyin = (FILE*) 0, *yyout = (FILE*) 0;
 
 typedef int yy_state_type;
 
-static int yylineno;
+extern int yylineno;
 
-static int yylineno = 1;
+int yylineno = 1;
 
-static char *yytext;
+extern char *yytext;
 #define yytext_ptr yytext
 
 static yy_state_type yy_get_previous_state (void );
@@ -374,8 +364,8 @@ static void yy_fatal_error (yyconst char msg[]  );
 	*yy_cp = '\0'; \
 	(yy_c_buf_p) = yy_cp;
 
-#define YY_NUM_RULES 119
-#define YY_END_OF_BUFFER 120
+#define YY_NUM_RULES 124
+#define YY_END_OF_BUFFER 125
 /* This struct is not used in this scanner,
    but its presence is necessary. */
 struct yy_trans_info
@@ -383,52 +373,55 @@ struct yy_trans_info
 	flex_int32_t yy_verify;
 	flex_int32_t yy_nxt;
 	};
-static yyconst flex_int16_t yy_accept[399] =
+static yyconst flex_int16_t yy_accept[424] =
     {   0,
-        0,    0,  114,  114,    0,    0,    0,    0,  120,  118,
-      117,  117,    8,  118,  109,    5,   98,  104,  107,  105,
-      102,  106,  118,  108,    1,  118,  103,  101,   99,  100,
-      112,   92,   92,   92,   92,   92,   92,   92,   92,   92,
-       92,   92,   92,   92,   92,   92,   92,   92,   92,   92,
-      110,  111,  114,  115,    6,    7,    9,   10,  117,    4,
-       93,  113,    2,    1,    3,   94,   95,   97,   96,   92,
-       92,   92,   92,   92,   92,   44,   92,   92,   92,   92,
-       92,   92,   92,   92,   92,   92,   92,   92,   92,   92,
-       92,   92,   28,   17,   25,   92,   92,   92,   92,   92,
-
-       54,   61,   92,   14,   92,   92,   92,   92,   92,   92,
-       92,   92,   92,   92,   92,   92,   92,   92,   92,   92,
-       92,   92,  114,  115,  115,  116,    6,    7,    9,   10,
-        2,   13,   45,   92,   92,   92,   92,   92,   92,   92,
-       92,   92,   92,   92,   92,   92,   92,   92,   92,   92,
-       92,   27,   92,   92,   92,   41,   92,   92,   92,   92,
-       21,   92,   92,   92,   92,   15,   92,   92,   92,   18,
-       92,   92,   92,   92,   92,   80,   92,   92,   92,   51,
-       92,   12,   92,   36,   92,   92,   92,   92,   92,   92,
-       92,   92,   92,   92,   92,   92,   92,   92,   20,   24,
-
-       92,   92,   92,   92,   92,   92,   92,   92,   92,   92,
-       46,   92,   92,   30,   92,   87,   92,   92,   39,   92,
-       92,   92,   92,   92,   48,   92,   89,   32,   91,   92,
-       11,   64,   92,   92,   92,   42,   92,   92,   92,   92,
-       92,   92,   92,   92,   92,   92,   29,   92,   92,   92,
-       92,   92,   92,   92,   92,   92,   85,   92,   26,   92,
-       66,   92,   92,   92,   37,   92,   92,   92,   92,   92,
-       92,   92,   31,   65,   23,   92,   57,   92,   75,   92,
-       92,   92,   43,   92,   92,   92,   92,   92,   92,   92,
-       92,   90,   92,   92,   56,   92,   92,   92,   92,   92,
-
-       92,   92,   40,   33,   79,   19,   92,   83,   74,   55,
-       92,   63,   92,   52,   92,   92,   92,   47,   92,   76,
-       92,   78,   92,   92,   34,   92,   92,   92,   35,   72,
-       92,   92,   92,   92,   58,   92,   50,   49,   92,   92,
-       53,   62,   92,   92,   92,   22,   92,   92,   73,   81,
-       92,   92,   77,   92,   68,   92,   92,   92,   92,   38,
-       92,   88,   67,   92,   84,   92,   92,   92,   86,   92,
-       59,   92,   16,   92,   70,   69,   92,   92,   82,   92,
-       92,   92,   92,   92,   92,   92,   92,   92,   92,   71,
-       92,   92,   92,   92,   92,   92,   60,    0
-
+        0,    0,  119,  119,    0,    0,    0,    0,  125,  123,
+      122,  122,    8,  123,  114,    5,  103,  109,  112,  110,
+      107,  111,  123,  113,    1,  123,  108,  106,  104,  105,
+      117,   96,   96,   96,   96,   96,   96,   96,   96,   96,
+       96,   96,   96,   96,   96,   96,   96,   96,   96,   96,
+      115,  116,  119,  120,    6,    7,    9,   10,  122,    4,
+       98,  118,    2,    1,    3,   99,  100,  102,  101,    0,
+       96,   96,   96,   96,   96,   96,   44,   96,   96,   96,
+       96,   96,   96,   96,   96,   96,   96,   96,   96,   96,
+       96,   96,   96,   28,   17,   25,   96,   96,   96,   96,
+
+       96,   96,   54,   63,   96,   14,   96,   96,   96,   96,
+       96,   96,   96,   96,   96,   96,   96,   96,   96,   96,
+       96,   96,   96,   96,  119,  120,  120,  121,    6,    7,
+        9,   10,    2,    0,   97,   13,   45,   96,   96,   96,
+       96,   96,   96,   96,   96,   96,   96,   96,   96,   96,
+       96,   96,   96,   96,   96,   96,   27,   96,   96,   96,
+       41,   96,   96,   96,   96,   21,   96,   96,   96,   96,
+       96,   15,   96,   96,   96,   18,   96,   96,   96,   96,
+       96,   82,   96,   96,   96,   51,   96,   12,   96,   36,
+       96,   96,   96,   96,   96,   96,   96,   96,   96,   96,
+
+       96,    0,   97,   96,   96,   96,   96,   20,   96,   24,
+       96,   96,   96,   96,   96,   96,   96,   96,   96,   96,
+       96,   46,   96,   96,   30,   96,   89,   96,   96,   39,
+       96,   96,   96,   96,   96,   48,   96,   94,   91,   32,
+       93,   96,   11,   66,   96,   96,   96,   42,   96,   96,
+       96,   96,   96,   96,   96,   96,   96,   96,   29,   96,
+       96,   96,   96,   96,   96,   96,   96,   96,   87,    0,
+       96,   26,   96,   96,   96,   68,   96,   96,   96,   96,
+       37,   96,   96,   96,   96,   96,   96,   96,   31,   67,
+       23,   96,   59,   96,   77,   96,   96,   96,   43,   96,
+
+       96,   96,   96,   96,   96,   96,   96,   92,   96,   96,
+       56,   96,   96,   96,   96,   96,   96,   96,   40,   33,
+        0,   81,   95,   19,   96,   96,   85,   96,   76,   55,
+       96,   65,   96,   52,   96,   96,   96,   47,   96,   78,
+       96,   80,   96,   96,   34,   96,   96,   96,   35,   74,
+       96,   96,   96,   96,   60,   96,   50,   49,   96,   96,
+       96,   57,   53,   64,   96,   96,   96,   22,   96,   96,
+       75,   83,   96,   96,   79,   96,   70,   96,   96,   96,
+       96,   96,   38,   96,   90,   69,   96,   86,   96,   96,
+       96,   88,   96,   96,   61,   96,   16,   96,   72,   71,
+
+       96,   58,   96,   84,   96,   96,   96,   96,   96,   96,
+       96,   96,   96,   96,   73,   96,   96,   96,   96,   96,
+       96,   62,    0
     } ;
 
 static yyconst flex_int32_t yy_ec[256] =
@@ -436,17 +429,17 @@ static yyconst flex_int32_t yy_ec[256] =
         1,    1,    1,    1,    1,    1,    1,    1,    2,    3,
         1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
         1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
-        1,    2,    1,    4,    1,    5,    6,    1,    7,    8,
-        9,   10,   11,   12,   13,   14,   15,   16,   16,   16,
-       16,   16,   16,   16,   16,   16,   16,   17,   18,   19,
-       20,   21,   22,    1,   23,   24,   25,   26,   27,   28,
-       29,   30,   31,   32,   33,   34,   35,   36,   37,   38,
-       39,   40,   41,   42,   43,   44,   45,   46,   47,   32,
-        1,    1,    1,    1,   48,    1,   32,   32,   32,   32,
-
-       32,   32,   32,   32,   32,   32,   32,   32,   32,   32,
-       32,   32,   32,   32,   32,   32,   32,   32,   32,   32,
-       32,   32,   49,    1,   50,    1,    1,    1,    1,    1,
+        1,    2,    1,    4,    5,    6,    7,    1,    8,    9,
+       10,   11,   12,   13,   14,   15,   16,   17,   17,   17,
+       17,   17,   17,   17,   17,   17,   17,   18,   19,   20,
+       21,   22,   23,    1,   24,   25,   26,   27,   28,   29,
+       30,   31,   32,   33,   34,   35,   36,   37,   38,   39,
+       40,   41,   42,   43,   44,   45,   46,   47,   48,   49,
+        1,    1,    1,    1,   50,    1,   33,   33,   33,   33,
+
+       33,   33,   33,   33,   33,   33,   33,   51,   33,   33,
+       33,   33,   52,   33,   53,   33,   33,   33,   33,   33,
+       33,   33,   54,    1,   55,    1,    1,    1,    1,    1,
         1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
         1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
         1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
@@ -463,236 +456,335 @@ static yyconst flex_int32_t yy_ec[256] =
         1,    1,    1,    1,    1
     } ;
 
-static yyconst flex_int32_t yy_meta[51] =
+static yyconst flex_int32_t yy_meta[56] =
     {   0,
-        1,    1,    1,    2,    1,    1,    3,    1,    1,    4,
-        1,    1,    1,    1,    1,    5,    1,    1,    1,    6,
-        1,    1,    5,    5,    5,    5,    5,    5,    5,    5,
-        5,    5,    5,    5,    5,    5,    5,    5,    5,    5,
-        5,    5,    5,    5,    5,    5,    5,    5,    1,    1
+        1,    1,    1,    2,    3,    1,    1,    4,    1,    1,
+        5,    1,    1,    1,    1,    6,    7,    1,    1,    1,
+        8,    1,    1,    9,    9,    9,    9,    9,    9,    9,
+        9,    9,    9,    9,    9,    9,    9,    9,    9,    9,
+        9,    9,    9,    9,    9,    9,    9,    9,    9,    9,
+        9,    9,    9,    1,    1
     } ;
 
-static yyconst flex_int16_t yy_base[409] =
+static yyconst flex_int16_t yy_base[436] =
     {   0,
-        0,    0,  437,  436,  438,  437,  439,  438,  441,  448,
-       49,   51,  448,    0,  448,  448,  448,  448,  448,  448,
-      448,  448,  426,  429,   41,  418,  448,   38,  448,  417,
-      448,   20,   33,   32,   46,   40,   44,    0,   54,   52,
-      399,   48,   60,  395,   65,   67,   81,   27,  411,   75,
-      448,  448,    0,   98,    0,  426,    0,  428,  113,    0,
-      448,  448,  415,   54,  410,  448,  448,  448,  448,    0,
-      403,   68,  399,  391,  389,    0,  402,   80,   84,  397,
-      383,   96,  381,  394,  379,  393,  387,  375,  379,  375,
-      377,  377,    0,   98,    0,  376,   97,  385,  368,  375,
-
-        0,    0,  381,  381,  364,   94,  103,  379,   98,   65,
-      381,  369,  109,  361,  377,  373,  351,   97,  372,  363,
-      115,  356,    0,  137,  138,  448,    0,  388,    0,  390,
-      377,    0,    0,  365,  360,  367,  365,  348,  346,  345,
-      350,  359,  347,  359,   95,  347,  353,  354,  336,  336,
-      123,    0,  334,  350,  351,    0,  338,  347,  344,  122,
-      124,  341,  336,  330,  340,  338,  331,  328,  336,    0,
-      326,  336,  334,  325,  315,  309,  322,  307,  327,    0,
-      313,    0,  311,    0,  325,  316,  313,  131,  309,  316,
-      323,  302,  304,  309,  309,  301,  304,  299,    0,    0,
-
-      311,  295,  305,  312,  292,  291,  305,  294,  307,  287,
-        0,  297,  279,    0,  298,    0,  295,  282,    0,  281,
-      276,  281,  280,  290,    0,  276,    0,    0,    0,  280,
-        0,    0,  276,  273,  287,    0,  272,  272,  270,  286,
-      271,  283,  280,  264,  282,  277,    0,  272,  272,  258,
-      257,  270,  256,  270,  269,  268,    0,  252,    0,  246,
-        0,  265,  249,  248,    0,  262,  252,  247,  246,  258,
-      248,  247,    0,    0,    0,  251,    0,  239,    0,  253,
-      249,  235,    0,  249,  250,  233,  238,  231,  249,  231,
-      228,    0,  229,  226,    0,  231,  243,  230,  237,  227,
-
-      235,  220,    0,    0,    0,  212,  219,    0,    0,    0,
-      216,    0,  230,    0,  231,  218,  217,    0,  213,    0,
-      216,    0,  208,  210,    0,  209,  223,  216,    0,    0,
-      219,  222,  204,  219,    0,  215,    0,    0,  199,  213,
-        0,    0,  197,  196,  201,    0,  210,  195,    0,    0,
-      201,  197,    0,  192,    0,  204,  204,  192,  202,    0,
-      179,    0,    0,  199,    0,  183,  177,  183,    0,  174,
-        0,  193,    0,  192,    0,    0,  183,  187,    0,  174,
-      174,  180,  166,  189,  181,  180,  166,  151,  118,    0,
-      130,  136,  127,  123,  119,  111,    0,  448,  167,  173,
-
-      179,  152,  181,  124,  187,  193,  199,  205
+        0,    0,  849,  848,  850,  849,  852,  851,  854,  861,
+       54,   56,  861,    0,  861,  861,  861,  861,  861,  861,
+      861,  861,  838,  841,   45,  830,  861,   42,  861,  829,
+      861,   45,   49,   54,   58,   56,   72,  833,   83,   86,
+       63,   67,   90,   53,  105,  107,  106,  120,   51,  101,
+      861,  861,    0,   55,    0,  840,    0,  843,  106,    0,
+      861,  861,  829,   61,  824,  861,  861,  861,  861,  839,
+      827,   88,  124,  130,  132,  125,  826,  129,  133,  136,
+       52,  138,  148,  140,  142,  145,  149,  152,  151,  159,
+      162,  169,  165,  825,  172,  824,  173,  170,  175,  179,
+
+      176,  177,  823,  822,  180,  182,  184,  200,  201,  195,
+      189,  202,  204,  207,  205,  210,  218,  220,  213,  215,
+      223,  230,  238,  217,    0,  240,  244,  861,    0,  829,
+        0,  832,  818,  781,    0,  817,  816,  233,  237,  243,
+      248,  251,  246,  252,  255,  257,  258,  262,  264,  263,
+      265,  267,  266,  269,  273,  270,  815,  274,  275,  287,
+      814,  290,  292,  291,  293,  294,  297,  300,  304,  298,
+      307,  313,  308,  309,  317,  813,  314,  315,  323,  318,
+      324,  328,  331,  332,  333,  812,  336,  811,  338,  810,
+      340,  339,  342,  344,  343,  341,  347,  346,  348,  349,
+
+      359,  773,    0,  356,  369,  370,  360,  808,  371,  807,
+      372,  375,  376,  378,  379,  380,  382,  383,  388,  393,
+      394,  806,  396,  397,  805,  398,  804,  399,  400,  803,
+      403,  404,  408,  413,  405,  802,  415,  801,  800,  799,
+      798,  406,  797,  796,  416,  417,  420,  795,  422,  418,
+      423,  425,  424,  426,  439,  429,  437,  440,  794,  446,
+      450,  453,  454,  455,  457,  458,  459,  460,  793,  757,
+      461,  791,  463,  464,  466,  790,  467,  468,  473,  474,
+      789,  475,  476,  477,  478,  480,  485,  486,  788,  787,
+      786,  489,  785,  491,  784,  498,  493,  494,  783,  499,
+
+      504,  509,  511,  513,  516,  514,  517,  782,  520,  519,
+      781,  521,  523,  527,  525,  528,  526,  529,  780,  779,
+      780,  776,  773,  530,  533,  535,  772,  534,  771,  770,
+      541,  769,  550,  760,  543,  548,  551,  753,  552,  736,
+      554,  730,  556,  557,  723,  558,  566,  563,  693,  692,
+      569,  572,  565,  578,  691,  574,  690,  689,  567,  585,
+      588,  688,  687,  685,  571,  589,  591,  683,  592,  593,
+      681,  680,  595,  596,  679,  597,  678,  599,  604,  602,
+      605,  608,  676,  606,  675,  674,  609,  673,  607,  610,
+      614,  670,  620,  623,  668,  628,  667,  630,  665,  664,
+
+      625,  663,  629,  112,  627,  626,  631,  632,  647,  633,
+      636,  637,  644,  650,  110,  652,  659,  657,  660,  661,
+      662,   57,  861,  710,  719,  728,  731,  734,  738,  747,
+      756,  765,  774,  781,  784
     } ;
 
-static yyconst flex_int16_t yy_def[409] =
+static yyconst flex_int16_t yy_def[436] =
     {   0,
-      398,    1,  399,  399,  400,  400,  401,  401,  398,  398,
-      398,  398,  398,  402,  398,  398,  398,  398,  398,  398,
-      398,  398,  398,  398,  398,  403,  398,  398,  398,  398,
-      398,  404,  404,  404,  404,  404,  404,  404,  404,  404,
-      404,  404,  404,  404,  404,  404,  404,  404,  404,  404,
-      398,  398,  405,  406,  407,  398,  408,  398,  398,  402,
-      398,  398,  398,  398,  403,  398,  398,  398,  398,  404,
-      404,  404,  404,  404,  404,  404,  404,  404,  404,  404,
-      404,  404,  404,  404,  404,  404,  404,  404,  404,  404,
-      404,  404,  404,  404,  404,  404,  404,  404,  404,  404,
-
-      404,  404,  404,  404,  404,  404,  404,  404,  404,  404,
-      404,  404,  404,  404,  404,  404,  404,  404,  404,  404,
-      404,  404,  405,  406,  406,  398,  407,  398,  408,  398,
-      398,  404,  404,  404,  404,  404,  404,  404,  404,  404,
-      404,  404,  404,  404,  404,  404,  404,  404,  404,  404,
-      404,  404,  404,  404,  404,  404,  404,  404,  404,  404,
-      404,  404,  404,  404,  404,  404,  404,  404,  404,  404,
-      404,  404,  404,  404,  404,  404,  404,  404,  404,  404,
-      404,  404,  404,  404,  404,  404,  404,  404,  404,  404,
-      404,  404,  404,  404,  404,  404,  404,  404,  404,  404,
-
-      404,  404,  404,  404,  404,  404,  404,  404,  404,  404,
-      404,  404,  404,  404,  404,  404,  404,  404,  404,  404,
-      404,  404,  404,  404,  404,  404,  404,  404,  404,  404,
-      404,  404,  404,  404,  404,  404,  404,  404,  404,  404,
-      404,  404,  404,  404,  404,  404,  404,  404,  404,  404,
-      404,  404,  404,  404,  404,  404,  404,  404,  404,  404,
-      404,  404,  404,  404,  404,  404,  404,  404,  404,  404,
-      404,  404,  404,  404,  404,  404,  404,  404,  404,  404,
-      404,  404,  404,  404,  404,  404,  404,  404,  404,  404,
-      404,  404,  404,  404,  404,  404,  404,  404,  404,  404,
-
-      404,  404,  404,  404,  404,  404,  404,  404,  404,  404,
-      404,  404,  404,  404,  404,  404,  404,  404,  404,  404,
-      404,  404,  404,  404,  404,  404,  404,  404,  404,  404,
-      404,  404,  404,  404,  404,  404,  404,  404,  404,  404,
-      404,  404,  404,  404,  404,  404,  404,  404,  404,  404,
-      404,  404,  404,  404,  404,  404,  404,  404,  404,  404,
-      404,  404,  404,  404,  404,  404,  404,  404,  404,  404,
-      404,  404,  404,  404,  404,  404,  404,  404,  404,  404,
-      404,  404,  404,  404,  404,  404,  404,  404,  404,  404,
-      404,  404,  404,  404,  404,  404,  404,    0,  398,  398,
-
-      398,  398,  398,  398,  398,  398,  398,  398
+      423,    1,  424,  424,  425,  425,  426,  426,  423,  423,
+      423,  423,  423,  427,  423,  423,  423,  423,  423,  423,
+      423,  423,  423,  423,  423,  428,  423,  423,  423,  423,
+      423,  429,  429,  429,  429,  429,  429,  429,  429,  429,
+      429,  429,  429,  429,  429,  429,  429,  429,  429,  429,
+      423,  423,  430,  431,  432,  423,  433,  423,  423,  427,
+      423,  423,  423,  423,  428,  423,  423,  423,  423,  434,
+      429,  429,  429,  429,  429,  429,  429,  429,  429,  429,
+      429,  429,  429,  429,  429,  429,  429,  429,  429,  429,
+      429,  429,  429,  429,  429,  429,  429,  429,  429,  429,
+
+      429,  429,  429,  429,  429,  429,  429,  429,  429,  429,
+      429,  429,  429,  429,  429,  429,  429,  429,  429,  429,
+      429,  429,  429,  429,  430,  431,  431,  423,  432,  423,
+      433,  423,  423,  423,  435,  429,  429,  429,  429,  429,
+      429,  429,  429,  429,  429,  429,  429,  429,  429,  429,
+      429,  429,  429,  429,  429,  429,  429,  429,  429,  429,
+      429,  429,  429,  429,  429,  429,  429,  429,  429,  429,
+      429,  429,  429,  429,  429,  429,  429,  429,  429,  429,
+      429,  429,  429,  429,  429,  429,  429,  429,  429,  429,
+      429,  429,  429,  429,  429,  429,  429,  429,  429,  429,
+
+      429,  423,  435,  429,  429,  429,  429,  429,  429,  429,
+      429,  429,  429,  429,  429,  429,  429,  429,  429,  429,
+      429,  429,  429,  429,  429,  429,  429,  429,  429,  429,
+      429,  429,  429,  429,  429,  429,  429,  429,  429,  429,
+      429,  429,  429,  429,  429,  429,  429,  429,  429,  429,
+      429,  429,  429,  429,  429,  429,  429,  429,  429,  429,
+      429,  429,  429,  429,  429,  429,  429,  429,  429,  423,
+      429,  429,  429,  429,  429,  429,  429,  429,  429,  429,
+      429,  429,  429,  429,  429,  429,  429,  429,  429,  429,
+      429,  429,  429,  429,  429,  429,  429,  429,  429,  429,
+
+      429,  429,  429,  429,  429,  429,  429,  429,  429,  429,
+      429,  429,  429,  429,  429,  429,  429,  429,  429,  429,
+      423,  429,  429,  429,  429,  429,  429,  429,  429,  429,
+      429,  429,  429,  429,  429,  429,  429,  429,  429,  429,
+      429,  429,  429,  429,  429,  429,  429,  429,  429,  429,
+      429,  429,  429,  429,  429,  429,  429,  429,  429,  429,
+      429,  429,  429,  429,  429,  429,  429,  429,  429,  429,
+      429,  429,  429,  429,  429,  429,  429,  429,  429,  429,
+      429,  429,  429,  429,  429,  429,  429,  429,  429,  429,
+      429,  429,  429,  429,  429,  429,  429,  429,  429,  429,
+
+      429,  429,  429,  429,  429,  429,  429,  429,  429,  429,
+      429,  429,  429,  429,  429,  429,  429,  429,  429,  429,
+      429,  429,    0,  423,  423,  423,  423,  423,  423,  423,
+      423,  423,  423,  423,  423
     } ;
 
-static yyconst flex_int16_t yy_nxt[499] =
+static yyconst flex_int16_t yy_nxt[917] =
     {   0,
-       10,   11,   12,   13,   14,   15,   16,   17,   18,   19,
-       20,   21,   22,   23,   24,   25,   26,   27,   28,   29,
-       30,   31,   32,   33,   34,   35,   36,   37,   38,   38,
-       39,   38,   38,   40,   41,   42,   43,   44,   38,   45,
-       46,   47,   48,   49,   50,   38,   38,   38,   51,   52,
-       59,   59,   59,   59,   63,   71,   64,   67,   68,   73,
-       72,   77,  118,   74,  119,   78,   75,   63,   79,   64,
-       88,   80,   82,   85,   81,   86,   83,   89,   96,   76,
-       90,   93,   84,   91,   99,   87,   92,  101,   97,   94,
-      100,  107,  133,  110,   95,  102,  111,  103,  179,  104,
-
-      108,  109,  105,  115,  121,  112,  180,  125,  134,  113,
-      116,  122,  126,  114,   59,   59,  139,  117,  141,  142,
-      146,  163,  140,  159,  171,  173,  143,  189,   70,  147,
-      172,  177,  183,  164,  207,  208,  148,  190,  160,  161,
-      174,  193,  178,  184,  175,  194,  398,  125,  222,  214,
-      224,  398,  126,  215,  248,  249,   60,  397,  396,  395,
-      225,  394,  393,  223,  392,  391,  250,   53,   53,   53,
-       53,   53,   53,   55,   55,   55,   55,   55,   55,   57,
-       57,   57,   57,   57,   57,   65,   65,  123,  123,  123,
-      390,  123,  123,  124,  124,  124,  124,  124,  124,  127,
-
-      127,  389,  127,  127,  127,  129,  388,  129,  129,  129,
-      129,  387,  386,  385,  384,  383,  382,  381,  380,  379,
-      378,  377,  376,  375,  374,  373,  372,  371,  370,  369,
-      368,  367,  366,  365,  364,  363,  362,  361,  360,  359,
-      358,  357,  356,  355,  354,  353,  352,  351,  350,  349,
-      348,  347,  346,  345,  344,  343,  342,  341,  340,  339,
-      338,  337,  336,  335,  334,  333,  332,  331,  330,  329,
-      328,  327,  326,  325,  324,  323,  322,  321,  320,  319,
-      318,  317,  316,  315,  314,  313,  312,  311,  310,  309,
-      308,  307,  306,  305,  304,  303,  302,  301,  300,  299,
-
-      298,  297,  296,  295,  294,  293,  292,  291,  290,  289,
-      288,  287,  286,  285,  284,  283,  282,  281,  280,  279,
-      278,  277,  276,  275,  274,  273,  272,  271,  270,  269,
-      268,  267,  266,  265,  264,  263,  262,  261,  260,  259,
-      258,  257,  256,  255,  254,  253,  252,  251,  247,  246,
-      245,  244,  243,  242,  241,  240,  239,  238,  237,  236,
-      235,  234,  233,  232,  231,  230,  229,  228,  227,  226,
-      221,  220,  219,  218,  217,  216,  213,  212,  211,  210,
-      209,  206,  205,  204,  203,  202,  201,  200,  199,  198,
-      197,  196,  131,  130,  128,  195,  192,  191,  188,  187,
-
-      186,  185,  182,  181,  176,  170,  169,  168,  167,  166,
-      165,  162,  158,  157,  156,  155,  154,  153,  152,  151,
-      150,  149,  145,  144,  138,  137,  136,  135,  132,  398,
-      131,  130,  128,  120,  106,   98,   69,   66,   62,   61,
-      398,   58,   58,   56,   56,   54,   54,    9,  398,  398,
-      398,  398,  398,  398,  398,  398,  398,  398,  398,  398,
-      398,  398,  398,  398,  398,  398,  398,  398,  398,  398,
-      398,  398,  398,  398,  398,  398,  398,  398,  398,  398,
-      398,  398,  398,  398,  398,  398,  398,  398,  398,  398,
-      398,  398,  398,  398,  398,  398,  398,  398
-
+       10,   11,   12,   13,   10,   14,   15,   16,   17,   18,
+       19,   20,   21,   22,   23,   24,   25,   26,   27,   28,
+       29,   30,   31,   32,   33,   34,   35,   36,   37,   38,
+       38,   39,   38,   38,   40,   41,   42,   43,   44,   38,
+       45,   46,   47,   48,   49,   50,   38,   38,   38,   38,
+       38,   38,   38,   51,   52,   59,   59,   59,   59,   63,
+       70,   64,   67,   68,   70,  127,   70,   70,   70,   70,
+      128,   70,   70,   70,  122,   63,   74,   64,   70,  149,
+       75,   72,   70,   76,   78,   83,   73,   70,   79,   84,
+       86,   80,   87,  108,   81,   85,   77,   82,   70,   89,
+
+      100,   70,   88,   70,  101,   70,   90,   59,   59,   91,
+      102,   94,   92,   97,  136,   93,   70,   98,  103,   95,
+       70,   70,   70,   99,   96,   70,  104,   70,  105,  117,
+      106,  123,  109,  107,  112,   70,  118,  113,  124,   70,
+       70,  110,  111,  119,   70,   70,  114,   70,   70,  137,
+      115,   70,  143,   70,  116,   70,  120,   70,  121,  139,
+       70,  140,  142,   70,   70,  138,   70,   70,  141,  155,
+      144,  146,  147,  151,   70,  157,  145,   70,  150,  148,
+       70,  154,  152,  158,   70,   70,  156,   70,   70,  153,
+       70,   70,   70,  159,   70,   70,  160,   70,  164,   70,
+
+      169,  163,  161,  168,   70,  171,  162,  174,  175,  167,
+       70,  173,  170,  165,  166,   70,   70,   70,  172,   70,
+       70,  182,   70,  183,  179,   70,  176,  187,   70,  189,
+       70,  177,   70,   70,  184,   70,  185,  178,   70,  180,
+      190,  188,  192,  181,  186,   70,  195,  193,   70,  197,
+      423,  191,   70,   70,  127,  423,  196,  201,   70,  128,
+      204,   70,  194,   70,  198,  199,   70,   70,  205,  200,
+       70,  207,   70,   70,  206,  208,  209,   70,   70,   70,
+       70,   70,   70,  215,   70,   70,  210,  217,   70,   70,
+       70,  222,  213,  211,  221,  214,  212,  225,  216,  220,
+
+      228,  226,   70,  218,  219,   70,   70,   70,   70,   70,
+      229,  223,   70,   70,  224,   70,  227,  231,  232,   70,
+      233,  235,   70,   70,   70,  230,  237,  238,   70,   70,
+       70,  236,   70,   70,  241,  234,  240,  239,   70,   70,
+      247,  242,  243,   70,  245,  244,   70,   70,   70,  248,
+      246,   70,  249,   70,   70,   70,   70,   70,   70,   70,
+      254,   70,   70,   70,   70,  252,  257,  250,  260,  261,
+      265,   70,  264,  258,   70,   70,  255,  251,  259,  256,
+      262,  253,  263,  268,   70,   70,   70,   70,  267,  266,
+       70,   70,  269,   70,   70,   70,  271,   70,   70,  276,
+
+      274,  279,  280,   70,  275,  272,  273,  278,   70,   70,
+      283,   70,   70,   70,   70,   70,  285,  277,   70,   70,
+       70,   70,  281,   70,  282,  284,  289,  287,   70,  290,
+       70,   70,   70,   70,  296,   70,  286,   70,   70,   70,
+       70,   70,  291,  298,   70,  292,  288,  301,  294,  305,
+      293,  307,   70,  295,   70,   70,  299,  297,  303,  300,
+      310,   70,  306,  302,  304,   70,  308,  311,   70,   70,
+       70,  309,   70,   70,   70,   70,   70,  312,   70,   70,
+      313,   70,   70,   70,  316,  318,  319,  320,   70,   70,
+       70,   70,   70,   70,  326,   70,  314,  315,  328,  317,
+
+       70,   70,  330,  322,   70,  323,   70,  334,   70,   70,
+      327,  324,  331,   70,   70,  325,  329,  332,  333,   70,
+      337,  335,  336,  340,   70,  339,   70,  342,   70,   70,
+      343,   70,   70,  338,   70,   70,   70,  341,   70,  347,
+       70,   70,   70,   70,   70,   70,  353,  345,   70,   70,
+       70,  344,  355,  357,  348,  346,   70,  352,   70,  349,
+      350,  351,  354,   70,  356,   70,   70,   70,  365,   70,
+      358,   70,   70,   70,  360,  361,  362,  364,   70,  359,
+       70,   70,   70,  363,   70,  366,   70,   70,  367,   70,
+      369,  373,  368,   70,  374,  376,  375,  371,  372,  370,
+
+       70,  379,  378,   70,   70,  377,   70,   70,   70,  380,
+       70,   70,   70,  383,   70,  382,  381,   70,  386,   70,
+       70,   70,   70,   70,   70,   70,  391,  385,  388,   70,
+      392,  384,  389,  387,  395,   70,  397,  390,   70,  393,
+       70,   70,   70,   70,   70,   70,   70,   70,   70,  398,
+      402,   70,   70,  394,  400,  396,  403,  399,  404,   70,
+      406,  405,   70,  413,  412,   70,  409,   70,  408,  401,
+      407,  411,   70,  414,   70,   70,   70,   70,   70,   70,
+       70,  410,   70,   70,  415,   70,  418,  417,   70,   70,
+       70,   70,  419,   70,   70,   70,   70,  420,   70,  416,
+
+       70,  421,   70,   70,   70,   70,   70,   70,   70,  422,
+       53,   53,   53,   53,   53,   53,   53,   53,   53,   55,
+       55,   55,   55,   55,   55,   55,   55,   55,   57,   57,
+       57,   57,   57,   57,   57,   57,   57,   60,   70,   60,
+       65,   65,   65,   71,   71,   70,   71,  125,  125,  125,
+      125,   70,  125,  125,  125,  125,  126,  126,  126,  126,
+      126,  126,  126,  126,  126,  129,  129,  129,   70,  129,
+      129,  129,  129,  129,  131,   70,  131,  131,  131,  131,
+      131,  131,  131,  135,   70,   70,   70,   70,   70,  135,
+      203,   70,  203,  135,   70,   70,   70,   70,   70,   70,
+
+       70,   70,   70,   70,   70,   70,   70,  321,   70,   70,
+       70,   70,   70,   70,   70,   70,   70,   70,   70,   70,
+       70,   70,   70,   70,  270,   70,   70,   70,   70,   70,
+       70,   70,   70,  202,  133,  132,  130,   70,   70,   70,
+       70,   70,   70,  134,  423,  133,  132,  130,   70,   69,
+       66,   62,   61,  423,   58,   58,   56,   56,   54,   54,
+        9,  423,  423,  423,  423,  423,  423,  423,  423,  423,
+      423,  423,  423,  423,  423,  423,  423,  423,  423,  423,
+      423,  423,  423,  423,  423,  423,  423,  423,  423,  423,
+      423,  423,  423,  423,  423,  423,  423,  423,  423,  423,
+
+      423,  423,  423,  423,  423,  423,  423,  423,  423,  423,
+      423,  423,  423,  423,  423,  423
     } ;
 
-static yyconst flex_int16_t yy_chk[499] =
+static yyconst flex_int16_t yy_chk[917] =
     {   0,
         1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
         1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
         1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
         1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
         1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
-       11,   11,   12,   12,   25,   32,   25,   28,   28,   33,
-       32,   34,   48,   33,   48,   34,   33,   64,   34,   64,
-       37,   34,   35,   36,   34,   36,   35,   37,   40,   33,
-       37,   39,   35,   37,   42,   36,   37,   43,   40,   39,
-       42,   45,   72,   46,   39,   43,   46,   43,  110,   43,
-
-       45,   45,   43,   47,   50,   46,  110,   54,   72,   46,
-       47,   50,   54,   46,   59,   59,   78,   47,   79,   79,
-       82,   97,   78,   94,  106,  107,   79,  118,  404,   82,
-      106,  109,  113,   97,  145,  145,   82,  118,   94,   94,
-      107,  121,  109,  113,  107,  121,  124,  125,  160,  151,
-      161,  124,  125,  151,  188,  188,  402,  396,  395,  394,
-      161,  393,  392,  160,  391,  389,  188,  399,  399,  399,
-      399,  399,  399,  400,  400,  400,  400,  400,  400,  401,
-      401,  401,  401,  401,  401,  403,  403,  405,  405,  405,
-      388,  405,  405,  406,  406,  406,  406,  406,  406,  407,
-
-      407,  387,  407,  407,  407,  408,  386,  408,  408,  408,
-      408,  385,  384,  383,  382,  381,  380,  378,  377,  374,
-      372,  370,  368,  367,  366,  364,  361,  359,  358,  357,
-      356,  354,  352,  351,  348,  347,  345,  344,  343,  340,
-      339,  336,  334,  333,  332,  331,  328,  327,  326,  324,
-      323,  321,  319,  317,  316,  315,  313,  311,  307,  306,
-      302,  301,  300,  299,  298,  297,  296,  294,  293,  291,
-      290,  289,  288,  287,  286,  285,  284,  282,  281,  280,
-      278,  276,  272,  271,  270,  269,  268,  267,  266,  264,
-      263,  262,  260,  258,  256,  255,  254,  253,  252,  251,
-
-      250,  249,  248,  246,  245,  244,  243,  242,  241,  240,
-      239,  238,  237,  235,  234,  233,  230,  226,  224,  223,
-      222,  221,  220,  218,  217,  215,  213,  212,  210,  209,
-      208,  207,  206,  205,  204,  203,  202,  201,  198,  197,
-      196,  195,  194,  193,  192,  191,  190,  189,  187,  186,
-      185,  183,  181,  179,  178,  177,  176,  175,  174,  173,
-      172,  171,  169,  168,  167,  166,  165,  164,  163,  162,
-      159,  158,  157,  155,  154,  153,  150,  149,  148,  147,
-      146,  144,  143,  142,  141,  140,  139,  138,  137,  136,
-      135,  134,  131,  130,  128,  122,  120,  119,  117,  116,
-
-      115,  114,  112,  111,  108,  105,  104,  103,  100,   99,
-       98,   96,   92,   91,   90,   89,   88,   87,   86,   85,
-       84,   83,   81,   80,   77,   75,   74,   73,   71,   65,
-       63,   58,   56,   49,   44,   41,   30,   26,   24,   23,
-        9,    8,    7,    6,    5,    4,    3,  398,  398,  398,
-      398,  398,  398,  398,  398,  398,  398,  398,  398,  398,
-      398,  398,  398,  398,  398,  398,  398,  398,  398,  398,
-      398,  398,  398,  398,  398,  398,  398,  398,  398,  398,
-      398,  398,  398,  398,  398,  398,  398,  398,  398,  398,
-      398,  398,  398,  398,  398,  398,  398,  398
-
+        1,    1,    1,    1,    1,   11,   11,   12,   12,   25,
+       32,   25,   28,   28,   33,   54,   49,   81,   44,   34,
+       54,   36,  422,   35,   49,   64,   33,   64,   41,   81,
+       33,   32,   42,   33,   34,   35,   32,   37,   34,   35,
+       36,   34,   36,   44,   34,   35,   33,   34,   39,   37,
+
+       41,   40,   36,   72,   42,   43,   37,   59,   59,   37,
+       42,   39,   37,   40,   72,   37,   50,   40,   43,   39,
+       45,   47,   46,   40,   39,  415,   43,  404,   43,   47,
+       43,   50,   45,   43,   46,   48,   47,   46,   50,   73,
+       76,   45,   45,   47,   78,   74,   46,   75,   79,   73,
+       46,   80,   78,   82,   46,   84,   48,   85,   48,   74,
+       86,   75,   76,   83,   87,   73,   89,   88,   75,   85,
+       79,   80,   80,   83,   90,   87,   79,   91,   82,   80,
+       93,   84,   83,   88,   92,   98,   86,   95,   97,   83,
+       99,  101,  102,   89,  100,  105,   90,  106,   95,  107,
+
+       99,   93,   91,   98,  111,  100,   92,  105,  106,   97,
+      110,  102,   99,   95,   95,  108,  109,  112,  101,  113,
+      115,  110,  114,  111,  109,  116,  107,  113,  119,  115,
+      120,  108,  124,  117,  111,  118,  112,  108,  121,  109,
+      115,  114,  117,  109,  112,  122,  120,  118,  138,  121,
+      126,  116,  139,  123,  127,  126,  120,  124,  140,  127,
+      138,  143,  119,  141,  122,  123,  142,  144,  139,  123,
+      145,  141,  146,  147,  140,  142,  142,  148,  150,  149,
+      151,  153,  152,  147,  154,  156,  143,  149,  155,  158,
+      159,  153,  146,  144,  152,  146,  145,  156,  148,  151,
+
+      159,  156,  160,  150,  150,  162,  164,  163,  165,  166,
+      160,  154,  167,  170,  155,  168,  158,  163,  164,  169,
+      165,  166,  171,  173,  174,  162,  167,  168,  172,  177,
+      178,  166,  175,  180,  171,  165,  170,  169,  179,  181,
+      178,  172,  173,  182,  175,  174,  183,  184,  185,  179,
+      177,  187,  180,  189,  192,  191,  196,  193,  195,  194,
+      185,  198,  197,  199,  200,  183,  191,  181,  194,  194,
+      197,  204,  196,  192,  201,  207,  187,  182,  193,  189,
+      194,  184,  195,  200,  205,  206,  209,  211,  199,  198,
+      212,  213,  201,  214,  215,  216,  204,  217,  218,  211,
+
+      207,  214,  215,  219,  209,  205,  206,  213,  220,  221,
+      218,  223,  224,  226,  228,  229,  220,  212,  231,  232,
+      235,  242,  216,  233,  217,  219,  226,  223,  234,  228,
+      237,  245,  246,  250,  235,  247,  221,  249,  251,  253,
+      252,  254,  229,  242,  256,  231,  224,  247,  233,  252,
+      232,  254,  257,  234,  255,  258,  245,  237,  250,  246,
+      257,  260,  253,  249,  251,  261,  255,  258,  262,  263,
+      264,  256,  265,  266,  267,  268,  271,  260,  273,  274,
+      261,  275,  277,  278,  264,  266,  267,  268,  279,  280,
+      282,  283,  284,  285,  277,  286,  262,  263,  279,  265,
+
+      287,  288,  282,  271,  292,  273,  294,  286,  297,  298,
+      278,  274,  283,  296,  300,  275,  280,  284,  285,  301,
+      292,  287,  288,  297,  302,  296,  303,  300,  304,  306,
+      301,  305,  307,  294,  310,  309,  312,  298,  313,  305,
+      315,  317,  314,  316,  318,  324,  313,  303,  325,  328,
+      326,  302,  315,  317,  306,  304,  331,  312,  335,  307,
+      309,  310,  314,  336,  316,  333,  337,  339,  335,  341,
+      318,  343,  344,  346,  325,  326,  328,  333,  348,  324,
+      353,  347,  359,  331,  351,  336,  365,  352,  337,  356,
+      341,  347,  339,  354,  348,  352,  351,  344,  346,  343,
+
+      360,  356,  354,  361,  366,  353,  367,  369,  370,  359,
+      373,  374,  376,  365,  378,  361,  360,  380,  369,  379,
+      381,  384,  389,  382,  387,  390,  378,  367,  373,  391,
+      379,  366,  374,  370,  382,  393,  387,  376,  394,  380,
+      401,  406,  405,  396,  403,  398,  407,  408,  410,  389,
+      394,  411,  412,  381,  391,  384,  396,  390,  398,  413,
+      403,  401,  409,  411,  410,  414,  407,  416,  406,  393,
+      405,  409,  418,  412,  417,  419,  420,  421,  402,  400,
+      399,  408,  397,  395,  413,  392,  417,  416,  388,  386,
+      385,  383,  418,  377,  375,  372,  371,  419,  368,  414,
+
+      364,  420,  363,  362,  358,  357,  355,  350,  349,  421,
+      424,  424,  424,  424,  424,  424,  424,  424,  424,  425,
+      425,  425,  425,  425,  425,  425,  425,  425,  426,  426,
+      426,  426,  426,  426,  426,  426,  426,  427,  345,  427,
+      428,  428,  428,  429,  429,  342,  429,  430,  430,  430,
+      430,  340,  430,  430,  430,  430,  431,  431,  431,  431,
+      431,  431,  431,  431,  431,  432,  432,  432,  338,  432,
+      432,  432,  432,  432,  433,  334,  433,  433,  433,  433,
+      433,  433,  433,  434,  332,  330,  329,  327,  323,  434,
+      435,  322,  435,  321,  320,  319,  311,  308,  299,  295,
+
+      293,  291,  290,  289,  281,  276,  272,  270,  269,  259,
+      248,  244,  243,  241,  240,  239,  238,  236,  230,  227,
+      225,  222,  210,  208,  202,  190,  188,  186,  176,  161,
+      157,  137,  136,  134,  133,  132,  130,  104,  103,   96,
+       94,   77,   71,   70,   65,   63,   58,   56,   38,   30,
+       26,   24,   23,    9,    8,    7,    6,    5,    4,    3,
+      423,  423,  423,  423,  423,  423,  423,  423,  423,  423,
+      423,  423,  423,  423,  423,  423,  423,  423,  423,  423,
+      423,  423,  423,  423,  423,  423,  423,  423,  423,  423,
+      423,  423,  423,  423,  423,  423,  423,  423,  423,  423,
+
+      423,  423,  423,  423,  423,  423,  423,  423,  423,  423,
+      423,  423,  423,  423,  423,  423
     } ;
 
 static yy_state_type yy_last_accepting_state;
 static char *yy_last_accepting_cpos;
 
-static int yy_flex_debug;
-static int yy_flex_debug = 0;
+extern int yy_flex_debug;
+int yy_flex_debug = 0;
 
 /* The intent behind this definition is that it'll catch
  * any uses of REJECT which flex missed.
@@ -701,15 +793,27 @@ static int yy_flex_debug = 0;
 #define yymore() yymore_used_but_not_detected
 #define YY_MORE_ADJ 0
 #define YY_RESTORE_YY_MORE_OFFSET
-static char *yytext;
+char *yytext;
 #line 1 "pars0lex.l"
-/**************************************************//**
-SQL parser lexical analyzer: input file for the GNU Flex lexer generator
+/*****************************************************************************
 
-(c) 1997 Innobase Oy
+Copyright (c) 1997, 2011, Oracle and/or its affiliates. All Rights Reserved.
 
-Created 12/14/1997 Heikki Tuuri
-Published under the GPL version 2
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+/******************************************************
+SQL parser lexical analyzer: input file for the GNU Flex lexer generator
 
 The InnoDB parser is frozen because MySQL takes care of SQL parsing.
 Therefore we normally keep the InnoDB parser C files as they are, and do
@@ -723,10 +827,12 @@ How to make the InnoDB parser and lexer C files:
 
 These instructions seem to work at least with bison-1.875d and flex-2.5.31 on
 Linux.
+
+Created 12/14/1997 Heikki Tuuri
 *******************************************************/
 #define YY_NO_INPUT 1
 #define YY_NO_UNISTD_H 1
-#line 38 "pars0lex.l"
+#line 53 "pars0lex.l"
 #define YYSTYPE que_node_t*
 
 #include "univ.i"
@@ -741,7 +847,8 @@ Linux.
 #define realloc(P, A)	ut_realloc(P, A)
 #define exit(A) 	ut_error
 
-#define YY_INPUT(buf, result, max_size) pars_get_lex_chars(buf, &result, max_size)
+#define YY_INPUT(buf, result, max_size) \
+	(result = pars_get_lex_chars(buf, max_size))
 
 /* String buffer for removing quotes */
 static ulint	stringbuf_len_alloc = 0; /* Allocated length */
@@ -756,7 +863,7 @@ string_append(
 	ulint		len)	/*!< in: length of the string */
 {
 	if (stringbuf == NULL) {
-		stringbuf = malloc(1);
+		stringbuf = static_cast<char*>(malloc(1));
 		stringbuf_len_alloc = 1;
 	}
 
@@ -764,7 +871,9 @@ string_append(
 		while (stringbuf_len + len > stringbuf_len_alloc) {
 			stringbuf_len_alloc <<= 1;
 		}
-		stringbuf = realloc(stringbuf, stringbuf_len_alloc);
+
+		stringbuf = static_cast<char*>(
+			realloc(stringbuf, stringbuf_len_alloc));
 	}
 
 	memcpy(stringbuf + stringbuf_len, str, len);
@@ -774,7 +883,7 @@ string_append(
 
 
 
-#line 759 "lexyy.c"
+#line 887 "lexyy.cc"
 
 #define INITIAL 0
 #define comment 1
@@ -793,6 +902,37 @@ string_append(
 #define YY_EXTRA_TYPE void *
 #endif
 
+static int yy_init_globals (void );
+
+/* Accessor methods to globals.
+   These are made visible to non-reentrant scanners for convenience. */
+
+__attribute__((unused)) static int yylex_destroy (void );
+
+int yyget_debug (void );
+
+void yyset_debug (int debug_flag  );
+
+YY_EXTRA_TYPE yyget_extra (void );
+
+void yyset_extra (YY_EXTRA_TYPE user_defined  );
+
+FILE *yyget_in (void );
+
+void yyset_in  (FILE * in_str  );
+
+FILE *yyget_out (void );
+
+void yyset_out  (FILE * out_str  );
+
+yy_size_t yyget_leng (void );
+
+char *yyget_text (void );
+
+int yyget_lineno (void );
+
+void yyset_lineno (int line_number  );
+
 /* Macros after this point can all be overridden by user definitions in
  * section 1.
  */
@@ -833,7 +973,7 @@ static int input (void );
 /* This used to be an fputs(), but since the string might contain NUL's,
  * we now use fwrite().
  */
-#define ECHO (void) fwrite( yytext, yyleng, 1, yyout )
+#define ECHO fwrite( yytext, yyleng, 1, yyout )
 #endif
 
 /* Gets input and stuffs it into "buf".  number of characters read, or YY_NULL,
@@ -844,7 +984,7 @@ static int input (void );
 	if ( YY_CURRENT_BUFFER_LVALUE->yy_is_interactive ) \
 		{ \
 		int c = '*'; \
-		size_t n; \
+		yy_size_t n; \
 		for ( n = 0; n < max_size && \
 			     (c = getc( yyin )) != EOF && c != '\n'; ++n ) \
 			buf[n] = (char) c; \
@@ -898,9 +1038,9 @@ static int input (void );
 #ifndef YY_DECL
 #define YY_DECL_IS_OURS 1
 
-UNIV_INTERN int yylex (void);
+extern int yylex (void);
 
-#define YY_DECL UNIV_INTERN int yylex (void)
+#define YY_DECL int yylex (void)
 #endif /* !YY_DECL */
 
 /* Code executed at the beginning of each rule, after yytext and yyleng
@@ -925,15 +1065,15 @@ YY_DECL
 	register yy_state_type yy_current_state;
 	register char *yy_cp, *yy_bp;
 	register int yy_act;
-    
-#line 92 "pars0lex.l"
+
+#line 112 "pars0lex.l"
 
 
-#line 914 "lexyy.c"
+#line 1073 "lexyy.cc"
 
-	if ( (yy_init) )
+	if ( !(yy_init) )
 		{
-		(yy_init) = 0;
+		(yy_init) = 1;
 
 #ifdef YY_USER_INIT
 		YY_USER_INIT;
@@ -982,13 +1122,13 @@ yy_match:
 			while ( yy_chk[yy_base[yy_current_state] + yy_c] != yy_current_state )
 				{
 				yy_current_state = (int) yy_def[yy_current_state];
-				if ( yy_current_state >= 399 )
+				if ( yy_current_state >= 424 )
 					yy_c = yy_meta[(unsigned int) yy_c];
 				}
 			yy_current_state = yy_nxt[yy_base[yy_current_state] + (unsigned int) yy_c];
 			++yy_cp;
 			}
-		while ( yy_current_state != 398 );
+		while ( yy_current_state != 423 );
 		yy_cp = (yy_last_accepting_cpos);
 		yy_current_state = (yy_last_accepting_state);
 
@@ -1010,7 +1150,7 @@ do_action:	/* This label is used only to access EOF actions. */
 
 case 1:
 YY_RULE_SETUP
-#line 94 "pars0lex.l"
+#line 114 "pars0lex.l"
 {
 			yylval = sym_tab_add_int_lit(pars_sym_tab_global,
 								atoi(yytext));
@@ -1019,7 +1159,7 @@ YY_RULE_SETUP
 	YY_BREAK
 case 2:
 YY_RULE_SETUP
-#line 100 "pars0lex.l"
+#line 120 "pars0lex.l"
 {
 			ut_error;	/* not implemented */
 
@@ -1028,7 +1168,7 @@ YY_RULE_SETUP
 	YY_BREAK
 case 3:
 YY_RULE_SETUP
-#line 106 "pars0lex.l"
+#line 126 "pars0lex.l"
 {
 			ulint	type;
 
@@ -1040,7 +1180,7 @@ YY_RULE_SETUP
 	YY_BREAK
 case 4:
 YY_RULE_SETUP
-#line 115 "pars0lex.l"
+#line 135 "pars0lex.l"
 {
 			yylval = sym_tab_add_bound_id(pars_sym_tab_global,
 				yytext + 1);
@@ -1050,7 +1190,7 @@ YY_RULE_SETUP
 	YY_BREAK
 case 5:
 YY_RULE_SETUP
-#line 122 "pars0lex.l"
+#line 142 "pars0lex.l"
 {
 /* Quoted character string literals are handled in an explicit
 start state 'quoted'.  This state is entered and the buffer for
@@ -1064,7 +1204,7 @@ In the state 'quoted', only two actions are possible (defined below). */
 case 6:
 /* rule 6 can match eol */
 YY_RULE_SETUP
-#line 131 "pars0lex.l"
+#line 151 "pars0lex.l"
 {
 			/* Got a sequence of characters other than "'":
 			append to string buffer */
@@ -1073,7 +1213,7 @@ YY_RULE_SETUP
 	YY_BREAK
 case 7:
 YY_RULE_SETUP
-#line 136 "pars0lex.l"
+#line 156 "pars0lex.l"
 {
 			/* Got a sequence of "'" characters:
 			append half of them to string buffer,
@@ -1100,7 +1240,7 @@ YY_RULE_SETUP
 	YY_BREAK
 case 8:
 YY_RULE_SETUP
-#line 160 "pars0lex.l"
+#line 180 "pars0lex.l"
 {
 /* Quoted identifiers are handled in an explicit start state 'id'.
 This state is entered and the buffer for the scanned string is emptied
@@ -1114,7 +1254,7 @@ In the state 'id', only two actions are possible (defined below). */
 case 9:
 /* rule 9 can match eol */
 YY_RULE_SETUP
-#line 169 "pars0lex.l"
+#line 189 "pars0lex.l"
 {
 			/* Got a sequence of characters other than '"':
 			append to string buffer */
@@ -1123,7 +1263,7 @@ YY_RULE_SETUP
 	YY_BREAK
 case 10:
 YY_RULE_SETUP
-#line 174 "pars0lex.l"
+#line 194 "pars0lex.l"
 {
 			/* Got a sequence of '"' characters:
 			append half of them to string buffer,
@@ -1151,7 +1291,7 @@ YY_RULE_SETUP
 	YY_BREAK
 case 11:
 YY_RULE_SETUP
-#line 199 "pars0lex.l"
+#line 219 "pars0lex.l"
 {
 			yylval = sym_tab_add_null_lit(pars_sym_tab_global);
 
@@ -1160,7 +1300,7 @@ YY_RULE_SETUP
 	YY_BREAK
 case 12:
 YY_RULE_SETUP
-#line 205 "pars0lex.l"
+#line 225 "pars0lex.l"
 {
 			/* Implicit cursor name */
 			yylval = sym_tab_add_str_lit(pars_sym_tab_global,
@@ -1170,645 +1310,643 @@ YY_RULE_SETUP
 	YY_BREAK
 case 13:
 YY_RULE_SETUP
-#line 212 "pars0lex.l"
+#line 232 "pars0lex.l"
 {
 			return(PARS_AND_TOKEN);
 }
 	YY_BREAK
 case 14:
 YY_RULE_SETUP
-#line 216 "pars0lex.l"
+#line 236 "pars0lex.l"
 {
 			return(PARS_OR_TOKEN);
 }
 	YY_BREAK
 case 15:
 YY_RULE_SETUP
-#line 220 "pars0lex.l"
+#line 240 "pars0lex.l"
 {
 			return(PARS_NOT_TOKEN);
 }
 	YY_BREAK
 case 16:
 YY_RULE_SETUP
-#line 224 "pars0lex.l"
+#line 244 "pars0lex.l"
 {
 			return(PARS_PROCEDURE_TOKEN);
 }
 	YY_BREAK
 case 17:
 YY_RULE_SETUP
-#line 228 "pars0lex.l"
+#line 248 "pars0lex.l"
 {
 			return(PARS_IN_TOKEN);
 }
 	YY_BREAK
 case 18:
 YY_RULE_SETUP
-#line 232 "pars0lex.l"
+#line 252 "pars0lex.l"
 {
 			return(PARS_OUT_TOKEN);
 }
 	YY_BREAK
 case 19:
 YY_RULE_SETUP
-#line 236 "pars0lex.l"
+#line 256 "pars0lex.l"
 {
-	 		return(PARS_BINARY_TOKEN);
+			return(PARS_BINARY_TOKEN);
 }
 	YY_BREAK
 case 20:
 YY_RULE_SETUP
-#line 240 "pars0lex.l"
+#line 260 "pars0lex.l"
 {
-	 		return(PARS_BLOB_TOKEN);
+			return(PARS_BLOB_TOKEN);
 }
 	YY_BREAK
 case 21:
 YY_RULE_SETUP
-#line 244 "pars0lex.l"
+#line 264 "pars0lex.l"
 {
-	 		return(PARS_INT_TOKEN);
+			return(PARS_INT_TOKEN);
 }
 	YY_BREAK
 case 22:
 YY_RULE_SETUP
-#line 248 "pars0lex.l"
+#line 268 "pars0lex.l"
 {
-	 		return(PARS_INT_TOKEN);
+			return(PARS_INT_TOKEN);
 }
 	YY_BREAK
 case 23:
 YY_RULE_SETUP
-#line 252 "pars0lex.l"
+#line 272 "pars0lex.l"
 {
-	 		return(PARS_FLOAT_TOKEN);
+			return(PARS_FLOAT_TOKEN);
 }
 	YY_BREAK
 case 24:
 YY_RULE_SETUP
-#line 256 "pars0lex.l"
+#line 276 "pars0lex.l"
 {
-	 		return(PARS_CHAR_TOKEN);
+			return(PARS_CHAR_TOKEN);
 }
 	YY_BREAK
 case 25:
 YY_RULE_SETUP
-#line 260 "pars0lex.l"
+#line 280 "pars0lex.l"
 {
 			return(PARS_IS_TOKEN);
 }
 	YY_BREAK
 case 26:
 YY_RULE_SETUP
-#line 264 "pars0lex.l"
+#line 284 "pars0lex.l"
 {
 			return(PARS_BEGIN_TOKEN);
 }
 	YY_BREAK
 case 27:
 YY_RULE_SETUP
-#line 268 "pars0lex.l"
+#line 288 "pars0lex.l"
 {
 			return(PARS_END_TOKEN);
 }
 	YY_BREAK
 case 28:
 YY_RULE_SETUP
-#line 272 "pars0lex.l"
+#line 292 "pars0lex.l"
 {
 			return(PARS_IF_TOKEN);
 }
 	YY_BREAK
 case 29:
 YY_RULE_SETUP
-#line 276 "pars0lex.l"
+#line 296 "pars0lex.l"
 {
 			return(PARS_THEN_TOKEN);
 }
 	YY_BREAK
 case 30:
 YY_RULE_SETUP
-#line 280 "pars0lex.l"
+#line 300 "pars0lex.l"
 {
 			return(PARS_ELSE_TOKEN);
 }
 	YY_BREAK
 case 31:
 YY_RULE_SETUP
-#line 284 "pars0lex.l"
+#line 304 "pars0lex.l"
 {
 			return(PARS_ELSIF_TOKEN);
 }
 	YY_BREAK
 case 32:
 YY_RULE_SETUP
-#line 288 "pars0lex.l"
+#line 308 "pars0lex.l"
 {
 			return(PARS_LOOP_TOKEN);
 }
 	YY_BREAK
 case 33:
 YY_RULE_SETUP
-#line 292 "pars0lex.l"
+#line 312 "pars0lex.l"
 {
 			return(PARS_WHILE_TOKEN);
 }
 	YY_BREAK
 case 34:
 YY_RULE_SETUP
-#line 296 "pars0lex.l"
+#line 316 "pars0lex.l"
 {
 			return(PARS_RETURN_TOKEN);
 }
 	YY_BREAK
 case 35:
 YY_RULE_SETUP
-#line 300 "pars0lex.l"
+#line 320 "pars0lex.l"
 {
 			return(PARS_SELECT_TOKEN);
 }
 	YY_BREAK
 case 36:
 YY_RULE_SETUP
-#line 304 "pars0lex.l"
+#line 324 "pars0lex.l"
 {
 			return(PARS_SUM_TOKEN);
 }
 	YY_BREAK
 case 37:
 YY_RULE_SETUP
-#line 308 "pars0lex.l"
+#line 328 "pars0lex.l"
 {
 			return(PARS_COUNT_TOKEN);
 }
 	YY_BREAK
 case 38:
 YY_RULE_SETUP
-#line 312 "pars0lex.l"
+#line 332 "pars0lex.l"
 {
 			return(PARS_DISTINCT_TOKEN);
 }
 	YY_BREAK
 case 39:
 YY_RULE_SETUP
-#line 316 "pars0lex.l"
+#line 336 "pars0lex.l"
 {
 			return(PARS_FROM_TOKEN);
 }
 	YY_BREAK
 case 40:
 YY_RULE_SETUP
-#line 320 "pars0lex.l"
+#line 340 "pars0lex.l"
 {
 			return(PARS_WHERE_TOKEN);
 }
 	YY_BREAK
 case 41:
 YY_RULE_SETUP
-#line 324 "pars0lex.l"
+#line 344 "pars0lex.l"
 {
 			return(PARS_FOR_TOKEN);
 }
 	YY_BREAK
 case 42:
 YY_RULE_SETUP
-#line 328 "pars0lex.l"
+#line 348 "pars0lex.l"
 {
 			return(PARS_READ_TOKEN);
 }
 	YY_BREAK
 case 43:
 YY_RULE_SETUP
-#line 332 "pars0lex.l"
+#line 352 "pars0lex.l"
 {
 			return(PARS_ORDER_TOKEN);
 }
 	YY_BREAK
 case 44:
 YY_RULE_SETUP
-#line 336 "pars0lex.l"
+#line 356 "pars0lex.l"
 {
 			return(PARS_BY_TOKEN);
 }
 	YY_BREAK
 case 45:
 YY_RULE_SETUP
-#line 340 "pars0lex.l"
+#line 360 "pars0lex.l"
 {
 			return(PARS_ASC_TOKEN);
 }
 	YY_BREAK
 case 46:
 YY_RULE_SETUP
-#line 344 "pars0lex.l"
+#line 364 "pars0lex.l"
 {
 			return(PARS_DESC_TOKEN);
 }
 	YY_BREAK
 case 47:
 YY_RULE_SETUP
-#line 348 "pars0lex.l"
+#line 368 "pars0lex.l"
 {
 			return(PARS_INSERT_TOKEN);
 }
 	YY_BREAK
 case 48:
 YY_RULE_SETUP
-#line 352 "pars0lex.l"
+#line 372 "pars0lex.l"
 {
 			return(PARS_INTO_TOKEN);
 }
 	YY_BREAK
 case 49:
 YY_RULE_SETUP
-#line 356 "pars0lex.l"
+#line 376 "pars0lex.l"
 {
 			return(PARS_VALUES_TOKEN);
 }
 	YY_BREAK
 case 50:
 YY_RULE_SETUP
-#line 360 "pars0lex.l"
+#line 380 "pars0lex.l"
 {
 			return(PARS_UPDATE_TOKEN);
 }
 	YY_BREAK
 case 51:
 YY_RULE_SETUP
-#line 364 "pars0lex.l"
+#line 384 "pars0lex.l"
 {
 			return(PARS_SET_TOKEN);
 }
 	YY_BREAK
 case 52:
 YY_RULE_SETUP
-#line 368 "pars0lex.l"
+#line 388 "pars0lex.l"
 {
 			return(PARS_DELETE_TOKEN);
 }
 	YY_BREAK
 case 53:
 YY_RULE_SETUP
-#line 372 "pars0lex.l"
+#line 392 "pars0lex.l"
 {
 			return(PARS_CURRENT_TOKEN);
 }
 	YY_BREAK
 case 54:
 YY_RULE_SETUP
-#line 376 "pars0lex.l"
+#line 396 "pars0lex.l"
 {
 			return(PARS_OF_TOKEN);
 }
 	YY_BREAK
 case 55:
 YY_RULE_SETUP
-#line 380 "pars0lex.l"
+#line 400 "pars0lex.l"
 {
 			return(PARS_CREATE_TOKEN);
 }
 	YY_BREAK
 case 56:
 YY_RULE_SETUP
-#line 384 "pars0lex.l"
+#line 404 "pars0lex.l"
 {
 			return(PARS_TABLE_TOKEN);
 }
 	YY_BREAK
 case 57:
 YY_RULE_SETUP
-#line 388 "pars0lex.l"
+#line 408 "pars0lex.l"
 {
-	 		return(PARS_INDEX_TOKEN);
+			return(PARS_COMPACT_TOKEN);
 }
 	YY_BREAK
 case 58:
 YY_RULE_SETUP
-#line 392 "pars0lex.l"
+#line 412 "pars0lex.l"
 {
-	 		return(PARS_UNIQUE_TOKEN);
+			return(PARS_BLOCK_SIZE_TOKEN);
 }
 	YY_BREAK
 case 59:
 YY_RULE_SETUP
-#line 396 "pars0lex.l"
+#line 416 "pars0lex.l"
 {
-	 		return(PARS_CLUSTERED_TOKEN);
+			return(PARS_INDEX_TOKEN);
 }
 	YY_BREAK
 case 60:
 YY_RULE_SETUP
-#line 400 "pars0lex.l"
+#line 420 "pars0lex.l"
 {
-			return(PARS_DOES_NOT_FIT_IN_MEM_TOKEN);
+			return(PARS_UNIQUE_TOKEN);
 }
 	YY_BREAK
 case 61:
 YY_RULE_SETUP
-#line 404 "pars0lex.l"
+#line 424 "pars0lex.l"
 {
-	 		return(PARS_ON_TOKEN);
+			return(PARS_CLUSTERED_TOKEN);
 }
 	YY_BREAK
 case 62:
 YY_RULE_SETUP
-#line 408 "pars0lex.l"
+#line 428 "pars0lex.l"
 {
-			return(PARS_DECLARE_TOKEN);
+			return(PARS_DOES_NOT_FIT_IN_MEM_TOKEN);
 }
 	YY_BREAK
 case 63:
 YY_RULE_SETUP
-#line 412 "pars0lex.l"
+#line 432 "pars0lex.l"
 {
-			return(PARS_CURSOR_TOKEN);
+			return(PARS_ON_TOKEN);
 }
 	YY_BREAK
 case 64:
 YY_RULE_SETUP
-#line 416 "pars0lex.l"
+#line 436 "pars0lex.l"
 {
-			return(PARS_OPEN_TOKEN);
+			return(PARS_DECLARE_TOKEN);
 }
 	YY_BREAK
 case 65:
 YY_RULE_SETUP
-#line 420 "pars0lex.l"
+#line 440 "pars0lex.l"
 {
-			return(PARS_FETCH_TOKEN);
+			return(PARS_CURSOR_TOKEN);
 }
 	YY_BREAK
 case 66:
 YY_RULE_SETUP
-#line 424 "pars0lex.l"
+#line 444 "pars0lex.l"
 {
-			return(PARS_CLOSE_TOKEN);
+			return(PARS_OPEN_TOKEN);
 }
 	YY_BREAK
 case 67:
 YY_RULE_SETUP
-#line 428 "pars0lex.l"
+#line 448 "pars0lex.l"
 {
-			return(PARS_NOTFOUND_TOKEN);
+			return(PARS_FETCH_TOKEN);
 }
 	YY_BREAK
 case 68:
 YY_RULE_SETUP
-#line 432 "pars0lex.l"
+#line 452 "pars0lex.l"
 {
-			return(PARS_TO_CHAR_TOKEN);
+			return(PARS_CLOSE_TOKEN);
 }
 	YY_BREAK
 case 69:
 YY_RULE_SETUP
-#line 436 "pars0lex.l"
+#line 456 "pars0lex.l"
 {
-			return(PARS_TO_NUMBER_TOKEN);
+			return(PARS_NOTFOUND_TOKEN);
 }
 	YY_BREAK
 case 70:
 YY_RULE_SETUP
-#line 440 "pars0lex.l"
+#line 460 "pars0lex.l"
 {
-			return(PARS_TO_BINARY_TOKEN);
+			return(PARS_TO_CHAR_TOKEN);
 }
 	YY_BREAK
 case 71:
 YY_RULE_SETUP
-#line 444 "pars0lex.l"
+#line 464 "pars0lex.l"
 {
-			return(PARS_BINARY_TO_NUMBER_TOKEN);
+			return(PARS_TO_NUMBER_TOKEN);
 }
 	YY_BREAK
 case 72:
 YY_RULE_SETUP
-#line 448 "pars0lex.l"
+#line 468 "pars0lex.l"
 {
-			return(PARS_SUBSTR_TOKEN);
+			return(PARS_TO_BINARY_TOKEN);
 }
 	YY_BREAK
 case 73:
 YY_RULE_SETUP
-#line 452 "pars0lex.l"
+#line 472 "pars0lex.l"
 {
-			return(PARS_REPLSTR_TOKEN);
+			return(PARS_BINARY_TO_NUMBER_TOKEN);
 }
 	YY_BREAK
 case 74:
 YY_RULE_SETUP
-#line 456 "pars0lex.l"
+#line 476 "pars0lex.l"
 {
-			return(PARS_CONCAT_TOKEN);
+			return(PARS_SUBSTR_TOKEN);
 }
 	YY_BREAK
 case 75:
 YY_RULE_SETUP
-#line 460 "pars0lex.l"
+#line 480 "pars0lex.l"
 {
-			return(PARS_INSTR_TOKEN);
+			return(PARS_REPLSTR_TOKEN);
 }
 	YY_BREAK
 case 76:
 YY_RULE_SETUP
-#line 464 "pars0lex.l"
+#line 484 "pars0lex.l"
 {
-			return(PARS_LENGTH_TOKEN);
+			return(PARS_CONCAT_TOKEN);
 }
 	YY_BREAK
 case 77:
 YY_RULE_SETUP
-#line 468 "pars0lex.l"
+#line 488 "pars0lex.l"
 {
-			return(PARS_SYSDATE_TOKEN);
+			return(PARS_INSTR_TOKEN);
 }
 	YY_BREAK
 case 78:
 YY_RULE_SETUP
-#line 472 "pars0lex.l"
+#line 492 "pars0lex.l"
 {
-			return(PARS_PRINTF_TOKEN);
+			return(PARS_LENGTH_TOKEN);
 }
 	YY_BREAK
 case 79:
 YY_RULE_SETUP
-#line 476 "pars0lex.l"
+#line 496 "pars0lex.l"
 {
-			return(PARS_ASSERT_TOKEN);
+			return(PARS_SYSDATE_TOKEN);
 }
 	YY_BREAK
 case 80:
 YY_RULE_SETUP
-#line 480 "pars0lex.l"
+#line 500 "pars0lex.l"
 {
-			return(PARS_RND_TOKEN);
+			return(PARS_PRINTF_TOKEN);
 }
 	YY_BREAK
 case 81:
 YY_RULE_SETUP
-#line 484 "pars0lex.l"
+#line 504 "pars0lex.l"
 {
-			return(PARS_RND_STR_TOKEN);
+			return(PARS_ASSERT_TOKEN);
 }
 	YY_BREAK
 case 82:
 YY_RULE_SETUP
-#line 488 "pars0lex.l"
+#line 508 "pars0lex.l"
 {
-			return(PARS_ROW_PRINTF_TOKEN);
+			return(PARS_RND_TOKEN);
 }
 	YY_BREAK
 case 83:
 YY_RULE_SETUP
-#line 492 "pars0lex.l"
+#line 512 "pars0lex.l"
 {
-			return(PARS_COMMIT_TOKEN);
+			return(PARS_RND_STR_TOKEN);
 }
 	YY_BREAK
 case 84:
 YY_RULE_SETUP
-#line 496 "pars0lex.l"
+#line 516 "pars0lex.l"
 {
-			return(PARS_ROLLBACK_TOKEN);
+			return(PARS_ROW_PRINTF_TOKEN);
 }
 	YY_BREAK
 case 85:
 YY_RULE_SETUP
-#line 500 "pars0lex.l"
+#line 520 "pars0lex.l"
 {
-			return(PARS_WORK_TOKEN);
+			return(PARS_COMMIT_TOKEN);
 }
 	YY_BREAK
 case 86:
 YY_RULE_SETUP
-#line 504 "pars0lex.l"
+#line 524 "pars0lex.l"
 {
-			return(PARS_UNSIGNED_TOKEN);
+			return(PARS_ROLLBACK_TOKEN);
 }
 	YY_BREAK
 case 87:
 YY_RULE_SETUP
-#line 508 "pars0lex.l"
+#line 528 "pars0lex.l"
 {
-			return(PARS_EXIT_TOKEN);
+			return(PARS_WORK_TOKEN);
 }
 	YY_BREAK
 case 88:
 YY_RULE_SETUP
-#line 512 "pars0lex.l"
+#line 532 "pars0lex.l"
 {
-			return(PARS_FUNCTION_TOKEN);
+			return(PARS_UNSIGNED_TOKEN);
 }
 	YY_BREAK
 case 89:
 YY_RULE_SETUP
-#line 516 "pars0lex.l"
+#line 536 "pars0lex.l"
 {
-			return(PARS_LOCK_TOKEN);
+			return(PARS_EXIT_TOKEN);
 }
 	YY_BREAK
 case 90:
 YY_RULE_SETUP
-#line 520 "pars0lex.l"
+#line 540 "pars0lex.l"
 {
-			return(PARS_SHARE_TOKEN);
+			return(PARS_FUNCTION_TOKEN);
 }
 	YY_BREAK
 case 91:
 YY_RULE_SETUP
-#line 524 "pars0lex.l"
+#line 544 "pars0lex.l"
 {
-			return(PARS_MODE_TOKEN);
+			return(PARS_LOCK_TOKEN);
 }
 	YY_BREAK
 case 92:
 YY_RULE_SETUP
-#line 528 "pars0lex.l"
+#line 548 "pars0lex.l"
 {
-			yylval = sym_tab_add_id(pars_sym_tab_global,
-							(byte*)yytext,
-							ut_strlen(yytext));
-			return(PARS_ID_TOKEN);
+			return(PARS_SHARE_TOKEN);
 }
 	YY_BREAK
 case 93:
 YY_RULE_SETUP
-#line 535 "pars0lex.l"
+#line 552 "pars0lex.l"
 {
-			return(PARS_DDOT_TOKEN);
+			return(PARS_MODE_TOKEN);
 }
 	YY_BREAK
 case 94:
 YY_RULE_SETUP
-#line 539 "pars0lex.l"
+#line 556 "pars0lex.l"
 {
-			return(PARS_ASSIGN_TOKEN);
+                        return(PARS_LIKE_TOKEN);
 }
 	YY_BREAK
 case 95:
 YY_RULE_SETUP
-#line 543 "pars0lex.l"
+#line 560 "pars0lex.l"
 {
-			return(PARS_LE_TOKEN);
+			return(PARS_BIGINT_TOKEN);
 }
 	YY_BREAK
 case 96:
 YY_RULE_SETUP
-#line 547 "pars0lex.l"
+#line 564 "pars0lex.l"
 {
-			return(PARS_GE_TOKEN);
+			yylval = sym_tab_add_id(pars_sym_tab_global,
+							(byte*) yytext,
+							ut_strlen(yytext));
+			return(PARS_ID_TOKEN);
 }
 	YY_BREAK
 case 97:
 YY_RULE_SETUP
-#line 551 "pars0lex.l"
+#line 571 "pars0lex.l"
 {
-			return(PARS_NE_TOKEN);
+			yylval = sym_tab_add_id(pars_sym_tab_global,
+							(byte*) yytext,
+							ut_strlen(yytext));
+			return(PARS_TABLE_NAME_TOKEN);
 }
 	YY_BREAK
 case 98:
 YY_RULE_SETUP
-#line 555 "pars0lex.l"
+#line 578 "pars0lex.l"
 {
-
-			return((int)(*yytext));
+			return(PARS_DDOT_TOKEN);
 }
 	YY_BREAK
 case 99:
 YY_RULE_SETUP
-#line 560 "pars0lex.l"
+#line 582 "pars0lex.l"
 {
-
-			return((int)(*yytext));
+			return(PARS_ASSIGN_TOKEN);
 }
 	YY_BREAK
 case 100:
 YY_RULE_SETUP
-#line 565 "pars0lex.l"
+#line 586 "pars0lex.l"
 {
-
-			return((int)(*yytext));
+			return(PARS_LE_TOKEN);
 }
 	YY_BREAK
 case 101:
 YY_RULE_SETUP
-#line 570 "pars0lex.l"
+#line 590 "pars0lex.l"
 {
-
-			return((int)(*yytext));
+			return(PARS_GE_TOKEN);
 }
 	YY_BREAK
 case 102:
 YY_RULE_SETUP
-#line 575 "pars0lex.l"
+#line 594 "pars0lex.l"
 {
-
-			return((int)(*yytext));
+			return(PARS_NE_TOKEN);
 }
 	YY_BREAK
 case 103:
 YY_RULE_SETUP
-#line 580 "pars0lex.l"
+#line 598 "pars0lex.l"
 {
 
 			return((int)(*yytext));
@@ -1816,7 +1954,7 @@ YY_RULE_SETUP
 	YY_BREAK
 case 104:
 YY_RULE_SETUP
-#line 585 "pars0lex.l"
+#line 603 "pars0lex.l"
 {
 
 			return((int)(*yytext));
@@ -1824,7 +1962,7 @@ YY_RULE_SETUP
 	YY_BREAK
 case 105:
 YY_RULE_SETUP
-#line 590 "pars0lex.l"
+#line 608 "pars0lex.l"
 {
 
 			return((int)(*yytext));
@@ -1832,7 +1970,7 @@ YY_RULE_SETUP
 	YY_BREAK
 case 106:
 YY_RULE_SETUP
-#line 595 "pars0lex.l"
+#line 613 "pars0lex.l"
 {
 
 			return((int)(*yytext));
@@ -1840,7 +1978,7 @@ YY_RULE_SETUP
 	YY_BREAK
 case 107:
 YY_RULE_SETUP
-#line 600 "pars0lex.l"
+#line 618 "pars0lex.l"
 {
 
 			return((int)(*yytext));
@@ -1848,7 +1986,7 @@ YY_RULE_SETUP
 	YY_BREAK
 case 108:
 YY_RULE_SETUP
-#line 605 "pars0lex.l"
+#line 623 "pars0lex.l"
 {
 
 			return((int)(*yytext));
@@ -1856,7 +1994,7 @@ YY_RULE_SETUP
 	YY_BREAK
 case 109:
 YY_RULE_SETUP
-#line 610 "pars0lex.l"
+#line 628 "pars0lex.l"
 {
 
 			return((int)(*yytext));
@@ -1864,7 +2002,7 @@ YY_RULE_SETUP
 	YY_BREAK
 case 110:
 YY_RULE_SETUP
-#line 615 "pars0lex.l"
+#line 633 "pars0lex.l"
 {
 
 			return((int)(*yytext));
@@ -1872,7 +2010,7 @@ YY_RULE_SETUP
 	YY_BREAK
 case 111:
 YY_RULE_SETUP
-#line 620 "pars0lex.l"
+#line 638 "pars0lex.l"
 {
 
 			return((int)(*yytext));
@@ -1880,7 +2018,7 @@ YY_RULE_SETUP
 	YY_BREAK
 case 112:
 YY_RULE_SETUP
-#line 625 "pars0lex.l"
+#line 643 "pars0lex.l"
 {
 
 			return((int)(*yytext));
@@ -1888,35 +2026,75 @@ YY_RULE_SETUP
 	YY_BREAK
 case 113:
 YY_RULE_SETUP
-#line 630 "pars0lex.l"
-BEGIN(comment); /* eat up comment */
+#line 648 "pars0lex.l"
+{
+
+			return((int)(*yytext));
+}
 	YY_BREAK
 case 114:
-/* rule 114 can match eol */
 YY_RULE_SETUP
-#line 632 "pars0lex.l"
+#line 653 "pars0lex.l"
+{
 
+			return((int)(*yytext));
+}
 	YY_BREAK
 case 115:
-/* rule 115 can match eol */
 YY_RULE_SETUP
-#line 633 "pars0lex.l"
+#line 658 "pars0lex.l"
+{
 
+			return((int)(*yytext));
+}
 	YY_BREAK
 case 116:
 YY_RULE_SETUP
-#line 634 "pars0lex.l"
-BEGIN(INITIAL);
+#line 663 "pars0lex.l"
+{
+
+			return((int)(*yytext));
+}
 	YY_BREAK
 case 117:
-/* rule 117 can match eol */
 YY_RULE_SETUP
-#line 636 "pars0lex.l"
-/* eat up whitespace */
+#line 668 "pars0lex.l"
+{
+
+			return((int)(*yytext));
+}
 	YY_BREAK
 case 118:
 YY_RULE_SETUP
-#line 639 "pars0lex.l"
+#line 673 "pars0lex.l"
+BEGIN(comment); /* eat up comment */
+	YY_BREAK
+case 119:
+/* rule 119 can match eol */
+YY_RULE_SETUP
+#line 675 "pars0lex.l"
+
+	YY_BREAK
+case 120:
+/* rule 120 can match eol */
+YY_RULE_SETUP
+#line 676 "pars0lex.l"
+
+	YY_BREAK
+case 121:
+YY_RULE_SETUP
+#line 677 "pars0lex.l"
+BEGIN(INITIAL);
+	YY_BREAK
+case 122:
+/* rule 122 can match eol */
+YY_RULE_SETUP
+#line 679 "pars0lex.l"
+/* eat up whitespace */
+	YY_BREAK
+case 123:
+YY_RULE_SETUP
+#line 682 "pars0lex.l"
 {
 			fprintf(stderr,"Unrecognized character: %02x\n",
 				*yytext);
@@ -1926,12 +2104,12 @@ YY_RULE_SETUP
 			return(0);
 }
 	YY_BREAK
-case 119:
+case 124:
 YY_RULE_SETUP
-#line 648 "pars0lex.l"
+#line 691 "pars0lex.l"
 YY_FATAL_ERROR( "flex scanner jammed" );
 	YY_BREAK
-#line 1916 "lexyy.c"
+#line 2113 "lexyy.cc"
 case YY_STATE_EOF(INITIAL):
 case YY_STATE_EOF(comment):
 case YY_STATE_EOF(quoted):
@@ -2121,7 +2299,7 @@ static int yy_get_next_buffer (void)
 
 	else
 		{
-			size_t num_to_read =
+			yy_size_t num_to_read =
 			YY_CURRENT_BUFFER_LVALUE->yy_buf_size - number_to_move - 1;
 
 		while ( num_to_read <= 0 )
@@ -2135,16 +2313,16 @@ static int yy_get_next_buffer (void)
 
 			if ( b->yy_is_our_buffer )
 				{
-				int new_size = b->yy_buf_size * 2;
+				yy_size_t new_size = b->yy_buf_size * 2;
 
 				if ( new_size <= 0 )
 					b->yy_buf_size += b->yy_buf_size / 8;
 				else
 					b->yy_buf_size *= 2;
 
-				b->yy_ch_buf = (char *)
+				b->yy_ch_buf = (char*)
 					/* Include room in for 2 EOB chars. */
-					yyrealloc((void *) b->yy_ch_buf,b->yy_buf_size + 2  );
+					yyrealloc((void*) b->yy_ch_buf,b->yy_buf_size + 2  );
 				}
 			else
 				/* Can't grow it, we don't own it. */
@@ -2190,6 +2368,14 @@ static int yy_get_next_buffer (void)
 	else
 		ret_val = EOB_ACT_CONTINUE_SCAN;
 
+	if ((yy_size_t) ((yy_n_chars) + number_to_move) > YY_CURRENT_BUFFER_LVALUE->yy_buf_size) {
+		/* Extend the array by 50%, plus the number we really need. */
+		yy_size_t new_size = (yy_n_chars) + number_to_move + ((yy_n_chars) >> 1);
+		YY_CURRENT_BUFFER_LVALUE->yy_ch_buf = (char*) yyrealloc((void*) YY_CURRENT_BUFFER_LVALUE->yy_ch_buf,new_size  );
+		if ( ! YY_CURRENT_BUFFER_LVALUE->yy_ch_buf )
+			YY_FATAL_ERROR( "out of dynamic memory in yy_get_next_buffer()" );
+	}
+
 	(yy_n_chars) += number_to_move;
 	YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[(yy_n_chars)] = YY_END_OF_BUFFER_CHAR;
 	YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[(yy_n_chars) + 1] = YY_END_OF_BUFFER_CHAR;
@@ -2205,7 +2391,7 @@ static int yy_get_next_buffer (void)
 {
 	register yy_state_type yy_current_state;
 	register char *yy_cp;
-    
+
 	yy_current_state = (yy_start);
 
 	for ( yy_cp = (yytext_ptr) + YY_MORE_ADJ; yy_cp < (yy_c_buf_p); ++yy_cp )
@@ -2219,7 +2405,7 @@ static int yy_get_next_buffer (void)
 		while ( yy_chk[yy_base[yy_current_state] + yy_c] != yy_current_state )
 			{
 			yy_current_state = (int) yy_def[yy_current_state];
-			if ( yy_current_state >= 399 )
+			if ( yy_current_state >= 424 )
 				yy_c = yy_meta[(unsigned int) yy_c];
 			}
 		yy_current_state = yy_nxt[yy_base[yy_current_state] + (unsigned int) yy_c];
@@ -2247,11 +2433,11 @@ static int yy_get_next_buffer (void)
 	while ( yy_chk[yy_base[yy_current_state] + yy_c] != yy_current_state )
 		{
 		yy_current_state = (int) yy_def[yy_current_state];
-		if ( yy_current_state >= 399 )
+		if ( yy_current_state >= 424 )
 			yy_c = yy_meta[(unsigned int) yy_c];
 		}
 	yy_current_state = yy_nxt[yy_base[yy_current_state] + (unsigned int) yy_c];
-	yy_is_jam = (yy_current_state == 398);
+	yy_is_jam = (yy_current_state == 423);
 
 	return yy_is_jam ? 0 : yy_current_state;
 }
@@ -2265,7 +2451,7 @@ static int yy_get_next_buffer (void)
 
 {
 	int c;
-    
+
 	*(yy_c_buf_p) = (yy_hold_char);
 
 	if ( *(yy_c_buf_p) == YY_END_OF_BUFFER_CHAR )
@@ -2280,7 +2466,7 @@ static int yy_get_next_buffer (void)
 
 		else
 			{ /* need more input */
-			int offset = (int)((yy_c_buf_p) - (yytext_ptr));
+			yy_size_t offset = (yy_c_buf_p) - (yytext_ptr);
 			++(yy_c_buf_p);
 
 			switch ( yy_get_next_buffer(  ) )
@@ -2304,7 +2490,7 @@ static int yy_get_next_buffer (void)
 				case EOB_ACT_END_OF_FILE:
 					{
 					if ( yywrap( ) )
-						return EOF;
+						return 0;
 
 					if ( ! (yy_did_buffer_switch_on_eof) )
 						YY_NEW_FILE;
@@ -2322,7 +2508,7 @@ static int yy_get_next_buffer (void)
 			}
 		}
 
-	c = *(unsigned char *) (yy_c_buf_p);	/* cast for 8-bit char's */
+	c = *(unsigned char*) (yy_c_buf_p);	/* cast for 8-bit char's */
 	*(yy_c_buf_p) = '\0';	/* preserve yytext */
 	(yy_hold_char) = *++(yy_c_buf_p);
 
@@ -2332,12 +2518,12 @@ static int yy_get_next_buffer (void)
 
 /** Immediately switch to a different input stream.
  * @param input_file A readable stream.
- * 
+ *
  * @note This function does not reset the start condition to @c INITIAL .
  */
-    static void yyrestart  (FILE * input_file )
+    void yyrestart  (FILE * input_file )
 {
-    
+
 	if ( ! YY_CURRENT_BUFFER ){
         yyensure_buffer_stack ();
 		YY_CURRENT_BUFFER_LVALUE =
@@ -2350,11 +2536,11 @@ static int yy_get_next_buffer (void)
 
 /** Switch to a different input buffer.
  * @param new_buffer The new input buffer.
- * 
+ *
  */
     __attribute__((unused)) static void yy_switch_to_buffer  (YY_BUFFER_STATE  new_buffer )
 {
-    
+
 	/* TODO. We should be able to replace this entire function body
 	 * with
 	 *		yypop_buffer_state();
@@ -2394,13 +2580,13 @@ static void yy_load_buffer_state  (void)
 /** Allocate and initialize an input buffer state.
  * @param file A readable stream.
  * @param size The character buffer size in bytes. When in doubt, use @c YY_BUF_SIZE.
- * 
+ *
  * @return the allocated buffer state.
  */
     static YY_BUFFER_STATE yy_create_buffer  (FILE * file, int  size )
 {
 	YY_BUFFER_STATE b;
-    
+
 	b = (YY_BUFFER_STATE) yyalloc(sizeof( struct yy_buffer_state )  );
 	if ( ! b )
 		YY_FATAL_ERROR( "out of dynamic memory in yy_create_buffer()" );
@@ -2410,7 +2596,7 @@ static void yy_load_buffer_state  (void)
 	/* yy_ch_buf has to be 2 characters longer than the size given because
 	 * we need to put in 2 end-of-buffer characters.
 	 */
-	b->yy_ch_buf = (char *) yyalloc(b->yy_buf_size + 2  );
+	b->yy_ch_buf = (char*) yyalloc(b->yy_buf_size + 2  );
 	if ( ! b->yy_ch_buf )
 		YY_FATAL_ERROR( "out of dynamic memory in yy_create_buffer()" );
 
@@ -2423,11 +2609,11 @@ static void yy_load_buffer_state  (void)
 
 /** Destroy the buffer.
  * @param b a buffer created with yy_create_buffer()
- * 
+ *
  */
-    static void yy_delete_buffer (YY_BUFFER_STATE  b )
+    void yy_delete_buffer (YY_BUFFER_STATE  b )
 {
-    
+
 	if ( ! b )
 		return;
 
@@ -2435,9 +2621,9 @@ static void yy_load_buffer_state  (void)
 		YY_CURRENT_BUFFER_LVALUE = (YY_BUFFER_STATE) 0;
 
 	if ( b->yy_is_our_buffer )
-		yyfree((void *) b->yy_ch_buf  );
+		yyfree((void*) b->yy_ch_buf  );
 
-	yyfree((void *) b  );
+	yyfree((void*) b  );
 }
 
 /* Initializes or reinitializes a buffer.
@@ -2448,7 +2634,7 @@ static void yy_load_buffer_state  (void)
 
 {
 	int oerrno = errno;
-    
+
 	yy_flush_buffer(b );
 
 	b->yy_input_file = file;
@@ -2464,15 +2650,15 @@ static void yy_load_buffer_state  (void)
     }
 
         b->yy_is_interactive = 0;
-    
+
 	errno = oerrno;
 }
 
 /** Discard all buffered characters. On the next scan, YY_INPUT will be called.
  * @param b the buffer state to be flushed, usually @c YY_CURRENT_BUFFER.
- * 
+ *
  */
-    static void yy_flush_buffer (YY_BUFFER_STATE  b )
+    void yy_flush_buffer (YY_BUFFER_STATE  b )
 {
     	if ( ! b )
 		return;
@@ -2499,9 +2685,9 @@ static void yy_load_buffer_state  (void)
  *  the current state. This function will allocate the stack
  *  if necessary.
  *  @param new_buffer The new state.
- *  
+ *
  */
-__attribute__((unused)) static void yypush_buffer_state (YY_BUFFER_STATE new_buffer )
+void yypush_buffer_state (YY_BUFFER_STATE new_buffer )
 {
     	if (new_buffer == NULL)
 		return;
@@ -2529,9 +2715,9 @@ __attribute__((unused)) static void yypush_buffer_state (YY_BUFFER_STATE new_buf
 
 /** Removes and deletes the top of the stack, if present.
  *  The next element becomes the new top.
- *  
+ *
  */
-__attribute__((unused)) static void yypop_buffer_state (void)
+void yypop_buffer_state (void)
 {
     	if (!YY_CURRENT_BUFFER)
 		return;
@@ -2552,8 +2738,8 @@ __attribute__((unused)) static void yypop_buffer_state (void)
  */
 static void yyensure_buffer_stack (void)
 {
-	int num_to_alloc;
-    
+	yy_size_t num_to_alloc;
+
 	if (!(yy_buffer_stack)) {
 
 		/* First allocation is just for 2 elements, since we don't know if this
@@ -2561,12 +2747,14 @@ static void yyensure_buffer_stack (void)
 		 * immediate realloc on the next call.
          */
 		num_to_alloc = 1;
-		(yy_buffer_stack) = (struct yy_buffer_state**)yyalloc
+		(yy_buffer_stack) = (struct yy_buffer_state**) yyalloc
 								(num_to_alloc * sizeof(struct yy_buffer_state*)
 								);
-		
+		if ( ! (yy_buffer_stack) )
+			YY_FATAL_ERROR( "out of dynamic memory in yyensure_buffer_stack()" );
+
 		memset((yy_buffer_stack), 0, num_to_alloc * sizeof(struct yy_buffer_state*));
-				
+
 		(yy_buffer_stack_max) = num_to_alloc;
 		(yy_buffer_stack_top) = 0;
 		return;
@@ -2578,10 +2766,12 @@ static void yyensure_buffer_stack (void)
 		int grow_size = 8 /* arbitrary grow size */;
 
 		num_to_alloc = (yy_buffer_stack_max) + grow_size;
-		(yy_buffer_stack) = (struct yy_buffer_state**)yyrealloc
+		(yy_buffer_stack) = (struct yy_buffer_state**) yyrealloc
 								((yy_buffer_stack),
 								num_to_alloc * sizeof(struct yy_buffer_state*)
 								);
+		if ( ! (yy_buffer_stack) )
+			YY_FATAL_ERROR( "out of dynamic memory in yyensure_buffer_stack()" );
 
 		/* zero only the new slots.*/
 		memset((yy_buffer_stack) + (yy_buffer_stack_max), 0, grow_size * sizeof(struct yy_buffer_state*));
@@ -2619,87 +2809,115 @@ static void yy_fatal_error (yyconst char* msg )
 /* Accessor  methods (get/set functions) to struct members. */
 
 /** Get the current line number.
- * 
+ *
  */
-__attribute__((unused)) static int yyget_lineno  (void)
+int yyget_lineno  (void)
 {
-        
+
     return yylineno;
 }
 
 /** Get the input stream.
- * 
+ *
  */
-__attribute__((unused)) static FILE *yyget_in  (void)
+FILE *yyget_in  (void)
 {
         return yyin;
 }
 
 /** Get the output stream.
- * 
+ *
  */
-__attribute__((unused)) static FILE *yyget_out  (void)
+FILE *yyget_out  (void)
 {
         return yyout;
 }
 
 /** Get the length of the current token.
- * 
+ *
  */
-__attribute__((unused)) static int yyget_leng  (void)
+yy_size_t yyget_leng  (void)
 {
         return yyleng;
 }
 
 /** Get the current token.
- * 
+ *
  */
 
-__attribute__((unused)) static char *yyget_text  (void)
+char *yyget_text  (void)
 {
         return yytext;
 }
 
 /** Set the current line number.
  * @param line_number
- * 
+ *
  */
-__attribute__((unused)) static void yyset_lineno (int  line_number )
+void yyset_lineno (int  line_number )
 {
-    
+
     yylineno = line_number;
 }
 
 /** Set the input stream. This does not discard the current
  * input buffer.
  * @param in_str A readable stream.
- * 
+ *
  * @see yy_switch_to_buffer
  */
-__attribute__((unused)) static void yyset_in (FILE *  in_str )
+void yyset_in (FILE *  in_str )
 {
         yyin = in_str ;
 }
 
-__attribute__((unused)) static void yyset_out (FILE *  out_str )
+void yyset_out (FILE *  out_str )
 {
         yyout = out_str ;
 }
 
-__attribute__((unused)) static int yyget_debug  (void)
+int yyget_debug  (void)
 {
         return yy_flex_debug;
 }
 
-__attribute__((unused)) static void yyset_debug (int  bdebug )
+void yyset_debug (int  bdebug )
 {
         yy_flex_debug = bdebug ;
 }
 
+static int yy_init_globals (void)
+{
+        /* Initialization is the same as for the non-reentrant scanner.
+     * This function is called from yylex_destroy(), so don't allocate here.
+     */
+
+    (yy_buffer_stack) = 0;
+    (yy_buffer_stack_top) = 0;
+    (yy_buffer_stack_max) = 0;
+    (yy_c_buf_p) = (char*) 0;
+    (yy_init) = 0;
+    (yy_start) = 0;
+
+/* Defined in main.c */
+#ifdef YY_STDINIT
+    yyin = stdin;
+    yyout = stdout;
+#else
+    yyin = (FILE*) 0;
+    yyout = (FILE*) 0;
+#endif
+
+    /* For future reference: Set errno on error, since we are called by
+     * yylex_init()
+     */
+    return 0;
+}
+
 /* yylex_destroy is for both reentrant and non-reentrant scanners. */
 __attribute__((unused)) static int yylex_destroy  (void)
 {
-    
+
     /* Pop the buffer stack, destroying each element. */
 	while(YY_CURRENT_BUFFER){
 		yy_delete_buffer(YY_CURRENT_BUFFER  );
@@ -2711,6 +2929,10 @@ __attribute__((unused)) static int yylex_destroy  (void)
 	yyfree((yy_buffer_stack) );
 	(yy_buffer_stack) = NULL;
 
+    /* Reset the globals. This is important in a non-reentrant scanner so the next time
+     * yylex() is called, initialization will occur. */
+    yy_init_globals( );
+
     return 0;
 }
 
@@ -2722,7 +2944,7 @@ __attribute__((unused)) static int yylex_destroy  (void)
 static void yy_flex_strncpy (char* s1, yyconst char * s2, int n )
 {
 	register int i;
-    	for ( i = 0; i < n; ++i )
+	for ( i = 0; i < n; ++i )
 		s1[i] = s2[i];
 }
 #endif
@@ -2731,51 +2953,38 @@ static void yy_flex_strncpy (char* s1, yyconst char * s2, int n )
 static int yy_flex_strlen (yyconst char * s )
 {
 	register int n;
-    	for ( n = 0; s[n]; ++n )
+	for ( n = 0; s[n]; ++n )
 		;
 
 	return n;
 }
 #endif
 
-static void *yyalloc (yy_size_t  size )
+void *yyalloc (yy_size_t  size )
 {
-	return (void *) malloc( size );
+	return (void*) malloc( size );
 }
 
-static void *yyrealloc  (void * ptr, yy_size_t  size )
+void *yyrealloc  (void * ptr, yy_size_t  size )
 {
-	/* The cast to (char *) in the following accommodates both
+	/* The cast to (char*) in the following accommodates both
 	 * implementations that use char* generic pointers, and those
 	 * that use void* generic pointers.  It works with the latter
 	 * because both ANSI C and C++ allow castless assignment from
 	 * any pointer type to void*, and deal with argument conversions
 	 * as though doing an assignment.
 	 */
-	return (void *) realloc( (char *) ptr, size );
+	return (void*) realloc( (char*) ptr, size );
 }
 
-static void yyfree (void * ptr )
+void yyfree (void * ptr )
 {
-	free( (char *) ptr );	/* see yyrealloc() for (char *) cast */
+	free( (char*) ptr );	/* see yyrealloc() for (char*) cast */
 }
 
 #define YYTABLES_NAME "yytables"
 
-#undef YY_NEW_FILE
-#undef YY_FLUSH_BUFFER
-#undef yy_set_bol
-#undef yy_new_buffer
-#undef yy_set_interactive
-#undef yytext_ptr
-#undef YY_DO_BEFORE_ACTION
-
-#ifdef YY_DECL_IS_OURS
-#undef YY_DECL_IS_OURS
-#undef YY_DECL
-#endif
-#line 648 "pars0lex.l"
-
+#line 691 "pars0lex.l"
 
 
 
@@ -2791,3 +3000,4 @@ pars_lexer_close(void)
 	stringbuf = NULL;
 	stringbuf_len_alloc = stringbuf_len = 0;
 }
+
diff --git a/storage/innobase/pars/make_bison.sh b/storage/innobase/pars/make_bison.sh
index 09bb86e3106..2618be102bc 100755
--- a/storage/innobase/pars/make_bison.sh
+++ b/storage/innobase/pars/make_bison.sh
@@ -1,24 +1,24 @@
 #!/bin/bash
 #
-# Copyright (c) 2006, 2009, Innobase Oy. All Rights Reserved.
-# 
+# Copyright (c) 1994, 2011, Oracle and/or its affiliates. All Rights Reserved.
+#
 # This program is free software; you can redistribute it and/or modify it under
 # the terms of the GNU General Public License as published by the Free Software
 # Foundation; version 2 of the License.
-# 
+#
 # This program is distributed in the hope that it will be useful, but WITHOUT
 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
-# 
+#
 # You should have received a copy of the GNU General Public License along with
-# this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-# Place, Suite 330, Boston, MA 02111-1307 USA
+# this program; if not, write to the Free Software Foundation, Inc.,
+# 51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 #
 # generate parser files from bison input files.
 
 set -eu
 TMPFILE=pars0grm.tab.c
-OUTFILE=pars0grm.c
+OUTFILE=pars0grm.cc
 
 bison -d pars0grm.y
 mv pars0grm.tab.h ../include/pars0grm.h
diff --git a/storage/innobase/pars/make_flex.sh b/storage/innobase/pars/make_flex.sh
index 89308a6636f..581fc2342aa 100755
--- a/storage/innobase/pars/make_flex.sh
+++ b/storage/innobase/pars/make_flex.sh
@@ -1,25 +1,25 @@
 #!/bin/bash
 #
-# Copyright (c) 2006, 2009, Innobase Oy. All Rights Reserved.
-# 
+# Copyright (c) 1994, 2011, Oracle and/or its affiliates. All Rights Reserved.
+#
 # This program is free software; you can redistribute it and/or modify it under
 # the terms of the GNU General Public License as published by the Free Software
 # Foundation; version 2 of the License.
-# 
+#
 # This program is distributed in the hope that it will be useful, but WITHOUT
 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
-# 
+#
 # You should have received a copy of the GNU General Public License along with
-# this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-# Place, Suite 330, Boston, MA 02111-1307 USA
+# this program; if not, write to the Free Software Foundation, Inc.,
+# 51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 #
 # generate lexer files from flex input files.
 
 set -eu
 
-TMPFILE=_flex_tmp.c
-OUTFILE=lexyy.c
+TMPFILE=_flex_tmp.cc
+OUTFILE=lexyy.cc
 
 flex -o $TMPFILE pars0lex.l
 
diff --git a/storage/innobase/pars/pars0grm.c b/storage/innobase/pars/pars0grm.c
deleted file mode 100644
index d667970735e..00000000000
--- a/storage/innobase/pars/pars0grm.c
+++ /dev/null
@@ -1,2601 +0,0 @@
-/*****************************************************************************
-
-Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
-Copyright (c) 1984, 1989, 1990, 2000, 2001, 2002, 2003, 2004 Free Software
-Foundation, Inc.
-
-As a special exception, when this file is copied by Bison into a
-Bison output file, you may use that output file without restriction.
-This special exception was added by the Free Software Foundation
-in version 1.24 of Bison.
-
-This program is free software; you can redistribute it and/or modify it under
-the terms of the GNU General Public License as published by the Free Software
-Foundation; version 2 of the License.
-
-This program is distributed in the hope that it will be useful, but WITHOUT
-ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
-FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
-
-*****************************************************************************/
-
-/* A Bison parser, made by GNU Bison 2.0.  */
-
-/* Written by Richard Stallman by simplifying the original so called
-   ``semantic'' parser.  */
-
-/* All symbols defined below should begin with yy or YY, to avoid
-   infringing on user name space.  This should be done even for local
-   variables, as they might otherwise be expanded by user macros.
-   There are some unavoidable exceptions within include files to
-   define necessary library symbols; they are noted "INFRINGES ON
-   USER NAME SPACE" below.  */
-
-/* Identify Bison output.  */
-#define YYBISON 1
-
-/* Skeleton name.  */
-#define YYSKELETON_NAME "yacc.c"
-
-/* Pure parsers.  */
-#define YYPURE 0
-
-/* Using locations.  */
-#define YYLSP_NEEDED 0
-
-
-
-/* Tokens.  */
-#ifndef YYTOKENTYPE
-# define YYTOKENTYPE
-   /* Put the tokens into the symbol table, so that GDB and other debuggers
-      know about them.  */
-   enum yytokentype {
-     PARS_INT_LIT = 258,
-     PARS_FLOAT_LIT = 259,
-     PARS_STR_LIT = 260,
-     PARS_FIXBINARY_LIT = 261,
-     PARS_BLOB_LIT = 262,
-     PARS_NULL_LIT = 263,
-     PARS_ID_TOKEN = 264,
-     PARS_AND_TOKEN = 265,
-     PARS_OR_TOKEN = 266,
-     PARS_NOT_TOKEN = 267,
-     PARS_GE_TOKEN = 268,
-     PARS_LE_TOKEN = 269,
-     PARS_NE_TOKEN = 270,
-     PARS_PROCEDURE_TOKEN = 271,
-     PARS_IN_TOKEN = 272,
-     PARS_OUT_TOKEN = 273,
-     PARS_BINARY_TOKEN = 274,
-     PARS_BLOB_TOKEN = 275,
-     PARS_INT_TOKEN = 276,
-     PARS_INTEGER_TOKEN = 277,
-     PARS_FLOAT_TOKEN = 278,
-     PARS_CHAR_TOKEN = 279,
-     PARS_IS_TOKEN = 280,
-     PARS_BEGIN_TOKEN = 281,
-     PARS_END_TOKEN = 282,
-     PARS_IF_TOKEN = 283,
-     PARS_THEN_TOKEN = 284,
-     PARS_ELSE_TOKEN = 285,
-     PARS_ELSIF_TOKEN = 286,
-     PARS_LOOP_TOKEN = 287,
-     PARS_WHILE_TOKEN = 288,
-     PARS_RETURN_TOKEN = 289,
-     PARS_SELECT_TOKEN = 290,
-     PARS_SUM_TOKEN = 291,
-     PARS_COUNT_TOKEN = 292,
-     PARS_DISTINCT_TOKEN = 293,
-     PARS_FROM_TOKEN = 294,
-     PARS_WHERE_TOKEN = 295,
-     PARS_FOR_TOKEN = 296,
-     PARS_DDOT_TOKEN = 297,
-     PARS_READ_TOKEN = 298,
-     PARS_ORDER_TOKEN = 299,
-     PARS_BY_TOKEN = 300,
-     PARS_ASC_TOKEN = 301,
-     PARS_DESC_TOKEN = 302,
-     PARS_INSERT_TOKEN = 303,
-     PARS_INTO_TOKEN = 304,
-     PARS_VALUES_TOKEN = 305,
-     PARS_UPDATE_TOKEN = 306,
-     PARS_SET_TOKEN = 307,
-     PARS_DELETE_TOKEN = 308,
-     PARS_CURRENT_TOKEN = 309,
-     PARS_OF_TOKEN = 310,
-     PARS_CREATE_TOKEN = 311,
-     PARS_TABLE_TOKEN = 312,
-     PARS_INDEX_TOKEN = 313,
-     PARS_UNIQUE_TOKEN = 314,
-     PARS_CLUSTERED_TOKEN = 315,
-     PARS_DOES_NOT_FIT_IN_MEM_TOKEN = 316,
-     PARS_ON_TOKEN = 317,
-     PARS_ASSIGN_TOKEN = 318,
-     PARS_DECLARE_TOKEN = 319,
-     PARS_CURSOR_TOKEN = 320,
-     PARS_SQL_TOKEN = 321,
-     PARS_OPEN_TOKEN = 322,
-     PARS_FETCH_TOKEN = 323,
-     PARS_CLOSE_TOKEN = 324,
-     PARS_NOTFOUND_TOKEN = 325,
-     PARS_TO_CHAR_TOKEN = 326,
-     PARS_TO_NUMBER_TOKEN = 327,
-     PARS_TO_BINARY_TOKEN = 328,
-     PARS_BINARY_TO_NUMBER_TOKEN = 329,
-     PARS_SUBSTR_TOKEN = 330,
-     PARS_REPLSTR_TOKEN = 331,
-     PARS_CONCAT_TOKEN = 332,
-     PARS_INSTR_TOKEN = 333,
-     PARS_LENGTH_TOKEN = 334,
-     PARS_SYSDATE_TOKEN = 335,
-     PARS_PRINTF_TOKEN = 336,
-     PARS_ASSERT_TOKEN = 337,
-     PARS_RND_TOKEN = 338,
-     PARS_RND_STR_TOKEN = 339,
-     PARS_ROW_PRINTF_TOKEN = 340,
-     PARS_COMMIT_TOKEN = 341,
-     PARS_ROLLBACK_TOKEN = 342,
-     PARS_WORK_TOKEN = 343,
-     PARS_UNSIGNED_TOKEN = 344,
-     PARS_EXIT_TOKEN = 345,
-     PARS_FUNCTION_TOKEN = 346,
-     PARS_LOCK_TOKEN = 347,
-     PARS_SHARE_TOKEN = 348,
-     PARS_MODE_TOKEN = 349,
-     NEG = 350
-   };
-#endif
-#define PARS_INT_LIT 258
-#define PARS_FLOAT_LIT 259
-#define PARS_STR_LIT 260
-#define PARS_FIXBINARY_LIT 261
-#define PARS_BLOB_LIT 262
-#define PARS_NULL_LIT 263
-#define PARS_ID_TOKEN 264
-#define PARS_AND_TOKEN 265
-#define PARS_OR_TOKEN 266
-#define PARS_NOT_TOKEN 267
-#define PARS_GE_TOKEN 268
-#define PARS_LE_TOKEN 269
-#define PARS_NE_TOKEN 270
-#define PARS_PROCEDURE_TOKEN 271
-#define PARS_IN_TOKEN 272
-#define PARS_OUT_TOKEN 273
-#define PARS_BINARY_TOKEN 274
-#define PARS_BLOB_TOKEN 275
-#define PARS_INT_TOKEN 276
-#define PARS_INTEGER_TOKEN 277
-#define PARS_FLOAT_TOKEN 278
-#define PARS_CHAR_TOKEN 279
-#define PARS_IS_TOKEN 280
-#define PARS_BEGIN_TOKEN 281
-#define PARS_END_TOKEN 282
-#define PARS_IF_TOKEN 283
-#define PARS_THEN_TOKEN 284
-#define PARS_ELSE_TOKEN 285
-#define PARS_ELSIF_TOKEN 286
-#define PARS_LOOP_TOKEN 287
-#define PARS_WHILE_TOKEN 288
-#define PARS_RETURN_TOKEN 289
-#define PARS_SELECT_TOKEN 290
-#define PARS_SUM_TOKEN 291
-#define PARS_COUNT_TOKEN 292
-#define PARS_DISTINCT_TOKEN 293
-#define PARS_FROM_TOKEN 294
-#define PARS_WHERE_TOKEN 295
-#define PARS_FOR_TOKEN 296
-#define PARS_DDOT_TOKEN 297
-#define PARS_READ_TOKEN 298
-#define PARS_ORDER_TOKEN 299
-#define PARS_BY_TOKEN 300
-#define PARS_ASC_TOKEN 301
-#define PARS_DESC_TOKEN 302
-#define PARS_INSERT_TOKEN 303
-#define PARS_INTO_TOKEN 304
-#define PARS_VALUES_TOKEN 305
-#define PARS_UPDATE_TOKEN 306
-#define PARS_SET_TOKEN 307
-#define PARS_DELETE_TOKEN 308
-#define PARS_CURRENT_TOKEN 309
-#define PARS_OF_TOKEN 310
-#define PARS_CREATE_TOKEN 311
-#define PARS_TABLE_TOKEN 312
-#define PARS_INDEX_TOKEN 313
-#define PARS_UNIQUE_TOKEN 314
-#define PARS_CLUSTERED_TOKEN 315
-#define PARS_DOES_NOT_FIT_IN_MEM_TOKEN 316
-#define PARS_ON_TOKEN 317
-#define PARS_ASSIGN_TOKEN 318
-#define PARS_DECLARE_TOKEN 319
-#define PARS_CURSOR_TOKEN 320
-#define PARS_SQL_TOKEN 321
-#define PARS_OPEN_TOKEN 322
-#define PARS_FETCH_TOKEN 323
-#define PARS_CLOSE_TOKEN 324
-#define PARS_NOTFOUND_TOKEN 325
-#define PARS_TO_CHAR_TOKEN 326
-#define PARS_TO_NUMBER_TOKEN 327
-#define PARS_TO_BINARY_TOKEN 328
-#define PARS_BINARY_TO_NUMBER_TOKEN 329
-#define PARS_SUBSTR_TOKEN 330
-#define PARS_REPLSTR_TOKEN 331
-#define PARS_CONCAT_TOKEN 332
-#define PARS_INSTR_TOKEN 333
-#define PARS_LENGTH_TOKEN 334
-#define PARS_SYSDATE_TOKEN 335
-#define PARS_PRINTF_TOKEN 336
-#define PARS_ASSERT_TOKEN 337
-#define PARS_RND_TOKEN 338
-#define PARS_RND_STR_TOKEN 339
-#define PARS_ROW_PRINTF_TOKEN 340
-#define PARS_COMMIT_TOKEN 341
-#define PARS_ROLLBACK_TOKEN 342
-#define PARS_WORK_TOKEN 343
-#define PARS_UNSIGNED_TOKEN 344
-#define PARS_EXIT_TOKEN 345
-#define PARS_FUNCTION_TOKEN 346
-#define PARS_LOCK_TOKEN 347
-#define PARS_SHARE_TOKEN 348
-#define PARS_MODE_TOKEN 349
-#define NEG 350
-
-
-
-
-/* Copy the first part of user declarations.  */
-#line 13 "pars0grm.y"
-
-/* The value of the semantic attribute is a pointer to a query tree node
-que_node_t */
-
-#include "univ.i"
-#include <math.h>				/* Can't be before univ.i */
-#include "pars0pars.h"
-#include "mem0mem.h"
-#include "que0types.h"
-#include "que0que.h"
-#include "row0sel.h"
-
-#define YYSTYPE que_node_t*
-
-/* #define __STDC__ */
-
-int
-yylex(void);
-
-
-/* Enabling traces.  */
-#ifndef YYDEBUG
-# define YYDEBUG 0
-#endif
-
-/* Enabling verbose error messages.  */
-#ifdef YYERROR_VERBOSE
-# undef YYERROR_VERBOSE
-# define YYERROR_VERBOSE 1
-#else
-# define YYERROR_VERBOSE 0
-#endif
-
-#if ! defined (YYSTYPE) && ! defined (YYSTYPE_IS_DECLARED)
-typedef int YYSTYPE;
-# define yystype YYSTYPE /* obsolescent; will be withdrawn */
-# define YYSTYPE_IS_DECLARED 1
-# define YYSTYPE_IS_TRIVIAL 1
-#endif
-
-
-
-/* Copy the second part of user declarations.  */
-
-
-/* Line 213 of yacc.c.  */
-#line 297 "pars0grm.c"
-
-#if ! defined (yyoverflow) || YYERROR_VERBOSE
-
-# ifndef YYFREE
-#  define YYFREE free
-# endif
-# ifndef YYMALLOC
-#  define YYMALLOC malloc
-# endif
-
-/* The parser invokes alloca or malloc; define the necessary symbols.  */
-
-# ifdef YYSTACK_USE_ALLOCA
-#  if YYSTACK_USE_ALLOCA
-#   ifdef __GNUC__
-#    define YYSTACK_ALLOC __builtin_alloca
-#   else
-#    define YYSTACK_ALLOC alloca
-#   endif
-#  endif
-# endif
-
-# ifdef YYSTACK_ALLOC
-   /* Pacify GCC's `empty if-body' warning. */
-#  define YYSTACK_FREE(Ptr) do { /* empty */; } while (0)
-# else
-#  if defined (__STDC__) || defined (__cplusplus)
-#   include <stdlib.h> /* INFRINGES ON USER NAME SPACE */
-#   define YYSIZE_T size_t
-#  endif
-#  define YYSTACK_ALLOC YYMALLOC
-#  define YYSTACK_FREE YYFREE
-# endif
-#endif /* ! defined (yyoverflow) || YYERROR_VERBOSE */
-
-
-#if (! defined (yyoverflow) \
-     && (! defined (__cplusplus) \
-	 || (defined (YYSTYPE_IS_TRIVIAL) && YYSTYPE_IS_TRIVIAL)))
-
-/* A type that is properly aligned for any stack member.  */
-union yyalloc
-{
-  short int yyss;
-  YYSTYPE yyvs;
-  };
-
-/* The size of the maximum gap between one aligned stack and the next.  */
-# define YYSTACK_GAP_MAXIMUM (sizeof (union yyalloc) - 1)
-
-/* The size of an array large to enough to hold all stacks, each with
-   N elements.  */
-# define YYSTACK_BYTES(N) \
-     ((N) * (sizeof (short int) + sizeof (YYSTYPE))			\
-      + YYSTACK_GAP_MAXIMUM)
-
-/* Copy COUNT objects from FROM to TO.  The source and destination do
-   not overlap.  */
-# ifndef YYCOPY
-#  if defined (__GNUC__) && 1 < __GNUC__
-#   define YYCOPY(To, From, Count) \
-      __builtin_memcpy (To, From, (Count) * sizeof (*(From)))
-#  else
-#   define YYCOPY(To, From, Count)		\
-      do					\
-	{					\
-	  register YYSIZE_T yyi;		\
-	  for (yyi = 0; yyi < (Count); yyi++)	\
-	    (To)[yyi] = (From)[yyi];		\
-	}					\
-      while (0)
-#  endif
-# endif
-
-/* Relocate STACK from its old location to the new one.  The
-   local variables YYSIZE and YYSTACKSIZE give the old and new number of
-   elements in the stack, and YYPTR gives the new location of the
-   stack.  Advance YYPTR to a properly aligned location for the next
-   stack.  */
-# define YYSTACK_RELOCATE(Stack)					\
-    do									\
-      {									\
-	YYSIZE_T yynewbytes;						\
-	YYCOPY (&yyptr->Stack, Stack, yysize);				\
-	Stack = &yyptr->Stack;						\
-	yynewbytes = yystacksize * sizeof (*Stack) + YYSTACK_GAP_MAXIMUM; \
-	yyptr += yynewbytes / sizeof (*yyptr);				\
-      }									\
-    while (0)
-
-#endif
-
-#if defined (__STDC__) || defined (__cplusplus)
-   typedef signed char yysigned_char;
-#else
-   typedef short int yysigned_char;
-#endif
-
-/* YYFINAL -- State number of the termination state. */
-#define YYFINAL  5
-/* YYLAST -- Last index in YYTABLE.  */
-#define YYLAST   752
-
-/* YYNTOKENS -- Number of terminals. */
-#define YYNTOKENS  111
-/* YYNNTS -- Number of nonterminals. */
-#define YYNNTS  70
-/* YYNRULES -- Number of rules. */
-#define YYNRULES  175
-/* YYNRULES -- Number of states. */
-#define YYNSTATES  339
-
-/* YYTRANSLATE(YYLEX) -- Bison symbol number corresponding to YYLEX.  */
-#define YYUNDEFTOK  2
-#define YYMAXUTOK   350
-
-#define YYTRANSLATE(YYX) 						\
-  ((unsigned int) (YYX) <= YYMAXUTOK ? yytranslate[YYX] : YYUNDEFTOK)
-
-/* YYTRANSLATE[YYLEX] -- Bison symbol number corresponding to YYLEX.  */
-static const unsigned char yytranslate[] =
-{
-       0,     2,     2,     2,     2,     2,     2,     2,     2,     2,
-       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
-       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
-       2,     2,     2,     2,     2,     2,     2,   103,     2,     2,
-     105,   106,   100,    99,   108,    98,     2,   101,     2,     2,
-       2,     2,     2,     2,     2,     2,     2,     2,     2,   104,
-      96,    95,    97,   107,     2,     2,     2,     2,     2,     2,
-       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
-       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
-       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
-       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
-       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
-       2,     2,     2,   109,     2,   110,     2,     2,     2,     2,
-       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
-       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
-       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
-       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
-       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
-       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
-       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
-       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
-       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
-       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
-       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
-       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
-       2,     2,     2,     2,     2,     2,     1,     2,     3,     4,
-       5,     6,     7,     8,     9,    10,    11,    12,    13,    14,
-      15,    16,    17,    18,    19,    20,    21,    22,    23,    24,
-      25,    26,    27,    28,    29,    30,    31,    32,    33,    34,
-      35,    36,    37,    38,    39,    40,    41,    42,    43,    44,
-      45,    46,    47,    48,    49,    50,    51,    52,    53,    54,
-      55,    56,    57,    58,    59,    60,    61,    62,    63,    64,
-      65,    66,    67,    68,    69,    70,    71,    72,    73,    74,
-      75,    76,    77,    78,    79,    80,    81,    82,    83,    84,
-      85,    86,    87,    88,    89,    90,    91,    92,    93,    94,
-     102
-};
-
-#if YYDEBUG
-/* YYPRHS[YYN] -- Index of the first RHS symbol of rule number YYN in
-   YYRHS.  */
-static const unsigned short int yyprhs[] =
-{
-       0,     0,     3,     6,     8,    11,    14,    17,    20,    23,
-      26,    29,    32,    35,    38,    41,    44,    47,    50,    53,
-      56,    59,    62,    65,    68,    71,    73,    76,    78,    83,
-      85,    87,    89,    91,    93,    95,    97,   101,   105,   109,
-     113,   116,   120,   124,   128,   132,   136,   140,   144,   148,
-     152,   155,   159,   163,   165,   167,   169,   171,   173,   175,
-     177,   179,   181,   183,   185,   186,   188,   192,   199,   204,
-     206,   208,   210,   214,   216,   220,   221,   223,   227,   228,
-     230,   234,   236,   241,   247,   252,   253,   255,   259,   261,
-     265,   267,   268,   271,   272,   275,   276,   281,   282,   284,
-     286,   287,   292,   301,   305,   311,   314,   318,   320,   324,
-     329,   334,   337,   340,   344,   347,   350,   353,   357,   362,
-     364,   367,   368,   371,   373,   381,   388,   399,   401,   403,
-     406,   409,   414,   419,   425,   427,   431,   432,   436,   437,
-     439,   440,   443,   444,   446,   454,   456,   460,   461,   463,
-     464,   466,   477,   480,   483,   485,   487,   489,   491,   493,
-     497,   501,   502,   504,   508,   512,   513,   515,   518,   525,
-     530,   532,   534,   535,   537,   540
-};
-
-/* YYRHS -- A `-1'-separated list of the rules' RHS. */
-static const short int yyrhs[] =
-{
-     112,     0,    -1,   180,   104,    -1,   118,    -1,   119,   104,
-      -1,   151,   104,    -1,   152,   104,    -1,   153,   104,    -1,
-     150,   104,    -1,   154,   104,    -1,   146,   104,    -1,   133,
-     104,    -1,   135,   104,    -1,   145,   104,    -1,   143,   104,
-      -1,   144,   104,    -1,   140,   104,    -1,   141,   104,    -1,
-     155,   104,    -1,   157,   104,    -1,   156,   104,    -1,   169,
-     104,    -1,   170,   104,    -1,   164,   104,    -1,   168,   104,
-      -1,   113,    -1,   114,   113,    -1,     9,    -1,   116,   105,
-     124,   106,    -1,     3,    -1,     4,    -1,     5,    -1,     6,
-      -1,     7,    -1,     8,    -1,    66,    -1,   115,    99,   115,
-      -1,   115,    98,   115,    -1,   115,   100,   115,    -1,   115,
-     101,   115,    -1,    98,   115,    -1,   105,   115,   106,    -1,
-     115,    95,   115,    -1,   115,    96,   115,    -1,   115,    97,
-     115,    -1,   115,    13,   115,    -1,   115,    14,   115,    -1,
-     115,    15,   115,    -1,   115,    10,   115,    -1,   115,    11,
-     115,    -1,    12,   115,    -1,     9,   103,    70,    -1,    66,
-     103,    70,    -1,    71,    -1,    72,    -1,    73,    -1,    74,
-      -1,    75,    -1,    77,    -1,    78,    -1,    79,    -1,    80,
-      -1,    83,    -1,    84,    -1,    -1,   107,    -1,   117,   108,
-     107,    -1,   109,     9,   105,   117,   106,   110,    -1,   120,
-     105,   124,   106,    -1,    76,    -1,    81,    -1,    82,    -1,
-       9,   105,   106,    -1,     9,    -1,   122,   108,     9,    -1,
-      -1,     9,    -1,   123,   108,     9,    -1,    -1,   115,    -1,
-     124,   108,   115,    -1,   115,    -1,    37,   105,   100,   106,
-      -1,    37,   105,    38,     9,   106,    -1,    36,   105,   115,
-     106,    -1,    -1,   125,    -1,   126,   108,   125,    -1,   100,
-      -1,   126,    49,   123,    -1,   126,    -1,    -1,    40,   115,
-      -1,    -1,    41,    51,    -1,    -1,    92,    17,    93,    94,
-      -1,    -1,    46,    -1,    47,    -1,    -1,    44,    45,     9,
-     131,    -1,    35,   127,    39,   122,   128,   129,   130,   132,
-      -1,    48,    49,     9,    -1,   134,    50,   105,   124,   106,
-      -1,   134,   133,    -1,     9,    95,   115,    -1,   136,    -1,
-     137,   108,   136,    -1,    40,    54,    55,     9,    -1,    51,
-       9,    52,   137,    -1,   139,   128,    -1,   139,   138,    -1,
-      53,    39,     9,    -1,   142,   128,    -1,   142,   138,    -1,
-      85,   133,    -1,     9,    63,   115,    -1,    31,   115,    29,
-     114,    -1,   147,    -1,   148,   147,    -1,    -1,    30,   114,
-      -1,   148,    -1,    28,   115,    29,   114,   149,    27,    28,
-      -1,    33,   115,    32,   114,    27,    32,    -1,    41,     9,
-      17,   115,    42,   115,    32,   114,    27,    32,    -1,    90,
-      -1,    34,    -1,    67,     9,    -1,    69,     9,    -1,    68,
-       9,    49,   123,    -1,    68,     9,    49,   121,    -1,     9,
-     171,   160,   161,   162,    -1,   158,    -1,   159,   108,   158,
-      -1,    -1,   105,     3,   106,    -1,    -1,    89,    -1,    -1,
-      12,     8,    -1,    -1,    61,    -1,    56,    57,     9,   105,
-     159,   106,   163,    -1,     9,    -1,   165,   108,     9,    -1,
-      -1,    59,    -1,    -1,    60,    -1,    56,   166,   167,    58,
-       9,    62,     9,   105,   165,   106,    -1,    86,    88,    -1,
-      87,    88,    -1,    21,    -1,    22,    -1,    24,    -1,    19,
-      -1,    20,    -1,     9,    17,   171,    -1,     9,    18,   171,
-      -1,    -1,   172,    -1,   173,   108,   172,    -1,     9,   171,
-     104,    -1,    -1,   174,    -1,   175,   174,    -1,    64,    65,
-       9,    25,   133,   104,    -1,    64,    91,     9,   104,    -1,
-     176,    -1,   177,    -1,    -1,   178,    -1,   179,   178,    -1,
-      16,     9,   105,   173,   106,    25,   175,   179,    26,   114,
-      27,    -1
-};
-
-/* YYRLINE[YYN] -- source line where rule number YYN was defined.  */
-static const unsigned short int yyrline[] =
-{
-       0,   138,   138,   141,   142,   143,   144,   145,   146,   147,
-     148,   149,   150,   151,   152,   153,   154,   155,   156,   157,
-     158,   159,   160,   161,   162,   166,   167,   172,   173,   175,
-     176,   177,   178,   179,   180,   181,   182,   183,   184,   185,
-     186,   187,   188,   189,   190,   191,   192,   193,   194,   195,
-     196,   197,   199,   204,   205,   206,   207,   209,   210,   211,
-     212,   213,   214,   215,   218,   220,   221,   225,   230,   235,
-     236,   237,   241,   245,   246,   251,   252,   253,   258,   259,
-     260,   264,   265,   270,   276,   283,   284,   285,   290,   292,
-     294,   298,   299,   303,   304,   309,   310,   315,   316,   317,
-     321,   322,   327,   337,   342,   344,   349,   353,   354,   359,
-     365,   372,   377,   382,   388,   393,   398,   403,   408,   414,
-     415,   420,   421,   423,   427,   434,   440,   448,   452,   456,
-     462,   468,   470,   475,   480,   481,   486,   487,   492,   493,
-     499,   500,   506,   507,   513,   519,   520,   525,   526,   530,
-     531,   535,   543,   548,   553,   554,   555,   556,   557,   561,
-     564,   570,   571,   572,   577,   581,   583,   584,   588,   594,
-     599,   600,   603,   605,   606,   610
-};
-#endif
-
-#if YYDEBUG || YYERROR_VERBOSE
-/* YYTNME[SYMBOL-NUM] -- String name of the symbol SYMBOL-NUM.
-   First, the terminals, then, starting at YYNTOKENS, nonterminals. */
-static const char *const yytname[] =
-{
-  "$end", "error", "$undefined", "PARS_INT_LIT", "PARS_FLOAT_LIT",
-  "PARS_STR_LIT", "PARS_FIXBINARY_LIT", "PARS_BLOB_LIT", "PARS_NULL_LIT",
-  "PARS_ID_TOKEN", "PARS_AND_TOKEN", "PARS_OR_TOKEN", "PARS_NOT_TOKEN",
-  "PARS_GE_TOKEN", "PARS_LE_TOKEN", "PARS_NE_TOKEN",
-  "PARS_PROCEDURE_TOKEN", "PARS_IN_TOKEN", "PARS_OUT_TOKEN",
-  "PARS_BINARY_TOKEN", "PARS_BLOB_TOKEN", "PARS_INT_TOKEN",
-  "PARS_INTEGER_TOKEN", "PARS_FLOAT_TOKEN", "PARS_CHAR_TOKEN",
-  "PARS_IS_TOKEN", "PARS_BEGIN_TOKEN", "PARS_END_TOKEN", "PARS_IF_TOKEN",
-  "PARS_THEN_TOKEN", "PARS_ELSE_TOKEN", "PARS_ELSIF_TOKEN",
-  "PARS_LOOP_TOKEN", "PARS_WHILE_TOKEN", "PARS_RETURN_TOKEN",
-  "PARS_SELECT_TOKEN", "PARS_SUM_TOKEN", "PARS_COUNT_TOKEN",
-  "PARS_DISTINCT_TOKEN", "PARS_FROM_TOKEN", "PARS_WHERE_TOKEN",
-  "PARS_FOR_TOKEN", "PARS_DDOT_TOKEN", "PARS_READ_TOKEN",
-  "PARS_ORDER_TOKEN", "PARS_BY_TOKEN", "PARS_ASC_TOKEN", "PARS_DESC_TOKEN",
-  "PARS_INSERT_TOKEN", "PARS_INTO_TOKEN", "PARS_VALUES_TOKEN",
-  "PARS_UPDATE_TOKEN", "PARS_SET_TOKEN", "PARS_DELETE_TOKEN",
-  "PARS_CURRENT_TOKEN", "PARS_OF_TOKEN", "PARS_CREATE_TOKEN",
-  "PARS_TABLE_TOKEN", "PARS_INDEX_TOKEN", "PARS_UNIQUE_TOKEN",
-  "PARS_CLUSTERED_TOKEN", "PARS_DOES_NOT_FIT_IN_MEM_TOKEN",
-  "PARS_ON_TOKEN", "PARS_ASSIGN_TOKEN", "PARS_DECLARE_TOKEN",
-  "PARS_CURSOR_TOKEN", "PARS_SQL_TOKEN", "PARS_OPEN_TOKEN",
-  "PARS_FETCH_TOKEN", "PARS_CLOSE_TOKEN", "PARS_NOTFOUND_TOKEN",
-  "PARS_TO_CHAR_TOKEN", "PARS_TO_NUMBER_TOKEN", "PARS_TO_BINARY_TOKEN",
-  "PARS_BINARY_TO_NUMBER_TOKEN", "PARS_SUBSTR_TOKEN", "PARS_REPLSTR_TOKEN",
-  "PARS_CONCAT_TOKEN", "PARS_INSTR_TOKEN", "PARS_LENGTH_TOKEN",
-  "PARS_SYSDATE_TOKEN", "PARS_PRINTF_TOKEN", "PARS_ASSERT_TOKEN",
-  "PARS_RND_TOKEN", "PARS_RND_STR_TOKEN", "PARS_ROW_PRINTF_TOKEN",
-  "PARS_COMMIT_TOKEN", "PARS_ROLLBACK_TOKEN", "PARS_WORK_TOKEN",
-  "PARS_UNSIGNED_TOKEN", "PARS_EXIT_TOKEN", "PARS_FUNCTION_TOKEN",
-  "PARS_LOCK_TOKEN", "PARS_SHARE_TOKEN", "PARS_MODE_TOKEN", "'='", "'<'",
-  "'>'", "'-'", "'+'", "'*'", "'/'", "NEG", "'%'", "';'", "'('", "')'",
-  "'?'", "','", "'{'", "'}'", "$accept", "top_statement", "statement",
-  "statement_list", "exp", "function_name", "question_mark_list",
-  "stored_procedure_call", "predefined_procedure_call",
-  "predefined_procedure_name", "user_function_call", "table_list",
-  "variable_list", "exp_list", "select_item", "select_item_list",
-  "select_list", "search_condition", "for_update_clause",
-  "lock_shared_clause", "order_direction", "order_by_clause",
-  "select_statement", "insert_statement_start", "insert_statement",
-  "column_assignment", "column_assignment_list", "cursor_positioned",
-  "update_statement_start", "update_statement_searched",
-  "update_statement_positioned", "delete_statement_start",
-  "delete_statement_searched", "delete_statement_positioned",
-  "row_printf_statement", "assignment_statement", "elsif_element",
-  "elsif_list", "else_part", "if_statement", "while_statement",
-  "for_statement", "exit_statement", "return_statement",
-  "open_cursor_statement", "close_cursor_statement", "fetch_statement",
-  "column_def", "column_def_list", "opt_column_len", "opt_unsigned",
-  "opt_not_null", "not_fit_in_memory", "create_table", "column_list",
-  "unique_def", "clustered_def", "create_index", "commit_statement",
-  "rollback_statement", "type_name", "parameter_declaration",
-  "parameter_declaration_list", "variable_declaration",
-  "variable_declaration_list", "cursor_declaration",
-  "function_declaration", "declaration", "declaration_list",
-  "procedure_definition", 0
-};
-#endif
-
-# ifdef YYPRINT
-/* YYTOKNUM[YYLEX-NUM] -- Internal token number corresponding to
-   token YYLEX-NUM.  */
-static const unsigned short int yytoknum[] =
-{
-       0,   256,   257,   258,   259,   260,   261,   262,   263,   264,
-     265,   266,   267,   268,   269,   270,   271,   272,   273,   274,
-     275,   276,   277,   278,   279,   280,   281,   282,   283,   284,
-     285,   286,   287,   288,   289,   290,   291,   292,   293,   294,
-     295,   296,   297,   298,   299,   300,   301,   302,   303,   304,
-     305,   306,   307,   308,   309,   310,   311,   312,   313,   314,
-     315,   316,   317,   318,   319,   320,   321,   322,   323,   324,
-     325,   326,   327,   328,   329,   330,   331,   332,   333,   334,
-     335,   336,   337,   338,   339,   340,   341,   342,   343,   344,
-     345,   346,   347,   348,   349,    61,    60,    62,    45,    43,
-      42,    47,   350,    37,    59,    40,    41,    63,    44,   123,
-     125
-};
-# endif
-
-/* YYR1[YYN] -- Symbol number of symbol that rule YYN derives.  */
-static const unsigned char yyr1[] =
-{
-       0,   111,   112,   113,   113,   113,   113,   113,   113,   113,
-     113,   113,   113,   113,   113,   113,   113,   113,   113,   113,
-     113,   113,   113,   113,   113,   114,   114,   115,   115,   115,
-     115,   115,   115,   115,   115,   115,   115,   115,   115,   115,
-     115,   115,   115,   115,   115,   115,   115,   115,   115,   115,
-     115,   115,   115,   116,   116,   116,   116,   116,   116,   116,
-     116,   116,   116,   116,   117,   117,   117,   118,   119,   120,
-     120,   120,   121,   122,   122,   123,   123,   123,   124,   124,
-     124,   125,   125,   125,   125,   126,   126,   126,   127,   127,
-     127,   128,   128,   129,   129,   130,   130,   131,   131,   131,
-     132,   132,   133,   134,   135,   135,   136,   137,   137,   138,
-     139,   140,   141,   142,   143,   144,   145,   146,   147,   148,
-     148,   149,   149,   149,   150,   151,   152,   153,   154,   155,
-     156,   157,   157,   158,   159,   159,   160,   160,   161,   161,
-     162,   162,   163,   163,   164,   165,   165,   166,   166,   167,
-     167,   168,   169,   170,   171,   171,   171,   171,   171,   172,
-     172,   173,   173,   173,   174,   175,   175,   175,   176,   177,
-     178,   178,   179,   179,   179,   180
-};
-
-/* YYR2[YYN] -- Number of symbols composing right hand side of rule YYN.  */
-static const unsigned char yyr2[] =
-{
-       0,     2,     2,     1,     2,     2,     2,     2,     2,     2,
-       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
-       2,     2,     2,     2,     2,     1,     2,     1,     4,     1,
-       1,     1,     1,     1,     1,     1,     3,     3,     3,     3,
-       2,     3,     3,     3,     3,     3,     3,     3,     3,     3,
-       2,     3,     3,     1,     1,     1,     1,     1,     1,     1,
-       1,     1,     1,     1,     0,     1,     3,     6,     4,     1,
-       1,     1,     3,     1,     3,     0,     1,     3,     0,     1,
-       3,     1,     4,     5,     4,     0,     1,     3,     1,     3,
-       1,     0,     2,     0,     2,     0,     4,     0,     1,     1,
-       0,     4,     8,     3,     5,     2,     3,     1,     3,     4,
-       4,     2,     2,     3,     2,     2,     2,     3,     4,     1,
-       2,     0,     2,     1,     7,     6,    10,     1,     1,     2,
-       2,     4,     4,     5,     1,     3,     0,     3,     0,     1,
-       0,     2,     0,     1,     7,     1,     3,     0,     1,     0,
-       1,    10,     2,     2,     1,     1,     1,     1,     1,     3,
-       3,     0,     1,     3,     3,     0,     1,     2,     6,     4,
-       1,     1,     0,     1,     2,    11
-};
-
-/* YYDEFACT[STATE-NAME] -- Default rule to reduce with in state
-   STATE-NUM when YYTABLE doesn't specify something else to do.  Zero
-   means the default is an error.  */
-static const unsigned char yydefact[] =
-{
-       0,     0,     0,     0,     0,     1,     2,   161,     0,   162,
-       0,     0,     0,     0,     0,   157,   158,   154,   155,   156,
-     159,   160,   165,   163,     0,   166,   172,     0,     0,   167,
-     170,   171,   173,     0,   164,     0,     0,     0,   174,     0,
-       0,     0,     0,     0,   128,    85,     0,     0,     0,     0,
-     147,     0,     0,     0,    69,    70,    71,     0,     0,     0,
-     127,     0,    25,     0,     3,     0,     0,     0,     0,     0,
-      91,     0,     0,    91,     0,     0,     0,     0,     0,     0,
-       0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
-       0,   169,     0,    29,    30,    31,    32,    33,    34,    27,
-       0,    35,    53,    54,    55,    56,    57,    58,    59,    60,
-      61,    62,    63,     0,     0,     0,     0,     0,     0,     0,
-      88,    81,    86,    90,     0,     0,     0,     0,     0,     0,
-     148,   149,   129,     0,   130,   116,   152,   153,     0,   175,
-      26,     4,    78,    11,     0,   105,    12,     0,   111,   112,
-      16,    17,   114,   115,    14,    15,    13,    10,     8,     5,
-       6,     7,     9,    18,    20,    19,    23,    24,    21,    22,
-       0,   117,     0,    50,     0,    40,     0,     0,     0,     0,
-       0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
-      78,     0,     0,     0,    75,     0,     0,     0,   103,     0,
-     113,     0,   150,     0,    75,    64,    79,     0,    78,     0,
-      92,   168,    51,    52,    41,    48,    49,    45,    46,    47,
-     121,    42,    43,    44,    37,    36,    38,    39,     0,     0,
-       0,     0,     0,    76,    89,    87,    73,    91,     0,     0,
-     107,   110,     0,     0,    76,   132,   131,    65,     0,    68,
-       0,     0,     0,     0,     0,   119,   123,     0,    28,     0,
-      84,     0,    82,     0,     0,     0,    93,     0,     0,     0,
-       0,   134,     0,     0,     0,     0,     0,    80,   104,   109,
-     122,     0,   120,     0,   125,    83,    77,    74,     0,    95,
-       0,   106,   108,   136,   142,     0,     0,    72,    67,    66,
-       0,   124,    94,     0,   100,     0,     0,   138,   143,   144,
-     135,     0,   118,     0,     0,   102,     0,     0,   139,   140,
-       0,     0,     0,     0,   137,     0,   133,   145,     0,    96,
-      97,   126,   141,   151,     0,    98,    99,   101,   146
-};
-
-/* YYDEFGOTO[NTERM-NUM]. */
-static const short int yydefgoto[] =
-{
-      -1,     2,    62,    63,   206,   116,   248,    64,    65,    66,
-     245,   237,   234,   207,   122,   123,   124,   148,   289,   304,
-     337,   315,    67,    68,    69,   240,   241,   149,    70,    71,
-      72,    73,    74,    75,    76,    77,   255,   256,   257,    78,
-      79,    80,    81,    82,    83,    84,    85,   271,   272,   307,
-     319,   326,   309,    86,   328,   131,   203,    87,    88,    89,
-      20,     9,    10,    25,    26,    30,    31,    32,    33,     3
-};
-
-/* YYPACT[STATE-NUM] -- Index in YYTABLE of the portion describing
-   STATE-NUM.  */
-#define YYPACT_NINF -177
-static const short int yypact[] =
-{
-      28,    38,    54,   -46,   -29,  -177,  -177,    56,    50,  -177,
-     -75,     8,     8,    46,    56,  -177,  -177,  -177,  -177,  -177,
-    -177,  -177,    63,  -177,     8,  -177,     2,   -26,   -51,  -177,
-    -177,  -177,  -177,   -13,  -177,    71,    72,   587,  -177,    57,
-     -21,    26,   272,   272,  -177,    13,    91,    55,    96,    67,
-     -22,    99,   100,   103,  -177,  -177,  -177,    75,    29,    35,
-    -177,   116,  -177,   396,  -177,    22,    23,    27,    -9,    30,
-      87,    31,    32,    87,    47,    49,    52,    58,    59,    60,
-      61,    62,    65,    66,    74,    77,    78,    86,    89,   102,
-      75,  -177,   272,  -177,  -177,  -177,  -177,  -177,  -177,    39,
-     272,    51,  -177,  -177,  -177,  -177,  -177,  -177,  -177,  -177,
-    -177,  -177,  -177,   272,   272,   361,    25,   489,    45,    90,
-    -177,   651,  -177,   -39,    93,   142,   124,   108,   152,   170,
-    -177,   131,  -177,   143,  -177,  -177,  -177,  -177,    98,  -177,
-    -177,  -177,   272,  -177,   110,  -177,  -177,   256,  -177,  -177,
-    -177,  -177,  -177,  -177,  -177,  -177,  -177,  -177,  -177,  -177,
-    -177,  -177,  -177,  -177,  -177,  -177,  -177,  -177,  -177,  -177,
-     112,   651,   137,   101,   147,   204,    88,   272,   272,   272,
-     272,   272,   587,   272,   272,   272,   272,   272,   272,   272,
-     272,   587,   272,   -30,   211,   168,   212,   272,  -177,   213,
-    -177,   118,  -177,   167,   217,   122,   651,   -63,   272,   175,
-     651,  -177,  -177,  -177,  -177,   101,   101,    21,    21,   651,
-     332,    21,    21,    21,    -6,    -6,   204,   204,   -60,   460,
-     198,   222,   126,  -177,   125,  -177,  -177,   -33,   584,   140,
-    -177,   128,   228,   229,   139,  -177,   125,  -177,   -53,  -177,
-     272,   -49,   240,   587,   272,  -177,   224,   226,  -177,   225,
-    -177,   150,  -177,   258,   272,   260,   230,   272,   272,   213,
-       8,  -177,   -45,   208,   166,   164,   176,   651,  -177,  -177,
-     587,   631,  -177,   254,  -177,  -177,  -177,  -177,   234,   194,
-     638,   651,  -177,   182,   227,   228,   280,  -177,  -177,  -177,
-     587,  -177,  -177,   273,   247,   587,   289,   214,  -177,  -177,
-    -177,   195,   587,   209,   261,  -177,   524,   199,  -177,   295,
-     292,   215,   299,   279,  -177,   304,  -177,  -177,   -44,  -177,
-      -8,  -177,  -177,  -177,   305,  -177,  -177,  -177,  -177
-};
-
-/* YYPGOTO[NTERM-NUM].  */
-static const short int yypgoto[] =
-{
-    -177,  -177,   -62,  -176,   -40,  -177,  -177,  -177,  -177,  -177,
-    -177,  -177,   109,  -166,   120,  -177,  -177,   -69,  -177,  -177,
-    -177,  -177,   -34,  -177,  -177,    48,  -177,   243,  -177,  -177,
-    -177,  -177,  -177,  -177,  -177,  -177,    64,  -177,  -177,  -177,
-    -177,  -177,  -177,  -177,  -177,  -177,  -177,    24,  -177,  -177,
-    -177,  -177,  -177,  -177,  -177,  -177,  -177,  -177,  -177,  -177,
-     -12,   307,  -177,   297,  -177,  -177,  -177,   285,  -177,  -177
-};
-
-/* YYTABLE[YYPACT[STATE-NUM]].  What to do in state STATE-NUM.  If
-   positive, shift that token.  If negative, reduce the rule which
-   number is the opposite.  If zero, do what YYDEFACT says.
-   If YYTABLE_NINF, syntax error.  */
-#define YYTABLE_NINF -1
-static const unsigned short int yytable[] =
-{
-      21,   140,   115,   117,   152,   121,   220,   264,   231,   181,
-     194,    24,    27,    37,    35,   229,    93,    94,    95,    96,
-      97,    98,    99,   135,   228,   100,    45,    15,    16,    17,
-      18,    13,    19,    14,   145,   129,   181,   130,   335,   336,
-      36,   144,   251,   249,     1,   250,   258,     4,   250,   118,
-     119,    28,   171,   275,     5,   276,   170,   278,     6,   250,
-     173,   294,   333,   295,   334,     8,    28,    11,    12,   195,
-     232,    22,    24,   175,   176,   265,     7,   280,    34,   101,
-      39,    40,    90,    91,   102,   103,   104,   105,   106,    92,
-     107,   108,   109,   110,   188,   189,   111,   112,   177,   178,
-     125,   179,   180,   181,   126,   127,   128,   210,   132,   133,
-      45,   113,   134,   120,   179,   180,   181,   136,   114,   186,
-     187,   188,   189,   137,   312,   138,   141,   147,   142,   316,
-     190,   143,   196,   198,   146,   150,   151,   215,   216,   217,
-     218,   219,   172,   221,   222,   223,   224,   225,   226,   227,
-     192,   154,   230,   155,   174,   121,   156,   238,   140,   197,
-     199,   200,   157,   158,   159,   160,   161,   140,   266,   162,
-     163,    93,    94,    95,    96,    97,    98,    99,   164,   201,
-     100,   165,   166,   183,   184,   185,   186,   187,   188,   189,
-     167,   202,   204,   168,   214,   193,   183,   184,   185,   186,
-     187,   188,   189,   205,   118,   119,   169,   212,   177,   178,
-     277,   179,   180,   181,   281,   208,   211,   213,   140,   181,
-     233,   236,   239,   242,   210,   243,   244,   290,   291,   247,
-     252,   261,   262,   263,   101,   268,   269,   270,   273,   102,
-     103,   104,   105,   106,   274,   107,   108,   109,   110,   279,
-     140,   111,   112,   283,   140,   254,   285,   284,   293,    93,
-      94,    95,    96,    97,    98,    99,   113,   286,   100,   287,
-     296,   288,   297,   114,   298,    93,    94,    95,    96,    97,
-      98,    99,   301,   299,   100,   302,   303,   306,   308,   311,
-     313,   314,   317,   183,   184,   185,   186,   187,   188,   189,
-     320,   327,   321,   318,   260,   324,   322,   325,   330,   329,
-     209,   331,   332,   246,   338,   235,   153,   292,    38,   310,
-     282,    23,   101,    29,     0,     0,     0,   102,   103,   104,
-     105,   106,     0,   107,   108,   109,   110,     0,   101,   111,
-     112,    41,     0,   102,   103,   104,   105,   106,     0,   107,
-     108,   109,   110,     0,   113,   111,   112,     0,     0,     0,
-      42,   114,   253,   254,     0,    43,    44,    45,     0,     0,
-     113,   177,   178,    46,   179,   180,   181,   114,     0,     0,
-      47,     0,     0,    48,     0,    49,     0,     0,    50,     0,
-     182,     0,     0,     0,     0,     0,     0,     0,     0,    51,
-      52,    53,     0,     0,     0,    41,     0,     0,    54,     0,
-       0,     0,     0,    55,    56,     0,     0,    57,    58,    59,
-       0,     0,    60,   139,    42,     0,     0,     0,     0,    43,
-      44,    45,     0,     0,     0,     0,     0,    46,     0,     0,
-       0,    61,     0,     0,    47,     0,     0,    48,     0,    49,
-       0,     0,    50,     0,     0,     0,   183,   184,   185,   186,
-     187,   188,   189,    51,    52,    53,     0,     0,     0,    41,
-       0,     0,    54,     0,     0,     0,     0,    55,    56,     0,
-       0,    57,    58,    59,     0,     0,    60,   259,    42,     0,
-       0,     0,     0,    43,    44,    45,     0,     0,     0,   177,
-     178,    46,   179,   180,   181,    61,     0,     0,    47,     0,
-       0,    48,     0,    49,     0,     0,    50,     0,     0,     0,
-       0,   191,     0,     0,     0,     0,     0,    51,    52,    53,
-       0,     0,     0,    41,     0,     0,    54,     0,     0,     0,
-       0,    55,    56,     0,     0,    57,    58,    59,     0,     0,
-      60,   323,    42,     0,     0,     0,     0,    43,    44,    45,
-       0,     0,     0,     0,     0,    46,     0,     0,     0,    61,
-       0,     0,    47,     0,     0,    48,     0,    49,     0,     0,
-      50,     0,     0,     0,   183,   184,   185,   186,   187,   188,
-     189,    51,    52,    53,   177,   178,    41,   179,   180,   181,
-      54,     0,     0,     0,     0,    55,    56,     0,     0,    57,
-      58,    59,     0,     0,    60,    42,     0,     0,     0,     0,
-      43,    44,    45,     0,     0,     0,   267,     0,    46,     0,
-       0,     0,     0,    61,     0,    47,     0,     0,    48,     0,
-      49,   177,   178,    50,   179,   180,   181,     0,   177,   178,
-       0,   179,   180,   181,    51,    52,    53,     0,     0,     0,
-     300,   177,   178,    54,   179,   180,   181,     0,    55,    56,
-     305,     0,    57,    58,    59,     0,     0,    60,     0,   183,
-     184,   185,   186,   187,   188,   189,     0,     0,     0,     0,
-       0,     0,     0,     0,     0,     0,    61,     0,     0,     0,
-       0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
-       0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
-       0,     0,     0,     0,     0,     0,   183,   184,   185,   186,
-     187,   188,   189,   183,   184,   185,   186,   187,   188,   189,
-       0,     0,     0,     0,     0,     0,   183,   184,   185,   186,
-     187,   188,   189
-};
-
-static const short int yycheck[] =
-{
-      12,    63,    42,    43,    73,    45,   182,    40,    38,    15,
-      49,     9,    24,    26,    65,   191,     3,     4,     5,     6,
-       7,     8,     9,    57,   190,    12,    35,    19,    20,    21,
-      22,   106,    24,   108,    68,    57,    15,    59,    46,    47,
-      91,    50,   208,   106,    16,   108,   106,     9,   108,    36,
-      37,    64,    92,   106,     0,   108,    90,   106,   104,   108,
-     100,   106,   106,   108,   108,     9,    64,    17,    18,   108,
-     100,    25,     9,   113,   114,   108,   105,   253,   104,    66,
-       9,     9,    25,   104,    71,    72,    73,    74,    75,    63,
-      77,    78,    79,    80,   100,   101,    83,    84,    10,    11,
-       9,    13,    14,    15,    49,     9,    39,   147,     9,     9,
-      35,    98,     9,   100,    13,    14,    15,    88,   105,    98,
-      99,   100,   101,    88,   300,     9,   104,    40,   105,   305,
-     105,   104,    39,     9,   104,   104,   104,   177,   178,   179,
-     180,   181,   103,   183,   184,   185,   186,   187,   188,   189,
-     105,   104,   192,   104,   103,   195,   104,   197,   220,    17,
-      52,     9,   104,   104,   104,   104,   104,   229,   237,   104,
-     104,     3,     4,     5,     6,     7,     8,     9,   104,     9,
-      12,   104,   104,    95,    96,    97,    98,    99,   100,   101,
-     104,    60,    49,   104,   106,   105,    95,    96,    97,    98,
-      99,   100,   101,   105,    36,    37,   104,    70,    10,    11,
-     250,    13,    14,    15,   254,   105,   104,    70,   280,    15,
-       9,     9,     9,   105,   264,    58,     9,   267,   268,   107,
-      55,     9,   106,   108,    66,    95,   108,     9,     9,    71,
-      72,    73,    74,    75,   105,    77,    78,    79,    80,     9,
-     312,    83,    84,    27,   316,    31,   106,    32,   270,     3,
-       4,     5,     6,     7,     8,     9,    98,     9,    12,     9,
-      62,    41,   106,   105,   110,     3,     4,     5,     6,     7,
-       8,     9,    28,   107,    12,    51,    92,   105,    61,     9,
-      17,    44,     3,    95,    96,    97,    98,    99,   100,   101,
-     105,     9,    93,    89,   106,   106,    45,    12,     9,    94,
-      54,    32,     8,   204,     9,   195,    73,   269,    33,   295,
-     256,    14,    66,    26,    -1,    -1,    -1,    71,    72,    73,
-      74,    75,    -1,    77,    78,    79,    80,    -1,    66,    83,
-      84,     9,    -1,    71,    72,    73,    74,    75,    -1,    77,
-      78,    79,    80,    -1,    98,    83,    84,    -1,    -1,    -1,
-      28,   105,    30,    31,    -1,    33,    34,    35,    -1,    -1,
-      98,    10,    11,    41,    13,    14,    15,   105,    -1,    -1,
-      48,    -1,    -1,    51,    -1,    53,    -1,    -1,    56,    -1,
-      29,    -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,    67,
-      68,    69,    -1,    -1,    -1,     9,    -1,    -1,    76,    -1,
-      -1,    -1,    -1,    81,    82,    -1,    -1,    85,    86,    87,
-      -1,    -1,    90,    27,    28,    -1,    -1,    -1,    -1,    33,
-      34,    35,    -1,    -1,    -1,    -1,    -1,    41,    -1,    -1,
-      -1,   109,    -1,    -1,    48,    -1,    -1,    51,    -1,    53,
-      -1,    -1,    56,    -1,    -1,    -1,    95,    96,    97,    98,
-      99,   100,   101,    67,    68,    69,    -1,    -1,    -1,     9,
-      -1,    -1,    76,    -1,    -1,    -1,    -1,    81,    82,    -1,
-      -1,    85,    86,    87,    -1,    -1,    90,    27,    28,    -1,
-      -1,    -1,    -1,    33,    34,    35,    -1,    -1,    -1,    10,
-      11,    41,    13,    14,    15,   109,    -1,    -1,    48,    -1,
-      -1,    51,    -1,    53,    -1,    -1,    56,    -1,    -1,    -1,
-      -1,    32,    -1,    -1,    -1,    -1,    -1,    67,    68,    69,
-      -1,    -1,    -1,     9,    -1,    -1,    76,    -1,    -1,    -1,
-      -1,    81,    82,    -1,    -1,    85,    86,    87,    -1,    -1,
-      90,    27,    28,    -1,    -1,    -1,    -1,    33,    34,    35,
-      -1,    -1,    -1,    -1,    -1,    41,    -1,    -1,    -1,   109,
-      -1,    -1,    48,    -1,    -1,    51,    -1,    53,    -1,    -1,
-      56,    -1,    -1,    -1,    95,    96,    97,    98,    99,   100,
-     101,    67,    68,    69,    10,    11,     9,    13,    14,    15,
-      76,    -1,    -1,    -1,    -1,    81,    82,    -1,    -1,    85,
-      86,    87,    -1,    -1,    90,    28,    -1,    -1,    -1,    -1,
-      33,    34,    35,    -1,    -1,    -1,    42,    -1,    41,    -1,
-      -1,    -1,    -1,   109,    -1,    48,    -1,    -1,    51,    -1,
-      53,    10,    11,    56,    13,    14,    15,    -1,    10,    11,
-      -1,    13,    14,    15,    67,    68,    69,    -1,    -1,    -1,
-      29,    10,    11,    76,    13,    14,    15,    -1,    81,    82,
-      32,    -1,    85,    86,    87,    -1,    -1,    90,    -1,    95,
-      96,    97,    98,    99,   100,   101,    -1,    -1,    -1,    -1,
-      -1,    -1,    -1,    -1,    -1,    -1,   109,    -1,    -1,    -1,
-      -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
-      -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
-      -1,    -1,    -1,    -1,    -1,    -1,    95,    96,    97,    98,
-      99,   100,   101,    95,    96,    97,    98,    99,   100,   101,
-      -1,    -1,    -1,    -1,    -1,    -1,    95,    96,    97,    98,
-      99,   100,   101
-};
-
-/* YYSTOS[STATE-NUM] -- The (internal number of the) accessing
-   symbol of state STATE-NUM.  */
-static const unsigned char yystos[] =
-{
-       0,    16,   112,   180,     9,     0,   104,   105,     9,   172,
-     173,    17,    18,   106,   108,    19,    20,    21,    22,    24,
-     171,   171,    25,   172,     9,   174,   175,   171,    64,   174,
-     176,   177,   178,   179,   104,    65,    91,    26,   178,     9,
-       9,     9,    28,    33,    34,    35,    41,    48,    51,    53,
-      56,    67,    68,    69,    76,    81,    82,    85,    86,    87,
-      90,   109,   113,   114,   118,   119,   120,   133,   134,   135,
-     139,   140,   141,   142,   143,   144,   145,   146,   150,   151,
-     152,   153,   154,   155,   156,   157,   164,   168,   169,   170,
-      25,   104,    63,     3,     4,     5,     6,     7,     8,     9,
-      12,    66,    71,    72,    73,    74,    75,    77,    78,    79,
-      80,    83,    84,    98,   105,   115,   116,   115,    36,    37,
-     100,   115,   125,   126,   127,     9,    49,     9,    39,    57,
-      59,   166,     9,     9,     9,   133,    88,    88,     9,    27,
-     113,   104,   105,   104,    50,   133,   104,    40,   128,   138,
-     104,   104,   128,   138,   104,   104,   104,   104,   104,   104,
-     104,   104,   104,   104,   104,   104,   104,   104,   104,   104,
-     133,   115,   103,   115,   103,   115,   115,    10,    11,    13,
-      14,    15,    29,    95,    96,    97,    98,    99,   100,   101,
-     105,    32,   105,   105,    49,   108,    39,    17,     9,    52,
-       9,     9,    60,   167,    49,   105,   115,   124,   105,    54,
-     115,   104,    70,    70,   106,   115,   115,   115,   115,   115,
-     114,   115,   115,   115,   115,   115,   115,   115,   124,   114,
-     115,    38,   100,     9,   123,   125,     9,   122,   115,     9,
-     136,   137,   105,    58,     9,   121,   123,   107,   117,   106,
-     108,   124,    55,    30,    31,   147,   148,   149,   106,    27,
-     106,     9,   106,   108,    40,   108,   128,    42,    95,   108,
-       9,   158,   159,     9,   105,   106,   108,   115,   106,     9,
-     114,   115,   147,    27,    32,   106,     9,     9,    41,   129,
-     115,   115,   136,   171,   106,   108,    62,   106,   110,   107,
-      29,    28,    51,    92,   130,    32,   105,   160,    61,   163,
-     158,     9,   114,    17,    44,   132,   114,     3,    89,   161,
-     105,    93,    45,    27,   106,    12,   162,     9,   165,    94,
-       9,    32,     8,   106,   108,    46,    47,   131,     9
-};
-
-#if ! defined (YYSIZE_T) && defined (__SIZE_TYPE__)
-# define YYSIZE_T __SIZE_TYPE__
-#endif
-#if ! defined (YYSIZE_T) && defined (size_t)
-# define YYSIZE_T size_t
-#endif
-#if ! defined (YYSIZE_T)
-# if defined (__STDC__) || defined (__cplusplus)
-#  include <stddef.h> /* INFRINGES ON USER NAME SPACE */
-#  define YYSIZE_T size_t
-# endif
-#endif
-#if ! defined (YYSIZE_T)
-# define YYSIZE_T unsigned int
-#endif
-
-#define yyerrok		(yyerrstatus = 0)
-#define yyclearin	(yychar = YYEMPTY)
-#define YYEMPTY		(-2)
-#define YYEOF		0
-
-#define YYACCEPT	goto yyacceptlab
-#define YYABORT		goto yyabortlab
-#define YYERROR		goto yyerrorlab
-
-
-/* Like YYERROR except do call yyerror.  This remains here temporarily
-   to ease the transition to the new meaning of YYERROR, for GCC.
-   Once GCC version 2 has supplanted version 1, this can go.  */
-
-#define YYFAIL		goto yyerrlab
-
-#define YYRECOVERING()  (!!yyerrstatus)
-
-#define YYBACKUP(Token, Value)					\
-do								\
-  if (yychar == YYEMPTY && yylen == 1)				\
-    {								\
-      yychar = (Token);						\
-      yylval = (Value);						\
-      yytoken = YYTRANSLATE (yychar);				\
-      YYPOPSTACK;						\
-      goto yybackup;						\
-    }								\
-  else								\
-    { 								\
-      yyerror ("syntax error: cannot back up");\
-      YYERROR;							\
-    }								\
-while (0)
-
-
-#define YYTERROR	1
-#define YYERRCODE	256
-
-
-/* YYLLOC_DEFAULT -- Set CURRENT to span from RHS[1] to RHS[N].
-   If N is 0, then set CURRENT to the empty location which ends
-   the previous symbol: RHS[0] (always defined).  */
-
-#define YYRHSLOC(Rhs, K) ((Rhs)[K])
-#ifndef YYLLOC_DEFAULT
-# define YYLLOC_DEFAULT(Current, Rhs, N)				\
-    do									\
-      if (N)								\
-	{								\
-	  (Current).first_line   = YYRHSLOC (Rhs, 1).first_line;	\
-	  (Current).first_column = YYRHSLOC (Rhs, 1).first_column;	\
-	  (Current).last_line    = YYRHSLOC (Rhs, N).last_line;		\
-	  (Current).last_column  = YYRHSLOC (Rhs, N).last_column;	\
-	}								\
-      else								\
-	{								\
-	  (Current).first_line   = (Current).last_line   =		\
-	    YYRHSLOC (Rhs, 0).last_line;				\
-	  (Current).first_column = (Current).last_column =		\
-	    YYRHSLOC (Rhs, 0).last_column;				\
-	}								\
-    while (0)
-#endif
-
-
-/* YY_LOCATION_PRINT -- Print the location on the stream.
-   This macro was not mandated originally: define only if we know
-   we won't break user code: when these are the locations we know.  */
-
-#ifndef YY_LOCATION_PRINT
-# if YYLTYPE_IS_TRIVIAL
-#  define YY_LOCATION_PRINT(File, Loc)			\
-     fprintf (File, "%d.%d-%d.%d",			\
-              (Loc).first_line, (Loc).first_column,	\
-              (Loc).last_line,  (Loc).last_column)
-# else
-#  define YY_LOCATION_PRINT(File, Loc) ((void) 0)
-# endif
-#endif
-
-
-/* YYLEX -- calling `yylex' with the right arguments.  */
-
-#ifdef YYLEX_PARAM
-# define YYLEX yylex (YYLEX_PARAM)
-#else
-# define YYLEX yylex ()
-#endif
-
-/* Enable debugging if requested.  */
-#if YYDEBUG
-
-# ifndef YYFPRINTF
-#  include <stdio.h> /* INFRINGES ON USER NAME SPACE */
-#  define YYFPRINTF fprintf
-# endif
-
-# define YYDPRINTF(Args)			\
-do {						\
-  if (yydebug)					\
-    YYFPRINTF Args;				\
-} while (0)
-
-# define YY_SYMBOL_PRINT(Title, Type, Value, Location)		\
-do {								\
-  if (yydebug)							\
-    {								\
-      YYFPRINTF (stderr, "%s ", Title);				\
-      yysymprint (stderr, 					\
-                  Type, Value);	\
-      YYFPRINTF (stderr, "\n");					\
-    }								\
-} while (0)
-
-/*------------------------------------------------------------------.
-| yy_stack_print -- Print the state stack from its BOTTOM up to its |
-| TOP (included).                                                   |
-`------------------------------------------------------------------*/
-
-#if defined (__STDC__) || defined (__cplusplus)
-static void
-yy_stack_print (short int *bottom, short int *top)
-#else
-static void
-yy_stack_print (bottom, top)
-    short int *bottom;
-    short int *top;
-#endif
-{
-  YYFPRINTF (stderr, "Stack now");
-  for (/* Nothing. */; bottom <= top; ++bottom)
-    YYFPRINTF (stderr, " %d", *bottom);
-  YYFPRINTF (stderr, "\n");
-}
-
-# define YY_STACK_PRINT(Bottom, Top)				\
-do {								\
-  if (yydebug)							\
-    yy_stack_print ((Bottom), (Top));				\
-} while (0)
-
-
-/*------------------------------------------------.
-| Report that the YYRULE is going to be reduced.  |
-`------------------------------------------------*/
-
-#if defined (__STDC__) || defined (__cplusplus)
-static void
-yy_reduce_print (int yyrule)
-#else
-static void
-yy_reduce_print (yyrule)
-    int yyrule;
-#endif
-{
-  int yyi;
-  unsigned int yylno = yyrline[yyrule];
-  YYFPRINTF (stderr, "Reducing stack by rule %d (line %u), ",
-             yyrule - 1, yylno);
-  /* Print the symbols being reduced, and their result.  */
-  for (yyi = yyprhs[yyrule]; 0 <= yyrhs[yyi]; yyi++)
-    YYFPRINTF (stderr, "%s ", yytname [yyrhs[yyi]]);
-  YYFPRINTF (stderr, "-> %s\n", yytname [yyr1[yyrule]]);
-}
-
-# define YY_REDUCE_PRINT(Rule)		\
-do {					\
-  if (yydebug)				\
-    yy_reduce_print (Rule);		\
-} while (0)
-
-/* Nonzero means print parse trace.  It is left uninitialized so that
-   multiple parsers can coexist.  */
-int yydebug;
-#else /* !YYDEBUG */
-# define YYDPRINTF(Args)
-# define YY_SYMBOL_PRINT(Title, Type, Value, Location)
-# define YY_STACK_PRINT(Bottom, Top)
-# define YY_REDUCE_PRINT(Rule)
-#endif /* !YYDEBUG */
-
-
-/* YYINITDEPTH -- initial size of the parser's stacks.  */
-#ifndef	YYINITDEPTH
-# define YYINITDEPTH 200
-#endif
-
-/* YYMAXDEPTH -- maximum size the stacks can grow to (effective only
-   if the built-in stack extension method is used).
-
-   Do not make this value too large; the results are undefined if
-   SIZE_MAX < YYSTACK_BYTES (YYMAXDEPTH)
-   evaluated with infinite-precision integer arithmetic.  */
-
-#ifndef YYMAXDEPTH
-# define YYMAXDEPTH 10000
-#endif
-
-
-
-#if YYERROR_VERBOSE
-
-# ifndef yystrlen
-#  if defined (__GLIBC__) && defined (_STRING_H)
-#   define yystrlen strlen
-#  else
-/* Return the length of YYSTR.  */
-static YYSIZE_T
-#   if defined (__STDC__) || defined (__cplusplus)
-yystrlen (const char *yystr)
-#   else
-yystrlen (yystr)
-     const char *yystr;
-#   endif
-{
-  register const char *yys = yystr;
-
-  while (*yys++ != '\0')
-    continue;
-
-  return yys - yystr - 1;
-}
-#  endif
-# endif
-
-# ifndef yystpcpy
-#  if defined (__GLIBC__) && defined (_STRING_H) && defined (_GNU_SOURCE)
-#   define yystpcpy stpcpy
-#  else
-/* Copy YYSRC to YYDEST, returning the address of the terminating '\0' in
-   YYDEST.  */
-static char *
-#   if defined (__STDC__) || defined (__cplusplus)
-yystpcpy (char *yydest, const char *yysrc)
-#   else
-yystpcpy (yydest, yysrc)
-     char *yydest;
-     const char *yysrc;
-#   endif
-{
-  register char *yyd = yydest;
-  register const char *yys = yysrc;
-
-  while ((*yyd++ = *yys++) != '\0')
-    continue;
-
-  return yyd - 1;
-}
-#  endif
-# endif
-
-#endif /* !YYERROR_VERBOSE */
-
-
-
-#if YYDEBUG
-/*--------------------------------.
-| Print this symbol on YYOUTPUT.  |
-`--------------------------------*/
-
-#if defined (__STDC__) || defined (__cplusplus)
-static void
-yysymprint (FILE *yyoutput, int yytype, YYSTYPE *yyvaluep)
-#else
-static void
-yysymprint (yyoutput, yytype, yyvaluep)
-    FILE *yyoutput;
-    int yytype;
-    YYSTYPE *yyvaluep;
-#endif
-{
-  /* Pacify ``unused variable'' warnings.  */
-  (void) yyvaluep;
-
-  if (yytype < YYNTOKENS)
-    YYFPRINTF (yyoutput, "token %s (", yytname[yytype]);
-  else
-    YYFPRINTF (yyoutput, "nterm %s (", yytname[yytype]);
-
-
-# ifdef YYPRINT
-  if (yytype < YYNTOKENS)
-    YYPRINT (yyoutput, yytoknum[yytype], *yyvaluep);
-# endif
-  switch (yytype)
-    {
-      default:
-        break;
-    }
-  YYFPRINTF (yyoutput, ")");
-}
-
-#endif /* ! YYDEBUG */
-/*-----------------------------------------------.
-| Release the memory associated to this symbol.  |
-`-----------------------------------------------*/
-
-#if defined (__STDC__) || defined (__cplusplus)
-static void
-yydestruct (const char *yymsg, int yytype, YYSTYPE *yyvaluep)
-#else
-static void
-yydestruct (yymsg, yytype, yyvaluep)
-    const char *yymsg;
-    int yytype;
-    YYSTYPE *yyvaluep;
-#endif
-{
-  /* Pacify ``unused variable'' warnings.  */
-  (void) yyvaluep;
-
-  if (!yymsg)
-    yymsg = "Deleting";
-  YY_SYMBOL_PRINT (yymsg, yytype, yyvaluep, yylocationp);
-
-  switch (yytype)
-    {
-
-      default:
-        break;
-    }
-}
-
-
-/* Prevent warnings from -Wmissing-prototypes.  */
-
-#ifdef YYPARSE_PARAM
-# if defined (__STDC__) || defined (__cplusplus)
-UNIV_INTERN int yyparse (void *YYPARSE_PARAM);
-# else
-UNIV_INTERN int yyparse ();
-# endif
-#else /* ! YYPARSE_PARAM */
-#if defined (__STDC__) || defined (__cplusplus)
-UNIV_INTERN int yyparse (void);
-#else
-UNIV_INTERN int yyparse ();
-#endif
-#endif /* ! YYPARSE_PARAM */
-
-
-
-/* The look-ahead symbol.  */
-static int yychar;
-
-/* The semantic value of the look-ahead symbol.  */
-UNIV_INTERN YYSTYPE yylval;
-
-/* Number of syntax errors so far.  */
-static int yynerrs;
-
-
-
-/*----------.
-| yyparse.  |
-`----------*/
-
-#ifdef YYPARSE_PARAM
-# if defined (__STDC__) || defined (__cplusplus)
-UNIV_INTERN int yyparse (void *YYPARSE_PARAM)
-# else
-UNIV_INTERN int yyparse (YYPARSE_PARAM)
-  void *YYPARSE_PARAM;
-# endif
-#else /* ! YYPARSE_PARAM */
-#if defined (__STDC__) || defined (__cplusplus)
-int
-yyparse (void)
-#else
-int
-yyparse ()
-
-#endif
-#endif
-{
-  
-  register int yystate;
-  register int yyn;
-  int yyresult;
-  /* Number of tokens to shift before error messages enabled.  */
-  int yyerrstatus;
-  /* Look-ahead token as an internal (translated) token number.  */
-  int yytoken = 0;
-
-  /* Three stacks and their tools:
-     `yyss': related to states,
-     `yyvs': related to semantic values,
-     `yyls': related to locations.
-
-     Refer to the stacks thru separate pointers, to allow yyoverflow
-     to reallocate them elsewhere.  */
-
-  /* The state stack.  */
-  short int yyssa[YYINITDEPTH];
-  short int *yyss = yyssa;
-  register short int *yyssp;
-
-  /* The semantic value stack.  */
-  YYSTYPE yyvsa[YYINITDEPTH];
-  YYSTYPE *yyvs = yyvsa;
-  register YYSTYPE *yyvsp;
-
-
-
-#define YYPOPSTACK   (yyvsp--, yyssp--)
-
-  YYSIZE_T yystacksize = YYINITDEPTH;
-
-  /* The variables used to return semantic value and location from the
-     action routines.  */
-  YYSTYPE yyval;
-
-
-  /* When reducing, the number of symbols on the RHS of the reduced
-     rule.  */
-  int yylen;
-
-  YYDPRINTF ((stderr, "Starting parse\n"));
-
-  yystate = 0;
-  yyerrstatus = 0;
-  yynerrs = 0;
-  yychar = YYEMPTY;		/* Cause a token to be read.  */
-
-  /* Initialize stack pointers.
-     Waste one element of value and location stack
-     so that they stay on the same level as the state stack.
-     The wasted elements are never initialized.  */
-
-  yyssp = yyss;
-  yyvsp = yyvs;
-
-
-  yyvsp[0] = yylval;
-
-  goto yysetstate;
-
-/*------------------------------------------------------------.
-| yynewstate -- Push a new state, which is found in yystate.  |
-`------------------------------------------------------------*/
- yynewstate:
-  /* In all cases, when you get here, the value and location stacks
-     have just been pushed. so pushing a state here evens the stacks.
-     */
-  yyssp++;
-
- yysetstate:
-  *yyssp = yystate;
-
-  if (yyss + yystacksize - 1 <= yyssp)
-    {
-      /* Get the current used size of the three stacks, in elements.  */
-      YYSIZE_T yysize = yyssp - yyss + 1;
-
-#ifdef yyoverflow
-      {
-	/* Give user a chance to reallocate the stack. Use copies of
-	   these so that the &'s don't force the real ones into
-	   memory.  */
-	YYSTYPE *yyvs1 = yyvs;
-	short int *yyss1 = yyss;
-
-
-	/* Each stack pointer address is followed by the size of the
-	   data in use in that stack, in bytes.  This used to be a
-	   conditional around just the two extra args, but that might
-	   be undefined if yyoverflow is a macro.  */
-	yyoverflow ("parser stack overflow",
-		    &yyss1, yysize * sizeof (*yyssp),
-		    &yyvs1, yysize * sizeof (*yyvsp),
-
-		    &yystacksize);
-
-	yyss = yyss1;
-	yyvs = yyvs1;
-      }
-#else /* no yyoverflow */
-# ifndef YYSTACK_RELOCATE
-      goto yyoverflowlab;
-# else
-      /* Extend the stack our own way.  */
-      if (YYMAXDEPTH <= yystacksize)
-	goto yyoverflowlab;
-      yystacksize *= 2;
-      if (YYMAXDEPTH < yystacksize)
-	yystacksize = YYMAXDEPTH;
-
-      {
-	short int *yyss1 = yyss;
-	union yyalloc *yyptr =
-	  (union yyalloc *) YYSTACK_ALLOC (YYSTACK_BYTES (yystacksize));
-	if (! yyptr)
-	  goto yyoverflowlab;
-	YYSTACK_RELOCATE (yyss);
-	YYSTACK_RELOCATE (yyvs);
-
-#  undef YYSTACK_RELOCATE
-	if (yyss1 != yyssa)
-	  YYSTACK_FREE (yyss1);
-      }
-# endif
-#endif /* no yyoverflow */
-
-      yyssp = yyss + yysize - 1;
-      yyvsp = yyvs + yysize - 1;
-
-
-      YYDPRINTF ((stderr, "Stack size increased to %lu\n",
-		  (unsigned long int) yystacksize));
-
-      if (yyss + yystacksize - 1 <= yyssp)
-	YYABORT;
-    }
-
-  YYDPRINTF ((stderr, "Entering state %d\n", yystate));
-
-  goto yybackup;
-
-/*-----------.
-| yybackup.  |
-`-----------*/
-yybackup:
-
-/* Do appropriate processing given the current state.  */
-/* Read a look-ahead token if we need one and don't already have one.  */
-/* yyresume: */
-
-  /* First try to decide what to do without reference to look-ahead token.  */
-
-  yyn = yypact[yystate];
-  if (yyn == YYPACT_NINF)
-    goto yydefault;
-
-  /* Not known => get a look-ahead token if don't already have one.  */
-
-  /* YYCHAR is either YYEMPTY or YYEOF or a valid look-ahead symbol.  */
-  if (yychar == YYEMPTY)
-    {
-      YYDPRINTF ((stderr, "Reading a token: "));
-      yychar = YYLEX;
-    }
-
-  if (yychar <= YYEOF)
-    {
-      yychar = yytoken = YYEOF;
-      YYDPRINTF ((stderr, "Now at end of input.\n"));
-    }
-  else
-    {
-      yytoken = YYTRANSLATE (yychar);
-      YY_SYMBOL_PRINT ("Next token is", yytoken, &yylval, &yylloc);
-    }
-
-  /* If the proper action on seeing token YYTOKEN is to reduce or to
-     detect an error, take that action.  */
-  yyn += yytoken;
-  if (yyn < 0 || YYLAST < yyn || yycheck[yyn] != yytoken)
-    goto yydefault;
-  yyn = yytable[yyn];
-  if (yyn <= 0)
-    {
-      if (yyn == 0 || yyn == YYTABLE_NINF)
-	goto yyerrlab;
-      yyn = -yyn;
-      goto yyreduce;
-    }
-
-  if (yyn == YYFINAL)
-    YYACCEPT;
-
-  /* Shift the look-ahead token.  */
-  YY_SYMBOL_PRINT ("Shifting", yytoken, &yylval, &yylloc);
-
-  /* Discard the token being shifted unless it is eof.  */
-  if (yychar != YYEOF)
-    yychar = YYEMPTY;
-
-  *++yyvsp = yylval;
-
-
-  /* Count tokens shifted since error; after three, turn off error
-     status.  */
-  if (yyerrstatus)
-    yyerrstatus--;
-
-  yystate = yyn;
-  goto yynewstate;
-
-
-/*-----------------------------------------------------------.
-| yydefault -- do the default action for the current state.  |
-`-----------------------------------------------------------*/
-yydefault:
-  yyn = yydefact[yystate];
-  if (yyn == 0)
-    goto yyerrlab;
-  goto yyreduce;
-
-
-/*-----------------------------.
-| yyreduce -- Do a reduction.  |
-`-----------------------------*/
-yyreduce:
-  /* yyn is the number of a rule to reduce with.  */
-  yylen = yyr2[yyn];
-
-  /* If YYLEN is nonzero, implement the default value of the action:
-     `$$ = $1'.
-
-     Otherwise, the following line sets YYVAL to garbage.
-     This behavior is undocumented and Bison
-     users should not rely upon it.  Assigning to YYVAL
-     unconditionally makes the parser a bit smaller, and it avoids a
-     GCC warning that YYVAL may be used uninitialized.  */
-  yyval = yyvsp[1-yylen];
-
-
-  YY_REDUCE_PRINT (yyn);
-  switch (yyn)
-    {
-        case 25:
-#line 166 "pars0grm.y"
-    { (yyval) = que_node_list_add_last(NULL, (yyvsp[0])); ;}
-    break;
-
-  case 26:
-#line 168 "pars0grm.y"
-    { (yyval) = que_node_list_add_last((yyvsp[-1]), (yyvsp[0])); ;}
-    break;
-
-  case 27:
-#line 172 "pars0grm.y"
-    { (yyval) = (yyvsp[0]);;}
-    break;
-
-  case 28:
-#line 174 "pars0grm.y"
-    { (yyval) = pars_func((yyvsp[-3]), (yyvsp[-1])); ;}
-    break;
-
-  case 29:
-#line 175 "pars0grm.y"
-    { (yyval) = (yyvsp[0]);;}
-    break;
-
-  case 30:
-#line 176 "pars0grm.y"
-    { (yyval) = (yyvsp[0]);;}
-    break;
-
-  case 31:
-#line 177 "pars0grm.y"
-    { (yyval) = (yyvsp[0]);;}
-    break;
-
-  case 32:
-#line 178 "pars0grm.y"
-    { (yyval) = (yyvsp[0]);;}
-    break;
-
-  case 33:
-#line 179 "pars0grm.y"
-    { (yyval) = (yyvsp[0]);;}
-    break;
-
-  case 34:
-#line 180 "pars0grm.y"
-    { (yyval) = (yyvsp[0]);;}
-    break;
-
-  case 35:
-#line 181 "pars0grm.y"
-    { (yyval) = (yyvsp[0]);;}
-    break;
-
-  case 36:
-#line 182 "pars0grm.y"
-    { (yyval) = pars_op('+', (yyvsp[-2]), (yyvsp[0])); ;}
-    break;
-
-  case 37:
-#line 183 "pars0grm.y"
-    { (yyval) = pars_op('-', (yyvsp[-2]), (yyvsp[0])); ;}
-    break;
-
-  case 38:
-#line 184 "pars0grm.y"
-    { (yyval) = pars_op('*', (yyvsp[-2]), (yyvsp[0])); ;}
-    break;
-
-  case 39:
-#line 185 "pars0grm.y"
-    { (yyval) = pars_op('/', (yyvsp[-2]), (yyvsp[0])); ;}
-    break;
-
-  case 40:
-#line 186 "pars0grm.y"
-    { (yyval) = pars_op('-', (yyvsp[0]), NULL); ;}
-    break;
-
-  case 41:
-#line 187 "pars0grm.y"
-    { (yyval) = (yyvsp[-1]); ;}
-    break;
-
-  case 42:
-#line 188 "pars0grm.y"
-    { (yyval) = pars_op('=', (yyvsp[-2]), (yyvsp[0])); ;}
-    break;
-
-  case 43:
-#line 189 "pars0grm.y"
-    { (yyval) = pars_op('<', (yyvsp[-2]), (yyvsp[0])); ;}
-    break;
-
-  case 44:
-#line 190 "pars0grm.y"
-    { (yyval) = pars_op('>', (yyvsp[-2]), (yyvsp[0])); ;}
-    break;
-
-  case 45:
-#line 191 "pars0grm.y"
-    { (yyval) = pars_op(PARS_GE_TOKEN, (yyvsp[-2]), (yyvsp[0])); ;}
-    break;
-
-  case 46:
-#line 192 "pars0grm.y"
-    { (yyval) = pars_op(PARS_LE_TOKEN, (yyvsp[-2]), (yyvsp[0])); ;}
-    break;
-
-  case 47:
-#line 193 "pars0grm.y"
-    { (yyval) = pars_op(PARS_NE_TOKEN, (yyvsp[-2]), (yyvsp[0])); ;}
-    break;
-
-  case 48:
-#line 194 "pars0grm.y"
-    { (yyval) = pars_op(PARS_AND_TOKEN, (yyvsp[-2]), (yyvsp[0])); ;}
-    break;
-
-  case 49:
-#line 195 "pars0grm.y"
-    { (yyval) = pars_op(PARS_OR_TOKEN, (yyvsp[-2]), (yyvsp[0])); ;}
-    break;
-
-  case 50:
-#line 196 "pars0grm.y"
-    { (yyval) = pars_op(PARS_NOT_TOKEN, (yyvsp[0]), NULL); ;}
-    break;
-
-  case 51:
-#line 198 "pars0grm.y"
-    { (yyval) = pars_op(PARS_NOTFOUND_TOKEN, (yyvsp[-2]), NULL); ;}
-    break;
-
-  case 52:
-#line 200 "pars0grm.y"
-    { (yyval) = pars_op(PARS_NOTFOUND_TOKEN, (yyvsp[-2]), NULL); ;}
-    break;
-
-  case 53:
-#line 204 "pars0grm.y"
-    { (yyval) = &pars_to_char_token; ;}
-    break;
-
-  case 54:
-#line 205 "pars0grm.y"
-    { (yyval) = &pars_to_number_token; ;}
-    break;
-
-  case 55:
-#line 206 "pars0grm.y"
-    { (yyval) = &pars_to_binary_token; ;}
-    break;
-
-  case 56:
-#line 208 "pars0grm.y"
-    { (yyval) = &pars_binary_to_number_token; ;}
-    break;
-
-  case 57:
-#line 209 "pars0grm.y"
-    { (yyval) = &pars_substr_token; ;}
-    break;
-
-  case 58:
-#line 210 "pars0grm.y"
-    { (yyval) = &pars_concat_token; ;}
-    break;
-
-  case 59:
-#line 211 "pars0grm.y"
-    { (yyval) = &pars_instr_token; ;}
-    break;
-
-  case 60:
-#line 212 "pars0grm.y"
-    { (yyval) = &pars_length_token; ;}
-    break;
-
-  case 61:
-#line 213 "pars0grm.y"
-    { (yyval) = &pars_sysdate_token; ;}
-    break;
-
-  case 62:
-#line 214 "pars0grm.y"
-    { (yyval) = &pars_rnd_token; ;}
-    break;
-
-  case 63:
-#line 215 "pars0grm.y"
-    { (yyval) = &pars_rnd_str_token; ;}
-    break;
-
-  case 67:
-#line 226 "pars0grm.y"
-    { (yyval) = pars_stored_procedure_call((yyvsp[-4])); ;}
-    break;
-
-  case 68:
-#line 231 "pars0grm.y"
-    { (yyval) = pars_procedure_call((yyvsp[-3]), (yyvsp[-1])); ;}
-    break;
-
-  case 69:
-#line 235 "pars0grm.y"
-    { (yyval) = &pars_replstr_token; ;}
-    break;
-
-  case 70:
-#line 236 "pars0grm.y"
-    { (yyval) = &pars_printf_token; ;}
-    break;
-
-  case 71:
-#line 237 "pars0grm.y"
-    { (yyval) = &pars_assert_token; ;}
-    break;
-
-  case 72:
-#line 241 "pars0grm.y"
-    { (yyval) = (yyvsp[-2]); ;}
-    break;
-
-  case 73:
-#line 245 "pars0grm.y"
-    { (yyval) = que_node_list_add_last(NULL, (yyvsp[0])); ;}
-    break;
-
-  case 74:
-#line 247 "pars0grm.y"
-    { (yyval) = que_node_list_add_last((yyvsp[-2]), (yyvsp[0])); ;}
-    break;
-
-  case 75:
-#line 251 "pars0grm.y"
-    { (yyval) = NULL; ;}
-    break;
-
-  case 76:
-#line 252 "pars0grm.y"
-    { (yyval) = que_node_list_add_last(NULL, (yyvsp[0])); ;}
-    break;
-
-  case 77:
-#line 254 "pars0grm.y"
-    { (yyval) = que_node_list_add_last((yyvsp[-2]), (yyvsp[0])); ;}
-    break;
-
-  case 78:
-#line 258 "pars0grm.y"
-    { (yyval) = NULL; ;}
-    break;
-
-  case 79:
-#line 259 "pars0grm.y"
-    { (yyval) = que_node_list_add_last(NULL, (yyvsp[0]));;}
-    break;
-
-  case 80:
-#line 260 "pars0grm.y"
-    { (yyval) = que_node_list_add_last((yyvsp[-2]), (yyvsp[0])); ;}
-    break;
-
-  case 81:
-#line 264 "pars0grm.y"
-    { (yyval) = (yyvsp[0]); ;}
-    break;
-
-  case 82:
-#line 266 "pars0grm.y"
-    { (yyval) = pars_func(&pars_count_token,
-				          que_node_list_add_last(NULL,
-					    sym_tab_add_int_lit(
-						pars_sym_tab_global, 1))); ;}
-    break;
-
-  case 83:
-#line 271 "pars0grm.y"
-    { (yyval) = pars_func(&pars_count_token,
-					    que_node_list_add_last(NULL,
-						pars_func(&pars_distinct_token,
-						     que_node_list_add_last(
-								NULL, (yyvsp[-1]))))); ;}
-    break;
-
-  case 84:
-#line 277 "pars0grm.y"
-    { (yyval) = pars_func(&pars_sum_token,
-						que_node_list_add_last(NULL,
-									(yyvsp[-1]))); ;}
-    break;
-
-  case 85:
-#line 283 "pars0grm.y"
-    { (yyval) = NULL; ;}
-    break;
-
-  case 86:
-#line 284 "pars0grm.y"
-    { (yyval) = que_node_list_add_last(NULL, (yyvsp[0])); ;}
-    break;
-
-  case 87:
-#line 286 "pars0grm.y"
-    { (yyval) = que_node_list_add_last((yyvsp[-2]), (yyvsp[0])); ;}
-    break;
-
-  case 88:
-#line 290 "pars0grm.y"
-    { (yyval) = pars_select_list(&pars_star_denoter,
-								NULL); ;}
-    break;
-
-  case 89:
-#line 293 "pars0grm.y"
-    { (yyval) = pars_select_list((yyvsp[-2]), (yyvsp[0])); ;}
-    break;
-
-  case 90:
-#line 294 "pars0grm.y"
-    { (yyval) = pars_select_list((yyvsp[0]), NULL); ;}
-    break;
-
-  case 91:
-#line 298 "pars0grm.y"
-    { (yyval) = NULL; ;}
-    break;
-
-  case 92:
-#line 299 "pars0grm.y"
-    { (yyval) = (yyvsp[0]); ;}
-    break;
-
-  case 93:
-#line 303 "pars0grm.y"
-    { (yyval) = NULL; ;}
-    break;
-
-  case 94:
-#line 305 "pars0grm.y"
-    { (yyval) = &pars_update_token; ;}
-    break;
-
-  case 95:
-#line 309 "pars0grm.y"
-    { (yyval) = NULL; ;}
-    break;
-
-  case 96:
-#line 311 "pars0grm.y"
-    { yyval = &pars_share_token; ;}
-    break;
-
-  case 97:
-#line 315 "pars0grm.y"
-    { (yyval) = &pars_asc_token; ;}
-    break;
-
-  case 98:
-#line 316 "pars0grm.y"
-    { (yyval) = &pars_asc_token; ;}
-    break;
-
-  case 99:
-#line 317 "pars0grm.y"
-    { (yyval) = &pars_desc_token; ;}
-    break;
-
-  case 100:
-#line 321 "pars0grm.y"
-    { (yyval) = NULL; ;}
-    break;
-
-  case 101:
-#line 323 "pars0grm.y"
-    { (yyval) = pars_order_by((yyvsp[-1]), (yyvsp[0])); ;}
-    break;
-
-  case 102:
-#line 332 "pars0grm.y"
-    { (yyval) = pars_select_statement((yyvsp[-6]), (yyvsp[-4]), (yyvsp[-3]),
-								(yyvsp[-2]), (yyvsp[-1]), (yyvsp[0])); ;}
-    break;
-
-  case 103:
-#line 338 "pars0grm.y"
-    { (yyval) = (yyvsp[0]); ;}
-    break;
-
-  case 104:
-#line 343 "pars0grm.y"
-    { (yyval) = pars_insert_statement((yyvsp[-4]), (yyvsp[-1]), NULL); ;}
-    break;
-
-  case 105:
-#line 345 "pars0grm.y"
-    { (yyval) = pars_insert_statement((yyvsp[-1]), NULL, (yyvsp[0])); ;}
-    break;
-
-  case 106:
-#line 349 "pars0grm.y"
-    { (yyval) = pars_column_assignment((yyvsp[-2]), (yyvsp[0])); ;}
-    break;
-
-  case 107:
-#line 353 "pars0grm.y"
-    { (yyval) = que_node_list_add_last(NULL, (yyvsp[0])); ;}
-    break;
-
-  case 108:
-#line 355 "pars0grm.y"
-    { (yyval) = que_node_list_add_last((yyvsp[-2]), (yyvsp[0])); ;}
-    break;
-
-  case 109:
-#line 361 "pars0grm.y"
-    { (yyval) = (yyvsp[0]); ;}
-    break;
-
-  case 110:
-#line 367 "pars0grm.y"
-    { (yyval) = pars_update_statement_start(FALSE,
-								(yyvsp[-2]), (yyvsp[0])); ;}
-    break;
-
-  case 111:
-#line 373 "pars0grm.y"
-    { (yyval) = pars_update_statement((yyvsp[-1]), NULL, (yyvsp[0])); ;}
-    break;
-
-  case 112:
-#line 378 "pars0grm.y"
-    { (yyval) = pars_update_statement((yyvsp[-1]), (yyvsp[0]), NULL); ;}
-    break;
-
-  case 113:
-#line 383 "pars0grm.y"
-    { (yyval) = pars_update_statement_start(TRUE,
-								(yyvsp[0]), NULL); ;}
-    break;
-
-  case 114:
-#line 389 "pars0grm.y"
-    { (yyval) = pars_update_statement((yyvsp[-1]), NULL, (yyvsp[0])); ;}
-    break;
-
-  case 115:
-#line 394 "pars0grm.y"
-    { (yyval) = pars_update_statement((yyvsp[-1]), (yyvsp[0]), NULL); ;}
-    break;
-
-  case 116:
-#line 399 "pars0grm.y"
-    { (yyval) = pars_row_printf_statement((yyvsp[0])); ;}
-    break;
-
-  case 117:
-#line 404 "pars0grm.y"
-    { (yyval) = pars_assignment_statement((yyvsp[-2]), (yyvsp[0])); ;}
-    break;
-
-  case 118:
-#line 410 "pars0grm.y"
-    { (yyval) = pars_elsif_element((yyvsp[-2]), (yyvsp[0])); ;}
-    break;
-
-  case 119:
-#line 414 "pars0grm.y"
-    { (yyval) = que_node_list_add_last(NULL, (yyvsp[0])); ;}
-    break;
-
-  case 120:
-#line 416 "pars0grm.y"
-    { (yyval) = que_node_list_add_last((yyvsp[-1]), (yyvsp[0])); ;}
-    break;
-
-  case 121:
-#line 420 "pars0grm.y"
-    { (yyval) = NULL; ;}
-    break;
-
-  case 122:
-#line 422 "pars0grm.y"
-    { (yyval) = (yyvsp[0]); ;}
-    break;
-
-  case 123:
-#line 423 "pars0grm.y"
-    { (yyval) = (yyvsp[0]); ;}
-    break;
-
-  case 124:
-#line 430 "pars0grm.y"
-    { (yyval) = pars_if_statement((yyvsp[-5]), (yyvsp[-3]), (yyvsp[-2])); ;}
-    break;
-
-  case 125:
-#line 436 "pars0grm.y"
-    { (yyval) = pars_while_statement((yyvsp[-4]), (yyvsp[-2])); ;}
-    break;
-
-  case 126:
-#line 444 "pars0grm.y"
-    { (yyval) = pars_for_statement((yyvsp[-8]), (yyvsp[-6]), (yyvsp[-4]), (yyvsp[-2])); ;}
-    break;
-
-  case 127:
-#line 448 "pars0grm.y"
-    { (yyval) = pars_exit_statement(); ;}
-    break;
-
-  case 128:
-#line 452 "pars0grm.y"
-    { (yyval) = pars_return_statement(); ;}
-    break;
-
-  case 129:
-#line 457 "pars0grm.y"
-    { (yyval) = pars_open_statement(
-						ROW_SEL_OPEN_CURSOR, (yyvsp[0])); ;}
-    break;
-
-  case 130:
-#line 463 "pars0grm.y"
-    { (yyval) = pars_open_statement(
-						ROW_SEL_CLOSE_CURSOR, (yyvsp[0])); ;}
-    break;
-
-  case 131:
-#line 469 "pars0grm.y"
-    { (yyval) = pars_fetch_statement((yyvsp[-2]), (yyvsp[0]), NULL); ;}
-    break;
-
-  case 132:
-#line 471 "pars0grm.y"
-    { (yyval) = pars_fetch_statement((yyvsp[-2]), NULL, (yyvsp[0])); ;}
-    break;
-
-  case 133:
-#line 476 "pars0grm.y"
-    { (yyval) = pars_column_def((yyvsp[-4]), (yyvsp[-3]), (yyvsp[-2]), (yyvsp[-1]), (yyvsp[0])); ;}
-    break;
-
-  case 134:
-#line 480 "pars0grm.y"
-    { (yyval) = que_node_list_add_last(NULL, (yyvsp[0])); ;}
-    break;
-
-  case 135:
-#line 482 "pars0grm.y"
-    { (yyval) = que_node_list_add_last((yyvsp[-2]), (yyvsp[0])); ;}
-    break;
-
-  case 136:
-#line 486 "pars0grm.y"
-    { (yyval) = NULL; ;}
-    break;
-
-  case 137:
-#line 488 "pars0grm.y"
-    { (yyval) = (yyvsp[-1]); ;}
-    break;
-
-  case 138:
-#line 492 "pars0grm.y"
-    { (yyval) = NULL; ;}
-    break;
-
-  case 139:
-#line 494 "pars0grm.y"
-    { (yyval) = &pars_int_token;
-					/* pass any non-NULL pointer */ ;}
-    break;
-
-  case 140:
-#line 499 "pars0grm.y"
-    { (yyval) = NULL; ;}
-    break;
-
-  case 141:
-#line 501 "pars0grm.y"
-    { (yyval) = &pars_int_token;
-					/* pass any non-NULL pointer */ ;}
-    break;
-
-  case 142:
-#line 506 "pars0grm.y"
-    { (yyval) = NULL; ;}
-    break;
-
-  case 143:
-#line 508 "pars0grm.y"
-    { (yyval) = &pars_int_token;
-					/* pass any non-NULL pointer */ ;}
-    break;
-
-  case 144:
-#line 515 "pars0grm.y"
-    { (yyval) = pars_create_table((yyvsp[-4]), (yyvsp[-2]), (yyvsp[0])); ;}
-    break;
-
-  case 145:
-#line 519 "pars0grm.y"
-    { (yyval) = que_node_list_add_last(NULL, (yyvsp[0])); ;}
-    break;
-
-  case 146:
-#line 521 "pars0grm.y"
-    { (yyval) = que_node_list_add_last((yyvsp[-2]), (yyvsp[0])); ;}
-    break;
-
-  case 147:
-#line 525 "pars0grm.y"
-    { (yyval) = NULL; ;}
-    break;
-
-  case 148:
-#line 526 "pars0grm.y"
-    { (yyval) = &pars_unique_token; ;}
-    break;
-
-  case 149:
-#line 530 "pars0grm.y"
-    { (yyval) = NULL; ;}
-    break;
-
-  case 150:
-#line 531 "pars0grm.y"
-    { (yyval) = &pars_clustered_token; ;}
-    break;
-
-  case 151:
-#line 539 "pars0grm.y"
-    { (yyval) = pars_create_index((yyvsp[-8]), (yyvsp[-7]), (yyvsp[-5]), (yyvsp[-3]), (yyvsp[-1])); ;}
-    break;
-
-  case 152:
-#line 544 "pars0grm.y"
-    { (yyval) = pars_commit_statement(); ;}
-    break;
-
-  case 153:
-#line 549 "pars0grm.y"
-    { (yyval) = pars_rollback_statement(); ;}
-    break;
-
-  case 154:
-#line 553 "pars0grm.y"
-    { (yyval) = &pars_int_token; ;}
-    break;
-
-  case 155:
-#line 554 "pars0grm.y"
-    { (yyval) = &pars_int_token; ;}
-    break;
-
-  case 156:
-#line 555 "pars0grm.y"
-    { (yyval) = &pars_char_token; ;}
-    break;
-
-  case 157:
-#line 556 "pars0grm.y"
-    { (yyval) = &pars_binary_token; ;}
-    break;
-
-  case 158:
-#line 557 "pars0grm.y"
-    { (yyval) = &pars_blob_token; ;}
-    break;
-
-  case 159:
-#line 562 "pars0grm.y"
-    { (yyval) = pars_parameter_declaration((yyvsp[-2]),
-							PARS_INPUT, (yyvsp[0])); ;}
-    break;
-
-  case 160:
-#line 565 "pars0grm.y"
-    { (yyval) = pars_parameter_declaration((yyvsp[-2]),
-							PARS_OUTPUT, (yyvsp[0])); ;}
-    break;
-
-  case 161:
-#line 570 "pars0grm.y"
-    { (yyval) = NULL; ;}
-    break;
-
-  case 162:
-#line 571 "pars0grm.y"
-    { (yyval) = que_node_list_add_last(NULL, (yyvsp[0])); ;}
-    break;
-
-  case 163:
-#line 573 "pars0grm.y"
-    { (yyval) = que_node_list_add_last((yyvsp[-2]), (yyvsp[0])); ;}
-    break;
-
-  case 164:
-#line 578 "pars0grm.y"
-    { (yyval) = pars_variable_declaration((yyvsp[-2]), (yyvsp[-1])); ;}
-    break;
-
-  case 168:
-#line 590 "pars0grm.y"
-    { (yyval) = pars_cursor_declaration((yyvsp[-3]), (yyvsp[-1])); ;}
-    break;
-
-  case 169:
-#line 595 "pars0grm.y"
-    { (yyval) = pars_function_declaration((yyvsp[-1])); ;}
-    break;
-
-  case 175:
-#line 616 "pars0grm.y"
-    { (yyval) = pars_procedure_definition((yyvsp[-9]), (yyvsp[-7]),
-								(yyvsp[-1])); ;}
-    break;
-
-
-    }
-
-/* Line 1010 of yacc.c.  */
-#line 2345 "pars0grm.c"
-
-  yyvsp -= yylen;
-  yyssp -= yylen;
-
-
-  YY_STACK_PRINT (yyss, yyssp);
-
-  *++yyvsp = yyval;
-
-
-  /* Now `shift' the result of the reduction.  Determine what state
-     that goes to, based on the state we popped back to and the rule
-     number reduced by.  */
-
-  yyn = yyr1[yyn];
-
-  yystate = yypgoto[yyn - YYNTOKENS] + *yyssp;
-  if (0 <= yystate && yystate <= YYLAST && yycheck[yystate] == *yyssp)
-    yystate = yytable[yystate];
-  else
-    yystate = yydefgoto[yyn - YYNTOKENS];
-
-  goto yynewstate;
-
-
-/*------------------------------------.
-| yyerrlab -- here on detecting error |
-`------------------------------------*/
-yyerrlab:
-  /* If not already recovering from an error, report this error.  */
-  if (!yyerrstatus)
-    {
-      ++yynerrs;
-#if YYERROR_VERBOSE
-      yyn = yypact[yystate];
-
-      if (YYPACT_NINF < yyn && yyn < YYLAST)
-	{
-	  YYSIZE_T yysize = 0;
-	  int yytype = YYTRANSLATE (yychar);
-	  const char* yyprefix;
-	  char *yymsg;
-	  int yyx;
-
-	  /* Start YYX at -YYN if negative to avoid negative indexes in
-	     YYCHECK.  */
-	  int yyxbegin = yyn < 0 ? -yyn : 0;
-
-	  /* Stay within bounds of both yycheck and yytname.  */
-	  int yychecklim = YYLAST - yyn;
-	  int yyxend = yychecklim < YYNTOKENS ? yychecklim : YYNTOKENS;
-	  int yycount = 0;
-
-	  yyprefix = ", expecting ";
-	  for (yyx = yyxbegin; yyx < yyxend; ++yyx)
-	    if (yycheck[yyx + yyn] == yyx && yyx != YYTERROR)
-	      {
-		yysize += yystrlen (yyprefix) + yystrlen (yytname [yyx]);
-		yycount += 1;
-		if (yycount == 5)
-		  {
-		    yysize = 0;
-		    break;
-		  }
-	      }
-	  yysize += (sizeof ("syntax error, unexpected ")
-		     + yystrlen (yytname[yytype]));
-	  yymsg = (char *) YYSTACK_ALLOC (yysize);
-	  if (yymsg != 0)
-	    {
-	      char *yyp = yystpcpy (yymsg, "syntax error, unexpected ");
-	      yyp = yystpcpy (yyp, yytname[yytype]);
-
-	      if (yycount < 5)
-		{
-		  yyprefix = ", expecting ";
-		  for (yyx = yyxbegin; yyx < yyxend; ++yyx)
-		    if (yycheck[yyx + yyn] == yyx && yyx != YYTERROR)
-		      {
-			yyp = yystpcpy (yyp, yyprefix);
-			yyp = yystpcpy (yyp, yytname[yyx]);
-			yyprefix = " or ";
-		      }
-		}
-	      yyerror (yymsg);
-	      YYSTACK_FREE (yymsg);
-	    }
-	  else
-	    yyerror ("syntax error; also virtual memory exhausted");
-	}
-      else
-#endif /* YYERROR_VERBOSE */
-	yyerror ("syntax error");
-    }
-
-
-
-  if (yyerrstatus == 3)
-    {
-      /* If just tried and failed to reuse look-ahead token after an
-	 error, discard it.  */
-
-      if (yychar <= YYEOF)
-        {
-          /* If at end of input, pop the error token,
-	     then the rest of the stack, then return failure.  */
-	  if (yychar == YYEOF)
-	     for (;;)
-	       {
-
-		 YYPOPSTACK;
-		 if (yyssp == yyss)
-		   YYABORT;
-		 yydestruct ("Error: popping",
-                             yystos[*yyssp], yyvsp);
-	       }
-        }
-      else
-	{
-	  yydestruct ("Error: discarding", yytoken, &yylval);
-	  yychar = YYEMPTY;
-	}
-    }
-
-  /* Else will try to reuse look-ahead token after shifting the error
-     token.  */
-  goto yyerrlab1;
-
-
-/*---------------------------------------------------.
-| yyerrorlab -- error raised explicitly by YYERROR.  |
-`---------------------------------------------------*/
-yyerrorlab:
-
-#ifdef __GNUC__
-  /* Pacify GCC when the user code never invokes YYERROR and the label
-     yyerrorlab therefore never appears in user code.  */
-  if (0)
-     goto yyerrorlab;
-#endif
-
-yyvsp -= yylen;
-  yyssp -= yylen;
-  yystate = *yyssp;
-  goto yyerrlab1;
-
-
-/*-------------------------------------------------------------.
-| yyerrlab1 -- common code for both syntax error and YYERROR.  |
-`-------------------------------------------------------------*/
-yyerrlab1:
-  yyerrstatus = 3;	/* Each real token shifted decrements this.  */
-
-  for (;;)
-    {
-      yyn = yypact[yystate];
-      if (yyn != YYPACT_NINF)
-	{
-	  yyn += YYTERROR;
-	  if (0 <= yyn && yyn <= YYLAST && yycheck[yyn] == YYTERROR)
-	    {
-	      yyn = yytable[yyn];
-	      if (0 < yyn)
-		break;
-	    }
-	}
-
-      /* Pop the current state because it cannot handle the error token.  */
-      if (yyssp == yyss)
-	YYABORT;
-
-
-      yydestruct ("Error: popping", yystos[yystate], yyvsp);
-      YYPOPSTACK;
-      yystate = *yyssp;
-      YY_STACK_PRINT (yyss, yyssp);
-    }
-
-  if (yyn == YYFINAL)
-    YYACCEPT;
-
-  *++yyvsp = yylval;
-
-
-  /* Shift the error token. */
-  YY_SYMBOL_PRINT ("Shifting", yystos[yyn], yyvsp, yylsp);
-
-  yystate = yyn;
-  goto yynewstate;
-
-
-/*-------------------------------------.
-| yyacceptlab -- YYACCEPT comes here.  |
-`-------------------------------------*/
-yyacceptlab:
-  yyresult = 0;
-  goto yyreturn;
-
-/*-----------------------------------.
-| yyabortlab -- YYABORT comes here.  |
-`-----------------------------------*/
-yyabortlab:
-  yydestruct ("Error: discarding lookahead",
-              yytoken, &yylval);
-  yychar = YYEMPTY;
-  yyresult = 1;
-  goto yyreturn;
-
-#ifndef yyoverflow
-/*----------------------------------------------.
-| yyoverflowlab -- parser overflow comes here.  |
-`----------------------------------------------*/
-yyoverflowlab:
-  yyerror ("parser stack overflow");
-  yyresult = 2;
-  /* Fall through.  */
-#endif
-
-yyreturn:
-#ifndef yyoverflow
-  if (yyss != yyssa)
-    YYSTACK_FREE (yyss);
-#endif
-  return yyresult;
-}
-
-
-#line 620 "pars0grm.y"
-
-
diff --git a/storage/innobase/pars/pars0grm.cc b/storage/innobase/pars/pars0grm.cc
new file mode 100644
index 00000000000..b360f36e597
--- /dev/null
+++ b/storage/innobase/pars/pars0grm.cc
@@ -0,0 +1,3034 @@
+/* A Bison parser, made by GNU Bison 2.3.  */
+
+/* Skeleton implementation for Bison's Yacc-like parsers in C
+
+   Copyright (C) 1984, 1989, 1990, 2000, 2001, 2002, 2003, 2004, 2005, 2006
+   Free Software Foundation, Inc.
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2, or (at your option)
+   any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 51 Franklin Street, Fifth Floor,
+   Boston, MA 02110-1301, USA.  */
+
+/* As a special exception, you may create a larger work that contains
+   part or all of the Bison parser skeleton and distribute that work
+   under terms of your choice, so long as that work isn't itself a
+   parser generator using the skeleton or a modified version thereof
+   as a parser skeleton.  Alternatively, if you modify or redistribute
+   the parser skeleton itself, you may (at your option) remove this
+   special exception, which will cause the skeleton and the resulting
+   Bison output files to be licensed under the GNU General Public
+   License without this special exception.
+
+   This special exception was added by the Free Software Foundation in
+   version 2.2 of Bison.  */
+
+/* C LALR(1) parser skeleton written by Richard Stallman, by
+   simplifying the original so-called "semantic" parser.  */
+
+/* All symbols defined below should begin with yy or YY, to avoid
+   infringing on user name space.  This should be done even for local
+   variables, as they might otherwise be expanded by user macros.
+   There are some unavoidable exceptions within include files to
+   define necessary library symbols; they are noted "INFRINGES ON
+   USER NAME SPACE" below.  */
+
+/* Identify Bison output.  */
+#define YYBISON 1
+
+/* Bison version.  */
+#define YYBISON_VERSION "2.3"
+
+/* Skeleton name.  */
+#define YYSKELETON_NAME "yacc.c"
+
+/* Pure parsers.  */
+#define YYPURE 0
+
+/* Using locations.  */
+#define YYLSP_NEEDED 0
+
+
+
+/* Tokens.  */
+#ifndef YYTOKENTYPE
+# define YYTOKENTYPE
+   /* Put the tokens into the symbol table, so that GDB and other debuggers
+      know about them.  */
+   enum yytokentype {
+     PARS_INT_LIT = 258,
+     PARS_FLOAT_LIT = 259,
+     PARS_STR_LIT = 260,
+     PARS_FIXBINARY_LIT = 261,
+     PARS_BLOB_LIT = 262,
+     PARS_NULL_LIT = 263,
+     PARS_ID_TOKEN = 264,
+     PARS_AND_TOKEN = 265,
+     PARS_OR_TOKEN = 266,
+     PARS_NOT_TOKEN = 267,
+     PARS_GE_TOKEN = 268,
+     PARS_LE_TOKEN = 269,
+     PARS_NE_TOKEN = 270,
+     PARS_PROCEDURE_TOKEN = 271,
+     PARS_IN_TOKEN = 272,
+     PARS_OUT_TOKEN = 273,
+     PARS_BINARY_TOKEN = 274,
+     PARS_BLOB_TOKEN = 275,
+     PARS_INT_TOKEN = 276,
+     PARS_INTEGER_TOKEN = 277,
+     PARS_FLOAT_TOKEN = 278,
+     PARS_CHAR_TOKEN = 279,
+     PARS_IS_TOKEN = 280,
+     PARS_BEGIN_TOKEN = 281,
+     PARS_END_TOKEN = 282,
+     PARS_IF_TOKEN = 283,
+     PARS_THEN_TOKEN = 284,
+     PARS_ELSE_TOKEN = 285,
+     PARS_ELSIF_TOKEN = 286,
+     PARS_LOOP_TOKEN = 287,
+     PARS_WHILE_TOKEN = 288,
+     PARS_RETURN_TOKEN = 289,
+     PARS_SELECT_TOKEN = 290,
+     PARS_SUM_TOKEN = 291,
+     PARS_COUNT_TOKEN = 292,
+     PARS_DISTINCT_TOKEN = 293,
+     PARS_FROM_TOKEN = 294,
+     PARS_WHERE_TOKEN = 295,
+     PARS_FOR_TOKEN = 296,
+     PARS_DDOT_TOKEN = 297,
+     PARS_READ_TOKEN = 298,
+     PARS_ORDER_TOKEN = 299,
+     PARS_BY_TOKEN = 300,
+     PARS_ASC_TOKEN = 301,
+     PARS_DESC_TOKEN = 302,
+     PARS_INSERT_TOKEN = 303,
+     PARS_INTO_TOKEN = 304,
+     PARS_VALUES_TOKEN = 305,
+     PARS_UPDATE_TOKEN = 306,
+     PARS_SET_TOKEN = 307,
+     PARS_DELETE_TOKEN = 308,
+     PARS_CURRENT_TOKEN = 309,
+     PARS_OF_TOKEN = 310,
+     PARS_CREATE_TOKEN = 311,
+     PARS_TABLE_TOKEN = 312,
+     PARS_INDEX_TOKEN = 313,
+     PARS_UNIQUE_TOKEN = 314,
+     PARS_CLUSTERED_TOKEN = 315,
+     PARS_DOES_NOT_FIT_IN_MEM_TOKEN = 316,
+     PARS_ON_TOKEN = 317,
+     PARS_ASSIGN_TOKEN = 318,
+     PARS_DECLARE_TOKEN = 319,
+     PARS_CURSOR_TOKEN = 320,
+     PARS_SQL_TOKEN = 321,
+     PARS_OPEN_TOKEN = 322,
+     PARS_FETCH_TOKEN = 323,
+     PARS_CLOSE_TOKEN = 324,
+     PARS_NOTFOUND_TOKEN = 325,
+     PARS_TO_CHAR_TOKEN = 326,
+     PARS_TO_NUMBER_TOKEN = 327,
+     PARS_TO_BINARY_TOKEN = 328,
+     PARS_BINARY_TO_NUMBER_TOKEN = 329,
+     PARS_SUBSTR_TOKEN = 330,
+     PARS_REPLSTR_TOKEN = 331,
+     PARS_CONCAT_TOKEN = 332,
+     PARS_INSTR_TOKEN = 333,
+     PARS_LENGTH_TOKEN = 334,
+     PARS_SYSDATE_TOKEN = 335,
+     PARS_PRINTF_TOKEN = 336,
+     PARS_ASSERT_TOKEN = 337,
+     PARS_RND_TOKEN = 338,
+     PARS_RND_STR_TOKEN = 339,
+     PARS_ROW_PRINTF_TOKEN = 340,
+     PARS_COMMIT_TOKEN = 341,
+     PARS_ROLLBACK_TOKEN = 342,
+     PARS_WORK_TOKEN = 343,
+     PARS_UNSIGNED_TOKEN = 344,
+     PARS_EXIT_TOKEN = 345,
+     PARS_FUNCTION_TOKEN = 346,
+     PARS_LOCK_TOKEN = 347,
+     PARS_SHARE_TOKEN = 348,
+     PARS_MODE_TOKEN = 349,
+     PARS_LIKE_TOKEN = 350,
+     PARS_LIKE_TOKEN_EXACT = 351,
+     PARS_LIKE_TOKEN_PREFIX = 352,
+     PARS_LIKE_TOKEN_SUFFIX = 353,
+     PARS_LIKE_TOKEN_SUBSTR = 354,
+     PARS_TABLE_NAME_TOKEN = 355,
+     PARS_COMPACT_TOKEN = 356,
+     PARS_BLOCK_SIZE_TOKEN = 357,
+     PARS_BIGINT_TOKEN = 358,
+     NEG = 359
+   };
+#endif
+/* Tokens.  */
+#define PARS_INT_LIT 258
+#define PARS_FLOAT_LIT 259
+#define PARS_STR_LIT 260
+#define PARS_FIXBINARY_LIT 261
+#define PARS_BLOB_LIT 262
+#define PARS_NULL_LIT 263
+#define PARS_ID_TOKEN 264
+#define PARS_AND_TOKEN 265
+#define PARS_OR_TOKEN 266
+#define PARS_NOT_TOKEN 267
+#define PARS_GE_TOKEN 268
+#define PARS_LE_TOKEN 269
+#define PARS_NE_TOKEN 270
+#define PARS_PROCEDURE_TOKEN 271
+#define PARS_IN_TOKEN 272
+#define PARS_OUT_TOKEN 273
+#define PARS_BINARY_TOKEN 274
+#define PARS_BLOB_TOKEN 275
+#define PARS_INT_TOKEN 276
+#define PARS_INTEGER_TOKEN 277
+#define PARS_FLOAT_TOKEN 278
+#define PARS_CHAR_TOKEN 279
+#define PARS_IS_TOKEN 280
+#define PARS_BEGIN_TOKEN 281
+#define PARS_END_TOKEN 282
+#define PARS_IF_TOKEN 283
+#define PARS_THEN_TOKEN 284
+#define PARS_ELSE_TOKEN 285
+#define PARS_ELSIF_TOKEN 286
+#define PARS_LOOP_TOKEN 287
+#define PARS_WHILE_TOKEN 288
+#define PARS_RETURN_TOKEN 289
+#define PARS_SELECT_TOKEN 290
+#define PARS_SUM_TOKEN 291
+#define PARS_COUNT_TOKEN 292
+#define PARS_DISTINCT_TOKEN 293
+#define PARS_FROM_TOKEN 294
+#define PARS_WHERE_TOKEN 295
+#define PARS_FOR_TOKEN 296
+#define PARS_DDOT_TOKEN 297
+#define PARS_READ_TOKEN 298
+#define PARS_ORDER_TOKEN 299
+#define PARS_BY_TOKEN 300
+#define PARS_ASC_TOKEN 301
+#define PARS_DESC_TOKEN 302
+#define PARS_INSERT_TOKEN 303
+#define PARS_INTO_TOKEN 304
+#define PARS_VALUES_TOKEN 305
+#define PARS_UPDATE_TOKEN 306
+#define PARS_SET_TOKEN 307
+#define PARS_DELETE_TOKEN 308
+#define PARS_CURRENT_TOKEN 309
+#define PARS_OF_TOKEN 310
+#define PARS_CREATE_TOKEN 311
+#define PARS_TABLE_TOKEN 312
+#define PARS_INDEX_TOKEN 313
+#define PARS_UNIQUE_TOKEN 314
+#define PARS_CLUSTERED_TOKEN 315
+#define PARS_DOES_NOT_FIT_IN_MEM_TOKEN 316
+#define PARS_ON_TOKEN 317
+#define PARS_ASSIGN_TOKEN 318
+#define PARS_DECLARE_TOKEN 319
+#define PARS_CURSOR_TOKEN 320
+#define PARS_SQL_TOKEN 321
+#define PARS_OPEN_TOKEN 322
+#define PARS_FETCH_TOKEN 323
+#define PARS_CLOSE_TOKEN 324
+#define PARS_NOTFOUND_TOKEN 325
+#define PARS_TO_CHAR_TOKEN 326
+#define PARS_TO_NUMBER_TOKEN 327
+#define PARS_TO_BINARY_TOKEN 328
+#define PARS_BINARY_TO_NUMBER_TOKEN 329
+#define PARS_SUBSTR_TOKEN 330
+#define PARS_REPLSTR_TOKEN 331
+#define PARS_CONCAT_TOKEN 332
+#define PARS_INSTR_TOKEN 333
+#define PARS_LENGTH_TOKEN 334
+#define PARS_SYSDATE_TOKEN 335
+#define PARS_PRINTF_TOKEN 336
+#define PARS_ASSERT_TOKEN 337
+#define PARS_RND_TOKEN 338
+#define PARS_RND_STR_TOKEN 339
+#define PARS_ROW_PRINTF_TOKEN 340
+#define PARS_COMMIT_TOKEN 341
+#define PARS_ROLLBACK_TOKEN 342
+#define PARS_WORK_TOKEN 343
+#define PARS_UNSIGNED_TOKEN 344
+#define PARS_EXIT_TOKEN 345
+#define PARS_FUNCTION_TOKEN 346
+#define PARS_LOCK_TOKEN 347
+#define PARS_SHARE_TOKEN 348
+#define PARS_MODE_TOKEN 349
+#define PARS_LIKE_TOKEN 350
+#define PARS_LIKE_TOKEN_EXACT 351
+#define PARS_LIKE_TOKEN_PREFIX 352
+#define PARS_LIKE_TOKEN_SUFFIX 353
+#define PARS_LIKE_TOKEN_SUBSTR 354
+#define PARS_TABLE_NAME_TOKEN 355
+#define PARS_COMPACT_TOKEN 356
+#define PARS_BLOCK_SIZE_TOKEN 357
+#define PARS_BIGINT_TOKEN 358
+#define NEG 359
+
+
+
+
+/* Copy the first part of user declarations.  */
+#line 28 "pars0grm.y"
+
+/* The value of the semantic attribute is a pointer to a query tree node
+que_node_t */
+
+#include "univ.i"
+#include <math.h>				/* Can't be before univ.i */
+#include "pars0pars.h"
+#include "mem0mem.h"
+#include "que0types.h"
+#include "que0que.h"
+#include "row0sel.h"
+
+#define YYSTYPE que_node_t*
+
+/* #define __STDC__ */
+
+int
+yylex(void);
+
+
+/* Enabling traces.  */
+#ifndef YYDEBUG
+# define YYDEBUG 0
+#endif
+
+/* Enabling verbose error messages.  */
+#ifdef YYERROR_VERBOSE
+# undef YYERROR_VERBOSE
+# define YYERROR_VERBOSE 1
+#else
+# define YYERROR_VERBOSE 0
+#endif
+
+/* Enabling the token table.  */
+#ifndef YYTOKEN_TABLE
+# define YYTOKEN_TABLE 0
+#endif
+
+#if ! defined YYSTYPE && ! defined YYSTYPE_IS_DECLARED
+typedef int YYSTYPE;
+# define yystype YYSTYPE /* obsolescent; will be withdrawn */
+# define YYSTYPE_IS_DECLARED 1
+# define YYSTYPE_IS_TRIVIAL 1
+#endif
+
+
+
+/* Copy the second part of user declarations.  */
+
+
+/* Line 216 of yacc.c.  */
+#line 334 "pars0grm.cc"
+
+#ifdef short
+# undef short
+#endif
+
+#ifdef YYTYPE_UINT8
+typedef YYTYPE_UINT8 yytype_uint8;
+#else
+typedef unsigned char yytype_uint8;
+#endif
+
+#ifdef YYTYPE_INT8
+typedef YYTYPE_INT8 yytype_int8;
+#elif (defined __STDC__ || defined __C99__FUNC__ \
+     || defined __cplusplus || defined _MSC_VER)
+typedef signed char yytype_int8;
+#else
+typedef short int yytype_int8;
+#endif
+
+#ifdef YYTYPE_UINT16
+typedef YYTYPE_UINT16 yytype_uint16;
+#else
+typedef unsigned short int yytype_uint16;
+#endif
+
+#ifdef YYTYPE_INT16
+typedef YYTYPE_INT16 yytype_int16;
+#else
+typedef short int yytype_int16;
+#endif
+
+#ifndef YYSIZE_T
+# ifdef __SIZE_TYPE__
+#  define YYSIZE_T __SIZE_TYPE__
+# elif defined size_t
+#  define YYSIZE_T size_t
+# elif ! defined YYSIZE_T && (defined __STDC__ || defined __C99__FUNC__ \
+     || defined __cplusplus || defined _MSC_VER)
+#  include <stddef.h> /* INFRINGES ON USER NAME SPACE */
+#  define YYSIZE_T size_t
+# else
+#  define YYSIZE_T unsigned int
+# endif
+#endif
+
+#define YYSIZE_MAXIMUM ((YYSIZE_T) -1)
+
+#ifndef YY_
+# if defined YYENABLE_NLS && YYENABLE_NLS
+#  if ENABLE_NLS
+#   include <libintl.h> /* INFRINGES ON USER NAME SPACE */
+#   define YY_(msgid) dgettext ("bison-runtime", msgid)
+#  endif
+# endif
+# ifndef YY_
+#  define YY_(msgid) msgid
+# endif
+#endif
+
+/* Suppress unused-variable warnings by "using" E.  */
+#if ! defined lint || defined __GNUC__
+# define YYUSE(e) ((void) (e))
+#else
+# define YYUSE(e) /* empty */
+#endif
+
+/* Identity function, used to suppress warnings about constant conditions.  */
+#ifndef lint
+# define YYID(n) (n)
+#else
+#if (defined __STDC__ || defined __C99__FUNC__ \
+     || defined __cplusplus || defined _MSC_VER)
+static int
+YYID (int i)
+#else
+static int
+YYID (i)
+    int i;
+#endif
+{
+  return i;
+}
+#endif
+
+#if ! defined yyoverflow || YYERROR_VERBOSE
+
+/* The parser invokes alloca or malloc; define the necessary symbols.  */
+
+# ifdef YYSTACK_USE_ALLOCA
+#  if YYSTACK_USE_ALLOCA
+#   ifdef __GNUC__
+#    define YYSTACK_ALLOC __builtin_alloca
+#   elif defined __BUILTIN_VA_ARG_INCR
+#    include <alloca.h> /* INFRINGES ON USER NAME SPACE */
+#   elif defined _AIX
+#    define YYSTACK_ALLOC __alloca
+#   elif defined _MSC_VER
+#    include <malloc.h> /* INFRINGES ON USER NAME SPACE */
+#    define alloca _alloca
+#   else
+#    define YYSTACK_ALLOC alloca
+#    if ! defined _ALLOCA_H && ! defined _STDLIB_H && (defined __STDC__ || defined __C99__FUNC__ \
+     || defined __cplusplus || defined _MSC_VER)
+#     include <stdlib.h> /* INFRINGES ON USER NAME SPACE */
+#     ifndef _STDLIB_H
+#      define _STDLIB_H 1
+#     endif
+#    endif
+#   endif
+#  endif
+# endif
+
+# ifdef YYSTACK_ALLOC
+   /* Pacify GCC's `empty if-body' warning.  */
+#  define YYSTACK_FREE(Ptr) do { /* empty */; } while (YYID (0))
+#  ifndef YYSTACK_ALLOC_MAXIMUM
+    /* The OS might guarantee only one guard page at the bottom of the stack,
+       and a page size can be as small as 4096 bytes.  So we cannot safely
+       invoke alloca (N) if N exceeds 4096.  Use a slightly smaller number
+       to allow for a few compiler-allocated temporary stack slots.  */
+#   define YYSTACK_ALLOC_MAXIMUM 4032 /* reasonable circa 2006 */
+#  endif
+# else
+#  define YYSTACK_ALLOC YYMALLOC
+#  define YYSTACK_FREE YYFREE
+#  ifndef YYSTACK_ALLOC_MAXIMUM
+#   define YYSTACK_ALLOC_MAXIMUM YYSIZE_MAXIMUM
+#  endif
+#  if (defined __cplusplus && ! defined _STDLIB_H \
+       && ! ((defined YYMALLOC || defined malloc) \
+	     && (defined YYFREE || defined free)))
+#   include <stdlib.h> /* INFRINGES ON USER NAME SPACE */
+#   ifndef _STDLIB_H
+#    define _STDLIB_H 1
+#   endif
+#  endif
+#  ifndef YYMALLOC
+#   define YYMALLOC malloc
+#   if ! defined malloc && ! defined _STDLIB_H && (defined __STDC__ || defined __C99__FUNC__ \
+     || defined __cplusplus || defined _MSC_VER)
+void *malloc (YYSIZE_T); /* INFRINGES ON USER NAME SPACE */
+#   endif
+#  endif
+#  ifndef YYFREE
+#   define YYFREE free
+#   if ! defined free && ! defined _STDLIB_H && (defined __STDC__ || defined __C99__FUNC__ \
+     || defined __cplusplus || defined _MSC_VER)
+void free (void*); /* INFRINGES ON USER NAME SPACE */
+#   endif
+#  endif
+# endif
+#endif /* ! defined yyoverflow || YYERROR_VERBOSE */
+
+
+#if (! defined yyoverflow \
+     && (! defined __cplusplus \
+	 || (defined YYSTYPE_IS_TRIVIAL && YYSTYPE_IS_TRIVIAL)))
+
+/* A type that is properly aligned for any stack member.  */
+union yyalloc
+{
+  yytype_int16 yyss;
+  YYSTYPE yyvs;
+  };
+
+/* The size of the maximum gap between one aligned stack and the next.  */
+# define YYSTACK_GAP_MAXIMUM (sizeof (union yyalloc) - 1)
+
+/* The size of an array large to enough to hold all stacks, each with
+   N elements.  */
+# define YYSTACK_BYTES(N) \
+     ((N) * (sizeof (yytype_int16) + sizeof (YYSTYPE)) \
+      + YYSTACK_GAP_MAXIMUM)
+
+/* Copy COUNT objects from FROM to TO.  The source and destination do
+   not overlap.  */
+# ifndef YYCOPY
+#  if defined __GNUC__ && 1 < __GNUC__
+#   define YYCOPY(To, From, Count) \
+      __builtin_memcpy (To, From, (Count) * sizeof (*(From)))
+#  else
+#   define YYCOPY(To, From, Count)		\
+      do					\
+	{					\
+	  YYSIZE_T yyi;				\
+	  for (yyi = 0; yyi < (Count); yyi++)	\
+	    (To)[yyi] = (From)[yyi];		\
+	}					\
+      while (YYID (0))
+#  endif
+# endif
+
+/* Relocate STACK from its old location to the new one.  The
+   local variables YYSIZE and YYSTACKSIZE give the old and new number of
+   elements in the stack, and YYPTR gives the new location of the
+   stack.  Advance YYPTR to a properly aligned location for the next
+   stack.  */
+# define YYSTACK_RELOCATE(Stack)					\
+    do									\
+      {									\
+	YYSIZE_T yynewbytes;						\
+	YYCOPY (&yyptr->Stack, Stack, yysize);				\
+	Stack = &yyptr->Stack;						\
+	yynewbytes = yystacksize * sizeof (*Stack) + YYSTACK_GAP_MAXIMUM; \
+	yyptr += yynewbytes / sizeof (*yyptr);				\
+      }									\
+    while (YYID (0))
+
+#endif
+
+/* YYFINAL -- State number of the termination state.  */
+#define YYFINAL  5
+/* YYLAST -- Last index in YYTABLE.  */
+#define YYLAST   816
+
+/* YYNTOKENS -- Number of terminals.  */
+#define YYNTOKENS  120
+/* YYNNTS -- Number of nonterminals.  */
+#define YYNNTS  73
+/* YYNRULES -- Number of rules.  */
+#define YYNRULES  183
+/* YYNRULES -- Number of states.  */
+#define YYNSTATES  350
+
+/* YYTRANSLATE(YYLEX) -- Bison symbol number corresponding to YYLEX.  */
+#define YYUNDEFTOK  2
+#define YYMAXUTOK   359
+
+#define YYTRANSLATE(YYX)						\
+  ((unsigned int) (YYX) <= YYMAXUTOK ? yytranslate[YYX] : YYUNDEFTOK)
+
+/* YYTRANSLATE[YYLEX] -- Bison symbol number corresponding to YYLEX.  */
+static const yytype_uint8 yytranslate[] =
+{
+       0,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,   112,     2,     2,
+     114,   115,   109,   108,   117,   107,     2,   110,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,   113,
+     105,   104,   106,   116,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,   118,     2,   119,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     1,     2,     3,     4,
+       5,     6,     7,     8,     9,    10,    11,    12,    13,    14,
+      15,    16,    17,    18,    19,    20,    21,    22,    23,    24,
+      25,    26,    27,    28,    29,    30,    31,    32,    33,    34,
+      35,    36,    37,    38,    39,    40,    41,    42,    43,    44,
+      45,    46,    47,    48,    49,    50,    51,    52,    53,    54,
+      55,    56,    57,    58,    59,    60,    61,    62,    63,    64,
+      65,    66,    67,    68,    69,    70,    71,    72,    73,    74,
+      75,    76,    77,    78,    79,    80,    81,    82,    83,    84,
+      85,    86,    87,    88,    89,    90,    91,    92,    93,    94,
+      95,    96,    97,    98,    99,   100,   101,   102,   103,   111
+};
+
+#if YYDEBUG
+/* YYPRHS[YYN] -- Index of the first RHS symbol of rule number YYN in
+   YYRHS.  */
+static const yytype_uint16 yyprhs[] =
+{
+       0,     0,     3,     6,     8,    11,    14,    17,    20,    23,
+      26,    29,    32,    35,    38,    41,    44,    47,    50,    53,
+      56,    59,    62,    65,    68,    71,    73,    76,    78,    83,
+      85,    87,    89,    91,    93,    95,    97,   101,   105,   109,
+     113,   116,   120,   124,   128,   132,   136,   140,   144,   148,
+     152,   156,   159,   163,   167,   169,   171,   173,   175,   177,
+     179,   181,   183,   185,   187,   189,   190,   192,   196,   203,
+     208,   210,   212,   214,   218,   220,   224,   225,   227,   231,
+     232,   234,   238,   240,   245,   251,   256,   257,   259,   263,
+     265,   269,   271,   272,   275,   276,   279,   280,   285,   286,
+     288,   290,   291,   296,   305,   309,   315,   318,   322,   324,
+     328,   333,   338,   341,   344,   348,   351,   354,   357,   361,
+     366,   368,   371,   372,   375,   377,   385,   392,   403,   405,
+     407,   410,   413,   418,   423,   429,   431,   435,   436,   440,
+     441,   443,   444,   447,   448,   450,   451,   453,   454,   458,
+     468,   470,   474,   475,   477,   478,   480,   491,   493,   495,
+     498,   501,   503,   505,   507,   509,   511,   513,   517,   521,
+     522,   524,   528,   532,   533,   535,   538,   545,   550,   552,
+     554,   555,   557,   560
+};
+
+/* YYRHS -- A `-1'-separated list of the rules' RHS.  */
+static const yytype_int16 yyrhs[] =
+{
+     121,     0,    -1,   192,   113,    -1,   127,    -1,   128,   113,
+      -1,   160,   113,    -1,   161,   113,    -1,   162,   113,    -1,
+     159,   113,    -1,   163,   113,    -1,   155,   113,    -1,   142,
+     113,    -1,   144,   113,    -1,   154,   113,    -1,   152,   113,
+      -1,   153,   113,    -1,   149,   113,    -1,   150,   113,    -1,
+     164,   113,    -1,   166,   113,    -1,   165,   113,    -1,   181,
+     113,    -1,   182,   113,    -1,   175,   113,    -1,   179,   113,
+      -1,   122,    -1,   123,   122,    -1,     9,    -1,   125,   114,
+     133,   115,    -1,     3,    -1,     4,    -1,     5,    -1,     6,
+      -1,     7,    -1,     8,    -1,    66,    -1,   124,   108,   124,
+      -1,   124,   107,   124,    -1,   124,   109,   124,    -1,   124,
+     110,   124,    -1,   107,   124,    -1,   114,   124,   115,    -1,
+     124,   104,   124,    -1,   124,    95,     5,    -1,   124,   105,
+     124,    -1,   124,   106,   124,    -1,   124,    13,   124,    -1,
+     124,    14,   124,    -1,   124,    15,   124,    -1,   124,    10,
+     124,    -1,   124,    11,   124,    -1,    12,   124,    -1,     9,
+     112,    70,    -1,    66,   112,    70,    -1,    71,    -1,    72,
+      -1,    73,    -1,    74,    -1,    75,    -1,    77,    -1,    78,
+      -1,    79,    -1,    80,    -1,    83,    -1,    84,    -1,    -1,
+     116,    -1,   126,   117,   116,    -1,   118,     9,   114,   126,
+     115,   119,    -1,   129,   114,   133,   115,    -1,    76,    -1,
+      81,    -1,    82,    -1,     9,   114,   115,    -1,   180,    -1,
+     131,   117,   180,    -1,    -1,     9,    -1,   132,   117,     9,
+      -1,    -1,   124,    -1,   133,   117,   124,    -1,   124,    -1,
+      37,   114,   109,   115,    -1,    37,   114,    38,     9,   115,
+      -1,    36,   114,   124,   115,    -1,    -1,   134,    -1,   135,
+     117,   134,    -1,   109,    -1,   135,    49,   132,    -1,   135,
+      -1,    -1,    40,   124,    -1,    -1,    41,    51,    -1,    -1,
+      92,    17,    93,    94,    -1,    -1,    46,    -1,    47,    -1,
+      -1,    44,    45,     9,   140,    -1,    35,   136,    39,   131,
+     137,   138,   139,   141,    -1,    48,    49,   180,    -1,   143,
+      50,   114,   133,   115,    -1,   143,   142,    -1,     9,   104,
+     124,    -1,   145,    -1,   146,   117,   145,    -1,    40,    54,
+      55,     9,    -1,    51,   180,    52,   146,    -1,   148,   137,
+      -1,   148,   147,    -1,    53,    39,   180,    -1,   151,   137,
+      -1,   151,   147,    -1,    85,   142,    -1,     9,    63,   124,
+      -1,    31,   124,    29,   123,    -1,   156,    -1,   157,   156,
+      -1,    -1,    30,   123,    -1,   157,    -1,    28,   124,    29,
+     123,   158,    27,    28,    -1,    33,   124,    32,   123,    27,
+      32,    -1,    41,     9,    17,   124,    42,   124,    32,   123,
+      27,    32,    -1,    90,    -1,    34,    -1,    67,     9,    -1,
+      69,     9,    -1,    68,     9,    49,   132,    -1,    68,     9,
+      49,   130,    -1,     9,   183,   169,   170,   171,    -1,   167,
+      -1,   168,   117,   167,    -1,    -1,   114,     3,   115,    -1,
+      -1,    89,    -1,    -1,    12,     8,    -1,    -1,    61,    -1,
+      -1,   101,    -1,    -1,   102,   104,     3,    -1,    56,    57,
+     180,   114,   168,   115,   172,   173,   174,    -1,     9,    -1,
+     176,   117,     9,    -1,    -1,    59,    -1,    -1,    60,    -1,
+      56,   177,   178,    58,     9,    62,   180,   114,   176,   115,
+      -1,     9,    -1,   100,    -1,    86,    88,    -1,    87,    88,
+      -1,    21,    -1,    22,    -1,   103,    -1,    24,    -1,    19,
+      -1,    20,    -1,     9,    17,   183,    -1,     9,    18,   183,
+      -1,    -1,   184,    -1,   185,   117,   184,    -1,     9,   183,
+     113,    -1,    -1,   186,    -1,   187,   186,    -1,    64,    65,
+       9,    25,   142,   113,    -1,    64,    91,     9,   113,    -1,
+     188,    -1,   189,    -1,    -1,   190,    -1,   191,   190,    -1,
+      16,     9,   114,   185,   115,    25,   187,   191,    26,   123,
+      27,    -1
+};
+
+/* YYRLINE[YYN] -- source line where rule number YYN was defined.  */
+static const yytype_uint16 yyrline[] =
+{
+       0,   162,   162,   165,   166,   167,   168,   169,   170,   171,
+     172,   173,   174,   175,   176,   177,   178,   179,   180,   181,
+     182,   183,   184,   185,   186,   190,   191,   196,   197,   199,
+     200,   201,   202,   203,   204,   205,   206,   207,   208,   209,
+     210,   211,   212,   213,   215,   216,   217,   218,   219,   220,
+     221,   222,   223,   225,   230,   231,   232,   233,   235,   236,
+     237,   238,   239,   240,   241,   244,   246,   247,   251,   257,
+     262,   263,   264,   268,   272,   273,   278,   279,   280,   285,
+     286,   287,   291,   292,   297,   303,   310,   311,   312,   317,
+     319,   322,   326,   327,   331,   332,   337,   338,   343,   344,
+     345,   349,   350,   357,   372,   377,   380,   388,   394,   395,
+     400,   406,   415,   423,   431,   438,   446,   454,   460,   467,
+     473,   474,   479,   480,   482,   486,   493,   499,   509,   513,
+     517,   524,   531,   535,   543,   552,   553,   558,   559,   564,
+     565,   571,   572,   578,   579,   585,   586,   591,   592,   597,
+     608,   609,   614,   615,   619,   620,   624,   638,   639,   643,
+     648,   653,   654,   655,   656,   657,   658,   662,   667,   675,
+     676,   677,   682,   688,   690,   691,   695,   703,   709,   710,
+     713,   715,   716,   720
+};
+#endif
+
+#if YYDEBUG || YYERROR_VERBOSE || YYTOKEN_TABLE
+/* YYTNAME[SYMBOL-NUM] -- String name of the symbol SYMBOL-NUM.
+   First, the terminals, then, starting at YYNTOKENS, nonterminals.  */
+static const char *const yytname[] =
+{
+  "$end", "error", "$undefined", "PARS_INT_LIT", "PARS_FLOAT_LIT",
+  "PARS_STR_LIT", "PARS_FIXBINARY_LIT", "PARS_BLOB_LIT", "PARS_NULL_LIT",
+  "PARS_ID_TOKEN", "PARS_AND_TOKEN", "PARS_OR_TOKEN", "PARS_NOT_TOKEN",
+  "PARS_GE_TOKEN", "PARS_LE_TOKEN", "PARS_NE_TOKEN",
+  "PARS_PROCEDURE_TOKEN", "PARS_IN_TOKEN", "PARS_OUT_TOKEN",
+  "PARS_BINARY_TOKEN", "PARS_BLOB_TOKEN", "PARS_INT_TOKEN",
+  "PARS_INTEGER_TOKEN", "PARS_FLOAT_TOKEN", "PARS_CHAR_TOKEN",
+  "PARS_IS_TOKEN", "PARS_BEGIN_TOKEN", "PARS_END_TOKEN", "PARS_IF_TOKEN",
+  "PARS_THEN_TOKEN", "PARS_ELSE_TOKEN", "PARS_ELSIF_TOKEN",
+  "PARS_LOOP_TOKEN", "PARS_WHILE_TOKEN", "PARS_RETURN_TOKEN",
+  "PARS_SELECT_TOKEN", "PARS_SUM_TOKEN", "PARS_COUNT_TOKEN",
+  "PARS_DISTINCT_TOKEN", "PARS_FROM_TOKEN", "PARS_WHERE_TOKEN",
+  "PARS_FOR_TOKEN", "PARS_DDOT_TOKEN", "PARS_READ_TOKEN",
+  "PARS_ORDER_TOKEN", "PARS_BY_TOKEN", "PARS_ASC_TOKEN", "PARS_DESC_TOKEN",
+  "PARS_INSERT_TOKEN", "PARS_INTO_TOKEN", "PARS_VALUES_TOKEN",
+  "PARS_UPDATE_TOKEN", "PARS_SET_TOKEN", "PARS_DELETE_TOKEN",
+  "PARS_CURRENT_TOKEN", "PARS_OF_TOKEN", "PARS_CREATE_TOKEN",
+  "PARS_TABLE_TOKEN", "PARS_INDEX_TOKEN", "PARS_UNIQUE_TOKEN",
+  "PARS_CLUSTERED_TOKEN", "PARS_DOES_NOT_FIT_IN_MEM_TOKEN",
+  "PARS_ON_TOKEN", "PARS_ASSIGN_TOKEN", "PARS_DECLARE_TOKEN",
+  "PARS_CURSOR_TOKEN", "PARS_SQL_TOKEN", "PARS_OPEN_TOKEN",
+  "PARS_FETCH_TOKEN", "PARS_CLOSE_TOKEN", "PARS_NOTFOUND_TOKEN",
+  "PARS_TO_CHAR_TOKEN", "PARS_TO_NUMBER_TOKEN", "PARS_TO_BINARY_TOKEN",
+  "PARS_BINARY_TO_NUMBER_TOKEN", "PARS_SUBSTR_TOKEN", "PARS_REPLSTR_TOKEN",
+  "PARS_CONCAT_TOKEN", "PARS_INSTR_TOKEN", "PARS_LENGTH_TOKEN",
+  "PARS_SYSDATE_TOKEN", "PARS_PRINTF_TOKEN", "PARS_ASSERT_TOKEN",
+  "PARS_RND_TOKEN", "PARS_RND_STR_TOKEN", "PARS_ROW_PRINTF_TOKEN",
+  "PARS_COMMIT_TOKEN", "PARS_ROLLBACK_TOKEN", "PARS_WORK_TOKEN",
+  "PARS_UNSIGNED_TOKEN", "PARS_EXIT_TOKEN", "PARS_FUNCTION_TOKEN",
+  "PARS_LOCK_TOKEN", "PARS_SHARE_TOKEN", "PARS_MODE_TOKEN",
+  "PARS_LIKE_TOKEN", "PARS_LIKE_TOKEN_EXACT", "PARS_LIKE_TOKEN_PREFIX",
+  "PARS_LIKE_TOKEN_SUFFIX", "PARS_LIKE_TOKEN_SUBSTR",
+  "PARS_TABLE_NAME_TOKEN", "PARS_COMPACT_TOKEN", "PARS_BLOCK_SIZE_TOKEN",
+  "PARS_BIGINT_TOKEN", "'='", "'<'", "'>'", "'-'", "'+'", "'*'", "'/'",
+  "NEG", "'%'", "';'", "'('", "')'", "'?'", "','", "'{'", "'}'", "$accept",
+  "top_statement", "statement", "statement_list", "exp", "function_name",
+  "question_mark_list", "stored_procedure_call",
+  "predefined_procedure_call", "predefined_procedure_name",
+  "user_function_call", "table_list", "variable_list", "exp_list",
+  "select_item", "select_item_list", "select_list", "search_condition",
+  "for_update_clause", "lock_shared_clause", "order_direction",
+  "order_by_clause", "select_statement", "insert_statement_start",
+  "insert_statement", "column_assignment", "column_assignment_list",
+  "cursor_positioned", "update_statement_start",
+  "update_statement_searched", "update_statement_positioned",
+  "delete_statement_start", "delete_statement_searched",
+  "delete_statement_positioned", "row_printf_statement",
+  "assignment_statement", "elsif_element", "elsif_list", "else_part",
+  "if_statement", "while_statement", "for_statement", "exit_statement",
+  "return_statement", "open_cursor_statement", "close_cursor_statement",
+  "fetch_statement", "column_def", "column_def_list", "opt_column_len",
+  "opt_unsigned", "opt_not_null", "not_fit_in_memory", "compact",
+  "block_size", "create_table", "column_list", "unique_def",
+  "clustered_def", "create_index", "table_name", "commit_statement",
+  "rollback_statement", "type_name", "parameter_declaration",
+  "parameter_declaration_list", "variable_declaration",
+  "variable_declaration_list", "cursor_declaration",
+  "function_declaration", "declaration", "declaration_list",
+  "procedure_definition", 0
+};
+#endif
+
+# ifdef YYPRINT
+/* YYTOKNUM[YYLEX-NUM] -- Internal token number corresponding to
+   token YYLEX-NUM.  */
+static const yytype_uint16 yytoknum[] =
+{
+       0,   256,   257,   258,   259,   260,   261,   262,   263,   264,
+     265,   266,   267,   268,   269,   270,   271,   272,   273,   274,
+     275,   276,   277,   278,   279,   280,   281,   282,   283,   284,
+     285,   286,   287,   288,   289,   290,   291,   292,   293,   294,
+     295,   296,   297,   298,   299,   300,   301,   302,   303,   304,
+     305,   306,   307,   308,   309,   310,   311,   312,   313,   314,
+     315,   316,   317,   318,   319,   320,   321,   322,   323,   324,
+     325,   326,   327,   328,   329,   330,   331,   332,   333,   334,
+     335,   336,   337,   338,   339,   340,   341,   342,   343,   344,
+     345,   346,   347,   348,   349,   350,   351,   352,   353,   354,
+     355,   356,   357,   358,    61,    60,    62,    45,    43,    42,
+      47,   359,    37,    59,    40,    41,    63,    44,   123,   125
+};
+# endif
+
+/* YYR1[YYN] -- Symbol number of symbol that rule YYN derives.  */
+static const yytype_uint8 yyr1[] =
+{
+       0,   120,   121,   122,   122,   122,   122,   122,   122,   122,
+     122,   122,   122,   122,   122,   122,   122,   122,   122,   122,
+     122,   122,   122,   122,   122,   123,   123,   124,   124,   124,
+     124,   124,   124,   124,   124,   124,   124,   124,   124,   124,
+     124,   124,   124,   124,   124,   124,   124,   124,   124,   124,
+     124,   124,   124,   124,   125,   125,   125,   125,   125,   125,
+     125,   125,   125,   125,   125,   126,   126,   126,   127,   128,
+     129,   129,   129,   130,   131,   131,   132,   132,   132,   133,
+     133,   133,   134,   134,   134,   134,   135,   135,   135,   136,
+     136,   136,   137,   137,   138,   138,   139,   139,   140,   140,
+     140,   141,   141,   142,   143,   144,   144,   145,   146,   146,
+     147,   148,   149,   150,   151,   152,   153,   154,   155,   156,
+     157,   157,   158,   158,   158,   159,   160,   161,   162,   163,
+     164,   165,   166,   166,   167,   168,   168,   169,   169,   170,
+     170,   171,   171,   172,   172,   173,   173,   174,   174,   175,
+     176,   176,   177,   177,   178,   178,   179,   180,   180,   181,
+     182,   183,   183,   183,   183,   183,   183,   184,   184,   185,
+     185,   185,   186,   187,   187,   187,   188,   189,   190,   190,
+     191,   191,   191,   192
+};
+
+/* YYR2[YYN] -- Number of symbols composing right hand side of rule YYN.  */
+static const yytype_uint8 yyr2[] =
+{
+       0,     2,     2,     1,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     1,     2,     1,     4,     1,
+       1,     1,     1,     1,     1,     1,     3,     3,     3,     3,
+       2,     3,     3,     3,     3,     3,     3,     3,     3,     3,
+       3,     2,     3,     3,     1,     1,     1,     1,     1,     1,
+       1,     1,     1,     1,     1,     0,     1,     3,     6,     4,
+       1,     1,     1,     3,     1,     3,     0,     1,     3,     0,
+       1,     3,     1,     4,     5,     4,     0,     1,     3,     1,
+       3,     1,     0,     2,     0,     2,     0,     4,     0,     1,
+       1,     0,     4,     8,     3,     5,     2,     3,     1,     3,
+       4,     4,     2,     2,     3,     2,     2,     2,     3,     4,
+       1,     2,     0,     2,     1,     7,     6,    10,     1,     1,
+       2,     2,     4,     4,     5,     1,     3,     0,     3,     0,
+       1,     0,     2,     0,     1,     0,     1,     0,     3,     9,
+       1,     3,     0,     1,     0,     1,    10,     1,     1,     2,
+       2,     1,     1,     1,     1,     1,     1,     3,     3,     0,
+       1,     3,     3,     0,     1,     2,     6,     4,     1,     1,
+       0,     1,     2,    11
+};
+
+/* YYDEFACT[STATE-NAME] -- Default rule to reduce with in state
+   STATE-NUM when YYTABLE doesn't specify something else to do.  Zero
+   means the default is an error.  */
+static const yytype_uint8 yydefact[] =
+{
+       0,     0,     0,     0,     0,     1,     2,   169,     0,   170,
+       0,     0,     0,     0,     0,   165,   166,   161,   162,   164,
+     163,   167,   168,   173,   171,     0,   174,   180,     0,     0,
+     175,   178,   179,   181,     0,   172,     0,     0,     0,   182,
+       0,     0,     0,     0,     0,   129,    86,     0,     0,     0,
+       0,   152,     0,     0,     0,    70,    71,    72,     0,     0,
+       0,   128,     0,    25,     0,     3,     0,     0,     0,     0,
+       0,    92,     0,     0,    92,     0,     0,     0,     0,     0,
+       0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
+       0,     0,   177,     0,    29,    30,    31,    32,    33,    34,
+      27,     0,    35,    54,    55,    56,    57,    58,    59,    60,
+      61,    62,    63,    64,     0,     0,     0,     0,     0,     0,
+       0,    89,    82,    87,    91,     0,     0,     0,   157,   158,
+       0,     0,     0,   153,   154,   130,     0,   131,   117,   159,
+     160,     0,   183,    26,     4,    79,    11,     0,   106,    12,
+       0,   112,   113,    16,    17,   115,   116,    14,    15,    13,
+      10,     8,     5,     6,     7,     9,    18,    20,    19,    23,
+      24,    21,    22,     0,   118,     0,    51,     0,    40,     0,
+       0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
+       0,     0,     0,     0,    79,     0,     0,     0,    76,     0,
+       0,     0,   104,     0,   114,     0,   155,     0,    76,    65,
+      80,     0,    79,     0,    93,   176,    52,    53,    41,    49,
+      50,    46,    47,    48,   122,    43,    42,    44,    45,    37,
+      36,    38,    39,     0,     0,     0,     0,     0,    77,    90,
+      88,    92,    74,     0,     0,   108,   111,     0,     0,    77,
+     133,   132,    66,     0,    69,     0,     0,     0,     0,     0,
+     120,   124,     0,    28,     0,    85,     0,    83,     0,     0,
+       0,    94,     0,     0,     0,     0,   135,     0,     0,     0,
+       0,     0,    81,   105,   110,   123,     0,   121,     0,   126,
+      84,    78,    75,     0,    96,     0,   107,   109,   137,   143,
+       0,     0,    73,    68,    67,     0,   125,    95,     0,   101,
+       0,     0,   139,   144,   145,   136,     0,   119,     0,     0,
+     103,     0,     0,   140,   141,   146,   147,     0,     0,     0,
+       0,   138,     0,   134,     0,   149,   150,     0,    97,    98,
+     127,   142,     0,   156,     0,    99,   100,   102,   148,   151
+};
+
+/* YYDEFGOTO[NTERM-NUM].  */
+static const yytype_int16 yydefgoto[] =
+{
+      -1,     2,    63,    64,   210,   117,   253,    65,    66,    67,
+     250,   241,   239,   211,   123,   124,   125,   151,   294,   309,
+     347,   320,    68,    69,    70,   245,   246,   152,    71,    72,
+      73,    74,    75,    76,    77,    78,   260,   261,   262,    79,
+      80,    81,    82,    83,    84,    85,    86,   276,   277,   312,
+     324,   333,   314,   326,   335,    87,   337,   134,   207,    88,
+     130,    89,    90,    21,     9,    10,    26,    27,    31,    32,
+      33,    34,     3
+};
+
+/* YYPACT[STATE-NUM] -- Index in YYTABLE of the portion describing
+   STATE-NUM.  */
+#define YYPACT_NINF -179
+static const yytype_int16 yypact[] =
+{
+      24,    36,    58,   -48,   -25,  -179,  -179,    57,    31,  -179,
+     -74,    14,    14,    50,    57,  -179,  -179,  -179,  -179,  -179,
+    -179,  -179,  -179,    72,  -179,    14,  -179,     3,   -26,   -28,
+    -179,  -179,  -179,  -179,     4,  -179,    91,    95,   589,  -179,
+      80,    -6,    43,   285,   285,  -179,    19,    99,    69,    -5,
+      81,   -13,   110,   112,   114,  -179,  -179,  -179,    89,    37,
+      41,  -179,   122,  -179,   406,  -179,    25,    40,    44,    -3,
+      46,   116,    49,    51,   116,    52,    53,    54,    55,    56,
+      59,    61,    62,    70,    73,    74,    75,    76,    77,    78,
+      79,    89,  -179,   285,  -179,  -179,  -179,  -179,  -179,  -179,
+      82,   285,    83,  -179,  -179,  -179,  -179,  -179,  -179,  -179,
+    -179,  -179,  -179,  -179,   285,   285,   577,    92,   618,    94,
+      97,  -179,   706,  -179,   -33,   124,   153,    -5,  -179,  -179,
+     141,    -5,    -5,  -179,   136,  -179,   148,  -179,  -179,  -179,
+    -179,    98,  -179,  -179,  -179,   285,  -179,   101,  -179,  -179,
+     195,  -179,  -179,  -179,  -179,  -179,  -179,  -179,  -179,  -179,
+    -179,  -179,  -179,  -179,  -179,  -179,  -179,  -179,  -179,  -179,
+    -179,  -179,  -179,   100,   706,   135,     6,   154,    -7,   206,
+     285,   285,   285,   285,   285,   589,   218,   285,   285,   285,
+     285,   285,   285,   285,   285,   589,   285,   -27,   216,   173,
+      -5,   285,  -179,   217,  -179,   113,  -179,   171,   221,   119,
+     706,   -56,   285,   185,   706,  -179,  -179,  -179,  -179,     6,
+       6,    27,    27,   706,   345,  -179,    27,    27,    27,    35,
+      35,    -7,    -7,   -53,   467,   223,   232,   127,  -179,   126,
+    -179,   -31,  -179,   638,   151,  -179,   142,   251,   253,   150,
+    -179,   126,  -179,   -46,  -179,   285,   -45,   256,   589,   285,
+    -179,   240,   249,  -179,   245,  -179,   166,  -179,   273,   285,
+      -5,   242,   285,   285,   217,    14,  -179,   -39,   222,   170,
+     167,   179,   706,  -179,  -179,   589,   679,  -179,   268,  -179,
+    -179,  -179,  -179,   247,   207,   686,   706,  -179,   186,   243,
+     251,    -5,  -179,  -179,  -179,   589,  -179,  -179,   286,   261,
+     589,   303,   219,  -179,   224,  -179,   193,   589,   226,   272,
+    -179,   528,   205,  -179,   310,  -179,   233,   314,   230,   317,
+     302,  -179,   328,  -179,   235,  -179,  -179,   -38,  -179,     7,
+    -179,  -179,   334,  -179,   331,  -179,  -179,  -179,  -179,  -179
+};
+
+/* YYPGOTO[NTERM-NUM].  */
+static const yytype_int16 yypgoto[] =
+{
+    -179,  -179,   -63,  -178,   -41,  -179,  -179,  -179,  -179,  -179,
+    -179,  -179,   133,  -155,   143,  -179,  -179,   -68,  -179,  -179,
+    -179,  -179,   -40,  -179,  -179,    71,  -179,   269,  -179,  -179,
+    -179,  -179,  -179,  -179,  -179,  -179,    85,  -179,  -179,  -179,
+    -179,  -179,  -179,  -179,  -179,  -179,  -179,    47,  -179,  -179,
+    -179,  -179,  -179,  -179,  -179,  -179,  -179,  -179,  -179,  -179,
+    -117,  -179,  -179,   -12,   330,  -179,   321,  -179,  -179,  -179,
+     315,  -179,  -179
+};
+
+/* YYTABLE[YYPACT[STATE-NUM]].  What to do in state STATE-NUM.  If
+   positive, shift that token.  If negative, reduce the rule which
+   number is the opposite.  If zero, do what YYDEFACT says.
+   If YYTABLE_NINF, syntax error.  */
+#define YYTABLE_NINF -1
+static const yytype_uint16 yytable[] =
+{
+      22,   143,   116,   118,   128,   122,   155,   224,   184,   269,
+     202,   236,    25,    28,   204,   205,   198,   234,   138,   182,
+     183,   184,    94,    95,    96,    97,    98,    99,   100,   148,
+      38,   101,    46,    15,    16,    17,    18,    36,    19,   233,
+       1,    13,   184,    14,   132,     4,   133,   147,    11,    12,
+     184,   173,   174,   345,   346,   119,   120,   256,     5,   254,
+     176,   255,   263,    37,   255,     6,     8,    29,    29,   280,
+     283,   281,   255,   178,   179,    23,   299,   343,   300,   344,
+     285,    25,   237,   242,   199,   102,   270,    35,   186,     7,
+     103,   104,   105,   106,   107,   129,   108,   109,   110,   111,
+      40,   186,   112,   113,    41,    91,    93,    92,   126,   214,
+     187,   188,   189,   190,   191,   192,   193,    20,   127,   135,
+     131,   136,   186,   137,    46,   139,   114,   317,   121,   140,
+     186,   141,   321,   115,   190,   191,   192,   193,   144,   219,
+     220,   221,   222,   223,   192,   193,   226,   227,   228,   229,
+     230,   231,   232,   292,   145,   235,   150,   146,   122,   149,
+     243,   143,   153,   200,   154,   157,   158,   159,   160,   161,
+     201,   143,   162,   271,   163,   164,    94,    95,    96,    97,
+      98,    99,   100,   165,   316,   101,   166,   167,   168,   169,
+     170,   171,   172,   203,   175,   177,   206,   208,    94,    95,
+      96,    97,    98,    99,   100,   216,   194,   101,   196,   119,
+     120,   197,   209,   215,   282,   212,   180,   181,   286,   182,
+     183,   184,   143,   225,   217,   238,   244,   247,   214,   248,
+     249,   295,   296,   180,   181,   252,   182,   183,   184,   102,
+     257,   266,   267,   268,   103,   104,   105,   106,   107,   213,
+     108,   109,   110,   111,   143,   273,   112,   113,   143,   274,
+     275,   102,   278,   298,   279,   284,   103,   104,   105,   106,
+     107,   259,   108,   109,   110,   111,   288,   289,   112,   113,
+     114,   290,   291,   293,   301,   302,   303,   115,    94,    95,
+      96,    97,    98,    99,   100,   304,   306,   101,   307,   308,
+     311,   186,   114,   318,   313,   319,   322,   327,   323,   115,
+     187,   188,   189,   190,   191,   192,   193,   329,   186,   328,
+     331,   218,   332,   336,   338,   325,   339,   187,   188,   189,
+     190,   191,   192,   193,   340,   334,   341,   348,   265,   342,
+     349,   251,   240,   156,    24,   297,   287,   315,    30,    39,
+       0,   102,     0,     0,    42,     0,   103,   104,   105,   106,
+     107,     0,   108,   109,   110,   111,     0,     0,   112,   113,
+       0,     0,     0,    43,     0,   258,   259,     0,    44,    45,
+      46,     0,     0,     0,     0,     0,    47,     0,     0,     0,
+       0,     0,   114,    48,     0,     0,    49,     0,    50,   115,
+       0,    51,     0,     0,     0,     0,     0,     0,     0,     0,
+       0,     0,    52,    53,    54,    42,     0,     0,     0,     0,
+       0,    55,     0,     0,     0,     0,    56,    57,     0,     0,
+      58,    59,    60,   142,    43,    61,     0,     0,     0,    44,
+      45,    46,     0,     0,     0,     0,     0,    47,     0,     0,
+       0,     0,     0,     0,    48,     0,     0,    49,     0,    50,
+       0,     0,    51,    62,     0,     0,     0,     0,     0,     0,
+       0,     0,     0,    52,    53,    54,    42,     0,     0,     0,
+       0,     0,    55,     0,     0,     0,     0,    56,    57,     0,
+       0,    58,    59,    60,   264,    43,    61,     0,     0,     0,
+      44,    45,    46,     0,     0,     0,     0,     0,    47,     0,
+       0,     0,     0,     0,     0,    48,     0,     0,    49,     0,
+      50,     0,     0,    51,    62,     0,     0,     0,     0,     0,
+       0,     0,     0,     0,    52,    53,    54,    42,     0,     0,
+       0,     0,     0,    55,     0,     0,     0,     0,    56,    57,
+       0,     0,    58,    59,    60,   330,    43,    61,     0,     0,
+       0,    44,    45,    46,     0,     0,     0,     0,     0,    47,
+       0,     0,     0,     0,     0,     0,    48,     0,     0,    49,
+       0,    50,     0,     0,    51,    62,     0,   180,   181,     0,
+     182,   183,   184,     0,     0,    52,    53,    54,    42,     0,
+       0,     0,     0,     0,    55,     0,   185,     0,     0,    56,
+      57,     0,     0,    58,    59,    60,     0,    43,    61,     0,
+       0,     0,    44,    45,    46,     0,     0,     0,   180,   181,
+      47,   182,   183,   184,     0,     0,     0,    48,     0,     0,
+      49,     0,    50,     0,     0,    51,    62,     0,   180,   181,
+     195,   182,   183,   184,     0,     0,    52,    53,    54,     0,
+       0,     0,     0,     0,     0,    55,     0,     0,     0,     0,
+      56,    57,   186,     0,    58,    59,    60,     0,     0,    61,
+     272,   187,   188,   189,   190,   191,   192,   193,     0,   180,
+     181,     0,   182,   183,   184,     0,   180,   181,     0,   182,
+     183,   184,     0,     0,     0,     0,     0,    62,   305,     0,
+       0,     0,     0,   186,     0,     0,   180,   181,   310,   182,
+     183,   184,   187,   188,   189,   190,   191,   192,   193,     0,
+       0,     0,     0,   186,     0,     0,     0,     0,     0,     0,
+       0,     0,   187,   188,   189,   190,   191,   192,   193,     0,
+       0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
+       0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
+       0,     0,     0,     0,   186,     0,     0,     0,     0,     0,
+       0,   186,     0,   187,   188,   189,   190,   191,   192,   193,
+     187,   188,   189,   190,   191,   192,   193,     0,     0,     0,
+       0,   186,     0,     0,     0,     0,     0,     0,     0,     0,
+     187,   188,   189,   190,   191,   192,   193
+};
+
+static const yytype_int16 yycheck[] =
+{
+      12,    64,    43,    44,     9,    46,    74,   185,    15,    40,
+     127,    38,     9,    25,   131,   132,    49,   195,    58,    13,
+      14,    15,     3,     4,     5,     6,     7,     8,     9,    69,
+      26,    12,    35,    19,    20,    21,    22,    65,    24,   194,
+      16,   115,    15,   117,    57,     9,    59,    50,    17,    18,
+      15,    91,    93,    46,    47,    36,    37,   212,     0,   115,
+     101,   117,   115,    91,   117,   113,     9,    64,    64,   115,
+     115,   117,   117,   114,   115,    25,   115,   115,   117,   117,
+     258,     9,   109,   200,   117,    66,   117,   113,    95,   114,
+      71,    72,    73,    74,    75,   100,    77,    78,    79,    80,
+       9,    95,    83,    84,     9,    25,    63,   113,     9,   150,
+     104,   105,   106,   107,   108,   109,   110,   103,    49,     9,
+      39,     9,    95,     9,    35,    88,   107,   305,   109,    88,
+      95,     9,   310,   114,   107,   108,   109,   110,   113,   180,
+     181,   182,   183,   184,   109,   110,   187,   188,   189,   190,
+     191,   192,   193,   270,   114,   196,    40,   113,   199,   113,
+     201,   224,   113,    39,   113,   113,   113,   113,   113,   113,
+      17,   234,   113,   241,   113,   113,     3,     4,     5,     6,
+       7,     8,     9,   113,   301,    12,   113,   113,   113,   113,
+     113,   113,   113,    52,   112,   112,    60,    49,     3,     4,
+       5,     6,     7,     8,     9,    70,   114,    12,   114,    36,
+      37,   114,   114,   113,   255,   114,    10,    11,   259,    13,
+      14,    15,   285,     5,    70,     9,     9,   114,   269,    58,
+       9,   272,   273,    10,    11,   116,    13,    14,    15,    66,
+      55,     9,   115,   117,    71,    72,    73,    74,    75,    54,
+      77,    78,    79,    80,   317,   104,    83,    84,   321,   117,
+       9,    66,     9,   275,   114,     9,    71,    72,    73,    74,
+      75,    31,    77,    78,    79,    80,    27,    32,    83,    84,
+     107,   115,     9,    41,    62,   115,   119,   114,     3,     4,
+       5,     6,     7,     8,     9,   116,    28,    12,    51,    92,
+     114,    95,   107,    17,    61,    44,     3,   114,    89,   114,
+     104,   105,   106,   107,   108,   109,   110,    45,    95,    93,
+     115,   115,    12,     9,    94,   101,     9,   104,   105,   106,
+     107,   108,   109,   110,    32,   102,     8,     3,   115,   104,
+       9,   208,   199,    74,    14,   274,   261,   300,    27,    34,
+      -1,    66,    -1,    -1,     9,    -1,    71,    72,    73,    74,
+      75,    -1,    77,    78,    79,    80,    -1,    -1,    83,    84,
+      -1,    -1,    -1,    28,    -1,    30,    31,    -1,    33,    34,
+      35,    -1,    -1,    -1,    -1,    -1,    41,    -1,    -1,    -1,
+      -1,    -1,   107,    48,    -1,    -1,    51,    -1,    53,   114,
+      -1,    56,    -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
+      -1,    -1,    67,    68,    69,     9,    -1,    -1,    -1,    -1,
+      -1,    76,    -1,    -1,    -1,    -1,    81,    82,    -1,    -1,
+      85,    86,    87,    27,    28,    90,    -1,    -1,    -1,    33,
+      34,    35,    -1,    -1,    -1,    -1,    -1,    41,    -1,    -1,
+      -1,    -1,    -1,    -1,    48,    -1,    -1,    51,    -1,    53,
+      -1,    -1,    56,   118,    -1,    -1,    -1,    -1,    -1,    -1,
+      -1,    -1,    -1,    67,    68,    69,     9,    -1,    -1,    -1,
+      -1,    -1,    76,    -1,    -1,    -1,    -1,    81,    82,    -1,
+      -1,    85,    86,    87,    27,    28,    90,    -1,    -1,    -1,
+      33,    34,    35,    -1,    -1,    -1,    -1,    -1,    41,    -1,
+      -1,    -1,    -1,    -1,    -1,    48,    -1,    -1,    51,    -1,
+      53,    -1,    -1,    56,   118,    -1,    -1,    -1,    -1,    -1,
+      -1,    -1,    -1,    -1,    67,    68,    69,     9,    -1,    -1,
+      -1,    -1,    -1,    76,    -1,    -1,    -1,    -1,    81,    82,
+      -1,    -1,    85,    86,    87,    27,    28,    90,    -1,    -1,
+      -1,    33,    34,    35,    -1,    -1,    -1,    -1,    -1,    41,
+      -1,    -1,    -1,    -1,    -1,    -1,    48,    -1,    -1,    51,
+      -1,    53,    -1,    -1,    56,   118,    -1,    10,    11,    -1,
+      13,    14,    15,    -1,    -1,    67,    68,    69,     9,    -1,
+      -1,    -1,    -1,    -1,    76,    -1,    29,    -1,    -1,    81,
+      82,    -1,    -1,    85,    86,    87,    -1,    28,    90,    -1,
+      -1,    -1,    33,    34,    35,    -1,    -1,    -1,    10,    11,
+      41,    13,    14,    15,    -1,    -1,    -1,    48,    -1,    -1,
+      51,    -1,    53,    -1,    -1,    56,   118,    -1,    10,    11,
+      32,    13,    14,    15,    -1,    -1,    67,    68,    69,    -1,
+      -1,    -1,    -1,    -1,    -1,    76,    -1,    -1,    -1,    -1,
+      81,    82,    95,    -1,    85,    86,    87,    -1,    -1,    90,
+      42,   104,   105,   106,   107,   108,   109,   110,    -1,    10,
+      11,    -1,    13,    14,    15,    -1,    10,    11,    -1,    13,
+      14,    15,    -1,    -1,    -1,    -1,    -1,   118,    29,    -1,
+      -1,    -1,    -1,    95,    -1,    -1,    10,    11,    32,    13,
+      14,    15,   104,   105,   106,   107,   108,   109,   110,    -1,
+      -1,    -1,    -1,    95,    -1,    -1,    -1,    -1,    -1,    -1,
+      -1,    -1,   104,   105,   106,   107,   108,   109,   110,    -1,
+      -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
+      -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
+      -1,    -1,    -1,    -1,    95,    -1,    -1,    -1,    -1,    -1,
+      -1,    95,    -1,   104,   105,   106,   107,   108,   109,   110,
+     104,   105,   106,   107,   108,   109,   110,    -1,    -1,    -1,
+      -1,    95,    -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
+     104,   105,   106,   107,   108,   109,   110
+};
+
+/* YYSTOS[STATE-NUM] -- The (internal number of the) accessing
+   symbol of state STATE-NUM.  */
+static const yytype_uint8 yystos[] =
+{
+       0,    16,   121,   192,     9,     0,   113,   114,     9,   184,
+     185,    17,    18,   115,   117,    19,    20,    21,    22,    24,
+     103,   183,   183,    25,   184,     9,   186,   187,   183,    64,
+     186,   188,   189,   190,   191,   113,    65,    91,    26,   190,
+       9,     9,     9,    28,    33,    34,    35,    41,    48,    51,
+      53,    56,    67,    68,    69,    76,    81,    82,    85,    86,
+      87,    90,   118,   122,   123,   127,   128,   129,   142,   143,
+     144,   148,   149,   150,   151,   152,   153,   154,   155,   159,
+     160,   161,   162,   163,   164,   165,   166,   175,   179,   181,
+     182,    25,   113,    63,     3,     4,     5,     6,     7,     8,
+       9,    12,    66,    71,    72,    73,    74,    75,    77,    78,
+      79,    80,    83,    84,   107,   114,   124,   125,   124,    36,
+      37,   109,   124,   134,   135,   136,     9,    49,     9,   100,
+     180,    39,    57,    59,   177,     9,     9,     9,   142,    88,
+      88,     9,    27,   122,   113,   114,   113,    50,   142,   113,
+      40,   137,   147,   113,   113,   137,   147,   113,   113,   113,
+     113,   113,   113,   113,   113,   113,   113,   113,   113,   113,
+     113,   113,   113,   142,   124,   112,   124,   112,   124,   124,
+      10,    11,    13,    14,    15,    29,    95,   104,   105,   106,
+     107,   108,   109,   110,   114,    32,   114,   114,    49,   117,
+      39,    17,   180,    52,   180,   180,    60,   178,    49,   114,
+     124,   133,   114,    54,   124,   113,    70,    70,   115,   124,
+     124,   124,   124,   124,   123,     5,   124,   124,   124,   124,
+     124,   124,   124,   133,   123,   124,    38,   109,     9,   132,
+     134,   131,   180,   124,     9,   145,   146,   114,    58,     9,
+     130,   132,   116,   126,   115,   117,   133,    55,    30,    31,
+     156,   157,   158,   115,    27,   115,     9,   115,   117,    40,
+     117,   137,    42,   104,   117,     9,   167,   168,     9,   114,
+     115,   117,   124,   115,     9,   123,   124,   156,    27,    32,
+     115,     9,   180,    41,   138,   124,   124,   145,   183,   115,
+     117,    62,   115,   119,   116,    29,    28,    51,    92,   139,
+      32,   114,   169,    61,   172,   167,   180,   123,    17,    44,
+     141,   123,     3,    89,   170,   101,   173,   114,    93,    45,
+      27,   115,    12,   171,   102,   174,     9,   176,    94,     9,
+      32,     8,   104,   115,   117,    46,    47,   140,     3,     9
+};
+
+#define yyerrok		(yyerrstatus = 0)
+#define yyclearin	(yychar = YYEMPTY)
+#define YYEMPTY		(-2)
+#define YYEOF		0
+
+#define YYACCEPT	goto yyacceptlab
+#define YYABORT		goto yyabortlab
+#define YYERROR		goto yyerrorlab
+
+
+/* Like YYERROR except do call yyerror.  This remains here temporarily
+   to ease the transition to the new meaning of YYERROR, for GCC.
+   Once GCC version 2 has supplanted version 1, this can go.  */
+
+#define YYFAIL		goto yyerrlab
+
+#define YYRECOVERING()  (!!yyerrstatus)
+
+#define YYBACKUP(Token, Value)					\
+do								\
+  if (yychar == YYEMPTY && yylen == 1)				\
+    {								\
+      yychar = (Token);						\
+      yylval = (Value);						\
+      yytoken = YYTRANSLATE (yychar);				\
+      YYPOPSTACK (1);						\
+      goto yybackup;						\
+    }								\
+  else								\
+    {								\
+      yyerror (YY_("syntax error: cannot back up")); \
+      YYERROR;							\
+    }								\
+while (YYID (0))
+
+
+#define YYTERROR	1
+#define YYERRCODE	256
+
+
+/* YYLLOC_DEFAULT -- Set CURRENT to span from RHS[1] to RHS[N].
+   If N is 0, then set CURRENT to the empty location which ends
+   the previous symbol: RHS[0] (always defined).  */
+
+#define YYRHSLOC(Rhs, K) ((Rhs)[K])
+#ifndef YYLLOC_DEFAULT
+# define YYLLOC_DEFAULT(Current, Rhs, N)				\
+    do									\
+      if (YYID (N))                                                    \
+	{								\
+	  (Current).first_line   = YYRHSLOC (Rhs, 1).first_line;	\
+	  (Current).first_column = YYRHSLOC (Rhs, 1).first_column;	\
+	  (Current).last_line    = YYRHSLOC (Rhs, N).last_line;		\
+	  (Current).last_column  = YYRHSLOC (Rhs, N).last_column;	\
+	}								\
+      else								\
+	{								\
+	  (Current).first_line   = (Current).last_line   =		\
+	    YYRHSLOC (Rhs, 0).last_line;				\
+	  (Current).first_column = (Current).last_column =		\
+	    YYRHSLOC (Rhs, 0).last_column;				\
+	}								\
+    while (YYID (0))
+#endif
+
+
+/* YY_LOCATION_PRINT -- Print the location on the stream.
+   This macro was not mandated originally: define only if we know
+   we won't break user code: when these are the locations we know.  */
+
+#ifndef YY_LOCATION_PRINT
+# if defined YYLTYPE_IS_TRIVIAL && YYLTYPE_IS_TRIVIAL
+#  define YY_LOCATION_PRINT(File, Loc)			\
+     fprintf (File, "%d.%d-%d.%d",			\
+	      (Loc).first_line, (Loc).first_column,	\
+	      (Loc).last_line,  (Loc).last_column)
+# else
+#  define YY_LOCATION_PRINT(File, Loc) ((void) 0)
+# endif
+#endif
+
+
+/* YYLEX -- calling `yylex' with the right arguments.  */
+
+#ifdef YYLEX_PARAM
+# define YYLEX yylex (YYLEX_PARAM)
+#else
+# define YYLEX yylex ()
+#endif
+
+/* Enable debugging if requested.  */
+#if YYDEBUG
+
+# ifndef YYFPRINTF
+#  include <stdio.h> /* INFRINGES ON USER NAME SPACE */
+#  define YYFPRINTF fprintf
+# endif
+
+# define YYDPRINTF(Args)			\
+do {						\
+  if (yydebug)					\
+    YYFPRINTF Args;				\
+} while (YYID (0))
+
+# define YY_SYMBOL_PRINT(Title, Type, Value, Location)			  \
+do {									  \
+  if (yydebug)								  \
+    {									  \
+      YYFPRINTF (stderr, "%s ", Title);					  \
+      yy_symbol_print (stderr,						  \
+		  Type, Value); \
+      YYFPRINTF (stderr, "\n");						  \
+    }									  \
+} while (YYID (0))
+
+
+/*--------------------------------.
+| Print this symbol on YYOUTPUT.  |
+`--------------------------------*/
+
+/*ARGSUSED*/
+#if (defined __STDC__ || defined __C99__FUNC__ \
+     || defined __cplusplus || defined _MSC_VER)
+static void
+yy_symbol_value_print (FILE *yyoutput, int yytype, YYSTYPE const * const yyvaluep)
+#else
+static void
+yy_symbol_value_print (yyoutput, yytype, yyvaluep)
+    FILE *yyoutput;
+    int yytype;
+    YYSTYPE const * const yyvaluep;
+#endif
+{
+  if (!yyvaluep)
+    return;
+# ifdef YYPRINT
+  if (yytype < YYNTOKENS)
+    YYPRINT (yyoutput, yytoknum[yytype], *yyvaluep);
+# else
+  YYUSE (yyoutput);
+# endif
+  switch (yytype)
+    {
+      default:
+	break;
+    }
+}
+
+
+/*--------------------------------.
+| Print this symbol on YYOUTPUT.  |
+`--------------------------------*/
+
+#if (defined __STDC__ || defined __C99__FUNC__ \
+     || defined __cplusplus || defined _MSC_VER)
+static void
+yy_symbol_print (FILE *yyoutput, int yytype, YYSTYPE const * const yyvaluep)
+#else
+static void
+yy_symbol_print (yyoutput, yytype, yyvaluep)
+    FILE *yyoutput;
+    int yytype;
+    YYSTYPE const * const yyvaluep;
+#endif
+{
+  if (yytype < YYNTOKENS)
+    YYFPRINTF (yyoutput, "token %s (", yytname[yytype]);
+  else
+    YYFPRINTF (yyoutput, "nterm %s (", yytname[yytype]);
+
+  yy_symbol_value_print (yyoutput, yytype, yyvaluep);
+  YYFPRINTF (yyoutput, ")");
+}
+
+/*------------------------------------------------------------------.
+| yy_stack_print -- Print the state stack from its BOTTOM up to its |
+| TOP (included).                                                   |
+`------------------------------------------------------------------*/
+
+#if (defined __STDC__ || defined __C99__FUNC__ \
+     || defined __cplusplus || defined _MSC_VER)
+static void
+yy_stack_print (yytype_int16 *bottom, yytype_int16 *top)
+#else
+static void
+yy_stack_print (bottom, top)
+    yytype_int16 *bottom;
+    yytype_int16 *top;
+#endif
+{
+  YYFPRINTF (stderr, "Stack now");
+  for (; bottom <= top; ++bottom)
+    YYFPRINTF (stderr, " %d", *bottom);
+  YYFPRINTF (stderr, "\n");
+}
+
+# define YY_STACK_PRINT(Bottom, Top)				\
+do {								\
+  if (yydebug)							\
+    yy_stack_print ((Bottom), (Top));				\
+} while (YYID (0))
+
+
+/*------------------------------------------------.
+| Report that the YYRULE is going to be reduced.  |
+`------------------------------------------------*/
+
+#if (defined __STDC__ || defined __C99__FUNC__ \
+     || defined __cplusplus || defined _MSC_VER)
+static void
+yy_reduce_print (YYSTYPE *yyvsp, int yyrule)
+#else
+static void
+yy_reduce_print (yyvsp, yyrule)
+    YYSTYPE *yyvsp;
+    int yyrule;
+#endif
+{
+  int yynrhs = yyr2[yyrule];
+  int yyi;
+  unsigned long int yylno = yyrline[yyrule];
+  YYFPRINTF (stderr, "Reducing stack by rule %d (line %lu):\n",
+	     yyrule - 1, yylno);
+  /* The symbols being reduced.  */
+  for (yyi = 0; yyi < yynrhs; yyi++)
+    {
+      fprintf (stderr, "   $%d = ", yyi + 1);
+      yy_symbol_print (stderr, yyrhs[yyprhs[yyrule] + yyi],
+		       &(yyvsp[(yyi + 1) - (yynrhs)])
+		       		       );
+      fprintf (stderr, "\n");
+    }
+}
+
+# define YY_REDUCE_PRINT(Rule)		\
+do {					\
+  if (yydebug)				\
+    yy_reduce_print (yyvsp, Rule); \
+} while (YYID (0))
+
+/* Nonzero means print parse trace.  It is left uninitialized so that
+   multiple parsers can coexist.  */
+int yydebug;
+#else /* !YYDEBUG */
+# define YYDPRINTF(Args)
+# define YY_SYMBOL_PRINT(Title, Type, Value, Location)
+# define YY_STACK_PRINT(Bottom, Top)
+# define YY_REDUCE_PRINT(Rule)
+#endif /* !YYDEBUG */
+
+
+/* YYINITDEPTH -- initial size of the parser's stacks.  */
+#ifndef	YYINITDEPTH
+# define YYINITDEPTH 200
+#endif
+
+/* YYMAXDEPTH -- maximum size the stacks can grow to (effective only
+   if the built-in stack extension method is used).
+
+   Do not make this value too large; the results are undefined if
+   YYSTACK_ALLOC_MAXIMUM < YYSTACK_BYTES (YYMAXDEPTH)
+   evaluated with infinite-precision integer arithmetic.  */
+
+#ifndef YYMAXDEPTH
+# define YYMAXDEPTH 10000
+#endif
+
+
+
+#if YYERROR_VERBOSE
+
+# ifndef yystrlen
+#  if defined __GLIBC__ && defined _STRING_H
+#   define yystrlen strlen
+#  else
+/* Return the length of YYSTR.  */
+#if (defined __STDC__ || defined __C99__FUNC__ \
+     || defined __cplusplus || defined _MSC_VER)
+static YYSIZE_T
+yystrlen (const char *yystr)
+#else
+static YYSIZE_T
+yystrlen (yystr)
+    const char *yystr;
+#endif
+{
+  YYSIZE_T yylen;
+  for (yylen = 0; yystr[yylen]; yylen++)
+    continue;
+  return yylen;
+}
+#  endif
+# endif
+
+# ifndef yystpcpy
+#  if defined __GLIBC__ && defined _STRING_H && defined _GNU_SOURCE
+#   define yystpcpy stpcpy
+#  else
+/* Copy YYSRC to YYDEST, returning the address of the terminating '\0' in
+   YYDEST.  */
+#if (defined __STDC__ || defined __C99__FUNC__ \
+     || defined __cplusplus || defined _MSC_VER)
+static char *
+yystpcpy (char *yydest, const char *yysrc)
+#else
+static char *
+yystpcpy (yydest, yysrc)
+    char *yydest;
+    const char *yysrc;
+#endif
+{
+  char *yyd = yydest;
+  const char *yys = yysrc;
+
+  while ((*yyd++ = *yys++) != '\0')
+    continue;
+
+  return yyd - 1;
+}
+#  endif
+# endif
+
+# ifndef yytnamerr
+/* Copy to YYRES the contents of YYSTR after stripping away unnecessary
+   quotes and backslashes, so that it's suitable for yyerror.  The
+   heuristic is that double-quoting is unnecessary unless the string
+   contains an apostrophe, a comma, or backslash (other than
+   backslash-backslash).  YYSTR is taken from yytname.  If YYRES is
+   null, do not copy; instead, return the length of what the result
+   would have been.  */
+static YYSIZE_T
+yytnamerr (char *yyres, const char *yystr)
+{
+  if (*yystr == '"')
+    {
+      YYSIZE_T yyn = 0;
+      char const *yyp = yystr;
+
+      for (;;)
+	switch (*++yyp)
+	  {
+	  case '\'':
+	  case ',':
+	    goto do_not_strip_quotes;
+
+	  case '\\':
+	    if (*++yyp != '\\')
+	      goto do_not_strip_quotes;
+	    /* Fall through.  */
+	  default:
+	    if (yyres)
+	      yyres[yyn] = *yyp;
+	    yyn++;
+	    break;
+
+	  case '"':
+	    if (yyres)
+	      yyres[yyn] = '\0';
+	    return yyn;
+	  }
+    do_not_strip_quotes: ;
+    }
+
+  if (! yyres)
+    return yystrlen (yystr);
+
+  return yystpcpy (yyres, yystr) - yyres;
+}
+# endif
+
+/* Copy into YYRESULT an error message about the unexpected token
+   YYCHAR while in state YYSTATE.  Return the number of bytes copied,
+   including the terminating null byte.  If YYRESULT is null, do not
+   copy anything; just return the number of bytes that would be
+   copied.  As a special case, return 0 if an ordinary "syntax error"
+   message will do.  Return YYSIZE_MAXIMUM if overflow occurs during
+   size calculation.  */
+static YYSIZE_T
+yysyntax_error (char *yyresult, int yystate, int yychar)
+{
+  int yyn = yypact[yystate];
+
+  if (! (YYPACT_NINF < yyn && yyn <= YYLAST))
+    return 0;
+  else
+    {
+      int yytype = YYTRANSLATE (yychar);
+      YYSIZE_T yysize0 = yytnamerr (0, yytname[yytype]);
+      YYSIZE_T yysize = yysize0;
+      YYSIZE_T yysize1;
+      int yysize_overflow = 0;
+      enum { YYERROR_VERBOSE_ARGS_MAXIMUM = 5 };
+      char const *yyarg[YYERROR_VERBOSE_ARGS_MAXIMUM];
+      int yyx;
+
+# if 0
+      /* This is so xgettext sees the translatable formats that are
+	 constructed on the fly.  */
+      YY_("syntax error, unexpected %s");
+      YY_("syntax error, unexpected %s, expecting %s");
+      YY_("syntax error, unexpected %s, expecting %s or %s");
+      YY_("syntax error, unexpected %s, expecting %s or %s or %s");
+      YY_("syntax error, unexpected %s, expecting %s or %s or %s or %s");
+# endif
+      char *yyfmt;
+      char const *yyf;
+      static char const yyunexpected[] = "syntax error, unexpected %s";
+      static char const yyexpecting[] = ", expecting %s";
+      static char const yyor[] = " or %s";
+      char yyformat[sizeof yyunexpected
+		    + sizeof yyexpecting - 1
+		    + ((YYERROR_VERBOSE_ARGS_MAXIMUM - 2)
+		       * (sizeof yyor - 1))];
+      char const *yyprefix = yyexpecting;
+
+      /* Start YYX at -YYN if negative to avoid negative indexes in
+	 YYCHECK.  */
+      int yyxbegin = yyn < 0 ? -yyn : 0;
+
+      /* Stay within bounds of both yycheck and yytname.  */
+      int yychecklim = YYLAST - yyn + 1;
+      int yyxend = yychecklim < YYNTOKENS ? yychecklim : YYNTOKENS;
+      int yycount = 1;
+
+      yyarg[0] = yytname[yytype];
+      yyfmt = yystpcpy (yyformat, yyunexpected);
+
+      for (yyx = yyxbegin; yyx < yyxend; ++yyx)
+	if (yycheck[yyx + yyn] == yyx && yyx != YYTERROR)
+	  {
+	    if (yycount == YYERROR_VERBOSE_ARGS_MAXIMUM)
+	      {
+		yycount = 1;
+		yysize = yysize0;
+		yyformat[sizeof yyunexpected - 1] = '\0';
+		break;
+	      }
+	    yyarg[yycount++] = yytname[yyx];
+	    yysize1 = yysize + yytnamerr (0, yytname[yyx]);
+	    yysize_overflow |= (yysize1 < yysize);
+	    yysize = yysize1;
+	    yyfmt = yystpcpy (yyfmt, yyprefix);
+	    yyprefix = yyor;
+	  }
+
+      yyf = YY_(yyformat);
+      yysize1 = yysize + yystrlen (yyf);
+      yysize_overflow |= (yysize1 < yysize);
+      yysize = yysize1;
+
+      if (yysize_overflow)
+	return YYSIZE_MAXIMUM;
+
+      if (yyresult)
+	{
+	  /* Avoid sprintf, as that infringes on the user's name space.
+	     Don't have undefined behavior even if the translation
+	     produced a string with the wrong number of "%s"s.  */
+	  char *yyp = yyresult;
+	  int yyi = 0;
+	  while ((*yyp = *yyf) != '\0')
+	    {
+	      if (*yyp == '%' && yyf[1] == 's' && yyi < yycount)
+		{
+		  yyp += yytnamerr (yyp, yyarg[yyi++]);
+		  yyf += 2;
+		}
+	      else
+		{
+		  yyp++;
+		  yyf++;
+		}
+	    }
+	}
+      return yysize;
+    }
+}
+#endif /* YYERROR_VERBOSE */
+
+
+/*-----------------------------------------------.
+| Release the memory associated to this symbol.  |
+`-----------------------------------------------*/
+
+/*ARGSUSED*/
+#if (defined __STDC__ || defined __C99__FUNC__ \
+     || defined __cplusplus || defined _MSC_VER)
+static void
+yydestruct (const char *yymsg, int yytype, YYSTYPE *yyvaluep)
+#else
+static void
+yydestruct (yymsg, yytype, yyvaluep)
+    const char *yymsg;
+    int yytype;
+    YYSTYPE *yyvaluep;
+#endif
+{
+  YYUSE (yyvaluep);
+
+  if (!yymsg)
+    yymsg = "Deleting";
+  YY_SYMBOL_PRINT (yymsg, yytype, yyvaluep, yylocationp);
+}
+
+
+/* Prevent warnings from -Wmissing-prototypes.  */
+
+#ifdef YYPARSE_PARAM
+#if defined __STDC__ || defined __cplusplus
+int yyparse (void *YYPARSE_PARAM);
+#else
+int yyparse ();
+#endif
+#else /* ! YYPARSE_PARAM */
+#if defined __STDC__ || defined __cplusplus
+int yyparse (void);
+#else
+int yyparse ();
+#endif
+#endif /* ! YYPARSE_PARAM */
+
+
+
+/* The look-ahead symbol.  */
+int yychar;
+
+/* The semantic value of the look-ahead symbol.  */
+YYSTYPE yylval;
+
+/* Number of syntax errors so far.  */
+int yynerrs;
+
+
+
+/*----------.
+| yyparse.  |
+`----------*/
+
+#ifdef YYPARSE_PARAM
+#if (defined __STDC__ || defined __C99__FUNC__ \
+     || defined __cplusplus || defined _MSC_VER)
+int
+yyparse (void *YYPARSE_PARAM)
+#else
+int
+yyparse (YYPARSE_PARAM)
+    void *YYPARSE_PARAM;
+#endif
+#else /* ! YYPARSE_PARAM */
+#if (defined __STDC__ || defined __C99__FUNC__ \
+     || defined __cplusplus || defined _MSC_VER)
+int
+yyparse (void)
+#else
+int
+yyparse ()
+
+#endif
+#endif
+{
+
+  int yystate;
+  int yyn;
+  int yyresult;
+  /* Number of tokens to shift before error messages enabled.  */
+  int yyerrstatus;
+  /* Look-ahead token as an internal (translated) token number.  */
+  int yytoken = 0;
+#if YYERROR_VERBOSE
+  /* Buffer for error messages, and its allocated size.  */
+  char yymsgbuf[128];
+  char *yymsg = yymsgbuf;
+  YYSIZE_T yymsg_alloc = sizeof yymsgbuf;
+#endif
+
+  /* Three stacks and their tools:
+     `yyss': related to states,
+     `yyvs': related to semantic values,
+     `yyls': related to locations.
+
+     Refer to the stacks thru separate pointers, to allow yyoverflow
+     to reallocate them elsewhere.  */
+
+  /* The state stack.  */
+  yytype_int16 yyssa[YYINITDEPTH];
+  yytype_int16 *yyss = yyssa;
+  yytype_int16 *yyssp;
+
+  /* The semantic value stack.  */
+  YYSTYPE yyvsa[YYINITDEPTH];
+  YYSTYPE *yyvs = yyvsa;
+  YYSTYPE *yyvsp;
+
+
+
+#define YYPOPSTACK(N)   (yyvsp -= (N), yyssp -= (N))
+
+  YYSIZE_T yystacksize = YYINITDEPTH;
+
+  /* The variables used to return semantic value and location from the
+     action routines.  */
+  YYSTYPE yyval;
+
+
+  /* The number of symbols on the RHS of the reduced rule.
+     Keep to zero when no symbol should be popped.  */
+  int yylen = 0;
+
+  YYDPRINTF ((stderr, "Starting parse\n"));
+
+  yystate = 0;
+  yyerrstatus = 0;
+  yynerrs = 0;
+  yychar = YYEMPTY;		/* Cause a token to be read.  */
+
+  /* Initialize stack pointers.
+     Waste one element of value and location stack
+     so that they stay on the same level as the state stack.
+     The wasted elements are never initialized.  */
+
+  yyssp = yyss;
+  yyvsp = yyvs;
+
+  goto yysetstate;
+
+/*------------------------------------------------------------.
+| yynewstate -- Push a new state, which is found in yystate.  |
+`------------------------------------------------------------*/
+ yynewstate:
+  /* In all cases, when you get here, the value and location stacks
+     have just been pushed.  So pushing a state here evens the stacks.  */
+  yyssp++;
+
+ yysetstate:
+  *yyssp = yystate;
+
+  if (yyss + yystacksize - 1 <= yyssp)
+    {
+      /* Get the current used size of the three stacks, in elements.  */
+      YYSIZE_T yysize = yyssp - yyss + 1;
+
+#ifdef yyoverflow
+      {
+	/* Give user a chance to reallocate the stack.  Use copies of
+	   these so that the &'s don't force the real ones into
+	   memory.  */
+	YYSTYPE *yyvs1 = yyvs;
+	yytype_int16 *yyss1 = yyss;
+
+
+	/* Each stack pointer address is followed by the size of the
+	   data in use in that stack, in bytes.  This used to be a
+	   conditional around just the two extra args, but that might
+	   be undefined if yyoverflow is a macro.  */
+	yyoverflow (YY_("memory exhausted"),
+		    &yyss1, yysize * sizeof (*yyssp),
+		    &yyvs1, yysize * sizeof (*yyvsp),
+
+		    &yystacksize);
+
+	yyss = yyss1;
+	yyvs = yyvs1;
+      }
+#else /* no yyoverflow */
+# ifndef YYSTACK_RELOCATE
+      goto yyexhaustedlab;
+# else
+      /* Extend the stack our own way.  */
+      if (YYMAXDEPTH <= yystacksize)
+	goto yyexhaustedlab;
+      yystacksize *= 2;
+      if (YYMAXDEPTH < yystacksize)
+	yystacksize = YYMAXDEPTH;
+
+      {
+	yytype_int16 *yyss1 = yyss;
+	union yyalloc *yyptr =
+	  (union yyalloc*) YYSTACK_ALLOC (YYSTACK_BYTES (yystacksize));
+	if (! yyptr)
+	  goto yyexhaustedlab;
+	YYSTACK_RELOCATE (yyss);
+	YYSTACK_RELOCATE (yyvs);
+
+#  undef YYSTACK_RELOCATE
+	if (yyss1 != yyssa)
+	  YYSTACK_FREE (yyss1);
+      }
+# endif
+#endif /* no yyoverflow */
+
+      yyssp = yyss + yysize - 1;
+      yyvsp = yyvs + yysize - 1;
+
+
+      YYDPRINTF ((stderr, "Stack size increased to %lu\n",
+		  (unsigned long int) yystacksize));
+
+      if (yyss + yystacksize - 1 <= yyssp)
+	YYABORT;
+    }
+
+  YYDPRINTF ((stderr, "Entering state %d\n", yystate));
+
+  goto yybackup;
+
+/*-----------.
+| yybackup.  |
+`-----------*/
+yybackup:
+
+  /* Do appropriate processing given the current state.  Read a
+     look-ahead token if we need one and don't already have one.  */
+
+  /* First try to decide what to do without reference to look-ahead token.  */
+  yyn = yypact[yystate];
+  if (yyn == YYPACT_NINF)
+    goto yydefault;
+
+  /* Not known => get a look-ahead token if don't already have one.  */
+
+  /* YYCHAR is either YYEMPTY or YYEOF or a valid look-ahead symbol.  */
+  if (yychar == YYEMPTY)
+    {
+      YYDPRINTF ((stderr, "Reading a token: "));
+      yychar = YYLEX;
+    }
+
+  if (yychar <= YYEOF)
+    {
+      yychar = yytoken = YYEOF;
+      YYDPRINTF ((stderr, "Now at end of input.\n"));
+    }
+  else
+    {
+      yytoken = YYTRANSLATE (yychar);
+      YY_SYMBOL_PRINT ("Next token is", yytoken, &yylval, &yylloc);
+    }
+
+  /* If the proper action on seeing token YYTOKEN is to reduce or to
+     detect an error, take that action.  */
+  yyn += yytoken;
+  if (yyn < 0 || YYLAST < yyn || yycheck[yyn] != yytoken)
+    goto yydefault;
+  yyn = yytable[yyn];
+  if (yyn <= 0)
+    {
+      if (yyn == 0 || yyn == YYTABLE_NINF)
+	goto yyerrlab;
+      yyn = -yyn;
+      goto yyreduce;
+    }
+
+  if (yyn == YYFINAL)
+    YYACCEPT;
+
+  /* Count tokens shifted since error; after three, turn off error
+     status.  */
+  if (yyerrstatus)
+    yyerrstatus--;
+
+  /* Shift the look-ahead token.  */
+  YY_SYMBOL_PRINT ("Shifting", yytoken, &yylval, &yylloc);
+
+  /* Discard the shifted token unless it is eof.  */
+  if (yychar != YYEOF)
+    yychar = YYEMPTY;
+
+  yystate = yyn;
+  *++yyvsp = yylval;
+
+  goto yynewstate;
+
+
+/*-----------------------------------------------------------.
+| yydefault -- do the default action for the current state.  |
+`-----------------------------------------------------------*/
+yydefault:
+  yyn = yydefact[yystate];
+  if (yyn == 0)
+    goto yyerrlab;
+  goto yyreduce;
+
+
+/*-----------------------------.
+| yyreduce -- Do a reduction.  |
+`-----------------------------*/
+yyreduce:
+  /* yyn is the number of a rule to reduce with.  */
+  yylen = yyr2[yyn];
+
+  /* If YYLEN is nonzero, implement the default value of the action:
+     `$$ = $1'.
+
+     Otherwise, the following line sets YYVAL to garbage.
+     This behavior is undocumented and Bison
+     users should not rely upon it.  Assigning to YYVAL
+     unconditionally makes the parser a bit smaller, and it avoids a
+     GCC warning that YYVAL may be used uninitialized.  */
+  yyval = yyvsp[1-yylen];
+
+
+  YY_REDUCE_PRINT (yyn);
+  switch (yyn)
+    {
+        case 25:
+#line 190 "pars0grm.y"
+    { (yyval) = que_node_list_add_last(NULL, (yyvsp[(1) - (1)])); ;}
+    break;
+
+  case 26:
+#line 192 "pars0grm.y"
+    { (yyval) = que_node_list_add_last((yyvsp[(1) - (2)]), (yyvsp[(2) - (2)])); ;}
+    break;
+
+  case 27:
+#line 196 "pars0grm.y"
+    { (yyval) = (yyvsp[(1) - (1)]);;}
+    break;
+
+  case 28:
+#line 198 "pars0grm.y"
+    { (yyval) = pars_func((yyvsp[(1) - (4)]), (yyvsp[(3) - (4)])); ;}
+    break;
+
+  case 29:
+#line 199 "pars0grm.y"
+    { (yyval) = (yyvsp[(1) - (1)]);;}
+    break;
+
+  case 30:
+#line 200 "pars0grm.y"
+    { (yyval) = (yyvsp[(1) - (1)]);;}
+    break;
+
+  case 31:
+#line 201 "pars0grm.y"
+    { (yyval) = (yyvsp[(1) - (1)]);;}
+    break;
+
+  case 32:
+#line 202 "pars0grm.y"
+    { (yyval) = (yyvsp[(1) - (1)]);;}
+    break;
+
+  case 33:
+#line 203 "pars0grm.y"
+    { (yyval) = (yyvsp[(1) - (1)]);;}
+    break;
+
+  case 34:
+#line 204 "pars0grm.y"
+    { (yyval) = (yyvsp[(1) - (1)]);;}
+    break;
+
+  case 35:
+#line 205 "pars0grm.y"
+    { (yyval) = (yyvsp[(1) - (1)]);;}
+    break;
+
+  case 36:
+#line 206 "pars0grm.y"
+    { (yyval) = pars_op('+', (yyvsp[(1) - (3)]), (yyvsp[(3) - (3)])); ;}
+    break;
+
+  case 37:
+#line 207 "pars0grm.y"
+    { (yyval) = pars_op('-', (yyvsp[(1) - (3)]), (yyvsp[(3) - (3)])); ;}
+    break;
+
+  case 38:
+#line 208 "pars0grm.y"
+    { (yyval) = pars_op('*', (yyvsp[(1) - (3)]), (yyvsp[(3) - (3)])); ;}
+    break;
+
+  case 39:
+#line 209 "pars0grm.y"
+    { (yyval) = pars_op('/', (yyvsp[(1) - (3)]), (yyvsp[(3) - (3)])); ;}
+    break;
+
+  case 40:
+#line 210 "pars0grm.y"
+    { (yyval) = pars_op('-', (yyvsp[(2) - (2)]), NULL); ;}
+    break;
+
+  case 41:
+#line 211 "pars0grm.y"
+    { (yyval) = (yyvsp[(2) - (3)]); ;}
+    break;
+
+  case 42:
+#line 212 "pars0grm.y"
+    { (yyval) = pars_op('=', (yyvsp[(1) - (3)]), (yyvsp[(3) - (3)])); ;}
+    break;
+
+  case 43:
+#line 214 "pars0grm.y"
+    { (yyval) = pars_op(PARS_LIKE_TOKEN, (yyvsp[(1) - (3)]), (yyvsp[(3) - (3)])); ;}
+    break;
+
+  case 44:
+#line 215 "pars0grm.y"
+    { (yyval) = pars_op('<', (yyvsp[(1) - (3)]), (yyvsp[(3) - (3)])); ;}
+    break;
+
+  case 45:
+#line 216 "pars0grm.y"
+    { (yyval) = pars_op('>', (yyvsp[(1) - (3)]), (yyvsp[(3) - (3)])); ;}
+    break;
+
+  case 46:
+#line 217 "pars0grm.y"
+    { (yyval) = pars_op(PARS_GE_TOKEN, (yyvsp[(1) - (3)]), (yyvsp[(3) - (3)])); ;}
+    break;
+
+  case 47:
+#line 218 "pars0grm.y"
+    { (yyval) = pars_op(PARS_LE_TOKEN, (yyvsp[(1) - (3)]), (yyvsp[(3) - (3)])); ;}
+    break;
+
+  case 48:
+#line 219 "pars0grm.y"
+    { (yyval) = pars_op(PARS_NE_TOKEN, (yyvsp[(1) - (3)]), (yyvsp[(3) - (3)])); ;}
+    break;
+
+  case 49:
+#line 220 "pars0grm.y"
+    { (yyval) = pars_op(PARS_AND_TOKEN, (yyvsp[(1) - (3)]), (yyvsp[(3) - (3)])); ;}
+    break;
+
+  case 50:
+#line 221 "pars0grm.y"
+    { (yyval) = pars_op(PARS_OR_TOKEN, (yyvsp[(1) - (3)]), (yyvsp[(3) - (3)])); ;}
+    break;
+
+  case 51:
+#line 222 "pars0grm.y"
+    { (yyval) = pars_op(PARS_NOT_TOKEN, (yyvsp[(2) - (2)]), NULL); ;}
+    break;
+
+  case 52:
+#line 224 "pars0grm.y"
+    { (yyval) = pars_op(PARS_NOTFOUND_TOKEN, (yyvsp[(1) - (3)]), NULL); ;}
+    break;
+
+  case 53:
+#line 226 "pars0grm.y"
+    { (yyval) = pars_op(PARS_NOTFOUND_TOKEN, (yyvsp[(1) - (3)]), NULL); ;}
+    break;
+
+  case 54:
+#line 230 "pars0grm.y"
+    { (yyval) = &pars_to_char_token; ;}
+    break;
+
+  case 55:
+#line 231 "pars0grm.y"
+    { (yyval) = &pars_to_number_token; ;}
+    break;
+
+  case 56:
+#line 232 "pars0grm.y"
+    { (yyval) = &pars_to_binary_token; ;}
+    break;
+
+  case 57:
+#line 234 "pars0grm.y"
+    { (yyval) = &pars_binary_to_number_token; ;}
+    break;
+
+  case 58:
+#line 235 "pars0grm.y"
+    { (yyval) = &pars_substr_token; ;}
+    break;
+
+  case 59:
+#line 236 "pars0grm.y"
+    { (yyval) = &pars_concat_token; ;}
+    break;
+
+  case 60:
+#line 237 "pars0grm.y"
+    { (yyval) = &pars_instr_token; ;}
+    break;
+
+  case 61:
+#line 238 "pars0grm.y"
+    { (yyval) = &pars_length_token; ;}
+    break;
+
+  case 62:
+#line 239 "pars0grm.y"
+    { (yyval) = &pars_sysdate_token; ;}
+    break;
+
+  case 63:
+#line 240 "pars0grm.y"
+    { (yyval) = &pars_rnd_token; ;}
+    break;
+
+  case 64:
+#line 241 "pars0grm.y"
+    { (yyval) = &pars_rnd_str_token; ;}
+    break;
+
+  case 68:
+#line 252 "pars0grm.y"
+    { (yyval) = pars_stored_procedure_call(
+					static_cast<sym_node_t*>((yyvsp[(2) - (6)]))); ;}
+    break;
+
+  case 69:
+#line 258 "pars0grm.y"
+    { (yyval) = pars_procedure_call((yyvsp[(1) - (4)]), (yyvsp[(3) - (4)])); ;}
+    break;
+
+  case 70:
+#line 262 "pars0grm.y"
+    { (yyval) = &pars_replstr_token; ;}
+    break;
+
+  case 71:
+#line 263 "pars0grm.y"
+    { (yyval) = &pars_printf_token; ;}
+    break;
+
+  case 72:
+#line 264 "pars0grm.y"
+    { (yyval) = &pars_assert_token; ;}
+    break;
+
+  case 73:
+#line 268 "pars0grm.y"
+    { (yyval) = (yyvsp[(1) - (3)]); ;}
+    break;
+
+  case 74:
+#line 272 "pars0grm.y"
+    { (yyval) = que_node_list_add_last(NULL, (yyvsp[(1) - (1)])); ;}
+    break;
+
+  case 75:
+#line 274 "pars0grm.y"
+    { (yyval) = que_node_list_add_last((yyvsp[(1) - (3)]), (yyvsp[(3) - (3)])); ;}
+    break;
+
+  case 76:
+#line 278 "pars0grm.y"
+    { (yyval) = NULL; ;}
+    break;
+
+  case 77:
+#line 279 "pars0grm.y"
+    { (yyval) = que_node_list_add_last(NULL, (yyvsp[(1) - (1)])); ;}
+    break;
+
+  case 78:
+#line 281 "pars0grm.y"
+    { (yyval) = que_node_list_add_last((yyvsp[(1) - (3)]), (yyvsp[(3) - (3)])); ;}
+    break;
+
+  case 79:
+#line 285 "pars0grm.y"
+    { (yyval) = NULL; ;}
+    break;
+
+  case 80:
+#line 286 "pars0grm.y"
+    { (yyval) = que_node_list_add_last(NULL, (yyvsp[(1) - (1)]));;}
+    break;
+
+  case 81:
+#line 287 "pars0grm.y"
+    { (yyval) = que_node_list_add_last((yyvsp[(1) - (3)]), (yyvsp[(3) - (3)])); ;}
+    break;
+
+  case 82:
+#line 291 "pars0grm.y"
+    { (yyval) = (yyvsp[(1) - (1)]); ;}
+    break;
+
+  case 83:
+#line 293 "pars0grm.y"
+    { (yyval) = pars_func(&pars_count_token,
+				          que_node_list_add_last(NULL,
+					    sym_tab_add_int_lit(
+						pars_sym_tab_global, 1))); ;}
+    break;
+
+  case 84:
+#line 298 "pars0grm.y"
+    { (yyval) = pars_func(&pars_count_token,
+					    que_node_list_add_last(NULL,
+						pars_func(&pars_distinct_token,
+						     que_node_list_add_last(
+								NULL, (yyvsp[(4) - (5)]))))); ;}
+    break;
+
+  case 85:
+#line 304 "pars0grm.y"
+    { (yyval) = pars_func(&pars_sum_token,
+						que_node_list_add_last(NULL,
+									(yyvsp[(3) - (4)]))); ;}
+    break;
+
+  case 86:
+#line 310 "pars0grm.y"
+    { (yyval) = NULL; ;}
+    break;
+
+  case 87:
+#line 311 "pars0grm.y"
+    { (yyval) = que_node_list_add_last(NULL, (yyvsp[(1) - (1)])); ;}
+    break;
+
+  case 88:
+#line 313 "pars0grm.y"
+    { (yyval) = que_node_list_add_last((yyvsp[(1) - (3)]), (yyvsp[(3) - (3)])); ;}
+    break;
+
+  case 89:
+#line 317 "pars0grm.y"
+    { (yyval) = pars_select_list(&pars_star_denoter,
+								NULL); ;}
+    break;
+
+  case 90:
+#line 320 "pars0grm.y"
+    { (yyval) = pars_select_list(
+					(yyvsp[(1) - (3)]), static_cast<sym_node_t*>((yyvsp[(3) - (3)]))); ;}
+    break;
+
+  case 91:
+#line 322 "pars0grm.y"
+    { (yyval) = pars_select_list((yyvsp[(1) - (1)]), NULL); ;}
+    break;
+
+  case 92:
+#line 326 "pars0grm.y"
+    { (yyval) = NULL; ;}
+    break;
+
+  case 93:
+#line 327 "pars0grm.y"
+    { (yyval) = (yyvsp[(2) - (2)]); ;}
+    break;
+
+  case 94:
+#line 331 "pars0grm.y"
+    { (yyval) = NULL; ;}
+    break;
+
+  case 95:
+#line 333 "pars0grm.y"
+    { (yyval) = &pars_update_token; ;}
+    break;
+
+  case 96:
+#line 337 "pars0grm.y"
+    { (yyval) = NULL; ;}
+    break;
+
+  case 97:
+#line 339 "pars0grm.y"
+    { (yyval) = &pars_share_token; ;}
+    break;
+
+  case 98:
+#line 343 "pars0grm.y"
+    { (yyval) = &pars_asc_token; ;}
+    break;
+
+  case 99:
+#line 344 "pars0grm.y"
+    { (yyval) = &pars_asc_token; ;}
+    break;
+
+  case 100:
+#line 345 "pars0grm.y"
+    { (yyval) = &pars_desc_token; ;}
+    break;
+
+  case 101:
+#line 349 "pars0grm.y"
+    { (yyval) = NULL; ;}
+    break;
+
+  case 102:
+#line 351 "pars0grm.y"
+    { (yyval) = pars_order_by(
+					static_cast<sym_node_t*>((yyvsp[(3) - (4)])),
+					static_cast<pars_res_word_t*>((yyvsp[(4) - (4)]))); ;}
+    break;
+
+  case 103:
+#line 362 "pars0grm.y"
+    { (yyval) = pars_select_statement(
+					static_cast<sel_node_t*>((yyvsp[(2) - (8)])),
+					static_cast<sym_node_t*>((yyvsp[(4) - (8)])),
+					static_cast<que_node_t*>((yyvsp[(5) - (8)])),
+					static_cast<pars_res_word_t*>((yyvsp[(6) - (8)])),
+					static_cast<pars_res_word_t*>((yyvsp[(7) - (8)])),
+					static_cast<order_node_t*>((yyvsp[(8) - (8)]))); ;}
+    break;
+
+  case 104:
+#line 373 "pars0grm.y"
+    { (yyval) = (yyvsp[(3) - (3)]); ;}
+    break;
+
+  case 105:
+#line 378 "pars0grm.y"
+    { (yyval) = pars_insert_statement(
+					static_cast<sym_node_t*>((yyvsp[(1) - (5)])), (yyvsp[(4) - (5)]), NULL); ;}
+    break;
+
+  case 106:
+#line 381 "pars0grm.y"
+    { (yyval) = pars_insert_statement(
+					static_cast<sym_node_t*>((yyvsp[(1) - (2)])),
+					NULL,
+					static_cast<sel_node_t*>((yyvsp[(2) - (2)]))); ;}
+    break;
+
+  case 107:
+#line 388 "pars0grm.y"
+    { (yyval) = pars_column_assignment(
+					static_cast<sym_node_t*>((yyvsp[(1) - (3)])),
+					static_cast<que_node_t*>((yyvsp[(3) - (3)]))); ;}
+    break;
+
+  case 108:
+#line 394 "pars0grm.y"
+    { (yyval) = que_node_list_add_last(NULL, (yyvsp[(1) - (1)])); ;}
+    break;
+
+  case 109:
+#line 396 "pars0grm.y"
+    { (yyval) = que_node_list_add_last((yyvsp[(1) - (3)]), (yyvsp[(3) - (3)])); ;}
+    break;
+
+  case 110:
+#line 402 "pars0grm.y"
+    { (yyval) = (yyvsp[(4) - (4)]); ;}
+    break;
+
+  case 111:
+#line 408 "pars0grm.y"
+    { (yyval) = pars_update_statement_start(
+					FALSE,
+					static_cast<sym_node_t*>((yyvsp[(2) - (4)])),
+					static_cast<col_assign_node_t*>((yyvsp[(4) - (4)]))); ;}
+    break;
+
+  case 112:
+#line 416 "pars0grm.y"
+    { (yyval) = pars_update_statement(
+					static_cast<upd_node_t*>((yyvsp[(1) - (2)])),
+					NULL,
+					static_cast<que_node_t*>((yyvsp[(2) - (2)]))); ;}
+    break;
+
+  case 113:
+#line 424 "pars0grm.y"
+    { (yyval) = pars_update_statement(
+					static_cast<upd_node_t*>((yyvsp[(1) - (2)])),
+					static_cast<sym_node_t*>((yyvsp[(2) - (2)])),
+					NULL); ;}
+    break;
+
+  case 114:
+#line 432 "pars0grm.y"
+    { (yyval) = pars_update_statement_start(
+					TRUE,
+					static_cast<sym_node_t*>((yyvsp[(3) - (3)])), NULL); ;}
+    break;
+
+  case 115:
+#line 439 "pars0grm.y"
+    { (yyval) = pars_update_statement(
+					static_cast<upd_node_t*>((yyvsp[(1) - (2)])),
+					NULL,
+					static_cast<que_node_t*>((yyvsp[(2) - (2)]))); ;}
+    break;
+
+  case 116:
+#line 447 "pars0grm.y"
+    { (yyval) = pars_update_statement(
+					static_cast<upd_node_t*>((yyvsp[(1) - (2)])),
+					static_cast<sym_node_t*>((yyvsp[(2) - (2)])),
+					NULL); ;}
+    break;
+
+  case 117:
+#line 455 "pars0grm.y"
+    { (yyval) = pars_row_printf_statement(
+					static_cast<sel_node_t*>((yyvsp[(2) - (2)]))); ;}
+    break;
+
+  case 118:
+#line 461 "pars0grm.y"
+    { (yyval) = pars_assignment_statement(
+					static_cast<sym_node_t*>((yyvsp[(1) - (3)])),
+					static_cast<que_node_t*>((yyvsp[(3) - (3)]))); ;}
+    break;
+
+  case 119:
+#line 469 "pars0grm.y"
+    { (yyval) = pars_elsif_element((yyvsp[(2) - (4)]), (yyvsp[(4) - (4)])); ;}
+    break;
+
+  case 120:
+#line 473 "pars0grm.y"
+    { (yyval) = que_node_list_add_last(NULL, (yyvsp[(1) - (1)])); ;}
+    break;
+
+  case 121:
+#line 475 "pars0grm.y"
+    { (yyval) = que_node_list_add_last((yyvsp[(1) - (2)]), (yyvsp[(2) - (2)])); ;}
+    break;
+
+  case 122:
+#line 479 "pars0grm.y"
+    { (yyval) = NULL; ;}
+    break;
+
+  case 123:
+#line 481 "pars0grm.y"
+    { (yyval) = (yyvsp[(2) - (2)]); ;}
+    break;
+
+  case 124:
+#line 482 "pars0grm.y"
+    { (yyval) = (yyvsp[(1) - (1)]); ;}
+    break;
+
+  case 125:
+#line 489 "pars0grm.y"
+    { (yyval) = pars_if_statement((yyvsp[(2) - (7)]), (yyvsp[(4) - (7)]), (yyvsp[(5) - (7)])); ;}
+    break;
+
+  case 126:
+#line 495 "pars0grm.y"
+    { (yyval) = pars_while_statement((yyvsp[(2) - (6)]), (yyvsp[(4) - (6)])); ;}
+    break;
+
+  case 127:
+#line 503 "pars0grm.y"
+    { (yyval) = pars_for_statement(
+					static_cast<sym_node_t*>((yyvsp[(2) - (10)])),
+					(yyvsp[(4) - (10)]), (yyvsp[(6) - (10)]), (yyvsp[(8) - (10)])); ;}
+    break;
+
+  case 128:
+#line 509 "pars0grm.y"
+    { (yyval) = pars_exit_statement(); ;}
+    break;
+
+  case 129:
+#line 513 "pars0grm.y"
+    { (yyval) = pars_return_statement(); ;}
+    break;
+
+  case 130:
+#line 518 "pars0grm.y"
+    { (yyval) = pars_open_statement(
+						ROW_SEL_OPEN_CURSOR,
+						static_cast<sym_node_t*>((yyvsp[(2) - (2)]))); ;}
+    break;
+
+  case 131:
+#line 525 "pars0grm.y"
+    { (yyval) = pars_open_statement(
+						ROW_SEL_CLOSE_CURSOR,
+						static_cast<sym_node_t*>((yyvsp[(2) - (2)]))); ;}
+    break;
+
+  case 132:
+#line 532 "pars0grm.y"
+    { (yyval) = pars_fetch_statement(
+					static_cast<sym_node_t*>((yyvsp[(2) - (4)])),
+					static_cast<sym_node_t*>((yyvsp[(4) - (4)])), NULL); ;}
+    break;
+
+  case 133:
+#line 536 "pars0grm.y"
+    { (yyval) = pars_fetch_statement(
+					static_cast<sym_node_t*>((yyvsp[(2) - (4)])),
+					NULL,
+					static_cast<sym_node_t*>((yyvsp[(4) - (4)]))); ;}
+    break;
+
+  case 134:
+#line 544 "pars0grm.y"
+    { (yyval) = pars_column_def(
+					static_cast<sym_node_t*>((yyvsp[(1) - (5)])),
+					static_cast<pars_res_word_t*>((yyvsp[(2) - (5)])),
+					static_cast<sym_node_t*>((yyvsp[(3) - (5)])),
+					(yyvsp[(4) - (5)]), (yyvsp[(5) - (5)])); ;}
+    break;
+
+  case 135:
+#line 552 "pars0grm.y"
+    { (yyval) = que_node_list_add_last(NULL, (yyvsp[(1) - (1)])); ;}
+    break;
+
+  case 136:
+#line 554 "pars0grm.y"
+    { (yyval) = que_node_list_add_last((yyvsp[(1) - (3)]), (yyvsp[(3) - (3)])); ;}
+    break;
+
+  case 137:
+#line 558 "pars0grm.y"
+    { (yyval) = NULL; ;}
+    break;
+
+  case 138:
+#line 560 "pars0grm.y"
+    { (yyval) = (yyvsp[(2) - (3)]); ;}
+    break;
+
+  case 139:
+#line 564 "pars0grm.y"
+    { (yyval) = NULL; ;}
+    break;
+
+  case 140:
+#line 566 "pars0grm.y"
+    { (yyval) = &pars_int_token;
+					/* pass any non-NULL pointer */ ;}
+    break;
+
+  case 141:
+#line 571 "pars0grm.y"
+    { (yyval) = NULL; ;}
+    break;
+
+  case 142:
+#line 573 "pars0grm.y"
+    { (yyval) = &pars_int_token;
+					/* pass any non-NULL pointer */ ;}
+    break;
+
+  case 143:
+#line 578 "pars0grm.y"
+    { (yyval) = NULL; ;}
+    break;
+
+  case 144:
+#line 580 "pars0grm.y"
+    { (yyval) = &pars_int_token;
+					/* pass any non-NULL pointer */ ;}
+    break;
+
+  case 145:
+#line 585 "pars0grm.y"
+    { (yyval) = NULL; ;}
+    break;
+
+  case 146:
+#line 586 "pars0grm.y"
+    { (yyval) = &pars_int_token;
+					/* pass any non-NULL pointer */ ;}
+    break;
+
+  case 147:
+#line 591 "pars0grm.y"
+    { (yyval) = NULL; ;}
+    break;
+
+  case 148:
+#line 593 "pars0grm.y"
+    { (yyval) = (yyvsp[(3) - (3)]); ;}
+    break;
+
+  case 149:
+#line 600 "pars0grm.y"
+    { (yyval) = pars_create_table(
+					static_cast<sym_node_t*>((yyvsp[(3) - (9)])),
+					static_cast<sym_node_t*>((yyvsp[(5) - (9)])),
+					static_cast<sym_node_t*>((yyvsp[(8) - (9)])),
+					static_cast<sym_node_t*>((yyvsp[(9) - (9)])), (yyvsp[(7) - (9)])); ;}
+    break;
+
+  case 150:
+#line 608 "pars0grm.y"
+    { (yyval) = que_node_list_add_last(NULL, (yyvsp[(1) - (1)])); ;}
+    break;
+
+  case 151:
+#line 610 "pars0grm.y"
+    { (yyval) = que_node_list_add_last((yyvsp[(1) - (3)]), (yyvsp[(3) - (3)])); ;}
+    break;
+
+  case 152:
+#line 614 "pars0grm.y"
+    { (yyval) = NULL; ;}
+    break;
+
+  case 153:
+#line 615 "pars0grm.y"
+    { (yyval) = &pars_unique_token; ;}
+    break;
+
+  case 154:
+#line 619 "pars0grm.y"
+    { (yyval) = NULL; ;}
+    break;
+
+  case 155:
+#line 620 "pars0grm.y"
+    { (yyval) = &pars_clustered_token; ;}
+    break;
+
+  case 156:
+#line 629 "pars0grm.y"
+    { (yyval) = pars_create_index(
+					static_cast<pars_res_word_t*>((yyvsp[(2) - (10)])),
+					static_cast<pars_res_word_t*>((yyvsp[(3) - (10)])),
+					static_cast<sym_node_t*>((yyvsp[(5) - (10)])),
+					static_cast<sym_node_t*>((yyvsp[(7) - (10)])),
+					static_cast<sym_node_t*>((yyvsp[(9) - (10)]))); ;}
+    break;
+
+  case 157:
+#line 638 "pars0grm.y"
+    { (yyval) = (yyvsp[(1) - (1)]); ;}
+    break;
+
+  case 158:
+#line 639 "pars0grm.y"
+    { (yyval) = (yyvsp[(1) - (1)]); ;}
+    break;
+
+  case 159:
+#line 644 "pars0grm.y"
+    { (yyval) = pars_commit_statement(); ;}
+    break;
+
+  case 160:
+#line 649 "pars0grm.y"
+    { (yyval) = pars_rollback_statement(); ;}
+    break;
+
+  case 161:
+#line 653 "pars0grm.y"
+    { (yyval) = &pars_int_token; ;}
+    break;
+
+  case 162:
+#line 654 "pars0grm.y"
+    { (yyval) = &pars_int_token; ;}
+    break;
+
+  case 163:
+#line 655 "pars0grm.y"
+    { (yyval) = &pars_bigint_token; ;}
+    break;
+
+  case 164:
+#line 656 "pars0grm.y"
+    { (yyval) = &pars_char_token; ;}
+    break;
+
+  case 165:
+#line 657 "pars0grm.y"
+    { (yyval) = &pars_binary_token; ;}
+    break;
+
+  case 166:
+#line 658 "pars0grm.y"
+    { (yyval) = &pars_blob_token; ;}
+    break;
+
+  case 167:
+#line 663 "pars0grm.y"
+    { (yyval) = pars_parameter_declaration(
+					static_cast<sym_node_t*>((yyvsp[(1) - (3)])),
+					PARS_INPUT,
+					static_cast<pars_res_word_t*>((yyvsp[(3) - (3)]))); ;}
+    break;
+
+  case 168:
+#line 668 "pars0grm.y"
+    { (yyval) = pars_parameter_declaration(
+					static_cast<sym_node_t*>((yyvsp[(1) - (3)])),
+					PARS_OUTPUT,
+					static_cast<pars_res_word_t*>((yyvsp[(3) - (3)]))); ;}
+    break;
+
+  case 169:
+#line 675 "pars0grm.y"
+    { (yyval) = NULL; ;}
+    break;
+
+  case 170:
+#line 676 "pars0grm.y"
+    { (yyval) = que_node_list_add_last(NULL, (yyvsp[(1) - (1)])); ;}
+    break;
+
+  case 171:
+#line 678 "pars0grm.y"
+    { (yyval) = que_node_list_add_last((yyvsp[(1) - (3)]), (yyvsp[(3) - (3)])); ;}
+    break;
+
+  case 172:
+#line 683 "pars0grm.y"
+    { (yyval) = pars_variable_declaration(
+					static_cast<sym_node_t*>((yyvsp[(1) - (3)])),
+					static_cast<pars_res_word_t*>((yyvsp[(2) - (3)]))); ;}
+    break;
+
+  case 176:
+#line 697 "pars0grm.y"
+    { (yyval) = pars_cursor_declaration(
+					static_cast<sym_node_t*>((yyvsp[(3) - (6)])),
+					static_cast<sel_node_t*>((yyvsp[(5) - (6)]))); ;}
+    break;
+
+  case 177:
+#line 704 "pars0grm.y"
+    { (yyval) = pars_function_declaration(
+					static_cast<sym_node_t*>((yyvsp[(3) - (4)]))); ;}
+    break;
+
+  case 183:
+#line 726 "pars0grm.y"
+    { (yyval) = pars_procedure_definition(
+					static_cast<sym_node_t*>((yyvsp[(2) - (11)])),
+					static_cast<sym_node_t*>((yyvsp[(4) - (11)])),
+					(yyvsp[(10) - (11)])); ;}
+    break;
+
+
+/* Line 1267 of yacc.c.  */
+#line 2826 "pars0grm.cc"
+      default: break;
+    }
+  YY_SYMBOL_PRINT ("-> $$ =", yyr1[yyn], &yyval, &yyloc);
+
+  YYPOPSTACK (yylen);
+  yylen = 0;
+  YY_STACK_PRINT (yyss, yyssp);
+
+  *++yyvsp = yyval;
+
+
+  /* Now `shift' the result of the reduction.  Determine what state
+     that goes to, based on the state we popped back to and the rule
+     number reduced by.  */
+
+  yyn = yyr1[yyn];
+
+  yystate = yypgoto[yyn - YYNTOKENS] + *yyssp;
+  if (0 <= yystate && yystate <= YYLAST && yycheck[yystate] == *yyssp)
+    yystate = yytable[yystate];
+  else
+    yystate = yydefgoto[yyn - YYNTOKENS];
+
+  goto yynewstate;
+
+
+/*------------------------------------.
+| yyerrlab -- here on detecting error |
+`------------------------------------*/
+yyerrlab:
+  /* If not already recovering from an error, report this error.  */
+  if (!yyerrstatus)
+    {
+      ++yynerrs;
+#if ! YYERROR_VERBOSE
+      yyerror (YY_("syntax error"));
+#else
+      {
+	YYSIZE_T yysize = yysyntax_error (0, yystate, yychar);
+	if (yymsg_alloc < yysize && yymsg_alloc < YYSTACK_ALLOC_MAXIMUM)
+	  {
+	    YYSIZE_T yyalloc = 2 * yysize;
+	    if (! (yysize <= yyalloc && yyalloc <= YYSTACK_ALLOC_MAXIMUM))
+	      yyalloc = YYSTACK_ALLOC_MAXIMUM;
+	    if (yymsg != yymsgbuf)
+	      YYSTACK_FREE (yymsg);
+	    yymsg = (char*) YYSTACK_ALLOC (yyalloc);
+	    if (yymsg)
+	      yymsg_alloc = yyalloc;
+	    else
+	      {
+		yymsg = yymsgbuf;
+		yymsg_alloc = sizeof yymsgbuf;
+	      }
+	  }
+
+	if (0 < yysize && yysize <= yymsg_alloc)
+	  {
+	    (void) yysyntax_error (yymsg, yystate, yychar);
+	    yyerror (yymsg);
+	  }
+	else
+	  {
+	    yyerror (YY_("syntax error"));
+	    if (yysize != 0)
+	      goto yyexhaustedlab;
+	  }
+      }
+#endif
+    }
+
+
+
+  if (yyerrstatus == 3)
+    {
+      /* If just tried and failed to reuse look-ahead token after an
+	 error, discard it.  */
+
+      if (yychar <= YYEOF)
+	{
+	  /* Return failure if at end of input.  */
+	  if (yychar == YYEOF)
+	    YYABORT;
+	}
+      else
+	{
+	  yydestruct ("Error: discarding",
+		      yytoken, &yylval);
+	  yychar = YYEMPTY;
+	}
+    }
+
+  /* Else will try to reuse look-ahead token after shifting the error
+     token.  */
+  goto yyerrlab1;
+
+
+/*---------------------------------------------------.
+| yyerrorlab -- error raised explicitly by YYERROR.  |
+`---------------------------------------------------*/
+yyerrorlab:
+
+  /* Pacify compilers like GCC when the user code never invokes
+     YYERROR and the label yyerrorlab therefore never appears in user
+     code.  */
+  if (/*CONSTCOND*/ 0)
+     goto yyerrorlab;
+
+  /* Do not reclaim the symbols of the rule which action triggered
+     this YYERROR.  */
+  YYPOPSTACK (yylen);
+  yylen = 0;
+  YY_STACK_PRINT (yyss, yyssp);
+  yystate = *yyssp;
+  goto yyerrlab1;
+
+
+/*-------------------------------------------------------------.
+| yyerrlab1 -- common code for both syntax error and YYERROR.  |
+`-------------------------------------------------------------*/
+yyerrlab1:
+  yyerrstatus = 3;	/* Each real token shifted decrements this.  */
+
+  for (;;)
+    {
+      yyn = yypact[yystate];
+      if (yyn != YYPACT_NINF)
+	{
+	  yyn += YYTERROR;
+	  if (0 <= yyn && yyn <= YYLAST && yycheck[yyn] == YYTERROR)
+	    {
+	      yyn = yytable[yyn];
+	      if (0 < yyn)
+		break;
+	    }
+	}
+
+      /* Pop the current state because it cannot handle the error token.  */
+      if (yyssp == yyss)
+	YYABORT;
+
+
+      yydestruct ("Error: popping",
+		  yystos[yystate], yyvsp);
+      YYPOPSTACK (1);
+      yystate = *yyssp;
+      YY_STACK_PRINT (yyss, yyssp);
+    }
+
+  if (yyn == YYFINAL)
+    YYACCEPT;
+
+  *++yyvsp = yylval;
+
+
+  /* Shift the error token.  */
+  YY_SYMBOL_PRINT ("Shifting", yystos[yyn], yyvsp, yylsp);
+
+  yystate = yyn;
+  goto yynewstate;
+
+
+/*-------------------------------------.
+| yyacceptlab -- YYACCEPT comes here.  |
+`-------------------------------------*/
+yyacceptlab:
+  yyresult = 0;
+  goto yyreturn;
+
+/*-----------------------------------.
+| yyabortlab -- YYABORT comes here.  |
+`-----------------------------------*/
+yyabortlab:
+  yyresult = 1;
+  goto yyreturn;
+
+#ifndef yyoverflow
+/*-------------------------------------------------.
+| yyexhaustedlab -- memory exhaustion comes here.  |
+`-------------------------------------------------*/
+yyexhaustedlab:
+  yyerror (YY_("memory exhausted"));
+  yyresult = 2;
+  /* Fall through.  */
+#endif
+
+yyreturn:
+  if (yychar != YYEOF && yychar != YYEMPTY)
+     yydestruct ("Cleanup: discarding lookahead",
+		 yytoken, &yylval);
+  /* Do not reclaim the symbols of the rule which action triggered
+     this YYABORT or YYACCEPT.  */
+  YYPOPSTACK (yylen);
+  YY_STACK_PRINT (yyss, yyssp);
+  while (yyssp != yyss)
+    {
+      yydestruct ("Cleanup: popping",
+		  yystos[*yyssp], yyvsp);
+      YYPOPSTACK (1);
+    }
+#ifndef yyoverflow
+  if (yyss != yyssa)
+    YYSTACK_FREE (yyss);
+#endif
+#if YYERROR_VERBOSE
+  if (yymsg != yymsgbuf)
+    YYSTACK_FREE (yymsg);
+#endif
+  /* Make sure YYID is used.  */
+  return YYID (yyresult);
+}
+
+
+#line 732 "pars0grm.y"
+
+
diff --git a/storage/innobase/pars/pars0grm.y b/storage/innobase/pars/pars0grm.y
index 14d64f1826f..60913287cc4 100644
--- a/storage/innobase/pars/pars0grm.y
+++ b/storage/innobase/pars/pars0grm.y
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1997, 2011, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -137,6 +137,15 @@ yylex(void);
 %token PARS_LOCK_TOKEN
 %token PARS_SHARE_TOKEN
 %token PARS_MODE_TOKEN
+%token PARS_LIKE_TOKEN
+%token PARS_LIKE_TOKEN_EXACT
+%token PARS_LIKE_TOKEN_PREFIX
+%token PARS_LIKE_TOKEN_SUFFIX
+%token PARS_LIKE_TOKEN_SUBSTR
+%token PARS_TABLE_NAME_TOKEN
+%token PARS_COMPACT_TOKEN
+%token PARS_BLOCK_SIZE_TOKEN
+%token PARS_BIGINT_TOKEN
 
 %left PARS_AND_TOKEN PARS_OR_TOKEN
 %left PARS_NOT_TOKEN
@@ -201,8 +210,10 @@ exp:
 	| '-' exp %prec NEG 	{ $$ = pars_op('-', $2, NULL); }
 	| '(' exp ')'        	{ $$ = $2; }
 	| exp '=' exp		{ $$ = pars_op('=', $1, $3); }
-	| exp '<' exp		{ $$ = pars_op('<', $1, $3); }
-	| exp '>' exp		{ $$ = pars_op('>', $1, $3); }
+	| exp PARS_LIKE_TOKEN PARS_STR_LIT
+				{ $$ = pars_op(PARS_LIKE_TOKEN, $1, $3); }
+	| exp '<' exp           { $$ = pars_op('<', $1, $3); }
+	| exp '>' exp           { $$ = pars_op('>', $1, $3); }
 	| exp PARS_GE_TOKEN exp	{ $$ = pars_op(PARS_GE_TOKEN, $1, $3); }
 	| exp PARS_LE_TOKEN exp	{ $$ = pars_op(PARS_LE_TOKEN, $1, $3); }
 	| exp PARS_NE_TOKEN exp	{ $$ = pars_op(PARS_NE_TOKEN, $1, $3); }
@@ -238,7 +249,8 @@ question_mark_list:
 
 stored_procedure_call:
 	'{' PARS_ID_TOKEN '(' question_mark_list ')' '}'
-				{ $$ = pars_stored_procedure_call($2); }
+				{ $$ = pars_stored_procedure_call(
+					static_cast<sym_node_t*>($2)); }
 ;
 
 predefined_procedure_call:
@@ -257,8 +269,8 @@ user_function_call:
 ;
 
 table_list:
-	PARS_ID_TOKEN		{ $$ = que_node_list_add_last(NULL, $1); }
-	| table_list ',' PARS_ID_TOKEN
+	table_name		{ $$ = que_node_list_add_last(NULL, $1); }
+	| table_list ',' table_name
 				{ $$ = que_node_list_add_last($1, $3); }
 ;
 
@@ -305,7 +317,8 @@ select_list:
 	'*'			{ $$ = pars_select_list(&pars_star_denoter,
 								NULL); }
 	| select_item_list PARS_INTO_TOKEN variable_list
-				{ $$ = pars_select_list($1, $3); }
+				{ $$ = pars_select_list(
+					$1, static_cast<sym_node_t*>($3)); }
 	| select_item_list	{ $$ = pars_select_list($1, NULL); }
 ;
 
@@ -335,7 +348,9 @@ order_direction:
 order_by_clause:
 	/* Nothing */		{ $$ = NULL; }
 	| PARS_ORDER_TOKEN PARS_BY_TOKEN PARS_ID_TOKEN order_direction
-				{ $$ = pars_order_by($3, $4); }
+				{ $$ = pars_order_by(
+					static_cast<sym_node_t*>($3),
+					static_cast<pars_res_word_t*>($4)); }
 ;
 
 select_statement:
@@ -344,24 +359,35 @@ select_statement:
 	search_condition
 	for_update_clause
 	lock_shared_clause
-	order_by_clause		{ $$ = pars_select_statement($2, $4, $5,
-								$6, $7, $8); }
+	order_by_clause		{ $$ = pars_select_statement(
+					static_cast<sel_node_t*>($2),
+					static_cast<sym_node_t*>($4),
+					static_cast<que_node_t*>($5),
+					static_cast<pars_res_word_t*>($6),
+					static_cast<pars_res_word_t*>($7),
+					static_cast<order_node_t*>($8)); }
 ;
 
 insert_statement_start:
 	PARS_INSERT_TOKEN PARS_INTO_TOKEN
-	PARS_ID_TOKEN		{ $$ = $3; }
+	table_name		{ $$ = $3; }
 ;
 
 insert_statement:
 	insert_statement_start PARS_VALUES_TOKEN '(' exp_list ')'
-				{ $$ = pars_insert_statement($1, $4, NULL); }
+				{ $$ = pars_insert_statement(
+					static_cast<sym_node_t*>($1), $4, NULL); }
 	| insert_statement_start select_statement
-				{ $$ = pars_insert_statement($1, NULL, $2); }
+				{ $$ = pars_insert_statement(
+					static_cast<sym_node_t*>($1),
+					NULL,
+					static_cast<sel_node_t*>($2)); }
 ;
 
 column_assignment:
-	PARS_ID_TOKEN '=' exp	{ $$ = pars_column_assignment($1, $3); }
+	PARS_ID_TOKEN '=' exp	{ $$ = pars_column_assignment(
+					static_cast<sym_node_t*>($1),
+					static_cast<que_node_t*>($3)); }
 ;
 
 column_assignment_list:
@@ -377,46 +403,64 @@ cursor_positioned:
 ;
 
 update_statement_start:
-	PARS_UPDATE_TOKEN PARS_ID_TOKEN
+	PARS_UPDATE_TOKEN table_name
 	PARS_SET_TOKEN
-	column_assignment_list	{ $$ = pars_update_statement_start(FALSE,
-								$2, $4); }
+	column_assignment_list	{ $$ = pars_update_statement_start(
+					FALSE,
+					static_cast<sym_node_t*>($2),
+					static_cast<col_assign_node_t*>($4)); }
 ;
 
 update_statement_searched:
 	update_statement_start
-	search_condition	{ $$ = pars_update_statement($1, NULL, $2); }
+	search_condition	{ $$ = pars_update_statement(
+					static_cast<upd_node_t*>($1),
+					NULL,
+					static_cast<que_node_t*>($2)); }
 ;
 
 update_statement_positioned:
 	update_statement_start
-	cursor_positioned	{ $$ = pars_update_statement($1, $2, NULL); }
+	cursor_positioned	{ $$ = pars_update_statement(
+					static_cast<upd_node_t*>($1),
+					static_cast<sym_node_t*>($2),
+					NULL); }
 ;
 
 delete_statement_start:
 	PARS_DELETE_TOKEN PARS_FROM_TOKEN
-	PARS_ID_TOKEN		{ $$ = pars_update_statement_start(TRUE,
-								$3, NULL); }
+	table_name		{ $$ = pars_update_statement_start(
+					TRUE,
+					static_cast<sym_node_t*>($3), NULL); }
 ;
 
 delete_statement_searched:
 	delete_statement_start
-	search_condition	{ $$ = pars_update_statement($1, NULL, $2); }
+	search_condition	{ $$ = pars_update_statement(
+					static_cast<upd_node_t*>($1),
+					NULL,
+					static_cast<que_node_t*>($2)); }
 ;
 
 delete_statement_positioned:
 	delete_statement_start
-	cursor_positioned	{ $$ = pars_update_statement($1, $2, NULL); }
+	cursor_positioned	{ $$ = pars_update_statement(
+					static_cast<upd_node_t*>($1),
+					static_cast<sym_node_t*>($2),
+					NULL); }
 ;
 
 row_printf_statement:
 	PARS_ROW_PRINTF_TOKEN select_statement
-				{ $$ = pars_row_printf_statement($2); }
+				{ $$ = pars_row_printf_statement(
+					static_cast<sel_node_t*>($2)); }
 ;
 
 assignment_statement:
 	PARS_ID_TOKEN PARS_ASSIGN_TOKEN exp
-				{ $$ = pars_assignment_statement($1, $3); }
+				{ $$ = pars_assignment_statement(
+					static_cast<sym_node_t*>($1),
+					static_cast<que_node_t*>($3)); }
 ;
 
 elsif_element:
@@ -456,7 +500,9 @@ for_statement:
 	exp PARS_DDOT_TOKEN exp
 	PARS_LOOP_TOKEN statement_list
 	PARS_END_TOKEN PARS_LOOP_TOKEN
-				{ $$ = pars_for_statement($2, $4, $6, $8); }
+				{ $$ = pars_for_statement(
+					static_cast<sym_node_t*>($2),
+					$4, $6, $8); }
 ;
 
 exit_statement:
@@ -470,25 +516,36 @@ return_statement:
 open_cursor_statement:
 	PARS_OPEN_TOKEN PARS_ID_TOKEN
 				{ $$ = pars_open_statement(
-						ROW_SEL_OPEN_CURSOR, $2); }
+						ROW_SEL_OPEN_CURSOR,
+						static_cast<sym_node_t*>($2)); }
 ;
 
 close_cursor_statement:
 	PARS_CLOSE_TOKEN PARS_ID_TOKEN
 				{ $$ = pars_open_statement(
-						ROW_SEL_CLOSE_CURSOR, $2); }
+						ROW_SEL_CLOSE_CURSOR,
+						static_cast<sym_node_t*>($2)); }
 ;
 
 fetch_statement:
 	PARS_FETCH_TOKEN PARS_ID_TOKEN PARS_INTO_TOKEN variable_list
-				{ $$ = pars_fetch_statement($2, $4, NULL); }
+				{ $$ = pars_fetch_statement(
+					static_cast<sym_node_t*>($2),
+					static_cast<sym_node_t*>($4), NULL); }
 	| PARS_FETCH_TOKEN PARS_ID_TOKEN PARS_INTO_TOKEN user_function_call
-				{ $$ = pars_fetch_statement($2, NULL, $4); }
+				{ $$ = pars_fetch_statement(
+					static_cast<sym_node_t*>($2),
+					NULL,
+					static_cast<sym_node_t*>($4)); }
 ;
 
 column_def:
 	PARS_ID_TOKEN type_name	opt_column_len opt_unsigned opt_not_null
-				{ $$ = pars_column_def($1, $2, $3, $4, $5); }
+				{ $$ = pars_column_def(
+					static_cast<sym_node_t*>($1),
+					static_cast<pars_res_word_t*>($2),
+					static_cast<sym_node_t*>($3),
+					$4, $5); }
 ;
 
 column_def_list:
@@ -524,10 +581,27 @@ not_fit_in_memory:
 					/* pass any non-NULL pointer */ }
 ;
 
+compact:
+	/* Nothing */		{ $$ = NULL; }
+	| PARS_COMPACT_TOKEN	{ $$ = &pars_int_token;
+					/* pass any non-NULL pointer */ }
+;
+
+block_size:
+	/* Nothing */		{ $$ = NULL; }
+	| PARS_BLOCK_SIZE_TOKEN	'=' PARS_INT_LIT
+			{ $$ = $3; }
+;
+
 create_table:
 	PARS_CREATE_TOKEN PARS_TABLE_TOKEN
-	PARS_ID_TOKEN '(' column_def_list ')'
-	not_fit_in_memory	{ $$ = pars_create_table($3, $5, $7); }
+	table_name '(' column_def_list ')'
+	not_fit_in_memory compact block_size
+				{ $$ = pars_create_table(
+					static_cast<sym_node_t*>($3),
+					static_cast<sym_node_t*>($5),
+					static_cast<sym_node_t*>($8),
+					static_cast<sym_node_t*>($9), $7); }
 ;
 
 column_list:
@@ -550,8 +624,19 @@ create_index:
 	PARS_CREATE_TOKEN unique_def
 	clustered_def
 	PARS_INDEX_TOKEN
-	PARS_ID_TOKEN PARS_ON_TOKEN PARS_ID_TOKEN
-	'(' column_list ')'	{ $$ = pars_create_index($2, $3, $5, $7, $9); }
+	PARS_ID_TOKEN PARS_ON_TOKEN
+	table_name
+	'(' column_list ')'	{ $$ = pars_create_index(
+					static_cast<pars_res_word_t*>($2),
+					static_cast<pars_res_word_t*>($3),
+					static_cast<sym_node_t*>($5),
+					static_cast<sym_node_t*>($7),
+					static_cast<sym_node_t*>($9)); }
+;
+
+table_name:
+	PARS_ID_TOKEN		{ $$ = $1; }
+	| PARS_TABLE_NAME_TOKEN	{ $$ = $1; }
 ;
 
 commit_statement:
@@ -567,6 +652,7 @@ rollback_statement:
 type_name:
 	PARS_INT_TOKEN		{ $$ = &pars_int_token; }
 	| PARS_INTEGER_TOKEN	{ $$ = &pars_int_token; }
+	| PARS_BIGINT_TOKEN	{ $$ = &pars_bigint_token; }
 	| PARS_CHAR_TOKEN	{ $$ = &pars_char_token; }
 	| PARS_BINARY_TOKEN	{ $$ = &pars_binary_token; }
 	| PARS_BLOB_TOKEN	{ $$ = &pars_blob_token; }
@@ -574,11 +660,15 @@ type_name:
 
 parameter_declaration:
 	PARS_ID_TOKEN PARS_IN_TOKEN type_name
-				{ $$ = pars_parameter_declaration($1,
-							PARS_INPUT, $3); }
+				{ $$ = pars_parameter_declaration(
+					static_cast<sym_node_t*>($1),
+					PARS_INPUT,
+					static_cast<pars_res_word_t*>($3)); }
 	| PARS_ID_TOKEN PARS_OUT_TOKEN type_name
-				{ $$ = pars_parameter_declaration($1,
-							PARS_OUTPUT, $3); }
+				{ $$ = pars_parameter_declaration(
+					static_cast<sym_node_t*>($1),
+					PARS_OUTPUT,
+					static_cast<pars_res_word_t*>($3)); }
 ;
 
 parameter_declaration_list:
@@ -590,7 +680,9 @@ parameter_declaration_list:
 
 variable_declaration:
 	PARS_ID_TOKEN type_name ';'
-				{ $$ = pars_variable_declaration($1, $2); }
+				{ $$ = pars_variable_declaration(
+					static_cast<sym_node_t*>($1),
+					static_cast<pars_res_word_t*>($2)); }
 ;
 
 variable_declaration_list:
@@ -602,12 +694,15 @@ variable_declaration_list:
 cursor_declaration:
 	PARS_DECLARE_TOKEN PARS_CURSOR_TOKEN PARS_ID_TOKEN
 	PARS_IS_TOKEN select_statement ';'
-				{ $$ = pars_cursor_declaration($3, $5); }
+				{ $$ = pars_cursor_declaration(
+					static_cast<sym_node_t*>($3),
+					static_cast<sel_node_t*>($5)); }
 ;
 
 function_declaration:
 	PARS_DECLARE_TOKEN PARS_FUNCTION_TOKEN PARS_ID_TOKEN ';'
-				{ $$ = pars_function_declaration($3); }
+				{ $$ = pars_function_declaration(
+					static_cast<sym_node_t*>($3)); }
 ;
 
 declaration:
@@ -628,8 +723,10 @@ procedure_definition:
 	declaration_list
 	PARS_BEGIN_TOKEN
 	statement_list
-	PARS_END_TOKEN		{ $$ = pars_procedure_definition($2, $4,
-								$10); }
+	PARS_END_TOKEN		{ $$ = pars_procedure_definition(
+					static_cast<sym_node_t*>($2),
+					static_cast<sym_node_t*>($4),
+					$10); }
 ;
 
 %%
diff --git a/storage/innobase/pars/pars0lex.l b/storage/innobase/pars/pars0lex.l
index 55ed17f82e1..2446e40cde8 100644
--- a/storage/innobase/pars/pars0lex.l
+++ b/storage/innobase/pars/pars0lex.l
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1997, 2011, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
@@ -64,7 +64,9 @@ Created 12/14/1997 Heikki Tuuri
 #define realloc(P, A)	ut_realloc(P, A)
 #define exit(A) 	ut_error
 
-#define YY_INPUT(buf, result, max_size) pars_get_lex_chars(buf, &result, max_size)
+/* Note: We cast &result to int* from yysize_t* */
+#define YY_INPUT(buf, result, max_size) \
+	pars_get_lex_chars(buf, (int*) &result, max_size)
 
 /* String buffer for removing quotes */
 static ulint	stringbuf_len_alloc = 0; /* Allocated length */
@@ -79,7 +81,7 @@ string_append(
 	ulint		len)	/*!< in: length of the string */
 {
 	if (stringbuf == NULL) {
-		stringbuf = malloc(1);
+		stringbuf = static_cast<char*>(malloc(1));
 		stringbuf_len_alloc = 1;
 	}
 
@@ -87,7 +89,9 @@ string_append(
 		while (stringbuf_len + len > stringbuf_len_alloc) {
 			stringbuf_len_alloc <<= 1;
 		}
-		stringbuf = realloc(stringbuf, stringbuf_len_alloc);
+
+		stringbuf = static_cast<char*>(
+			realloc(stringbuf, stringbuf_len_alloc));
 	}
 
 	memcpy(stringbuf + stringbuf_len, str, len);
@@ -96,8 +100,9 @@ string_append(
 
 %}
 
-DIGIT	[0-9]
-ID	[a-z_A-Z][a-z_A-Z0-9]*
+DIGIT		[0-9]
+ID		[a-z_A-Z][a-z_A-Z0-9]*
+TABLE_NAME	[a-z_A-Z][a-z_A-Z0-9]*\/(#sql-|[a-z_A-Z])[a-z_A-Z0-9]*
 BOUND_LIT	\:[a-z_A-Z0-9]+
 BOUND_ID	\$[a-z_A-Z0-9]+
 
@@ -249,27 +254,27 @@ In the state 'id', only two actions are possible (defined below). */
 }
 
 "BINARY"	{
-	 		return(PARS_BINARY_TOKEN);
+			return(PARS_BINARY_TOKEN);
 }
 
 "BLOB"		{
-	 		return(PARS_BLOB_TOKEN);
+			return(PARS_BLOB_TOKEN);
 }
 
 "INT"		{
-	 		return(PARS_INT_TOKEN);
+			return(PARS_INT_TOKEN);
 }
 
 "INTEGER"	{
-	 		return(PARS_INT_TOKEN);
+			return(PARS_INT_TOKEN);
 }
 
 "FLOAT"		{
-	 		return(PARS_FLOAT_TOKEN);
+			return(PARS_FLOAT_TOKEN);
 }
 
 "CHAR"		{
-	 		return(PARS_CHAR_TOKEN);
+			return(PARS_CHAR_TOKEN);
 }
 
 "IS"		{
@@ -400,16 +405,24 @@ In the state 'id', only two actions are possible (defined below). */
 			return(PARS_TABLE_TOKEN);
 }
 
+"COMPACT"	{
+			return(PARS_COMPACT_TOKEN);
+}
+
+"BLOCK_SIZE"	{
+			return(PARS_BLOCK_SIZE_TOKEN);
+}
+
 "INDEX"		{
-	 		return(PARS_INDEX_TOKEN);
+			return(PARS_INDEX_TOKEN);
 }
 
 "UNIQUE"	{
-	 		return(PARS_UNIQUE_TOKEN);
+			return(PARS_UNIQUE_TOKEN);
 }
 
 "CLUSTERED"	{
-	 		return(PARS_CLUSTERED_TOKEN);
+			return(PARS_CLUSTERED_TOKEN);
 }
 
 "DOES_NOT_FIT_IN_MEMORY"	{
@@ -417,7 +430,7 @@ In the state 'id', only two actions are possible (defined below). */
 }
 
 "ON"		{
-	 		return(PARS_ON_TOKEN);
+			return(PARS_ON_TOKEN);
 }
 
 "DECLARE"	{
@@ -540,13 +553,28 @@ In the state 'id', only two actions are possible (defined below). */
 			return(PARS_MODE_TOKEN);
 }
 
+"LIKE"  {
+                        return(PARS_LIKE_TOKEN);
+}
+
+"BIGINT"	{
+			return(PARS_BIGINT_TOKEN);
+}
+
 {ID}		{
 			yylval = sym_tab_add_id(pars_sym_tab_global,
-							(byte*)yytext,
+							(byte*) yytext,
 							ut_strlen(yytext));
 			return(PARS_ID_TOKEN);
 }
 
+{TABLE_NAME}	{
+			yylval = sym_tab_add_id(pars_sym_tab_global,
+							(byte*) yytext,
+							ut_strlen(yytext));
+			return(PARS_TABLE_NAME_TOKEN);
+}
+
 ".."		{
 			return(PARS_DDOT_TOKEN);
 }
diff --git a/storage/innobase/pars/pars0opt.c b/storage/innobase/pars/pars0opt.cc
index d992805d9ef..e5f347eedd6 100644
--- a/storage/innobase/pars/pars0opt.c
+++ b/storage/innobase/pars/pars0opt.cc
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1997, 2011, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,13 +11,13 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
 /**************************************************//**
-@file pars/pars0opt.c
+@file pars/pars0opt.cc
 Simple SQL optimizer
 
 Created 12/21/1997 Heikki Tuuri
@@ -68,6 +68,7 @@ opt_invert_cmp_op(
 	} else if (op == PARS_GE_TOKEN) {
 		return(PARS_LE_TOKEN);
 	} else {
+		/* TODO: LIKE operator */
 		ut_error;
 	}
 
@@ -96,7 +97,7 @@ opt_check_exp_determined_before(
 	ut_ad(exp && sel_node);
 
 	if (que_node_get_type(exp) == QUE_NODE_FUNC) {
-		func_node = exp;
+		func_node = static_cast<func_node_t*>(exp);
 
 		arg = func_node->args;
 
@@ -114,7 +115,7 @@ opt_check_exp_determined_before(
 
 	ut_a(que_node_get_type(exp) == QUE_NODE_SYMBOL);
 
-	sym_node = exp;
+	sym_node = static_cast<sym_node_t*>(exp);
 
 	if (sym_node->token_type != SYM_COLUMN) {
 
@@ -165,11 +166,18 @@ opt_look_for_col_in_comparison_before(
 	     || (search_cond->func == '>')
 	     || (search_cond->func == '=')
 	     || (search_cond->func == PARS_GE_TOKEN)
-	     || (search_cond->func == PARS_LE_TOKEN));
+	     || (search_cond->func == PARS_LE_TOKEN)
+	     || (search_cond->func == PARS_LIKE_TOKEN_EXACT)
+	     || (search_cond->func == PARS_LIKE_TOKEN_PREFIX)
+	     || (search_cond->func == PARS_LIKE_TOKEN_SUFFIX)
+	     || (search_cond->func == PARS_LIKE_TOKEN_SUBSTR));
 
 	table = sel_node_get_nth_plan(sel_node, nth_table)->table;
 
-	if ((cmp_type == OPT_EQUAL) && (search_cond->func != '=')) {
+	if ((cmp_type == OPT_EQUAL)
+	    && (search_cond->func != '=')
+	    && (search_cond->func != PARS_LIKE_TOKEN_EXACT)
+            && (search_cond->func != PARS_LIKE_TOKEN_PREFIX)) {
 
 		return(NULL);
 
@@ -177,7 +185,9 @@ opt_look_for_col_in_comparison_before(
 		   && (search_cond->func != '<')
 		   && (search_cond->func != '>')
 		   && (search_cond->func != PARS_GE_TOKEN)
-		   && (search_cond->func != PARS_LE_TOKEN)) {
+		   && (search_cond->func != PARS_LE_TOKEN)
+		   && (search_cond->func != PARS_LIKE_TOKEN_PREFIX)
+                   && (search_cond->func != PARS_LIKE_TOKEN_SUFFIX)) {
 
 		return(NULL);
 	}
@@ -185,7 +195,7 @@ opt_look_for_col_in_comparison_before(
 	arg = search_cond->args;
 
 	if (que_node_get_type(arg) == QUE_NODE_SYMBOL) {
-		sym_node = arg;
+		sym_node = static_cast<sym_node_t*>(arg);
 
 		if ((sym_node->token_type == SYM_COLUMN)
 		    && (sym_node->table == table)
@@ -211,7 +221,7 @@ opt_look_for_col_in_comparison_before(
 	arg = que_node_get_next(arg);
 
 	if (que_node_get_type(arg) == QUE_NODE_SYMBOL) {
-		sym_node = arg;
+		sym_node = static_cast<sym_node_t*>(arg);
 
 		if ((sym_node->token_type == SYM_COLUMN)
 		    && (sym_node->table == table)
@@ -262,7 +272,7 @@ opt_look_for_col_in_cond_before(
 	ut_a(search_cond->func != PARS_NOT_TOKEN);
 
 	if (search_cond->func == PARS_AND_TOKEN) {
-		new_cond = search_cond->args;
+		new_cond = static_cast<func_node_t*>(search_cond->args);
 
 		exp = opt_look_for_col_in_cond_before(cmp_type, col_no,
 						      new_cond, sel_node,
@@ -272,7 +282,8 @@ opt_look_for_col_in_cond_before(
 			return(exp);
 		}
 
-		new_cond = que_node_get_next(new_cond);
+		new_cond = static_cast<func_node_t*>(
+			que_node_get_next(new_cond));
 
 		exp = opt_look_for_col_in_cond_before(cmp_type, col_no,
 						      new_cond, sel_node,
@@ -332,6 +343,12 @@ opt_calc_index_goodness(
 	ulint		op;
 	ulint		j;
 
+	/* At least for now we don't support using FTS indexes for queries
+	done through InnoDB's own SQL parser. */
+	if (index->type == DICT_FTS) {
+		return(0);
+	}
+
 	goodness = 0;
 
 	/* Note that as higher level node pointers in the B-tree contain
@@ -346,7 +363,8 @@ opt_calc_index_goodness(
 		col_no = dict_index_get_nth_col_no(index, j);
 
 		exp = opt_look_for_col_in_cond_before(
-			OPT_EQUAL, col_no, sel_node->search_cond,
+			OPT_EQUAL, col_no,
+			static_cast<func_node_t*>(sel_node->search_cond),
 			sel_node, nth_table, &op);
 		if (exp) {
 			/* The value for this column is exactly known already
@@ -359,7 +377,9 @@ opt_calc_index_goodness(
 			/* Look for non-equality comparisons */
 
 			exp = opt_look_for_col_in_cond_before(
-				OPT_COMPARISON, col_no, sel_node->search_cond,
+				OPT_COMPARISON, col_no,
+				static_cast<func_node_t*>(
+					sel_node->search_cond),
 				sel_node, nth_table, &op);
 			if (exp) {
 				index_plan[j] = exp;
@@ -413,7 +433,12 @@ opt_op_to_search_mode(
 			ascending order */
 	ulint	op)	/*!< in: operator '=', PARS_GE_TOKEN, ... */
 {
-	if (op == '=') {
+	if (op == '='
+	    || op == PARS_LIKE_TOKEN_EXACT
+	    || op == PARS_LIKE_TOKEN_PREFIX
+	    || op == PARS_LIKE_TOKEN_SUFFIX
+	    || op == PARS_LIKE_TOKEN_SUBSTR) {
+
 		if (asc) {
 			return(PAGE_CUR_GE);
 		} else {
@@ -583,12 +608,18 @@ opt_search_plan_for_table(
 					    n_fields);
 		dict_index_copy_types(plan->tuple, plan->index, n_fields);
 
-		plan->tuple_exps = mem_heap_alloc(pars_sym_tab_global->heap,
-						  n_fields * sizeof(void*));
+		plan->tuple_exps = static_cast<que_node_t**>(
+			mem_heap_alloc(
+				pars_sym_tab_global->heap,
+				n_fields * sizeof(void*)));
 
 		ut_memcpy(plan->tuple_exps, best_index_plan,
 			  n_fields * sizeof(void*));
-		if (best_last_op == '=') {
+		if (best_last_op == '='
+		    || best_last_op == PARS_LIKE_TOKEN_EXACT
+                    || best_last_op == PARS_LIKE_TOKEN_PREFIX
+                    || best_last_op == PARS_LIKE_TOKEN_SUFFIX
+                    || best_last_op == PARS_LIKE_TOKEN_SUBSTR) {
 			plan->n_exact_match = n_fields;
 		} else {
 			plan->n_exact_match = n_fields - 1;
@@ -717,7 +748,7 @@ opt_find_test_conds(
 					conditions or NULL */
 {
 	func_node_t*	new_cond;
-	ulint		class;
+	ulint		fclass;
 	plan_t*		plan;
 
 	if (cond == NULL) {
@@ -726,11 +757,12 @@ opt_find_test_conds(
 	}
 
 	if (cond->func == PARS_AND_TOKEN) {
-		new_cond = cond->args;
+		new_cond = static_cast<func_node_t*>(cond->args);
 
 		opt_find_test_conds(sel_node, i, new_cond);
 
-		new_cond = que_node_get_next(new_cond);
+		new_cond = static_cast<func_node_t*>(
+			que_node_get_next(new_cond));
 
 		opt_find_test_conds(sel_node, i, new_cond);
 
@@ -739,12 +771,12 @@ opt_find_test_conds(
 
 	plan = sel_node_get_nth_plan(sel_node, i);
 
-	class = opt_classify_comparison(sel_node, i, cond);
+	fclass = opt_classify_comparison(sel_node, i, cond);
 
-	if (class == OPT_END_COND) {
+	if (fclass == OPT_END_COND) {
 		UT_LIST_ADD_LAST(cond_list, plan->end_conds, cond);
 
-	} else if (class == OPT_TEST_COND) {
+	} else if (fclass == OPT_TEST_COND) {
 		UT_LIST_ADD_LAST(cond_list, plan->other_conds, cond);
 
 	}
@@ -772,7 +804,7 @@ opt_normalize_cmp_conds(
 
 		if (que_node_get_type(arg2) == QUE_NODE_SYMBOL) {
 
-			sym_node = arg2;
+			sym_node = static_cast<sym_node_t*>(arg2);
 
 			if ((sym_node->token_type == SYM_COLUMN)
 			    && (sym_node->table == table)) {
@@ -812,7 +844,10 @@ opt_determine_and_normalize_test_conds(
 
 	/* Recursively go through the conjuncts and classify them */
 
-	opt_find_test_conds(sel_node, i, sel_node->search_cond);
+	opt_find_test_conds(
+		sel_node,
+		i,
+		static_cast<func_node_t*>(sel_node->search_cond));
 
 	opt_normalize_cmp_conds(UT_LIST_GET_FIRST(plan->end_conds),
 				plan->table);
@@ -852,14 +887,14 @@ opt_find_all_cols(
 	}
 
 	if (que_node_get_type(exp) == QUE_NODE_FUNC) {
-		func_node = exp;
+		func_node = static_cast<func_node_t*>(exp);
 
-		arg = func_node->args;
+		for (arg = func_node->args;
+		     arg != 0;
+		     arg = que_node_get_next(arg)) {
 
-		while (arg) {
-			opt_find_all_cols(copy_val, index, col_list, plan,
-					  arg);
-			arg = que_node_get_next(arg);
+			opt_find_all_cols(
+				copy_val, index, col_list, plan, arg);
 		}
 
 		return;
@@ -867,7 +902,7 @@ opt_find_all_cols(
 
 	ut_a(que_node_get_type(exp) == QUE_NODE_SYMBOL);
 
-	sym_node = exp;
+	sym_node = static_cast<sym_node_t*>(exp);
 
 	if (sym_node->token_type != SYM_COLUMN) {
 
@@ -953,11 +988,12 @@ opt_find_copy_cols(
 	ut_ad(que_node_get_type(search_cond) == QUE_NODE_FUNC);
 
 	if (search_cond->func == PARS_AND_TOKEN) {
-		new_cond = search_cond->args;
+		new_cond = static_cast<func_node_t*>(search_cond->args);
 
 		opt_find_copy_cols(sel_node, i, new_cond);
 
-		new_cond = que_node_get_next(new_cond);
+		new_cond = static_cast<func_node_t*>(
+			que_node_get_next(new_cond));
 
 		opt_find_copy_cols(sel_node, i, new_cond);
 
@@ -1004,21 +1040,23 @@ opt_classify_cols(
 	/* All select list columns should be copied: therefore TRUE as the
 	first argument */
 
-	exp = sel_node->select_list;
+	for (exp = sel_node->select_list;
+	     exp != 0;
+	     exp = que_node_get_next(exp)) {
 
-	while (exp) {
-		opt_find_all_cols(TRUE, plan->index, &(plan->columns), plan,
-				  exp);
-		exp = que_node_get_next(exp);
+		opt_find_all_cols(
+			TRUE, plan->index, &(plan->columns), plan, exp);
 	}
 
-	opt_find_copy_cols(sel_node, i, sel_node->search_cond);
+	opt_find_copy_cols(
+		sel_node, i, static_cast<func_node_t*>(sel_node->search_cond));
 
 	/* All remaining columns in the search condition are temporary
 	columns: therefore FALSE */
 
-	opt_find_all_cols(FALSE, plan->index, &(plan->columns), plan,
-			  sel_node->search_cond);
+	opt_find_all_cols(
+		FALSE, plan->index, &plan->columns, plan,
+		static_cast<func_node_t*>(sel_node->search_cond));
 }
 
 /*******************************************************************//**
@@ -1068,7 +1106,8 @@ opt_clust_access(
 
 	dict_index_copy_types(plan->clust_ref, clust_index, n_fields);
 
-	plan->clust_map = mem_heap_alloc(heap, n_fields * sizeof(ulint));
+	plan->clust_map = static_cast<ulint*>(
+		mem_heap_alloc(heap, n_fields * sizeof(ulint)));
 
 	for (i = 0; i < n_fields; i++) {
 		pos = dict_index_get_nth_field_pos(index, clust_index, i);
@@ -1082,7 +1121,7 @@ opt_clust_access(
 		    || dict_index_get_nth_field(clust_index, i)
 		    ->prefix_len != 0) {
 			fprintf(stderr,
-				"InnoDB: Error in pars0opt.c:"
+				"InnoDB: Error in pars0opt.cc:"
 				" table %s has prefix_len != 0\n",
 				index->table_name);
 		}
@@ -1108,8 +1147,10 @@ opt_search_plan(
 	order_node_t*	order_by;
 	ulint		i;
 
-	sel_node->plans = mem_heap_alloc(pars_sym_tab_global->heap,
-					 sel_node->n_tables * sizeof(plan_t));
+	sel_node->plans = static_cast<plan_t*>(
+		mem_heap_alloc(
+			pars_sym_tab_global->heap,
+			sel_node->n_tables * sizeof(plan_t)));
 
 	/* Analyze the search condition to find out what we know at each
 	join stage about the conditions that the columns of a table should
@@ -1138,7 +1179,8 @@ opt_search_plan(
 
 		opt_determine_and_normalize_test_conds(sel_node, i);
 
-		table_node = que_node_get_next(table_node);
+		table_node = static_cast<sym_node_t*>(
+			que_node_get_next(table_node));
 	}
 
 	table_node = sel_node->table_list;
@@ -1155,7 +1197,8 @@ opt_search_plan(
 
 		opt_clust_access(sel_node, i);
 
-		table_node = que_node_get_next(table_node);
+		table_node = static_cast<sym_node_t*>(
+			que_node_get_next(table_node));
 	}
 
 	/* Check that the plan obeys a possible order-by clause: if not,
diff --git a/storage/innobase/pars/pars0pars.c b/storage/innobase/pars/pars0pars.cc
index 86f54195682..a4ab85adc36 100644
--- a/storage/innobase/pars/pars0pars.c
+++ b/storage/innobase/pars/pars0pars.cc
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1996, 2010, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2011, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -81,6 +81,7 @@ UNIV_INTERN pars_res_word_t	pars_distinct_token = {PARS_DISTINCT_TOKEN};
 UNIV_INTERN pars_res_word_t	pars_binary_token = {PARS_BINARY_TOKEN};
 UNIV_INTERN pars_res_word_t	pars_blob_token = {PARS_BLOB_TOKEN};
 UNIV_INTERN pars_res_word_t	pars_int_token = {PARS_INT_TOKEN};
+UNIV_INTERN pars_res_word_t	pars_bigint_token = {PARS_BIGINT_TOKEN};
 UNIV_INTERN pars_res_word_t	pars_char_token = {PARS_CHAR_TOKEN};
 UNIV_INTERN pars_res_word_t	pars_float_token = {PARS_FLOAT_TOKEN};
 UNIV_INTERN pars_res_word_t	pars_update_token = {PARS_UPDATE_TOKEN};
@@ -95,6 +96,95 @@ UNIV_INTERN pars_res_word_t	pars_clustered_token = {PARS_CLUSTERED_TOKEN};
 /** Global variable used to denote the '*' in SELECT * FROM.. */
 UNIV_INTERN ulint	pars_star_denoter	= 12345678;
 
+/********************************************************************
+Get user function with the given name.*/
+UNIV_INLINE
+pars_user_func_t*
+pars_info_lookup_user_func(
+/*=======================*/
+					/* out: user func, or NULL if not
+					found */
+	pars_info_t*		info,	/* in: info struct */
+	const char*		name)	/* in: function name to find*/
+{
+	if (info && info->funcs) {
+		ulint		i;
+		ib_vector_t*	vec = info->funcs;
+
+		for (i = 0; i < ib_vector_size(vec); i++) {
+			pars_user_func_t*	puf;
+
+			puf = static_cast<pars_user_func_t*>(
+				ib_vector_get(vec, i));
+
+			if (strcmp(puf->name, name) == 0) {
+				return(puf);
+			}
+		}
+	}
+
+	return(NULL);
+}
+
+/********************************************************************
+Get bound identifier with the given name.*/
+UNIV_INLINE
+pars_bound_id_t*
+pars_info_lookup_bound_id(
+/*======================*/
+					/* out: bound literal, or NULL if
+					not found */
+	pars_info_t*		info,	/* in: info struct */
+	const char*		name)	/* in: bound literal name to find */
+{
+	if (info && info->bound_ids) {
+		ulint		i;
+		ib_vector_t*	vec = info->bound_ids;
+
+		for (i = 0; i < ib_vector_size(vec); i++) {
+			pars_bound_id_t*	bid;
+
+		       	bid = static_cast<pars_bound_id_t*>(
+				ib_vector_get(vec, i));
+
+			if (strcmp(bid->name, name) == 0) {
+				return(bid);
+			}
+		}
+	}
+
+	return(NULL);
+}
+
+/********************************************************************
+Get bound literal with the given name.*/
+UNIV_INLINE
+pars_bound_lit_t*
+pars_info_lookup_bound_lit(
+/*=======================*/
+					/* out: bound literal, or NULL if
+					not found */
+	pars_info_t*		info,	/* in: info struct */
+	const char*		name)	/* in: bound literal name to find */
+{
+	if (info && info->bound_lits) {
+		ulint		i;
+		ib_vector_t*	vec = info->bound_lits;
+
+		for (i = 0; i < ib_vector_size(vec); i++) {
+			pars_bound_lit_t*	pbl;
+
+			pbl = static_cast<pars_bound_lit_t*>(
+				ib_vector_get(vec, i));
+
+			if (strcmp(pbl->name, name) == 0) {
+				return(pbl);
+			}
+		}
+	}
+
+	return(NULL);
+}
 
 /*********************************************************************//**
 Determines the class of a function code.
@@ -153,7 +243,8 @@ pars_func_low(
 {
 	func_node_t*	node;
 
-	node = mem_heap_alloc(pars_sym_tab_global->heap, sizeof(func_node_t));
+	node = static_cast<func_node_t*>(
+		mem_heap_alloc(pars_sym_tab_global->heap, sizeof(func_node_t)));
 
 	node->common.type = QUE_NODE_FUNC;
 	dfield_set_data(&(node->common.val), NULL, 0);
@@ -161,7 +252,7 @@ pars_func_low(
 
 	node->func = func;
 
-	node->class = pars_func_get_class(func);
+	node->fclass = pars_func_get_class(func);
 
 	node->args = arg;
 
@@ -180,9 +271,183 @@ pars_func(
 	que_node_t*	res_word,/*!< in: function name reserved word */
 	que_node_t*	arg)	/*!< in: first argument in the argument list */
 {
-	return(pars_func_low(((pars_res_word_t*)res_word)->code, arg));
+	return(pars_func_low(((pars_res_word_t*) res_word)->code, arg));
 }
 
+/*************************************************************************
+Rebind a LIKE search string. NOTE: We ignore any '%' characters embedded
+within the search string.*/
+
+int
+pars_like_rebind(
+/*=============*/
+				/* out, own: function node in a query tree */
+	sym_node_t*	node,	/* in: The search string node.*/
+	const byte*	ptr,	/* in: literal to (re) bind */
+	ulint		ptr_len)/* in: length of literal to (re) bind*/
+{
+	dtype_t*	dtype;
+	dfield_t*	dfield;
+	ib_like_t	op_check;
+	sym_node_t*	like_node;
+	sym_node_t*	str_node = NULL;
+	ib_like_t	op = IB_LIKE_EXACT;
+	int		func = PARS_LIKE_TOKEN_EXACT;
+
+	/* Is this a STRING% ? */
+	if (ptr[ptr_len - 1] == '%') {
+		op = IB_LIKE_PREFIX;
+	}
+
+	/* Is this a '%STRING' or %STRING% ?*/
+	if (*ptr == '%') {
+		op = (op == IB_LIKE_PREFIX) ? IB_LIKE_SUBSTR : IB_LIKE_SUFFIX;
+	}
+
+	if (node->like_node == NULL) {
+		/* Add the LIKE operator info node to the node list.
+		This will be used during the comparison phase to determine
+		how to match.*/
+		like_node = sym_tab_add_int_lit(node->sym_table, op);
+		que_node_list_add_last(NULL, like_node);
+		node->like_node = like_node;
+		str_node = sym_tab_add_str_lit(node->sym_table, ptr, ptr_len);
+		que_node_list_add_last(like_node, str_node);
+	} else {
+		like_node = node->like_node;
+
+		/* Change the value of the string in the existing
+		string node of like node */
+		str_node = static_cast<sym_node_t*>(
+			que_node_list_get_last(like_node));
+
+		/* Must find the string node */
+		ut_a(str_node);
+		ut_a(str_node != like_node);
+		ut_a(str_node->token_type == SYM_LIT);
+
+		dfield = que_node_get_val(str_node);
+		dfield_set_data(dfield, ptr, ptr_len);
+	}
+
+	dfield = que_node_get_val(like_node);
+	dtype = dfield_get_type(dfield);
+
+	ut_a(dtype_get_mtype(dtype) == DATA_INT);
+	op_check = static_cast<ib_like_t>(
+		mach_read_from_4(static_cast<byte*>(dfield_get_data(dfield))));
+
+	switch (op_check) {
+	case	IB_LIKE_PREFIX:
+	case	IB_LIKE_SUFFIX:
+	case	IB_LIKE_SUBSTR:
+	case	IB_LIKE_EXACT:
+		break;
+
+	default:
+		ut_error;
+	}
+
+	mach_write_to_4(static_cast<byte*>(dfield_get_data(dfield)), op);
+
+	dfield = que_node_get_val(node);
+
+	/* Adjust the length of the search value so the '%' is not
+	visible. Then create and add a search string node to the
+	search value node. Searching for %SUFFIX and %SUBSTR% requires
+	a full table scan and so we set the search value to ''.
+	For PREFIX% we simply remove the trailing '%'.*/
+
+	switch (op) {
+	case	IB_LIKE_EXACT:
+		dfield = que_node_get_val(str_node);
+		dtype = dfield_get_type(dfield);
+
+		ut_a(dtype_get_mtype(dtype) == DATA_VARCHAR);
+
+		dfield_set_data(dfield, ptr, ptr_len);
+		break;
+
+	case	IB_LIKE_PREFIX:
+		func = PARS_LIKE_TOKEN_PREFIX;
+
+		/* Modify the original node */
+		dfield_set_len(dfield, ptr_len - 1);
+
+		dfield = que_node_get_val(str_node);
+		dtype = dfield_get_type(dfield);
+
+		ut_a(dtype_get_mtype(dtype) == DATA_VARCHAR);
+
+		dfield_set_data(dfield, ptr, ptr_len - 1);
+		break;
+
+	case	IB_LIKE_SUFFIX:
+		func = PARS_LIKE_TOKEN_SUFFIX;
+
+		/* Modify the original node */
+		/* Make it an '' empty string */
+		dfield_set_len(dfield, 0);
+
+		dfield = que_node_get_val(str_node);
+		dtype = dfield_get_type(dfield);
+
+		ut_a(dtype_get_mtype(dtype) == DATA_VARCHAR);
+
+		dfield_set_data(dfield, ptr + 1, ptr_len - 1);
+		break;
+
+	case	IB_LIKE_SUBSTR:
+		func = PARS_LIKE_TOKEN_SUBSTR;
+
+		/* Modify the original node */
+		/* Make it an '' empty string */
+		dfield_set_len(dfield, 0);
+
+		dfield = que_node_get_val(str_node);
+		dtype = dfield_get_type(dfield);
+
+		ut_a(dtype_get_mtype(dtype) == DATA_VARCHAR);
+
+		dfield_set_data(dfield, ptr + 1, ptr_len - 2);
+		break;
+
+	default:
+		ut_error;
+	}
+
+	return(func);
+}
+
+/*************************************************************************
+Parses a LIKE operator expression. */
+static
+int
+pars_like_op(
+/*=========*/
+				/* out, own: function node in a query tree */
+	que_node_t*	arg)	/* in: LIKE comparison string.*/
+{
+	char*		ptr;
+	ulint		ptr_len;
+	int		func = PARS_LIKE_TOKEN_EXACT;
+	dfield_t*	dfield = que_node_get_val(arg);
+	dtype_t*	dtype = dfield_get_type(dfield);
+
+	ut_a(dtype_get_mtype(dtype) == DATA_CHAR
+	     || dtype_get_mtype(dtype) == DATA_VARCHAR);
+
+	ptr = static_cast<char*>(dfield_get_data(dfield));
+	ptr_len = strlen(ptr);
+
+	if (ptr_len) {
+
+		func = pars_like_rebind(
+			static_cast<sym_node_t*>(arg), (byte*) ptr, ptr_len);
+	}
+
+	return(func);
+}
 /*********************************************************************//**
 Parses an operator expression.
 @return	own: function node in a query tree */
@@ -201,6 +466,20 @@ pars_op(
 		que_node_list_add_last(arg1, arg2);
 	}
 
+	/* We need to parse the string and determine whether it's a
+	PREFIX, SUFFIX or SUBSTRING comparison */
+	if (func == PARS_LIKE_TOKEN) {
+
+		ut_a(que_node_get_type(arg2) == QUE_NODE_SYMBOL);
+
+		func = pars_like_op(arg2);
+
+		ut_a(func == PARS_LIKE_TOKEN_EXACT
+		     || func == PARS_LIKE_TOKEN_PREFIX
+		     || func == PARS_LIKE_TOKEN_SUFFIX
+		     || func == PARS_LIKE_TOKEN_SUBSTR);
+	}
+
 	return(pars_func_low(func, arg1));
 }
 
@@ -216,7 +495,9 @@ pars_order_by(
 {
 	order_node_t*	node;
 
-	node = mem_heap_alloc(pars_sym_tab_global->heap, sizeof(order_node_t));
+	node = static_cast<order_node_t*>(
+		mem_heap_alloc(
+			pars_sym_tab_global->heap, sizeof(order_node_t)));
 
 	node->common.type = QUE_NODE_ORDER;
 
@@ -339,6 +620,14 @@ pars_resolve_func_data_type(
 		dtype_set(que_node_get_data_type(node), DATA_INT, 0, 4);
 		break;
 
+	case PARS_LIKE_TOKEN_EXACT:
+	case PARS_LIKE_TOKEN_PREFIX:
+	case PARS_LIKE_TOKEN_SUFFIX:
+	case PARS_LIKE_TOKEN_SUBSTR:
+		dtype_set(que_node_get_data_type(node), DATA_VARCHAR,
+			  DATA_ENGLISH, 0);
+		break;
+
 	default:
 		ut_error;
 	}
@@ -365,7 +654,7 @@ pars_resolve_exp_variables_and_types(
 	ut_a(exp_node);
 
 	if (que_node_get_type(exp_node) == QUE_NODE_FUNC) {
-		func_node = exp_node;
+		func_node = static_cast<func_node_t*>(exp_node);
 
 		arg = func_node->args;
 
@@ -382,7 +671,7 @@ pars_resolve_exp_variables_and_types(
 
 	ut_a(que_node_get_type(exp_node) == QUE_NODE_SYMBOL);
 
-	sym_node = exp_node;
+	sym_node = static_cast<sym_node_t*>(exp_node);
 
 	if (sym_node->resolved) {
 
@@ -473,7 +762,7 @@ pars_resolve_exp_columns(
 	ut_a(exp_node);
 
 	if (que_node_get_type(exp_node) == QUE_NODE_FUNC) {
-		func_node = exp_node;
+		func_node = static_cast<func_node_t*>(exp_node);
 
 		arg = func_node->args;
 
@@ -488,7 +777,7 @@ pars_resolve_exp_columns(
 
 	ut_a(que_node_get_type(exp_node) == QUE_NODE_SYMBOL);
 
-	sym_node = exp_node;
+	sym_node = static_cast<sym_node_t*>(exp_node);
 
 	if (sym_node->resolved) {
 
@@ -530,7 +819,7 @@ pars_resolve_exp_columns(
 			}
 		}
 
-		t_node = que_node_get_next(t_node);
+		t_node = static_cast<sym_node_t*>(que_node_get_next(t_node));
 	}
 }
 
@@ -559,19 +848,21 @@ pars_retrieve_table_def(
 /*====================*/
 	sym_node_t*	sym_node)	/*!< in: table node */
 {
-	const char*	table_name;
-
 	ut_a(sym_node);
 	ut_a(que_node_get_type(sym_node) == QUE_NODE_SYMBOL);
 
-	sym_node->resolved = TRUE;
-	sym_node->token_type = SYM_TABLE;
+	/* Open the table only if it is not already opened. */
+	if (sym_node->token_type != SYM_TABLE_REF_COUNTED) {
 
-	table_name = (const char*) sym_node->name;
+		ut_a(sym_node->table == NULL);
 
-	sym_node->table = dict_table_get_low(table_name);
+		sym_node->resolved = TRUE;
+		sym_node->token_type = SYM_TABLE_REF_COUNTED;
 
-	ut_a(sym_node->table);
+		sym_node->table = dict_table_open_on_name(sym_node->name, TRUE);
+
+		ut_a(sym_node->table != NULL);
+	}
 }
 
 /*********************************************************************//**
@@ -595,7 +886,8 @@ pars_retrieve_table_list_defs(
 
 		count++;
 
-		sym_node = que_node_get_next(sym_node);
+		sym_node = static_cast<sym_node_t*>(
+			que_node_get_next(sym_node));
 	}
 
 	return(count);
@@ -627,14 +919,15 @@ pars_select_all_columns(
 				table, i);
 
 			col_node = sym_tab_add_id(pars_sym_tab_global,
-						  (byte*)col_name,
+						  (byte*) col_name,
 						  ut_strlen(col_name));
 
 			select_node->select_list = que_node_list_add_last(
 				select_node->select_list, col_node);
 		}
 
-		table_node = que_node_get_next(table_node);
+		table_node = static_cast<sym_node_t*>(
+			que_node_get_next(table_node));
 	}
 }
 
@@ -684,9 +977,9 @@ pars_check_aggregate(
 
 		if (que_node_get_type(exp_node) == QUE_NODE_FUNC) {
 
-			func_node = exp_node;
+			func_node = static_cast<func_node_t*>(exp_node);
 
-			if (func_node->class == PARS_FUNC_AGGREGATE) {
+			if (func_node->fclass == PARS_FUNC_AGGREGATE) {
 
 				n_aggregate_nodes++;
 			}
@@ -864,8 +1157,9 @@ pars_column_assignment(
 {
 	col_assign_node_t*	node;
 
-	node = mem_heap_alloc(pars_sym_tab_global->heap,
-			      sizeof(col_assign_node_t));
+	node = static_cast<col_assign_node_t*>(
+		mem_heap_alloc(pars_sym_tab_global->heap,
+			      sizeof(col_assign_node_t)));
 	node->common.type = QUE_NODE_COL_ASSIGNMENT;
 
 	node->col = column;
@@ -894,7 +1188,8 @@ pars_process_assign_list(
 	ulint			i;
 
 	table_sym = node->table_sym;
-	col_assign_list = node->col_assign_list;
+	col_assign_list = static_cast<col_assign_node_t*>(
+		 node->col_assign_list);
 	clust_index = dict_table_get_first_index(node->table);
 
 	assign_node = col_assign_list;
@@ -920,7 +1215,8 @@ pars_process_assign_list(
 				  assign_node->val);
 		n_assigns++;
 
-		assign_node = que_node_get_next(assign_node);
+		assign_node = static_cast<col_assign_node_t*>(
+				que_node_get_next(assign_node));
 	}
 
 	node->update = upd_create(n_assigns, pars_sym_tab_global->heap);
@@ -946,7 +1242,8 @@ pars_process_assign_list(
 			changes_field_size = 0;
 		}
 
-		assign_node = que_node_get_next(assign_node);
+		assign_node = static_cast<col_assign_node_t*>(
+				que_node_get_next(assign_node));
 	}
 
 	/* Find out if the update can modify an ordering field in any index */
@@ -1129,16 +1426,20 @@ pars_set_dfield_type(
 		flags |= DATA_UNSIGNED;
 	}
 
-	if (type == &pars_int_token) {
+	if (type == &pars_bigint_token) {
+		ut_a(len == 0);
+
+		dtype_set(dfield_get_type(dfield), DATA_INT, flags, 8);
+	} else if (type == &pars_int_token) {
 		ut_a(len == 0);
 
 		dtype_set(dfield_get_type(dfield), DATA_INT, flags, 4);
 
 	} else if (type == &pars_char_token) {
-		ut_a(len == 0);
+		//ut_a(len == 0);
 
 		dtype_set(dfield_get_type(dfield), DATA_VARCHAR,
-			  DATA_ENGLISH | flags, 0);
+			  DATA_ENGLISH | flags, len);
 	} else if (type == &pars_binary_token) {
 		ut_a(len != 0);
 
@@ -1209,12 +1510,12 @@ pars_set_parent_in_list(
 {
 	que_common_t*	common;
 
-	common = node_list;
+	common = static_cast<que_common_t*>(node_list);
 
 	while (common) {
 		common->parent = parent;
 
-		common = que_node_get_next(common);
+		common = static_cast<que_common_t*>(que_node_get_next(common));
 	}
 }
 
@@ -1230,7 +1531,9 @@ pars_elsif_element(
 {
 	elsif_node_t*	node;
 
-	node = mem_heap_alloc(pars_sym_tab_global->heap, sizeof(elsif_node_t));
+	node = static_cast<elsif_node_t*>(
+		mem_heap_alloc(
+			pars_sym_tab_global->heap, sizeof(elsif_node_t)));
 
 	node->common.type = QUE_NODE_ELSIF;
 
@@ -1258,7 +1561,9 @@ pars_if_statement(
 	if_node_t*	node;
 	elsif_node_t*	elsif_node;
 
-	node = mem_heap_alloc(pars_sym_tab_global->heap, sizeof(if_node_t));
+	node = static_cast<if_node_t*>(
+		 mem_heap_alloc(
+			pars_sym_tab_global->heap, sizeof(if_node_t)));
 
 	node->common.type = QUE_NODE_IF;
 
@@ -1273,14 +1578,15 @@ pars_if_statement(
 		/* There is a list of elsif conditions */
 
 		node->else_part = NULL;
-		node->elsif_list = else_part;
+		node->elsif_list = static_cast<elsif_node_t*>(else_part);
 
-		elsif_node = else_part;
+		elsif_node = static_cast<elsif_node_t*>(else_part);
 
 		while (elsif_node) {
 			pars_set_parent_in_list(elsif_node->stat_list, node);
 
-			elsif_node = que_node_get_next(elsif_node);
+			elsif_node = static_cast<elsif_node_t*>(
+				que_node_get_next(elsif_node));
 		}
 	} else {
 		node->else_part = else_part;
@@ -1306,7 +1612,9 @@ pars_while_statement(
 {
 	while_node_t*	node;
 
-	node = mem_heap_alloc(pars_sym_tab_global->heap, sizeof(while_node_t));
+	node = static_cast<while_node_t*>(
+		mem_heap_alloc(
+			pars_sym_tab_global->heap, sizeof(while_node_t)));
 
 	node->common.type = QUE_NODE_WHILE;
 
@@ -1335,7 +1643,8 @@ pars_for_statement(
 {
 	for_node_t*	node;
 
-	node = mem_heap_alloc(pars_sym_tab_global->heap, sizeof(for_node_t));
+	node = static_cast<for_node_t*>(
+		mem_heap_alloc(pars_sym_tab_global->heap, sizeof(for_node_t)));
 
 	node->common.type = QUE_NODE_FOR;
 
@@ -1367,7 +1676,8 @@ pars_exit_statement(void)
 {
 	exit_node_t*	node;
 
-	node = mem_heap_alloc(pars_sym_tab_global->heap, sizeof(exit_node_t));
+	node = static_cast<exit_node_t*>(
+		mem_heap_alloc(pars_sym_tab_global->heap, sizeof(exit_node_t)));
 	node->common.type = QUE_NODE_EXIT;
 
 	return(node);
@@ -1383,8 +1693,9 @@ pars_return_statement(void)
 {
 	return_node_t*	node;
 
-	node = mem_heap_alloc(pars_sym_tab_global->heap,
-			      sizeof(return_node_t));
+	node = static_cast<return_node_t*>(
+		mem_heap_alloc(
+			pars_sym_tab_global->heap, sizeof(return_node_t)));
 	node->common.type = QUE_NODE_RETURN;
 
 	return(node);
@@ -1402,8 +1713,9 @@ pars_assignment_statement(
 {
 	assign_node_t*	node;
 
-	node = mem_heap_alloc(pars_sym_tab_global->heap,
-			      sizeof(assign_node_t));
+	node = static_cast<assign_node_t*>(
+		mem_heap_alloc(
+			pars_sym_tab_global->heap, sizeof(assign_node_t)));
 	node->common.type = QUE_NODE_ASSIGNMENT;
 
 	node->var = var;
@@ -1455,7 +1767,9 @@ pars_fetch_statement(
 	/* Logical XOR. */
 	ut_a(!into_list != !user_func);
 
-	node = mem_heap_alloc(pars_sym_tab_global->heap, sizeof(fetch_node_t));
+	node = static_cast<fetch_node_t*>(
+		mem_heap_alloc(
+			pars_sym_tab_global->heap, sizeof(fetch_node_t)));
 
 	node->common.type = QUE_NODE_FETCH;
 
@@ -1503,7 +1817,9 @@ pars_open_statement(
 	sym_node_t*	cursor_decl;
 	open_node_t*	node;
 
-	node = mem_heap_alloc(pars_sym_tab_global->heap, sizeof(open_node_t));
+	node = static_cast<open_node_t*>(
+		mem_heap_alloc(
+			pars_sym_tab_global->heap, sizeof(open_node_t)));
 
 	node->common.type = QUE_NODE_OPEN;
 
@@ -1513,7 +1829,7 @@ pars_open_statement(
 
 	ut_a(cursor_decl->token_type == SYM_CURSOR);
 
-	node->op_type = type;
+	node->op_type = static_cast<open_node_op>(type);
 	node->cursor_def = cursor_decl->cursor_def;
 
 	return(node);
@@ -1530,8 +1846,9 @@ pars_row_printf_statement(
 {
 	row_printf_node_t*	node;
 
-	node = mem_heap_alloc(pars_sym_tab_global->heap,
-			      sizeof(row_printf_node_t));
+	node = static_cast<row_printf_node_t*>(
+		mem_heap_alloc(
+			pars_sym_tab_global->heap, sizeof(row_printf_node_t)));
 	node->common.type = QUE_NODE_ROW_PRINTF;
 
 	node->sel_node = sel_node;
@@ -1549,7 +1866,7 @@ commit_node_t*
 pars_commit_statement(void)
 /*=======================*/
 {
-	return(commit_node_create(pars_sym_tab_global->heap));
+	return(trx_commit_node_create(pars_sym_tab_global->heap));
 }
 
 /*********************************************************************//**
@@ -1604,6 +1921,8 @@ pars_create_table(
 	sym_node_t*	table_sym,	/*!< in: table name node in the symbol
 					table */
 	sym_node_t*	column_defs,	/*!< in: list of column names */
+	sym_node_t*	compact,	/* in: non-NULL if COMPACT table. */
+	sym_node_t*	block_size,	/* in: block size (can be NULL) */
 	void*		not_fit_in_memory __attribute__((unused)))
 					/*!< in: a non-NULL pointer means that
 					this is a table which in simulations
@@ -1621,13 +1940,44 @@ pars_create_table(
 	tab_node_t*	node;
 	const dtype_t*	dtype;
 	ulint		n_cols;
+	ulint		flags = 0;
+
+	if (compact != NULL) {
+		flags |= DICT_TF_COMPACT;
+	}
+
+	if (block_size != NULL) {
+		ulint		size;
+		dfield_t*	dfield;
+
+		dfield = que_node_get_val(block_size);
+
+		ut_a(dfield_get_len(dfield) == 4);
+		size = mach_read_from_4(static_cast<byte*>(
+			dfield_get_data(dfield)));
+
+
+		switch (size) {
+		case 0:
+			break;
+
+		case 1: case 2: case 4: case 8: case 16:
+			flags |= DICT_TF_COMPACT;
+			/* FTS-FIXME: needs the zip changes */
+			/* flags |= size << DICT_TF_COMPRESSED_SHIFT; */
+			break;
+
+		default:
+			ut_error;
+		}
+	}
 
 	n_cols = que_node_list_get_len(column_defs);
 
 	/* As the InnoDB SQL parser is for internal use only,
 	for creating some system tables, this function will only
 	create tables in the old (not compact) record format. */
-	table = dict_mem_table_create(table_sym->name, 0, n_cols, 0);
+	table = dict_mem_table_create(table_sym->name, 0, n_cols, flags, 0);
 
 #ifdef UNIV_DEBUG
 	if (not_fit_in_memory != NULL) {
@@ -1645,7 +1995,7 @@ pars_create_table(
 		column->resolved = TRUE;
 		column->token_type = SYM_COLUMN;
 
-		column = que_node_get_next(column);
+		column = static_cast<sym_node_t*>(que_node_get_next(column));
 	}
 
 	node = tab_create_graph_create(table, pars_sym_tab_global->heap);
@@ -1699,7 +2049,7 @@ pars_create_index(
 		column->resolved = TRUE;
 		column->token_type = SYM_COLUMN;
 
-		column = que_node_get_next(column);
+		column = static_cast<sym_node_t*>(que_node_get_next(column));
 	}
 
 	node = ind_create_graph_create(index, pars_sym_tab_global->heap);
@@ -1737,7 +2087,8 @@ pars_procedure_definition(
 
 	thr = que_thr_create(fork, heap);
 
-	node = mem_heap_alloc(heap, sizeof(proc_node_t));
+	node = static_cast<proc_node_t*>(
+		mem_heap_alloc(heap, sizeof(proc_node_t)));
 
 	node->common.type = QUE_NODE_PROC;
 	node->common.parent = thr;
@@ -1780,11 +2131,10 @@ pars_stored_procedure_call(
 /*************************************************************//**
 Retrieves characters to the lexical analyzer. */
 UNIV_INTERN
-void
+int
 pars_get_lex_chars(
 /*===============*/
 	char*	buf,		/*!< in/out: buffer where to copy */
-	int*	result,		/*!< out: number of characters copied or EOF */
 	int	max_size)	/*!< in: maximum number of characters which fit
 				in the buffer */
 {
@@ -1796,9 +2146,7 @@ pars_get_lex_chars(
 #ifdef YYDEBUG
 		/* fputs("SQL string ends\n", stderr); */
 #endif
-		*result = 0;
-
-		return;
+		return(0);
 	}
 
 	if (len > max_size) {
@@ -1820,9 +2168,10 @@ pars_get_lex_chars(
 
 	ut_memcpy(buf, pars_sym_tab_global->sql_string
 		  + pars_sym_tab_global->next_char_pos, len);
-	*result = len;
 
 	pars_sym_tab_global->next_char_pos += len;
+
+	return(len);
 }
 
 /*************************************************************//**
@@ -1865,8 +2214,8 @@ pars_sql(
 	pars_sym_tab_global = sym_tab_create(heap);
 
 	pars_sym_tab_global->string_len = strlen(str);
-	pars_sym_tab_global->sql_string = mem_heap_dup(
-		heap, str, pars_sym_tab_global->string_len + 1);
+	pars_sym_tab_global->sql_string = static_cast<char*>(
+		mem_heap_dup(heap, str, pars_sym_tab_global->string_len + 1));
 	pars_sym_tab_global->next_char_pos = 0;
 	pars_sym_tab_global->info = info;
 
@@ -1885,6 +2234,8 @@ pars_sql(
 	graph->sym_tab = pars_sym_tab_global;
 	graph->info = info;
 
+	pars_sym_tab_global = NULL;
+
 	/* fprintf(stderr, "SQL graph size %lu\n", mem_heap_get_size(heap)); */
 
 	return(graph);
@@ -1934,7 +2285,7 @@ pars_info_create(void)
 
 	heap = mem_heap_create(512);
 
-	info = mem_heap_alloc(heap, sizeof(*info));
+	info = static_cast<pars_info_t*>(mem_heap_alloc(heap, sizeof(*info)));
 
 	info->heap = heap;
 	info->funcs = NULL;
@@ -1974,16 +2325,22 @@ pars_info_add_literal(
 
 	ut_ad(!pars_info_get_bound_lit(info, name));
 
-	pbl = mem_heap_alloc(info->heap, sizeof(*pbl));
+	pbl = static_cast<pars_bound_lit_t*>(
+		mem_heap_alloc(info->heap, sizeof(*pbl)));
 
 	pbl->name = name;
+
 	pbl->address = address;
 	pbl->length = length;
 	pbl->type = type;
 	pbl->prtype = prtype;
 
 	if (!info->bound_lits) {
-		info->bound_lits = ib_vector_create(info->heap, 8);
+		ib_alloc_t*     heap_alloc;
+
+		heap_alloc = ib_heap_allocator_create(info->heap);
+
+		info->bound_lits = ib_vector_create(heap_alloc, sizeof(*pbl), 8);
 	}
 
 	ib_vector_push(info->bound_lits, pbl);
@@ -2004,6 +2361,63 @@ pars_info_add_str_literal(
 			      DATA_VARCHAR, DATA_ENGLISH);
 }
 
+/********************************************************************
+If the literal value already exists then it rebinds otherwise it
+creates a new entry.*/
+UNIV_INTERN
+void
+pars_info_bind_literal(
+/*===================*/
+	pars_info_t*	info,		/* in: info struct */
+	const char*	name,		/* in: name */
+	const void*	address,	/* in: address */
+	ulint		length,		/* in: length of data */
+	ulint		type,		/* in: type, e.g. DATA_FIXBINARY */
+	ulint		prtype)		/* in: precise type, e.g. */
+{
+	pars_bound_lit_t*	pbl;
+
+	pbl = pars_info_lookup_bound_lit(info, name);
+
+	if (!pbl) {
+		pars_info_add_literal(
+			info, name, address, length, type, prtype);
+	} else {
+		pbl->address = address;
+		pbl->length = length;
+
+		sym_tab_rebind_lit(pbl->node, address, length);
+	}
+}
+
+/********************************************************************
+If the literal value already exists then it rebinds otherwise it
+creates a new entry.*/
+UNIV_INTERN
+void
+pars_info_bind_varchar_literal(
+/*===========================*/
+	pars_info_t*	info,		/*!< in: info struct */
+	const char*	name,		/*!< in: name */
+	const byte*	str,		/*!< in: string */
+	ulint		str_len)	/*!< in: string length */
+{
+	pars_bound_lit_t*	pbl;
+
+	pbl = pars_info_lookup_bound_lit(info, name);
+
+	if (!pbl) {
+		pars_info_add_literal(
+			info, name, str, str_len, DATA_VARCHAR, DATA_ENGLISH);
+	} else {
+
+		pbl->address = str;
+		pbl->length = str_len;
+
+		sym_tab_rebind_lit(pbl->node, str, str_len);
+	}
+}
+
 /****************************************************************//**
 Equivalent to:
 
@@ -2021,12 +2435,65 @@ pars_info_add_int4_literal(
 	const char*	name,		/*!< in: name */
 	lint		val)		/*!< in: value */
 {
-	byte*	buf = mem_heap_alloc(info->heap, 4);
+	byte*	buf = static_cast<byte*>(mem_heap_alloc(info->heap, 4));
 
 	mach_write_to_4(buf, val);
 	pars_info_add_literal(info, name, buf, 4, DATA_INT, 0);
 }
 
+/********************************************************************
+If the literal value already exists then it rebinds otherwise it
+creates a new entry. */
+UNIV_INTERN
+void
+pars_info_bind_int4_literal(
+/*========================*/
+	pars_info_t*		info,   /* in: info struct */
+	const char*		name,   /* in: name */
+	const ib_uint32_t*	val)    /* in: value */
+{
+	pars_bound_lit_t*       pbl;
+
+	pbl = pars_info_lookup_bound_lit(info, name);
+
+	if (!pbl) {
+		pars_info_add_literal(info, name, val, 4, DATA_INT, 0);
+	} else {
+
+		pbl->address = val;
+		pbl->length = sizeof(*val);
+
+		sym_tab_rebind_lit(pbl->node, val, sizeof(*val));
+	}
+}
+
+/********************************************************************
+If the literal value already exists then it rebinds otherwise it
+creates a new entry. */
+UNIV_INTERN
+void
+pars_info_bind_int8_literal(
+/*========================*/
+	pars_info_t*		info,	/* in: info struct */
+	const char*		name,	/* in: name */
+	const ib_uint64_t*	val)	/* in: value */
+{
+        pars_bound_lit_t*	pbl;
+
+	pbl = pars_info_lookup_bound_lit(info, name);
+
+	if (!pbl) {
+		pars_info_add_literal(
+			info, name, val, sizeof(*val), DATA_INT, 0);
+	} else {
+
+		pbl->address = val;
+		pbl->length = sizeof(*val);
+
+		sym_tab_rebind_lit(pbl->node, val, sizeof(*val));
+	}
+}
+
 /****************************************************************//**
 Equivalent to:
 
@@ -2044,7 +2511,7 @@ pars_info_add_ull_literal(
 	const char*	name,		/*!< in: name */
 	ib_uint64_t	val)		/*!< in: value */
 {
-	byte*	buf = mem_heap_alloc(info->heap, 8);
+	byte*	buf = static_cast<byte*>(mem_heap_alloc(info->heap, 8));
 
 	mach_write_to_8(buf, val);
 
@@ -2055,8 +2522,8 @@ pars_info_add_ull_literal(
 Add user function. */
 UNIV_INTERN
 void
-pars_info_add_function(
-/*===================*/
+pars_info_bind_function(
+/*====================*/
 	pars_info_t*		info,	/*!< in: info struct */
 	const char*		name,	/*!< in: function name */
 	pars_user_func_cb_t	func,	/*!< in: function address */
@@ -2064,45 +2531,77 @@ pars_info_add_function(
 {
 	pars_user_func_t*	puf;
 
-	ut_ad(!pars_info_get_user_func(info, name));
+	puf = pars_info_lookup_user_func(info, name);
 
-	puf = mem_heap_alloc(info->heap, sizeof(*puf));
+	if (!puf) {
+		if (!info->funcs) {
+			ib_alloc_t*     heap_alloc;
 
-	puf->name = name;
-	puf->func = func;
-	puf->arg = arg;
+			heap_alloc = ib_heap_allocator_create(info->heap);
+
+			info->funcs = ib_vector_create(
+				heap_alloc, sizeof(*puf), 8);
+		}
 
-	if (!info->funcs) {
-		info->funcs = ib_vector_create(info->heap, 8);
+		/* Create a "new" element */
+		puf = static_cast<pars_user_func_t*>(
+			ib_vector_push(info->funcs, NULL));
+		puf->name = name;
 	}
 
-	ib_vector_push(info->funcs, puf);
+	puf->arg = arg;
+	puf->func = func;
 }
 
-/****************************************************************//**
+/********************************************************************
 Add bound id. */
 UNIV_INTERN
 void
-pars_info_add_id(
-/*=============*/
+pars_info_bind_id(
+/*==============*/
 	pars_info_t*	info,		/*!< in: info struct */
+	ibool		copy_name,	/* in: copy name if TRUE */
 	const char*	name,		/*!< in: name */
 	const char*	id)		/*!< in: id */
 {
 	pars_bound_id_t*	bid;
 
-	ut_ad(!pars_info_get_bound_id(info, name));
+	bid = pars_info_lookup_bound_id(info, name);
 
-	bid = mem_heap_alloc(info->heap, sizeof(*bid));
+	if (!bid) {
 
-	bid->name = name;
-	bid->id = id;
+		if (!info->bound_ids) {
+			ib_alloc_t*     heap_alloc;
+
+			heap_alloc = ib_heap_allocator_create(info->heap);
 
-	if (!info->bound_ids) {
-		info->bound_ids = ib_vector_create(info->heap, 8);
+			info->bound_ids = ib_vector_create(
+				heap_alloc, sizeof(*bid), 8);
+		}
+
+		/* Create a "new" element */
+		bid = static_cast<pars_bound_id_t*>(
+			ib_vector_push(info->bound_ids, NULL));
+
+		bid->name = (copy_name)
+		    ? mem_heap_strdup(info->heap, name) : name;
 	}
 
-	ib_vector_push(info->bound_ids, bid);
+	bid->id = id;
+}
+
+/********************************************************************
+Get bound identifier with the given name.*/
+
+pars_bound_id_t*
+pars_info_get_bound_id(
+/*===================*/
+					/* out: bound id, or NULL if not
+					found */
+	pars_info_t*		info,	/* in: info struct */
+	const char*		name)	/* in: bound id name to find */
+{
+	return(pars_info_lookup_bound_id(info, name));
 }
 
 /****************************************************************//**
@@ -2115,24 +2614,7 @@ pars_info_get_user_func(
 	pars_info_t*		info,	/*!< in: info struct */
 	const char*		name)	/*!< in: function name to find*/
 {
-	ulint		i;
-	ib_vector_t*	vec;
-
-	if (!info || !info->funcs) {
-		return(NULL);
-	}
-
-	vec = info->funcs;
-
-	for (i = 0; i < ib_vector_size(vec); i++) {
-		pars_user_func_t*	puf = ib_vector_get(vec, i);
-
-		if (strcmp(puf->name, name) == 0) {
-			return(puf);
-		}
-	}
-
-	return(NULL);
+	return(pars_info_lookup_user_func(info, name));
 }
 
 /****************************************************************//**
@@ -2145,52 +2627,5 @@ pars_info_get_bound_lit(
 	pars_info_t*		info,	/*!< in: info struct */
 	const char*		name)	/*!< in: bound literal name to find */
 {
-	ulint		i;
-	ib_vector_t*	vec;
-
-	if (!info || !info->bound_lits) {
-		return(NULL);
-	}
-
-	vec = info->bound_lits;
-
-	for (i = 0; i < ib_vector_size(vec); i++) {
-		pars_bound_lit_t*	pbl = ib_vector_get(vec, i);
-
-		if (strcmp(pbl->name, name) == 0) {
-			return(pbl);
-		}
-	}
-
-	return(NULL);
-}
-
-/****************************************************************//**
-Get bound id with the given name.
-@return	bound id, or NULL if not found */
-UNIV_INTERN
-pars_bound_id_t*
-pars_info_get_bound_id(
-/*===================*/
-	pars_info_t*		info,	/*!< in: info struct */
-	const char*		name)	/*!< in: bound id name to find */
-{
-	ulint		i;
-	ib_vector_t*	vec;
-
-	if (!info || !info->bound_ids) {
-		return(NULL);
-	}
-
-	vec = info->bound_ids;
-
-	for (i = 0; i < ib_vector_size(vec); i++) {
-		pars_bound_id_t*	bid = ib_vector_get(vec, i);
-
-		if (strcmp(bid->name, name) == 0) {
-			return(bid);
-		}
-	}
-
-	return(NULL);
+	return(pars_info_lookup_bound_lit(info, name));
 }
diff --git a/storage/innobase/pars/pars0sym.c b/storage/innobase/pars/pars0sym.cc
index b56350116bb..c71ad8a6b39 100644
--- a/storage/innobase/pars/pars0sym.c
+++ b/storage/innobase/pars/pars0sym.cc
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1997, 2011, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,13 +11,13 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
 /**************************************************//**
-@file pars/pars0sym.c
+@file pars/pars0sym.cc
 SQL parser symbol table
 
 Created 12/15/1997 Heikki Tuuri
@@ -49,7 +49,8 @@ sym_tab_create(
 {
 	sym_tab_t*	sym_tab;
 
-	sym_tab = mem_heap_alloc(heap, sizeof(sym_tab_t));
+	sym_tab = static_cast<sym_tab_t*>(
+		mem_heap_alloc(heap, sizeof(sym_tab_t)));
 
 	UT_LIST_INIT(sym_tab->sym_list);
 	UT_LIST_INIT(sym_tab->func_node_list);
@@ -59,6 +60,7 @@ sym_tab_create(
 	return(sym_tab);
 }
 
+
 /******************************************************************//**
 Frees the memory allocated dynamically AFTER parsing phase for variables
 etc. in the symbol table. Does not free the mem heap where the table was
@@ -72,9 +74,23 @@ sym_tab_free_private(
 	sym_node_t*	sym;
 	func_node_t*	func;
 
-	sym = UT_LIST_GET_FIRST(sym_tab->sym_list);
+	ut_ad(mutex_own(&dict_sys->mutex));
+
+	for (sym = UT_LIST_GET_FIRST(sym_tab->sym_list);
+	     sym != NULL;
+	     sym = UT_LIST_GET_NEXT(sym_list, sym)) {
+
+		/* Close the tables opened in pars_retrieve_table_def(). */
+
+		if (sym->token_type == SYM_TABLE_REF_COUNTED) {
+
+			dict_table_close(sym->table, TRUE);
+
+			sym->table = NULL;
+			sym->resolved = FALSE;
+			sym->token_type = SYM_UNSET;
+		}
 
-	while (sym) {
 		eval_node_free_val_buf(sym);
 
 		if (sym->prefetch_buf) {
@@ -84,16 +100,13 @@ sym_tab_free_private(
 		if (sym->cursor_def) {
 			que_graph_free_recursive(sym->cursor_def);
 		}
-
-		sym = UT_LIST_GET_NEXT(sym_list, sym);
 	}
 
-	func = UT_LIST_GET_FIRST(sym_tab->func_node_list);
+	for (func = UT_LIST_GET_FIRST(sym_tab->func_node_list);
+	     func != NULL;
+	     func = UT_LIST_GET_NEXT(func_node_list, func)) {
 
-	while (func) {
 		eval_node_free_val_buf(func);
-
-		func = UT_LIST_GET_NEXT(func_node_list, func);
 	}
 }
 
@@ -110,10 +123,12 @@ sym_tab_add_int_lit(
 	sym_node_t*	node;
 	byte*		data;
 
-	node = mem_heap_alloc(sym_tab->heap, sizeof(sym_node_t));
+	node = static_cast<sym_node_t*>(
+		mem_heap_alloc(sym_tab->heap, sizeof(sym_node_t)));
 
 	node->common.type = QUE_NODE_SYMBOL;
 
+	node->table = NULL;
 	node->resolved = TRUE;
 	node->token_type = SYM_LIT;
 
@@ -121,7 +136,7 @@ sym_tab_add_int_lit(
 
 	dtype_set(dfield_get_type(&node->common.val), DATA_INT, 0, 4);
 
-	data = mem_heap_alloc(sym_tab->heap, 4);
+	data = static_cast<byte*>(mem_heap_alloc(sym_tab->heap, 4));
 	mach_write_to_4(data, val);
 
 	dfield_set_data(&(node->common.val), data, 4);
@@ -132,6 +147,8 @@ sym_tab_add_int_lit(
 
 	UT_LIST_ADD_LAST(sym_list, sym_tab->sym_list, node);
 
+	node->like_node = NULL;
+
 	node->sym_table = sym_tab;
 
 	return(node);
@@ -145,17 +162,19 @@ sym_node_t*
 sym_tab_add_str_lit(
 /*================*/
 	sym_tab_t*	sym_tab,	/*!< in: symbol table */
-	byte*		str,		/*!< in: string with no quotes around
+	const byte*	str,		/*!< in: string with no quotes around
 					it */
 	ulint		len)		/*!< in: string length */
 {
 	sym_node_t*	node;
 	byte*		data;
 
-	node = mem_heap_alloc(sym_tab->heap, sizeof(sym_node_t));
+	node = static_cast<sym_node_t*>(
+		mem_heap_alloc(sym_tab->heap, sizeof(sym_node_t)));
 
 	node->common.type = QUE_NODE_SYMBOL;
 
+	node->table = NULL;
 	node->resolved = TRUE;
 	node->token_type = SYM_LIT;
 
@@ -164,12 +183,8 @@ sym_tab_add_str_lit(
 	dtype_set(dfield_get_type(&node->common.val),
 		  DATA_VARCHAR, DATA_ENGLISH, 0);
 
-	if (len) {
-		data = mem_heap_alloc(sym_tab->heap, len);
-		ut_memcpy(data, str, len);
-	} else {
-		data = NULL;
-	}
+	data = (len) ? static_cast<byte*>(mem_heap_dup(sym_tab->heap, str, len))
+	      	     : NULL;
 
 	dfield_set_data(&(node->common.val), data, len);
 
@@ -179,6 +194,8 @@ sym_tab_add_str_lit(
 
 	UT_LIST_ADD_LAST(sym_list, sym_tab->sym_list, node);
 
+	node->like_node = NULL;
+
 	node->sym_table = sym_tab;
 
 	return(node);
@@ -202,10 +219,13 @@ sym_tab_add_bound_lit(
 	blit = pars_info_get_bound_lit(sym_tab->info, name);
 	ut_a(blit);
 
-	node = mem_heap_alloc(sym_tab->heap, sizeof(sym_node_t));
+	node = static_cast<sym_node_t*>(
+		mem_heap_alloc(sym_tab->heap, sizeof(sym_node_t)));
 
 	node->common.type = QUE_NODE_SYMBOL;
+	node->common.brother = node->common.parent = NULL;
 
+	node->table = NULL;
 	node->resolved = TRUE;
 	node->token_type = SYM_LIT;
 
@@ -255,11 +275,57 @@ sym_tab_add_bound_lit(
 
 	UT_LIST_ADD_LAST(sym_list, sym_tab->sym_list, node);
 
+	blit->node = node;
+	node->like_node = NULL;
 	node->sym_table = sym_tab;
 
 	return(node);
 }
 
+/**********************************************************************
+Rebind literal to a node in the symbol table. */
+
+sym_node_t*
+sym_tab_rebind_lit(
+/*===============*/
+					/* out: symbol table node */
+	sym_node_t*	node,		/* in: node that is bound to literal*/
+	const void*	address,	/* in: pointer to data */
+	ulint		length)		/* in: length of data */
+{
+	dfield_t*	dfield = que_node_get_val(node);
+	dtype_t*	dtype = dfield_get_type(dfield);
+
+	ut_a(node->token_type == SYM_LIT);
+
+	dfield_set_data(&node->common.val, address, length);
+
+	if (node->like_node) {
+
+	    ut_a(dtype_get_mtype(dtype) == DATA_CHAR
+		 || dtype_get_mtype(dtype) == DATA_VARCHAR);
+
+		/* Don't force [FALSE] creation of sub-nodes (for LIKE) */
+		pars_like_rebind(
+			node,static_cast<const byte*>(address), length);
+	}
+
+	/* FIXME: What's this ? */
+	node->common.val_buf_size = 0;
+
+	if (node->prefetch_buf) {
+		sel_col_prefetch_buf_free(node->prefetch_buf);
+		node->prefetch_buf = NULL;
+	}
+
+	if (node->cursor_def) {
+		que_graph_free_recursive(node->cursor_def);
+		node->cursor_def = NULL;
+	}
+
+	return(node);
+}
+
 /******************************************************************//**
 Adds an SQL null literal to a symbol table.
 @return	symbol table node */
@@ -271,10 +337,12 @@ sym_tab_add_null_lit(
 {
 	sym_node_t*	node;
 
-	node = mem_heap_alloc(sym_tab->heap, sizeof(sym_node_t));
+	node = static_cast<sym_node_t*>(
+		mem_heap_alloc(sym_tab->heap, sizeof(sym_node_t)));
 
 	node->common.type = QUE_NODE_SYMBOL;
 
+	node->table = NULL;
 	node->resolved = TRUE;
 	node->token_type = SYM_LIT;
 
@@ -290,6 +358,8 @@ sym_tab_add_null_lit(
 
 	UT_LIST_ADD_LAST(sym_list, sym_tab->sym_list, node);
 
+	node->like_node = NULL;
+
 	node->sym_table = sym_tab;
 
 	return(node);
@@ -308,13 +378,11 @@ sym_tab_add_id(
 {
 	sym_node_t*	node;
 
-	node = mem_heap_alloc(sym_tab->heap, sizeof(sym_node_t));
+	node = static_cast<sym_node_t*>(
+		mem_heap_zalloc(sym_tab->heap, sizeof(*node)));
 
 	node->common.type = QUE_NODE_SYMBOL;
 
-	node->resolved = FALSE;
-	node->indirection = NULL;
-
 	node->name = mem_heap_strdupl(sym_tab->heap, (char*) name, len);
 	node->name_len = len;
 
@@ -322,10 +390,6 @@ sym_tab_add_id(
 
 	dfield_set_null(&node->common.val);
 
-	node->common.val_buf_size = 0;
-	node->prefetch_buf = NULL;
-	node->cursor_def = NULL;
-
 	node->sym_table = sym_tab;
 
 	return(node);
@@ -337,7 +401,7 @@ Add a bound identifier to a symbol table.
 UNIV_INTERN
 sym_node_t*
 sym_tab_add_bound_id(
-/*===========*/
+/*=================*/
 	sym_tab_t*	sym_tab,	/*!< in: symbol table */
 	const char*	name)		/*!< in: name of bound id */
 {
@@ -347,11 +411,14 @@ sym_tab_add_bound_id(
 	bid = pars_info_get_bound_id(sym_tab->info, name);
 	ut_a(bid);
 
-	node = mem_heap_alloc(sym_tab->heap, sizeof(sym_node_t));
+	node = static_cast<sym_node_t*>(
+		mem_heap_alloc(sym_tab->heap, sizeof(sym_node_t)));
 
 	node->common.type = QUE_NODE_SYMBOL;
 
+	node->table = NULL;
 	node->resolved = FALSE;
+	node->token_type = SYM_UNSET;
 	node->indirection = NULL;
 
 	node->name = mem_heap_strdup(sym_tab->heap, bid->id);
@@ -365,6 +432,8 @@ sym_tab_add_bound_id(
 	node->prefetch_buf = NULL;
 	node->cursor_def = NULL;
 
+	node->like_node = NULL;
+
 	node->sym_table = sym_tab;
 
 	return(node);
diff --git a/storage/innobase/plugin_exports b/storage/innobase/plugin_exports
deleted file mode 100644
index 03b8cf8c217..00000000000
--- a/storage/innobase/plugin_exports
+++ /dev/null
@@ -1,11 +0,0 @@
-{
- global:
- _mysql_plugin_interface_version_;
- _mysql_sizeof_struct_st_plugin_;
- _mysql_plugin_declarations_;
- thd_wait_service;
- my_snprintf_service;
- thd_alloc_service;
- local:
- *;
-};
diff --git a/storage/innobase/que/que0que.c b/storage/innobase/que/que0que.cc
index 384fb490b86..c023723685c 100644
--- a/storage/innobase/que/que0que.c
+++ b/storage/innobase/que/que0que.cc
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2011, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,13 +11,13 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
 /**************************************************//**
-@file que/que0que.c
+@file que/que0que.cc
 Query graph
 
 Created 5/27/1996 Heikki Tuuri
@@ -40,11 +40,10 @@ Created 5/27/1996 Heikki Tuuri
 #include "dict0crea.h"
 #include "log0log.h"
 #include "eval0proc.h"
+#include "lock0lock.h"
 #include "eval0eval.h"
 #include "pars0types.h"
 
-#define QUE_PARALLELIZE_LIMIT	(64 * 256 * 256 * 256)
-#define QUE_ROUND_ROBIN_LIMIT	(64 * 256 * 256 * 256)
 #define QUE_MAX_LOOPS_WITHOUT_CHECK	16
 
 #ifdef UNIV_DEBUG
@@ -101,22 +100,10 @@ A = assign_node_t, W = while_node_t. */
 is executed?
 
 The commit or rollback can be seen as a subprocedure call.
-The problem is that if there are several query threads
-currently running within the transaction, their action could
-mess the commit or rollback operation. Or, at the least, the
-operation would be difficult to visualize and keep in control.
-
-Therefore the query thread requesting a commit or a rollback
-sends to the transaction a signal, which moves the transaction
-to TRX_QUE_SIGNALED state. All running query threads of the
-transaction will eventually notice that the transaction is now in
-this state and voluntarily suspend themselves. Only the last
-query thread which suspends itself will trigger handling of
-the signal.
-
-When the transaction starts to handle a rollback or commit
-signal, it builds a query graph which, when executed, will
-roll back or commit the incomplete transaction. The transaction
+
+When the transaction starts to handle a rollback or commit.
+It builds a query graph which, when executed, will roll back
+or commit the incomplete transaction. The transaction
 is moved to the TRX_QUE_ROLLING_BACK or TRX_QUE_COMMITTING state.
 If specified, the SQL cursors opened by the transaction are closed.
 When the execution of the graph completes, it is like returning
@@ -135,20 +122,6 @@ que_thr_move_to_run_state(
 	que_thr_t*	thr);	/*!< in: an query thread */
 
 /***********************************************************************//**
-Adds a query graph to the session's list of graphs. */
-UNIV_INTERN
-void
-que_graph_publish(
-/*==============*/
-	que_t*	graph,	/*!< in: graph */
-	sess_t*	sess)	/*!< in: session */
-{
-	ut_ad(mutex_own(&kernel_mutex));
-
-	UT_LIST_ADD_LAST(graphs, sess->graphs, graph);
-}
-
-/***********************************************************************//**
 Creates a query graph fork node.
 @return	own: fork node */
 UNIV_INTERN
@@ -166,30 +139,19 @@ que_fork_create(
 
 	ut_ad(heap);
 
-	fork = mem_heap_alloc(heap, sizeof(que_fork_t));
+	fork = static_cast<que_fork_t*>(mem_heap_zalloc(heap, sizeof(*fork)));
 
-	fork->common.type = QUE_NODE_FORK;
-	fork->n_active_thrs = 0;
-
-	fork->state = QUE_FORK_COMMAND_WAIT;
-
-	if (graph != NULL) {
-		fork->graph = graph;
-	} else {
-		fork->graph = fork;
-	}
+	fork->heap = heap;
 
-	fork->common.parent = parent;
 	fork->fork_type = fork_type;
 
-	fork->caller = NULL;
+	fork->common.parent = parent;
 
-	UT_LIST_INIT(fork->thrs);
+	fork->common.type = QUE_NODE_FORK;
 
-	fork->sym_tab = NULL;
-	fork->info = NULL;
+	fork->state = QUE_FORK_COMMAND_WAIT;
 
-	fork->heap = heap;
+	fork->graph = (graph != NULL) ? graph : fork;
 
 	return(fork);
 }
@@ -208,21 +170,18 @@ que_thr_create(
 
 	ut_ad(parent && heap);
 
-	thr = mem_heap_alloc(heap, sizeof(que_thr_t));
+	thr = static_cast<que_thr_t*>(mem_heap_zalloc(heap, sizeof(*thr)));
+
+	thr->graph = parent->graph;
 
-	thr->common.type = QUE_NODE_THR;
 	thr->common.parent = parent;
 
 	thr->magic_n = QUE_THR_MAGIC_N;
 
-	thr->graph = parent->graph;
+	thr->common.type = QUE_NODE_THR;
 
 	thr->state = QUE_THR_COMMAND_WAIT;
 
-	thr->is_active = FALSE;
-
-	thr->run_node = NULL;
-	thr->resource = 0;
 	thr->lock_state = QUE_THR_LOCK_NOLOCK;
 
 	UT_LIST_ADD_LAST(thrs, parent->thrs, thr);
@@ -232,87 +191,43 @@ que_thr_create(
 
 /**********************************************************************//**
 Moves a suspended query thread to the QUE_THR_RUNNING state and may release
-a single worker thread to execute it. This function should be used to end
+a worker thread to execute it. This function should be used to end
 the wait state of a query thread waiting for a lock or a stored procedure
-completion. */
+completion.
+@return the query thread that needs to be released. */
 UNIV_INTERN
-void
-que_thr_end_wait(
-/*=============*/
-	que_thr_t*	thr,		/*!< in: query thread in the
-					QUE_THR_LOCK_WAIT,
-					or QUE_THR_PROCEDURE_WAIT, or
-					QUE_THR_SIG_REPLY_WAIT state */
-	que_thr_t**	next_thr)	/*!< in/out: next query thread to run;
-					if the value which is passed in is
-					a pointer to a NULL pointer, then the
-					calling function can start running
-					a new query thread; if NULL is passed
-					as the parameter, it is ignored */
+que_thr_t*
+que_thr_end_lock_wait(
+/*==================*/
+	trx_t*		trx)	/*!< in: transaction with que_state in
+		       		QUE_THR_LOCK_WAIT */
 {
-	ibool	was_active;
-
-	ut_ad(mutex_own(&kernel_mutex));
-	ut_ad(thr);
-	ut_ad((thr->state == QUE_THR_LOCK_WAIT)
-	      || (thr->state == QUE_THR_PROCEDURE_WAIT)
-	      || (thr->state == QUE_THR_SIG_REPLY_WAIT));
-	ut_ad(thr->run_node);
-
-	thr->prev_node = thr->run_node;
-
-	was_active = thr->is_active;
+	que_thr_t*	thr;
+	ibool		was_active;
 
-	que_thr_move_to_run_state(thr);
+	ut_ad(lock_mutex_own());
+	ut_ad(trx_mutex_own(trx));
 
-	if (was_active) {
+	thr = trx->lock.wait_thr;
 
-		return;
-	}
+	ut_ad(thr != NULL);
 
-	if (next_thr && *next_thr == NULL) {
-		*next_thr = thr;
-	} else {
-		ut_a(0);
-		srv_que_task_enqueue_low(thr);
-	}
-}
-
-/**********************************************************************//**
-Same as que_thr_end_wait, but no parameter next_thr available. */
-UNIV_INTERN
-void
-que_thr_end_wait_no_next_thr(
-/*=========================*/
-	que_thr_t*	thr)	/*!< in: query thread in the QUE_THR_LOCK_WAIT,
-				or QUE_THR_PROCEDURE_WAIT, or
-				QUE_THR_SIG_REPLY_WAIT state */
-{
-	ibool	was_active;
-
-	ut_a(thr->state == QUE_THR_LOCK_WAIT);	/* In MySQL this is the
-						only possible state here */
-	ut_ad(mutex_own(&kernel_mutex));
-	ut_ad(thr);
-	ut_ad((thr->state == QUE_THR_LOCK_WAIT)
-	      || (thr->state == QUE_THR_PROCEDURE_WAIT)
-	      || (thr->state == QUE_THR_SIG_REPLY_WAIT));
+	ut_ad(trx->lock.que_state == TRX_QUE_LOCK_WAIT);
+	/* In MySQL this is the only possible state here */
+	ut_a(thr->state == QUE_THR_LOCK_WAIT);
 
 	was_active = thr->is_active;
 
 	que_thr_move_to_run_state(thr);
 
-	if (was_active) {
+	trx->lock.que_state = TRX_QUE_RUNNING;
 
-		return;
-	}
+	trx->lock.wait_thr = NULL;
 
 	/* In MySQL we let the OS thread (not just the query thread) to wait
 	for the lock to be released: */
 
-	srv_release_mysql_thread_if_suspended(thr);
-
-	/* srv_que_task_enqueue_low(thr); */
+	return((!was_active && thr != NULL) ? thr : NULL);
 }
 
 /**********************************************************************//**
@@ -330,6 +245,53 @@ que_thr_init_command(
 }
 
 /**********************************************************************//**
+Round robin scheduler.
+@return a query thread of the graph moved to QUE_THR_RUNNING state, or
+NULL; the query thread should be executed by que_run_threads by the
+caller */
+UNIV_INTERN
+que_thr_t*
+que_fork_scheduler_round_robin(
+/*===========================*/
+	que_fork_t*	fork,		/*!< in: a query fork */
+	que_thr_t*	thr)		/*!< in: current pos */
+{
+	trx_mutex_enter(fork->trx);
+
+	/* If no current, start first available. */
+	if (thr == NULL) {
+		thr = UT_LIST_GET_FIRST(fork->thrs);
+	} else {
+		thr = UT_LIST_GET_NEXT(thrs, thr);
+	}
+
+	if (thr) {
+
+		fork->state = QUE_FORK_ACTIVE;
+
+		fork->last_sel_node = NULL;
+
+		switch (thr->state) {
+		case QUE_THR_COMMAND_WAIT:
+		case QUE_THR_COMPLETED:
+			ut_a(!thr->is_active);
+			que_thr_init_command(thr);
+			break;
+
+		case QUE_THR_SUSPENDED:
+		case QUE_THR_LOCK_WAIT:
+		default:
+			ut_error;
+
+		}
+	}
+
+	trx_mutex_exit(fork->trx);
+
+	return(thr);
+}
+
+/**********************************************************************//**
 Starts execution of a command in a query fork. Picks a query thread which
 is not in the QUE_THR_RUNNING state and moves it to that state. If none
 can be chosen, a situation which may arise in parallelized fetches, NULL
@@ -363,11 +325,12 @@ que_fork_start_command(
 	state, finally we try to find a query thread in the QUE_THR_COMPLETED
 	state */
 
-	thr = UT_LIST_GET_FIRST(fork->thrs);
-
 	/* We make a single pass over the thr list within which we note which
 	threads are ready to run. */
-	while (thr) {
+	for (thr = UT_LIST_GET_FIRST(fork->thrs);
+	     thr != NULL;
+	     thr = UT_LIST_GET_NEXT(thrs, thr)) {
+
 		switch (thr->state) {
 		case QUE_THR_COMMAND_WAIT:
 
@@ -399,8 +362,6 @@ que_fork_start_command(
 			ut_error;
 
 		}
-
-		thr = UT_LIST_GET_NEXT(thrs, thr);
 	}
 
 	if (suspended_thr) {
@@ -412,52 +373,13 @@ que_fork_start_command(
 
 		thr = completed_thr;
 		que_thr_init_command(thr);
+	} else {
+		ut_error;
 	}
 
 	return(thr);
 }
 
-/**********************************************************************//**
-After signal handling is finished, returns control to a query graph error
-handling routine. (Currently, just returns the control to the root of the
-graph so that the graph can communicate an error message to the client.) */
-UNIV_INTERN
-void
-que_fork_error_handle(
-/*==================*/
-	trx_t*	trx __attribute__((unused)),	/*!< in: trx */
-	que_t*	fork)	/*!< in: query graph which was run before signal
-			handling started, NULL not allowed */
-{
-	que_thr_t*	thr;
-
-	ut_ad(mutex_own(&kernel_mutex));
-	ut_ad(trx->sess->state == SESS_ERROR);
-	ut_ad(UT_LIST_GET_LEN(trx->reply_signals) == 0);
-	ut_ad(UT_LIST_GET_LEN(trx->wait_thrs) == 0);
-
-	thr = UT_LIST_GET_FIRST(fork->thrs);
-
-	while (thr != NULL) {
-		ut_ad(!thr->is_active);
-		ut_ad(thr->state != QUE_THR_SIG_REPLY_WAIT);
-		ut_ad(thr->state != QUE_THR_LOCK_WAIT);
-
-		thr->run_node = thr;
-		thr->prev_node = thr->child;
-		thr->state = QUE_THR_COMPLETED;
-
-		thr = UT_LIST_GET_NEXT(thrs, thr);
-	}
-
-	thr = UT_LIST_GET_FIRST(fork->thrs);
-
-	que_thr_move_to_run_state(thr);
-
-	ut_a(0);
-	srv_que_task_enqueue_low(thr);
-}
-
 /****************************************************************//**
 Tests if all the query threads in the same fork have a given state.
 @return TRUE if all the query threads in the same fork were in the
@@ -471,15 +393,14 @@ que_fork_all_thrs_in_state(
 {
 	que_thr_t*	thr_node;
 
-	thr_node = UT_LIST_GET_FIRST(fork->thrs);
+	for (thr_node = UT_LIST_GET_FIRST(fork->thrs);
+	     thr_node != NULL;
+	     thr_node = UT_LIST_GET_NEXT(thrs, thr_node)) {
 
-	while (thr_node != NULL) {
 		if (thr_node->state != state) {
 
 			return(FALSE);
 		}
-
-		thr_node = UT_LIST_GET_NEXT(thrs, thr_node);
 	}
 
 	return(TRUE);
@@ -527,7 +448,7 @@ que_graph_free_recursive(
 	switch (que_node_get_type(node)) {
 
 	case QUE_NODE_FORK:
-		fork = node;
+		fork = static_cast<que_fork_t*>(node);
 
 		thr = UT_LIST_GET_FIRST(fork->thrs);
 
@@ -540,7 +461,7 @@ que_graph_free_recursive(
 		break;
 	case QUE_NODE_THR:
 
-		thr = node;
+		thr = static_cast<que_thr_t*>(node);
 
 		if (thr->magic_n != QUE_THR_MAGIC_N) {
 			fprintf(stderr,
@@ -558,21 +479,21 @@ que_graph_free_recursive(
 		break;
 	case QUE_NODE_UNDO:
 
-		undo = node;
+		undo = static_cast<undo_node_t*>(node);
 
 		mem_heap_free(undo->heap);
 
 		break;
 	case QUE_NODE_SELECT:
 
-		sel = node;
+		sel = static_cast<sel_node_t*>(node);
 
 		sel_node_free_private(sel);
 
 		break;
 	case QUE_NODE_INSERT:
 
-		ins = node;
+		ins = static_cast<ins_node_t*>(node);
 
 		que_graph_free_recursive(ins->select);
 
@@ -580,7 +501,7 @@ que_graph_free_recursive(
 
 		break;
 	case QUE_NODE_PURGE:
-		purge = node;
+		purge = static_cast<purge_node_t*>(node);
 
 		mem_heap_free(purge->heap);
 
@@ -588,7 +509,7 @@ que_graph_free_recursive(
 
 	case QUE_NODE_UPDATE:
 
-		upd = node;
+		upd = static_cast<upd_node_t*>(node);
 
 		if (upd->in_mysql_interface) {
 
@@ -607,7 +528,7 @@ que_graph_free_recursive(
 
 		break;
 	case QUE_NODE_CREATE_TABLE:
-		cre_tab = node;
+		cre_tab = static_cast<tab_node_t*>(node);
 
 		que_graph_free_recursive(cre_tab->tab_def);
 		que_graph_free_recursive(cre_tab->col_def);
@@ -617,7 +538,7 @@ que_graph_free_recursive(
 
 		break;
 	case QUE_NODE_CREATE_INDEX:
-		cre_ind = node;
+		cre_ind = static_cast<ind_node_t*>(node);
 
 		que_graph_free_recursive(cre_ind->ind_def);
 		que_graph_free_recursive(cre_ind->field_def);
@@ -627,25 +548,25 @@ que_graph_free_recursive(
 
 		break;
 	case QUE_NODE_PROC:
-		que_graph_free_stat_list(((proc_node_t*)node)->stat_list);
+		que_graph_free_stat_list(((proc_node_t*) node)->stat_list);
 
 		break;
 	case QUE_NODE_IF:
-		que_graph_free_stat_list(((if_node_t*)node)->stat_list);
-		que_graph_free_stat_list(((if_node_t*)node)->else_part);
-		que_graph_free_stat_list(((if_node_t*)node)->elsif_list);
+		que_graph_free_stat_list(((if_node_t*) node)->stat_list);
+		que_graph_free_stat_list(((if_node_t*) node)->else_part);
+		que_graph_free_stat_list(((if_node_t*) node)->elsif_list);
 
 		break;
 	case QUE_NODE_ELSIF:
-		que_graph_free_stat_list(((elsif_node_t*)node)->stat_list);
+		que_graph_free_stat_list(((elsif_node_t*) node)->stat_list);
 
 		break;
 	case QUE_NODE_WHILE:
-		que_graph_free_stat_list(((while_node_t*)node)->stat_list);
+		que_graph_free_stat_list(((while_node_t*) node)->stat_list);
 
 		break;
 	case QUE_NODE_FOR:
-		que_graph_free_stat_list(((for_node_t*)node)->stat_list);
+		que_graph_free_stat_list(((for_node_t*) node)->stat_list);
 
 		break;
 
@@ -724,11 +645,11 @@ que_thr_node_step(
 		return(thr);
 	}
 
-	mutex_enter(&kernel_mutex);
+	trx_mutex_enter(thr_get_trx(thr));
 
 	if (que_thr_peek_stop(thr)) {
 
-		mutex_exit(&kernel_mutex);
+		trx_mutex_exit(thr_get_trx(thr));
 
 		return(thr);
 	}
@@ -737,7 +658,7 @@ que_thr_node_step(
 
 	thr->state = QUE_THR_COMPLETED;
 
-	mutex_exit(&kernel_mutex);
+	trx_mutex_exit(thr_get_trx(thr));
 
 	return(NULL);
 }
@@ -754,35 +675,75 @@ que_thr_move_to_run_state(
 /*======================*/
 	que_thr_t*	thr)	/*!< in: an query thread */
 {
-	trx_t*	trx;
-
 	ut_ad(thr->state != QUE_THR_RUNNING);
 
-	trx = thr_get_trx(thr);
-
 	if (!thr->is_active) {
+		trx_t*	trx;
 
-		(thr->graph)->n_active_thrs++;
+		trx = thr_get_trx(thr);
 
-		trx->n_active_thrs++;
+		thr->graph->n_active_thrs++;
 
-		thr->is_active = TRUE;
+		trx->lock.n_active_thrs++;
 
-		ut_ad((thr->graph)->n_active_thrs == 1);
-		ut_ad(trx->n_active_thrs == 1);
+		thr->is_active = TRUE;
 	}
 
 	thr->state = QUE_THR_RUNNING;
 }
 
 /**********************************************************************//**
+Stops a query thread if graph or trx is in a state requiring it. The
+conditions are tested in the order (1) graph, (2) trx.
+@return	TRUE if stopped */
+UNIV_INTERN
+ibool
+que_thr_stop(
+/*=========*/
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	que_t*		graph;
+	trx_t*		trx = thr_get_trx(thr);
+
+	graph = thr->graph;
+
+	ut_ad(trx_mutex_own(trx));
+
+	if (graph->state == QUE_FORK_COMMAND_WAIT) {
+
+		thr->state = QUE_THR_SUSPENDED;
+
+	} else if (trx->lock.que_state == TRX_QUE_LOCK_WAIT) {
+
+		trx->lock.wait_thr = thr;
+		thr->state = QUE_THR_LOCK_WAIT;
+
+	} else if (trx->error_state != DB_SUCCESS
+		   && trx->error_state != DB_LOCK_WAIT) {
+
+		/* Error handling built for the MySQL interface */
+		thr->state = QUE_THR_COMPLETED;
+
+	} else if (graph->fork_type == QUE_FORK_ROLLBACK) {
+
+		thr->state = QUE_THR_SUSPENDED;
+	} else {
+		ut_ad(graph->state == QUE_FORK_ACTIVE);
+
+		return(FALSE);
+	}
+
+	return(TRUE);
+}
+
+/**********************************************************************//**
 Decrements the query thread reference counts in the query graph and the
-transaction. May start signal handling, e.g., a rollback.
+transaction.
 *** NOTE ***:
 This and que_thr_stop_for_mysql are the only functions where the reference
 count can be decremented and this function may only be called from inside
-que_run_threads or que_thr_check_if_switch! These restrictions exist to make
-the rollback code easier to maintain. */
+que_run_threads! These restrictions exist to make the rollback code easier
+to maintain. */
 static
 void
 que_thr_dec_refer_count(
@@ -794,161 +755,57 @@ que_thr_dec_refer_count(
 					calling function can start running
 					a new query thread */
 {
-	que_fork_t*	fork;
 	trx_t*		trx;
-	ulint		fork_type;
-	ibool		stopped;
+	que_fork_t*	fork;
 
-	fork = thr->common.parent;
 	trx = thr_get_trx(thr);
 
-	mutex_enter(&kernel_mutex);
-
 	ut_a(thr->is_active);
+	ut_ad(trx_mutex_own(trx));
 
 	if (thr->state == QUE_THR_RUNNING) {
 
-		stopped = que_thr_stop(thr);
+		if (!que_thr_stop(thr)) {
+
+			ut_a(next_thr != NULL && *next_thr == NULL);
 
-		if (!stopped) {
 			/* The reason for the thr suspension or wait was
 			already canceled before we came here: continue
-			running the thread */
-
-			/* fputs("!!!!!!!! Wait already ended: continue thr\n",
-			stderr); */
-
-			if (next_thr && *next_thr == NULL) {
-				/* Normally srv_suspend_mysql_thread resets
-				the state to DB_SUCCESS before waiting, but
-				in this case we have to do it here,
-				otherwise nobody does it. */
-				trx->error_state = DB_SUCCESS;
-
-				*next_thr = thr;
-			} else {
-				ut_error;
-				srv_que_task_enqueue_low(thr);
-			}
+			running the thread.
 
-			mutex_exit(&kernel_mutex);
+			This is also possible because in trx_commit_step() we
+			assume a single query thread. We set the query thread
+			state to QUE_THR_RUNNING. */
 
-			return;
-		}
-	}
-
-	ut_ad(fork->n_active_thrs == 1);
-	ut_ad(trx->n_active_thrs == 1);
+			/* fprintf(stderr,
+		       		"Wait already ended: trx: %p\n", trx); */
 
-	fork->n_active_thrs--;
-	trx->n_active_thrs--;
-
-	thr->is_active = FALSE;
+			/* Normally srv_suspend_mysql_thread resets
+			the state to DB_SUCCESS before waiting, but
+			in this case we have to do it here,
+			otherwise nobody does it. */
 
-	if (trx->n_active_thrs > 0) {
-
-		mutex_exit(&kernel_mutex);
-
-		return;
-	}
+			trx->error_state = DB_SUCCESS;
 
-	fork_type = fork->fork_type;
+			*next_thr = thr;
 
-	/* Check if all query threads in the same fork are completed */
-
-	if (que_fork_all_thrs_in_state(fork, QUE_THR_COMPLETED)) {
-
-		switch (fork_type) {
-		case QUE_FORK_ROLLBACK:
-			/* This is really the undo graph used in rollback,
-			no roll_node in this graph */
-
-			ut_ad(UT_LIST_GET_LEN(trx->signals) > 0);
-			ut_ad(trx->handling_signals == TRUE);
-
-			trx_finish_rollback_off_kernel(fork, trx, next_thr);
-			break;
-
-		case QUE_FORK_PURGE:
-		case QUE_FORK_RECOVERY:
-		case QUE_FORK_MYSQL_INTERFACE:
-
-			/* Do nothing */
-			break;
-
-		default:
-			ut_error;	/*!< not used in MySQL */
+			return;
 		}
 	}
 
-	if (UT_LIST_GET_LEN(trx->signals) > 0 && trx->n_active_thrs == 0) {
-
-		/* If the trx is signaled and its query thread count drops to
-		zero, then we start processing a signal; from it we may get
-		a new query thread to run */
-
-		trx_sig_start_handle(trx, next_thr);
-	}
-
-	if (trx->handling_signals && UT_LIST_GET_LEN(trx->signals) == 0) {
+	fork = static_cast<que_fork_t*>(thr->common.parent);
 
-		trx_end_signal_handling(trx);
-	}
+	--trx->lock.n_active_thrs;
 
-	mutex_exit(&kernel_mutex);
-}
+	--fork->n_active_thrs;
 
-/**********************************************************************//**
-Stops a query thread if graph or trx is in a state requiring it. The
-conditions are tested in the order (1) graph, (2) trx. The kernel mutex has
-to be reserved.
-@return	TRUE if stopped */
-UNIV_INTERN
-ibool
-que_thr_stop(
-/*=========*/
-	que_thr_t*	thr)	/*!< in: query thread */
-{
-	trx_t*	trx;
-	que_t*	graph;
-	ibool	ret	= TRUE;
-
-	ut_ad(mutex_own(&kernel_mutex));
-
-	graph = thr->graph;
-	trx = graph->trx;
-
-	if (graph->state == QUE_FORK_COMMAND_WAIT) {
-		thr->state = QUE_THR_SUSPENDED;
-
-	} else if (trx->que_state == TRX_QUE_LOCK_WAIT) {
-
-		UT_LIST_ADD_FIRST(trx_thrs, trx->wait_thrs, thr);
-		thr->state = QUE_THR_LOCK_WAIT;
-
-	} else if (trx->error_state != DB_SUCCESS
-		   && trx->error_state != DB_LOCK_WAIT) {
-
-		/* Error handling built for the MySQL interface */
-		thr->state = QUE_THR_COMPLETED;
-
-	} else if (UT_LIST_GET_LEN(trx->signals) > 0
-		   && graph->fork_type != QUE_FORK_ROLLBACK) {
-
-		thr->state = QUE_THR_SUSPENDED;
-	} else {
-		ut_ad(graph->state == QUE_FORK_ACTIVE);
-
-		ret = FALSE;
-	}
-
-	return(ret);
+	thr->is_active = FALSE;
 }
 
 /**********************************************************************//**
 A patch for MySQL used to 'stop' a dummy query thread used in MySQL. The
 query thread is stopped and made inactive, except in the case where
-it was put to the lock wait state in lock0lock.c, but the lock has already
+it was put to the lock wait state in lock0lock.cc, but the lock has already
 been granted or the transaction chosen as a victim in deadlock resolution. */
 UNIV_INTERN
 void
@@ -960,7 +817,10 @@ que_thr_stop_for_mysql(
 
 	trx = thr_get_trx(thr);
 
-	mutex_enter(&kernel_mutex);
+	/* Can't be the purge transaction. */
+	ut_a(trx->id != 0);
+
+	trx_mutex_enter(trx);
 
 	if (thr->state == QUE_THR_RUNNING) {
 
@@ -974,22 +834,22 @@ que_thr_stop_for_mysql(
 			already released, or this transaction was chosen
 			as a victim in selective deadlock resolution */
 
-			mutex_exit(&kernel_mutex);
+			trx_mutex_exit(trx);
 
 			return;
 		}
 	}
 
 	ut_ad(thr->is_active == TRUE);
-	ut_ad(trx->n_active_thrs == 1);
+	ut_ad(trx->lock.n_active_thrs == 1);
 	ut_ad(thr->graph->n_active_thrs == 1);
 
 	thr->is_active = FALSE;
-	(thr->graph)->n_active_thrs--;
+	thr->graph->n_active_thrs--;
 
-	trx->n_active_thrs--;
+	trx->lock.n_active_thrs--;
 
-	mutex_exit(&kernel_mutex);
+	trx_mutex_exit(trx);
 }
 
 /**********************************************************************//**
@@ -1017,7 +877,7 @@ que_thr_move_to_run_state_for_mysql(
 
 		thr->graph->n_active_thrs++;
 
-		trx->n_active_thrs++;
+		trx->lock.n_active_thrs++;
 
 		thr->is_active = TRUE;
 	}
@@ -1036,8 +896,9 @@ que_thr_stop_for_mysql_no_error(
 	trx_t*		trx)	/*!< in: transaction */
 {
 	ut_ad(thr->state == QUE_THR_RUNNING);
+	ut_ad(thr_get_trx(thr)->id != 0);
 	ut_ad(thr->is_active == TRUE);
-	ut_ad(trx->n_active_thrs == 1);
+	ut_ad(trx->lock.n_active_thrs == 1);
 	ut_ad(thr->graph->n_active_thrs == 1);
 
 	if (thr->magic_n != QUE_THR_MAGIC_N) {
@@ -1053,9 +914,9 @@ que_thr_stop_for_mysql_no_error(
 	thr->state = QUE_THR_COMPLETED;
 
 	thr->is_active = FALSE;
-	(thr->graph)->n_active_thrs--;
+	thr->graph->n_active_thrs--;
 
-	trx->n_active_thrs--;
+	trx->lock.n_active_thrs--;
 }
 
 /****************************************************************//**
@@ -1234,9 +1095,6 @@ que_thr_step(
 	} else if (type == QUE_NODE_LOCK) {
 
 		ut_error;
-		/*
-		thr = que_lock_step(thr);
-		*/
 	} else if (type == QUE_NODE_THR) {
 		thr = que_thr_node_step(thr);
 	} else if (type == QUE_NODE_COMMIT) {
@@ -1282,51 +1140,56 @@ que_run_threads_low(
 /*================*/
 	que_thr_t*	thr)	/*!< in: query thread */
 {
+	trx_t*		trx;
 	que_thr_t*	next_thr;
-	ulint		loop_count;
 
 	ut_ad(thr->state == QUE_THR_RUNNING);
 	ut_a(thr_get_trx(thr)->error_state == DB_SUCCESS);
-	ut_ad(!mutex_own(&kernel_mutex));
+	ut_ad(!trx_mutex_own(thr_get_trx(thr)));
 
-	loop_count = QUE_MAX_LOOPS_WITHOUT_CHECK;
-loop:
-	/* Check that there is enough space in the log to accommodate
-	possible log entries by this query step; if the operation can touch
-	more than about 4 pages, checks must be made also within the query
-	step! */
+	/* cumul_resource counts how much resources the OS thread (NOT the
+	query thread) has spent in this function */
+
+	trx = thr_get_trx(thr);
 
-	log_free_check();
+	do {
+		/* Check that there is enough space in the log to accommodate
+		possible log entries by this query step; if the operation can
+		touch more than about 4 pages, checks must be made also within
+		the query step! */
 
-	/* Perform the actual query step: note that the query thread
-	may change if, e.g., a subprocedure call is made */
+		log_free_check();
 
-	/*-------------------------*/
-	next_thr = que_thr_step(thr);
-	/*-------------------------*/
+		/* Perform the actual query step: note that the query thread
+		may change if, e.g., a subprocedure call is made */
 
-	ut_a(!next_thr || (thr_get_trx(next_thr)->error_state == DB_SUCCESS));
+		/*-------------------------*/
+		next_thr = que_thr_step(thr);
+		/*-------------------------*/
 
-	loop_count++;
+		trx_mutex_enter(trx);
 
-	if (next_thr != thr) {
-		ut_a(next_thr == NULL);
+		ut_a(next_thr == NULL || trx->error_state == DB_SUCCESS);
 
-		/* This can change next_thr to a non-NULL value if there was
-		a lock wait that already completed. */
-		que_thr_dec_refer_count(thr, &next_thr);
+		if (next_thr != thr) {
+			ut_a(next_thr == NULL);
 
-		if (next_thr == NULL) {
+			/* This can change next_thr to a non-NULL value
+			if there was a lock wait that already completed. */
 
-			return;
+			que_thr_dec_refer_count(thr, &next_thr);
+
+			if (next_thr != NULL) {
+
+				thr = next_thr;
+			}
 		}
 
-		loop_count = QUE_MAX_LOOPS_WITHOUT_CHECK;
+		ut_ad(trx == thr_get_trx(thr));
 
-		thr = next_thr;
-	}
+		trx_mutex_exit(trx);
 
-	goto loop;
+	} while (next_thr != NULL);
 }
 
 /**********************************************************************//**
@@ -1337,11 +1200,12 @@ que_run_threads(
 /*============*/
 	que_thr_t*	thr)	/*!< in: query thread */
 {
+	ut_ad(!trx_mutex_own(thr_get_trx(thr)));
+
 loop:
 	ut_a(thr_get_trx(thr)->error_state == DB_SUCCESS);
-	que_run_threads_low(thr);
 
-	mutex_enter(&kernel_mutex);
+	que_run_threads_low(thr);
 
 	switch (thr->state) {
 
@@ -1349,27 +1213,25 @@ loop:
 		/* There probably was a lock wait, but it already ended
 		before we came here: continue running thr */
 
-		mutex_exit(&kernel_mutex);
-
 		goto loop;
 
 	case QUE_THR_LOCK_WAIT:
-		mutex_exit(&kernel_mutex);
+		lock_wait_suspend_thread(thr);
 
-		/* The ..._mysql_... function works also for InnoDB's
-		internal threads. Let us wait that the lock wait ends. */
+		trx_mutex_enter(thr_get_trx(thr));
 
-		srv_suspend_mysql_thread(thr);
+		ut_a(thr_get_trx(thr)->id != 0);
 
 		if (thr_get_trx(thr)->error_state != DB_SUCCESS) {
 			/* thr was chosen as a deadlock victim or there was
 			a lock wait timeout */
 
 			que_thr_dec_refer_count(thr, NULL);
-
-			return;
+			trx_mutex_exit(thr_get_trx(thr));
+			break;
 		}
 
+		trx_mutex_exit(thr_get_trx(thr));
 		goto loop;
 
 	case QUE_THR_COMPLETED:
@@ -1380,15 +1242,13 @@ loop:
 	default:
 		ut_error;
 	}
-
-	mutex_exit(&kernel_mutex);
 }
 
 /*********************************************************************//**
 Evaluate the given SQL.
 @return	error code or DB_SUCCESS */
 UNIV_INTERN
-ulint
+enum db_err
 que_eval_sql(
 /*=========*/
 	pars_info_t*	info,	/*!< in: info struct, or NULL */
@@ -1424,7 +1284,35 @@ que_eval_sql(
 
 	que_run_threads(thr);
 
+	if (reserve_dict_mutex) {
+		mutex_enter(&dict_sys->mutex);
+	}
+
 	que_graph_free(graph);
 
+	if (reserve_dict_mutex) {
+		mutex_exit(&dict_sys->mutex);
+	}
+
 	return(trx->error_state);
 }
+
+/*********************************************************************//**
+Initialise the query sub-system. */
+UNIV_INTERN
+void
+que_init(void)
+/*==========*/
+{
+	/* No op */
+}
+
+/*********************************************************************//**
+Close the query sub-system. */
+UNIV_INTERN
+void
+que_close(void)
+/*===========*/
+{
+	/* No op */
+}
diff --git a/storage/innobase/read/read0read.c b/storage/innobase/read/read0read.cc
index 9975b8c2c57..02d78d657c6 100644
--- a/storage/innobase/read/read0read.c
+++ b/storage/innobase/read/read0read.cc
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2011, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,13 +11,13 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
 /**************************************************//**
-@file read/read0read.c
+@file read/read0read.cc
 Cursor read
 
 Created 2/16/1997 Heikki Tuuri
@@ -70,7 +70,7 @@ cluster record is accessed.  Because trx_id of the creating
 transaction is stored when this view was created to the list of
 trx_ids not seen by this read view previous version of the
 record is requested to be built. This is build using clustered record.
-If the secondary key record is delete  marked it's corresponding
+If the secondary key record is delete-marked, its corresponding
 clustered record can be already be purged only if records
 trx_id < low_limit_no. Purge can't remove any record deleted by a
 transaction which was active when cursor was created. But, we still
@@ -129,13 +129,103 @@ in the view. If this is not true we build based on undo_rec previous
 version of the record. This record is found because purge can't remove
 records accessed by active transaction. Thus we see correct version. Q. E. D.
 -------------------------------------------------------------------------------
-FACT C: Purge does not remove any delete marked row that is visible
+FACT C: Purge does not remove any delete-marked row that is visible
 -------
-to cursor view.
+in any cursor read view.
+
+PROOF: We know that:
+ 1: Currently active read views in trx_sys_t::view_list are ordered by
+    read_view_t::low_limit_no in descending order, that is,
+    newest read view first.
+
+ 2: Purge clones the oldest read view and uses that to determine whether there
+    are any active transactions that can see the to be purged records.
+
+Therefore any joining or active transaction will not have a view older
+than the purge view, according to 1.
+
+When purge needs to remove a delete-marked row from a secondary index,
+it will first check that the DB_TRX_ID value of the corresponding
+record in the clustered index is older than the purge view. It will
+also check if there is a newer version of the row (clustered index
+record) that is not delete-marked in the secondary index. If such a
+row exists and is collation-equal to the delete-marked secondary index
+record then purge will not remove the secondary index record.
+
+Delete-marked clustered index records will be removed by
+row_purge_remove_clust_if_poss(), unless the clustered index record
+(and its DB_ROLL_PTR) has been updated. Every new version of the
+clustered index record will update DB_ROLL_PTR, pointing to a new UNDO
+log entry that allows the old version to be reconstructed. The
+DB_ROLL_PTR in the oldest remaining version in the old-version chain
+may be pointing to garbage (an undo log record discarded by purge),
+but it will never be dereferenced, because the purge view is older
+than any active transaction.
+
+For details see: row_vers_old_has_index_entry() and row_purge_poss_sec()
+
+Some additional issues:
+
+What if trx_sys->view_list == NULL and some transaction T1 and Purge both
+try to open read_view at same time. Only one can acquire trx_sys->mutex.
+In which order will the views be opened? Should it matter? If no, why?
+
+The order does not matter. No new transactions can be created and no running
+transaction can commit or rollback (or free views).
+*/
 
-TODO: proof this
+#ifdef UNIV_DEBUG
+/*********************************************************************//**
+Validates a read view object. */
+static
+ibool
+read_view_validate(
+/*===============*/
+	const read_view_t*	view)	/*!< in: view to validate */
+{
+	ulint	i;
 
-*/
+	ut_ad(mutex_own(&trx_sys->mutex));
+
+	/* Check that the view->trx_ids array is in descending order. */
+	for (i = 1; i < view->n_trx_ids; ++i) {
+
+		ut_a(view->trx_ids[i] < view->trx_ids[i - 1]);
+	}
+
+	return(TRUE);
+}
+
+/** Functor to validate the view list. */
+struct	Check {
+
+	Check() : m_prev_view(0) { }
+
+	void	operator()(const read_view_t* view)
+	{
+		ut_a(m_prev_view == NULL
+		     || m_prev_view->low_limit_no >= view->low_limit_no);
+
+		m_prev_view = view;
+	}
+
+	const read_view_t*	m_prev_view;
+};
+
+/*********************************************************************//**
+Validates a read view list. */
+static
+ibool
+read_view_list_validate(void)
+/*=========================*/
+{
+	ut_ad(mutex_own(&trx_sys->mutex));
+
+	ut_list_map(trx_sys->view_list, &read_view_t::view_list, Check());
+
+	return(TRUE);
+}
+#endif
 
 /*********************************************************************//**
 Creates a read view object.
@@ -149,96 +239,186 @@ read_view_create_low(
 {
 	read_view_t*	view;
 
-	view = mem_heap_alloc(heap, sizeof(read_view_t));
+	view = static_cast<read_view_t*>(
+		mem_heap_alloc(
+			heap, sizeof(*view) + n * sizeof(*view->trx_ids)));
 
 	view->n_trx_ids = n;
-	view->trx_ids = mem_heap_alloc(heap, n * sizeof *view->trx_ids);
+	view->trx_ids = (trx_id_t*) &view[1];
 
 	return(view);
 }
 
 /*********************************************************************//**
-Makes a copy of the oldest existing read view, with the exception that also
-the creating trx of the oldest view is set as not visible in the 'copied'
-view. Opens a new view if no views currently exist. The view must be closed
-with ..._close. This is used in purge.
-@return	own: read view struct */
-UNIV_INTERN
+Clones a read view object. This function will allocate space for two read
+views contiguously, one identical in size and content as @param view (starting
+at returned pointer) and another view immediately following the trx_ids array.
+The second view will have space for an extra trx_id_t element.
+@return	read view struct */
+UNIV_INLINE
 read_view_t*
-read_view_oldest_copy_or_open_new(
-/*==============================*/
-	trx_id_t	cr_trx_id,	/*!< in: trx_id of creating
-					transaction, or 0 used in purge */
-	mem_heap_t*	heap)		/*!< in: memory heap from which
-					allocated */
+read_view_clone(
+/*============*/
+	const read_view_t*	view,	/*!< in: view to clone */
+	mem_heap_t*		heap)	/*!< in: memory heap
+					from which allocated */
 {
-	read_view_t*	old_view;
-	read_view_t*	view_copy;
-	ibool		needs_insert	= TRUE;
-	ulint		insert_done	= 0;
-	ulint		n;
-	ulint		i;
+	ulint		sz;
+	read_view_t*	clone;
+	read_view_t*	new_view;
 
-	ut_ad(mutex_own(&kernel_mutex));
+	ut_ad(mutex_own(&trx_sys->mutex));
 
-	old_view = UT_LIST_GET_LAST(trx_sys->view_list);
+	/* Allocate space for two views. */
 
-	if (old_view == NULL) {
+	sz = sizeof(*view) + view->n_trx_ids * sizeof(*view->trx_ids);
 
-		return(read_view_open_now(cr_trx_id, heap));
-	}
+	/* Add an extra trx_id_t slot for the new view. */
+
+	clone = static_cast<read_view_t*>(
+		mem_heap_alloc(heap, (sz * 2) + sizeof(trx_id_t)));
+
+	/* Only the contents of the old view are important, the new view
+	will be created from this and so we don't copy that across. */
+
+	memcpy(clone, view, sz);
+
+	clone->trx_ids = (trx_id_t*) &clone[1];
+
+	new_view = (read_view_t*) &clone->trx_ids[clone->n_trx_ids];
+	new_view->trx_ids = (trx_id_t*) &new_view[1];
+	new_view->n_trx_ids = clone->n_trx_ids + 1;
+
+	ut_a(new_view->n_trx_ids == view->n_trx_ids + 1);
 
-	n = old_view->n_trx_ids;
+	return(clone);
+}
+
+/*********************************************************************//**
+Insert the view in the proper order into the trx_sys->view_list. The
+read view list is ordered by read_view_t::low_limit_no in descending order. */
+static
+void
+read_view_add(
+/*==========*/
+	read_view_t*	view)		/*!< in: view to add to */
+{
+	read_view_t*	elem;
+	read_view_t*	prev_elem;
+
+	ut_ad(mutex_own(&trx_sys->mutex));
+	ut_ad(read_view_validate(view));
 
-	if (old_view->creator_trx_id) {
-		n++;
+	/* Find the correct slot for insertion. */
+	for (elem = UT_LIST_GET_FIRST(trx_sys->view_list), prev_elem = NULL;
+	     elem != NULL && view->low_limit_no < elem->low_limit_no;
+	     prev_elem = elem, elem = UT_LIST_GET_NEXT(view_list, elem)) {
+		/* No op */
+	}
+
+	if (prev_elem == NULL) {
+		UT_LIST_ADD_FIRST(view_list, trx_sys->view_list, view);
 	} else {
-		needs_insert = FALSE;
+		UT_LIST_INSERT_AFTER(
+			view_list, trx_sys->view_list, prev_elem, view);
 	}
 
-	view_copy = read_view_create_low(n, heap);
+	ut_ad(read_view_list_validate());
+}
 
-	/* Insert the id of the creator in the right place of the descending
-	array of ids, if needs_insert is TRUE: */
+/** Functor to create thew view trx_ids array. */
+struct	CreateView {
 
-	i = 0;
-	while (i < n) {
-		if (needs_insert
-		    && (i >= old_view->n_trx_ids
-			|| old_view->creator_trx_id
-			> read_view_get_nth_trx_id(old_view, i))) {
+	CreateView(read_view_t*	view)
+		: m_view(view)
+	{
+		  m_n_trx = m_view->n_trx_ids;
+		  m_view->n_trx_ids = 0;
+	}
 
-			read_view_set_nth_trx_id(view_copy, i,
-						 old_view->creator_trx_id);
-			needs_insert = FALSE;
-			insert_done = 1;
-		} else {
-			read_view_set_nth_trx_id(view_copy, i,
-						 read_view_get_nth_trx_id(
-							 old_view,
-							 i - insert_done));
-		}
+	void	operator()(const trx_t* trx)
+	{
+		ut_ad(mutex_own(&trx_sys->mutex));
+		ut_ad(trx->in_rw_trx_list);
+
+		/* trx->state cannot change from or to NOT_STARTED
+		while we are holding the trx_sys->mutex. It may change
+		from ACTIVE to PREPARED or COMMITTED. */
+
+		if (trx->id != m_view->creator_trx_id
+		    && !trx_state_eq(trx, TRX_STATE_COMMITTED_IN_MEMORY)) {
+
+			ut_ad(m_n_trx > m_view->n_trx_ids);
+
+			m_view->trx_ids[m_view->n_trx_ids++] = trx->id;
+
+			/* NOTE that a transaction whose trx number is <
+			trx_sys->max_trx_id can still be active, if it is
+			in the middle of its commit! Note that when a
+			transaction starts, we initialize trx->no to
+			IB_ULONGLONG_MAX. */
+
+			/* trx->no is protected by trx_sys->mutex, which
+			we are holding. It is assigned by trx_commit()
+			before lock_trx_release_locks() assigns
+			trx->state = TRX_STATE_COMMITTED_IN_MEMORY. */
 
-		i++;
+			if (m_view->low_limit_no > trx->no) {
+				m_view->low_limit_no = trx->no;
+			}
+		}
 	}
 
-	view_copy->creator_trx_id = cr_trx_id;
+	read_view_t*	m_view;
+	ulint		m_n_trx;
+};
+
+/*********************************************************************//**
+Opens a read view where exactly the transactions serialized before this
+point in time are seen in the view.
+@return	own: read view struct */
+static
+read_view_t*
+read_view_open_now_low(
+/*===================*/
+	trx_id_t	cr_trx_id,	/*!< in: trx_id of creating
+					transaction, or 0 used in purge */
+	mem_heap_t*	heap)		/*!< in: memory heap from which
+					allocated */
+{
+	read_view_t*	view;
+	ulint		n_trx = UT_LIST_GET_LEN(trx_sys->rw_trx_list);
+
+	ut_ad(mutex_own(&trx_sys->mutex));
+
+	view = read_view_create_low(n_trx, heap);
+
+	view->undo_no = 0;
+	view->type = VIEW_NORMAL;
+	view->creator_trx_id = cr_trx_id;
+
+	/* No future transactions should be visible in the view */
+
+	view->low_limit_no = trx_sys->max_trx_id;
+	view->low_limit_id = view->low_limit_no;
 
-	view_copy->low_limit_no = old_view->low_limit_no;
-	view_copy->low_limit_id = old_view->low_limit_id;
+	/* No active transaction should be visible, except cr_trx */
 
+	ut_list_map(trx_sys->rw_trx_list, &trx_t::trx_list, CreateView(view));
 
-	if (n > 0) {
+	if (view->n_trx_ids > 0) {
 		/* The last active transaction has the smallest id: */
-		view_copy->up_limit_id = read_view_get_nth_trx_id(
-			view_copy, n - 1);
+		view->up_limit_id = view->trx_ids[view->n_trx_ids - 1];
 	} else {
-		view_copy->up_limit_id = old_view->up_limit_id;
+		view->up_limit_id = view->low_limit_id;
 	}
 
-	UT_LIST_ADD_LAST(view_list, trx_sys->view_list, view_copy);
+	/* Purge views are not added to the view list. */
+	if (cr_trx_id > 0) {
+		read_view_add(view);
+	}
 
-	return(view_copy);
+	return(view);
 }
 
 /*********************************************************************//**
@@ -255,77 +435,117 @@ read_view_open_now(
 					allocated */
 {
 	read_view_t*	view;
-	trx_t*		trx;
-	ulint		n;
 
-	ut_ad(mutex_own(&kernel_mutex));
+	mutex_enter(&trx_sys->mutex);
 
-	view = read_view_create_low(UT_LIST_GET_LEN(trx_sys->trx_list), heap);
+	view = read_view_open_now_low(cr_trx_id, heap);
 
-	view->creator_trx_id = cr_trx_id;
-	view->type = VIEW_NORMAL;
-	view->undo_no = 0;
+	mutex_exit(&trx_sys->mutex);
 
-	/* No future transactions should be visible in the view */
+	return(view);
+}
 
-	view->low_limit_no = trx_sys->max_trx_id;
-	view->low_limit_id = view->low_limit_no;
+/*********************************************************************//**
+Makes a copy of the oldest existing read view, with the exception that also
+the creating trx of the oldest view is set as not visible in the 'copied'
+view. Opens a new view if no views currently exist. The view must be closed
+with ..._close. This is used in purge.
+@return	own: read view struct */
+UNIV_INTERN
+read_view_t*
+read_view_purge_open(
+/*=================*/
+	mem_heap_t*	heap)		/*!< in: memory heap from which
+					allocated */
+{
+	ulint		i;
+	read_view_t*	view;
+	read_view_t*	oldest_view;
+	trx_id_t	creator_trx_id;
+	ulint		insert_done	= 0;
 
-	n = 0;
-	trx = UT_LIST_GET_FIRST(trx_sys->trx_list);
+	mutex_enter(&trx_sys->mutex);
 
-	/* No active transaction should be visible, except cr_trx */
+	oldest_view = UT_LIST_GET_LAST(trx_sys->view_list);
 
-	while (trx) {
-		if (trx->id != cr_trx_id
-		    && (trx->conc_state == TRX_ACTIVE
-			|| trx->conc_state == TRX_PREPARED)) {
+	if (oldest_view == NULL) {
 
-			read_view_set_nth_trx_id(view, n, trx->id);
+		view = read_view_open_now_low(0, heap);
 
-			n++;
+		mutex_exit(&trx_sys->mutex);
 
-			/* NOTE that a transaction whose trx number is <
-			trx_sys->max_trx_id can still be active, if it is
-			in the middle of its commit! Note that when a
-			transaction starts, we initialize trx->no to
-			IB_ULONGLONG_MAX. */
+		return(view);
+	}
 
-			if (view->low_limit_no > trx->no) {
+	/* Allocate space for both views, the oldest and the new purge view. */
 
-				view->low_limit_no = trx->no;
-			}
+	oldest_view = read_view_clone(oldest_view, heap);
+
+	ut_ad(read_view_validate(oldest_view));
+
+	mutex_exit(&trx_sys->mutex);
+
+	ut_a(oldest_view->creator_trx_id > 0);
+	creator_trx_id = oldest_view->creator_trx_id;
+
+	view = (read_view_t*) &oldest_view->trx_ids[oldest_view->n_trx_ids];
+
+	/* Add the creator transaction id in the trx_ids array in the
+	correct slot. */
+
+	for (i = 0; i < oldest_view->n_trx_ids; ++i) {
+		trx_id_t	id;
+
+		id = oldest_view->trx_ids[i - insert_done];
+
+		if (insert_done == 0 && creator_trx_id > id) {
+			id = creator_trx_id;
+			insert_done = 1;
 		}
 
-		trx = UT_LIST_GET_NEXT(trx_list, trx);
+		view->trx_ids[i] = id;
 	}
 
-	view->n_trx_ids = n;
-
-	if (n > 0) {
-		/* The last active transaction has the smallest id: */
-		view->up_limit_id = read_view_get_nth_trx_id(view, n - 1);
+	if (insert_done == 0) {
+		view->trx_ids[i] = creator_trx_id;
 	} else {
-		view->up_limit_id = view->low_limit_id;
+		ut_a(i > 0);
+		view->trx_ids[i] = oldest_view->trx_ids[i - 1];
 	}
 
+	view->creator_trx_id = 0;
+
+	view->low_limit_no = oldest_view->low_limit_no;
+	view->low_limit_id = oldest_view->low_limit_id;
+
+	if (view->n_trx_ids > 0) {
+		/* The last active transaction has the smallest id: */
 
-	UT_LIST_ADD_FIRST(view_list, trx_sys->view_list, view);
+		view->up_limit_id = view->trx_ids[view->n_trx_ids - 1];
+	} else {
+		view->up_limit_id = oldest_view->up_limit_id;
+	}
 
 	return(view);
 }
 
 /*********************************************************************//**
-Closes a read view. */
+Remove a read view from the trx_sys->view_list. */
 UNIV_INTERN
 void
-read_view_close(
-/*============*/
+read_view_remove(
+/*=============*/
 	read_view_t*	view)	/*!< in: read view */
 {
-	ut_ad(mutex_own(&kernel_mutex));
+	mutex_enter(&trx_sys->mutex);
+
+	ut_ad(read_view_validate(view));
 
 	UT_LIST_REMOVE(view_list, trx_sys->view_list, view);
+
+	ut_ad(read_view_list_validate());
+
+	mutex_exit(&trx_sys->mutex);
 }
 
 /*********************************************************************//**
@@ -335,20 +555,16 @@ UNIV_INTERN
 void
 read_view_close_for_mysql(
 /*======================*/
-	trx_t*	trx)	/*!< in: trx which has a read view */
+	trx_t*		trx)	/*!< in: trx which has a read view */
 {
 	ut_a(trx->global_read_view);
 
-	mutex_enter(&kernel_mutex);
-
-	read_view_close(trx->global_read_view);
+	read_view_remove(trx->global_read_view);
 
 	mem_heap_empty(trx->global_read_view_heap);
 
 	trx->read_view = NULL;
 	trx->global_read_view = NULL;
-
-	mutex_exit(&kernel_mutex);
 }
 
 /*********************************************************************//**
@@ -364,20 +580,20 @@ read_view_print(
 
 	if (view->type == VIEW_HIGH_GRANULARITY) {
 		fprintf(stderr,
-			"High-granularity read view undo_n:o %llu\n",
-			(ullint) view->undo_no);
+			"High-granularity read view undo_n:o " TRX_ID_FMT "\n",
+			view->undo_no);
 	} else {
 		fprintf(stderr, "Normal read view\n");
 	}
 
 	fprintf(stderr, "Read view low limit trx n:o " TRX_ID_FMT "\n",
-		(ullint) view->low_limit_no);
+		view->low_limit_no);
 
 	fprintf(stderr, "Read view up limit trx id " TRX_ID_FMT "\n",
-		(ullint) view->up_limit_id);
+		view->up_limit_id);
 
 	fprintf(stderr, "Read view low limit trx id " TRX_ID_FMT "\n",
-		(ullint) view->low_limit_id);
+		view->low_limit_id);
 
 	fprintf(stderr, "Read view individually stored trx ids:\n");
 
@@ -385,7 +601,7 @@ read_view_print(
 
 	for (i = 0; i < n_ids; i++) {
 		fprintf(stderr, "Read view trx id " TRX_ID_FMT "\n",
-			(ullint) read_view_get_nth_trx_id(view, i));
+			view->trx_ids[i]);
 	}
 }
 
@@ -398,85 +614,62 @@ UNIV_INTERN
 cursor_view_t*
 read_cursor_view_create_for_mysql(
 /*==============================*/
-	trx_t*	cr_trx)	/*!< in: trx where cursor view is created */
+	trx_t*		cr_trx)	/*!< in: trx where cursor view is created */
 {
-	cursor_view_t*	curview;
 	read_view_t*	view;
 	mem_heap_t*	heap;
-	trx_t*		trx;
-	ulint		n;
-
-	ut_a(cr_trx);
+	ulint		n_trx;
+	cursor_view_t*	curview;
 
 	/* Use larger heap than in trx_create when creating a read_view
 	because cursors are quite long. */
 
 	heap = mem_heap_create(512);
 
-	curview = (cursor_view_t*) mem_heap_alloc(heap, sizeof(cursor_view_t));
+	curview = (cursor_view_t*) mem_heap_alloc(heap, sizeof(*curview));
+
 	curview->heap = heap;
 
-	/* Drop cursor tables from consideration when evaluating the need of
-	auto-commit */
+	/* Drop cursor tables from consideration when evaluating the
+	need of auto-commit */
+
 	curview->n_mysql_tables_in_use = cr_trx->n_mysql_tables_in_use;
+
 	cr_trx->n_mysql_tables_in_use = 0;
 
-	mutex_enter(&kernel_mutex);
+	mutex_enter(&trx_sys->mutex);
 
-	curview->read_view = read_view_create_low(
-		UT_LIST_GET_LEN(trx_sys->trx_list), curview->heap);
+	n_trx = UT_LIST_GET_LEN(trx_sys->rw_trx_list);
+
+	curview->read_view = read_view_create_low(n_trx, curview->heap);
 
 	view = curview->read_view;
-	view->creator_trx_id = cr_trx->id;
-	view->type = VIEW_HIGH_GRANULARITY;
 	view->undo_no = cr_trx->undo_no;
+	view->type = VIEW_HIGH_GRANULARITY;
+	view->creator_trx_id = UINT64_UNDEFINED;
 
 	/* No future transactions should be visible in the view */
 
 	view->low_limit_no = trx_sys->max_trx_id;
 	view->low_limit_id = view->low_limit_no;
 
-	n = 0;
-	trx = UT_LIST_GET_FIRST(trx_sys->trx_list);
-
 	/* No active transaction should be visible */
 
-	while (trx) {
-
-		if (trx->conc_state == TRX_ACTIVE
-		    || trx->conc_state == TRX_PREPARED) {
-
-			read_view_set_nth_trx_id(view, n, trx->id);
-
-			n++;
-
-			/* NOTE that a transaction whose trx number is <
-			trx_sys->max_trx_id can still be active, if it is
-			in the middle of its commit! Note that when a
-			transaction starts, we initialize trx->no to
-			IB_ULONGLONG_MAX. */
+	ut_list_map(trx_sys->rw_trx_list, &trx_t::trx_list, CreateView(view));
 
-			if (view->low_limit_no > trx->no) {
-
-				view->low_limit_no = trx->no;
-			}
-		}
-
-		trx = UT_LIST_GET_NEXT(trx_list, trx);
-	}
-
-	view->n_trx_ids = n;
+	view->creator_trx_id = cr_trx->id;
 
-	if (n > 0) {
+	if (view->n_trx_ids > 0) {
 		/* The last active transaction has the smallest id: */
-		view->up_limit_id = read_view_get_nth_trx_id(view, n - 1);
+
+		view->up_limit_id = view->trx_ids[view->n_trx_ids - 1];
 	} else {
 		view->up_limit_id = view->low_limit_id;
 	}
 
-	UT_LIST_ADD_FIRST(view_list, trx_sys->view_list, view);
+	read_view_add(view);
 
-	mutex_exit(&kernel_mutex);
+	mutex_exit(&trx_sys->mutex);
 
 	return(curview);
 }
@@ -499,13 +692,10 @@ read_cursor_view_close_for_mysql(
 	belong to this transaction */
 	trx->n_mysql_tables_in_use += curview->n_mysql_tables_in_use;
 
-	mutex_enter(&kernel_mutex);
+	read_view_remove(curview->read_view);
 
-	read_view_close(curview->read_view);
 	trx->read_view = trx->global_read_view;
 
-	mutex_exit(&kernel_mutex);
-
 	mem_heap_free(curview->heap);
 }
 
@@ -522,7 +712,7 @@ read_cursor_set_for_mysql(
 {
 	ut_a(trx);
 
-	mutex_enter(&kernel_mutex);
+	mutex_enter(&trx_sys->mutex);
 
 	if (UNIV_LIKELY(curview != NULL)) {
 		trx->read_view = curview->read_view;
@@ -530,5 +720,7 @@ read_cursor_set_for_mysql(
 		trx->read_view = trx->global_read_view;
 	}
 
-	mutex_exit(&kernel_mutex);
+	ut_ad(read_view_validate(trx->read_view));
+
+	mutex_exit(&trx_sys->mutex);
 }
diff --git a/storage/innobase/rem/rem0cmp.c b/storage/innobase/rem/rem0cmp.cc
index 04d2c15437b..19f5633953a 100644
--- a/storage/innobase/rem/rem0cmp.c
+++ b/storage/innobase/rem/rem0cmp.cc
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1994, 2011, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,13 +11,13 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
 /*******************************************************************//**
-@file rem/rem0cmp.c
+@file rem/rem0cmp.cc
 Comparison services for records
 
 Created 7/1/1994 Heikki Tuuri
@@ -29,6 +29,7 @@ Created 7/1/1994 Heikki Tuuri
 #include "rem0cmp.ic"
 #endif
 
+#include "ha_prototypes.h"
 #include "srv0srv.h"
 
 /*		ALPHABETICAL ORDER
@@ -90,6 +91,23 @@ innobase_mysql_cmp(
 	const unsigned char* b,		/*!< in: data field */
 	unsigned int	b_length);	/*!< in: data field length,
 					not UNIV_SQL_NULL */
+/*************************************************************//**
+This function is used to compare two data fields for which the data type
+is such that we must use MySQL code to compare them. The prototype here
+must be a copy of the one in ha_innobase.cc!
+@return	1, 0, -1, if a is greater, equal, less than b, respectively */
+extern
+int
+innobase_mysql_cmp_prefix(
+/*======================*/
+	int		mysql_type,	/*!< in: MySQL type */
+	uint		charset_number,	/*!< in: number of the charset */
+	const unsigned char* a,		/*!< in: data field */
+	unsigned int	a_length,	/*!< in: data field length,
+					not UNIV_SQL_NULL */
+	const unsigned char* b,		/*!< in: data field */
+	unsigned int	b_length);	/*!< in: data field length,
+					not UNIV_SQL_NULL */
 /*********************************************************************//**
 Transforms the character code so that it is ordered appropriately for the
 language. This is only used for the latin1 char set. MySQL does the
@@ -184,8 +202,8 @@ cmp_whole_field(
 
 	case DATA_DECIMAL:
 		/* Remove preceding spaces */
-		for (; a_length && *a == ' '; a++, a_length--);
-		for (; b_length && *b == ' '; b++, b_length--);
+		for (; a_length && *a == ' '; a++, a_length--) { }
+		for (; b_length && *b == ' '; b++, b_length--) { }
 
 		if (*a == '-') {
 			if (*b != '-') {
@@ -271,7 +289,7 @@ cmp_whole_field(
 	case DATA_MYSQL:
 		return(innobase_mysql_cmp(
 			       (int)(prtype & DATA_MYSQL_TYPE_MASK),
-			       (uint)dtype_get_charset_coll(prtype),
+			       (uint) dtype_get_charset_coll(prtype),
 			       a, a_length, b, b_length));
 	default:
 		fprintf(stderr,
@@ -283,6 +301,44 @@ cmp_whole_field(
 	return(0);
 }
 
+/*****************************************************************
+This function is used to compare two dfields where at least the first
+has its data type field set. */
+UNIV_INTERN
+int
+cmp_dfield_dfield_like_prefix(
+/*==========================*/
+				/* out: 1, 0, -1, if dfield1 is greater, equal,
+				less than dfield2, respectively */
+	dfield_t*	dfield1,/* in: data field; must have type field set */
+	dfield_t*	dfield2)/* in: data field */
+{
+	const dtype_t*  type;
+	ulint           ret;
+
+	ut_ad(dfield_check_typed(dfield1));
+
+	type = dfield_get_type(dfield1);
+
+	if (type->mtype >= DATA_FLOAT) {
+		ret = innobase_mysql_cmp_prefix(
+			(int)(type->prtype & DATA_MYSQL_TYPE_MASK),
+			(uint) dtype_get_charset_coll(type->prtype),
+			static_cast<byte*>(dfield_get_data(dfield1)),
+			dfield_get_len(dfield1),
+                        static_cast<byte*>(dfield_get_data(dfield2)),
+                        dfield_get_len(dfield2));
+        } else {
+                ret = (cmp_data_data_like_prefix(
+                        static_cast<byte*>(dfield_get_data(dfield1)),
+                        dfield_get_len(dfield1),
+                        static_cast<byte*>(dfield_get_data(dfield2)),
+                        dfield_get_len(dfield2)));
+        }
+
+        return(ret);
+}
+
 /*************************************************************//**
 This function is used to compare two data fields for which we know the
 data type.
@@ -396,6 +452,162 @@ next_byte:
 	return(0);		/* Not reached */
 }
 
+/*****************************************************************
+This function is used to compare two data fields for which we know the
+data type to be VARCHAR */
+
+int
+cmp_data_data_slow_varchar(
+/*=======================*/
+				/* out: 1, 0, -1, if lhs is greater, equal,
+				less than rhs, respectively */
+	const byte*	lhs,	/* in: data field (== a pointer to a memory
+				buffer) */
+	ulint		lhs_len,/* in: data field length or UNIV_SQL_NULL */
+	const byte*	rhs,	/* in: data field (== a pointer to a memory
+				buffer) */
+	ulint		rhs_len)/* in: data field length or UNIV_SQL_NULL */
+{
+	ulint	i;
+
+	ut_a(rhs_len != UNIV_SQL_NULL);
+
+	if (lhs_len == UNIV_SQL_NULL) {
+
+		/* We define the SQL null to be the smallest possible
+		value of a field in the alphabetical order */
+
+		return(-1);
+	}
+
+	/* Compare the values.*/
+
+	for (i = 0; i < lhs_len && i < rhs_len; ++i, ++rhs, ++lhs) {
+		ulint	lhs_byte = *lhs;
+		ulint	rhs_byte = *rhs;
+
+		if (lhs_byte != rhs_byte) {
+			/* If the bytes are equal, they will remain such even
+			after the collation transformation below */
+
+			lhs_byte = cmp_collate(lhs_byte);
+			rhs_byte = cmp_collate(rhs_byte);
+
+			if (lhs_byte > rhs_byte) {
+
+				return(1);
+			} else if (lhs_byte < rhs_byte) {
+
+				return(-1);
+			}
+		}
+	}
+
+	return(i == lhs_len && i == rhs_len) ? 0 : rhs_len - lhs_len;
+}
+
+/*****************************************************************
+This function is used to compare two data fields for which we know the
+data type. The comparison is done for the LIKE operator.*/
+
+int
+cmp_data_data_slow_like_prefix(
+/*===========================*/
+				/* out: 1, 0, -1, if lhs is greater, equal,
+				less than rhs, respectively */
+	const byte*	lhs,	/* in: data field (== a pointer to a memory
+				buffer) */
+	ulint		len1,	/* in: data field length or UNIV_SQL_NULL */
+	const byte*	rhs,	/* in: data field (== a pointer to a memory
+				buffer) */
+	ulint		len2)	/* in: data field length or UNIV_SQL_NULL */
+{
+	ulint	i;
+
+	ut_a(len2 != UNIV_SQL_NULL);
+
+	if (len1 == UNIV_SQL_NULL) {
+
+		/* We define the SQL null to be the smallest possible
+		value of a field in the alphabetical order */
+
+		return(-1);
+	}
+
+	/* Compare the values.*/
+
+	for (i = 0; i < len1 && i < len2; ++i, ++rhs, ++lhs) {
+		ulint	lhs_byte = *lhs;
+		ulint	rhs_byte = *rhs;
+
+		if (lhs_byte != rhs_byte) {
+			/* If the bytes are equal, they will remain such even
+			after the collation transformation below */
+
+			lhs_byte = cmp_collate(lhs_byte);
+			rhs_byte = cmp_collate(rhs_byte);
+
+			if (lhs_byte > rhs_byte) {
+
+				return(1);
+			} else if (lhs_byte < rhs_byte) {
+
+				return(-1);
+			}
+		}
+	}
+
+	return(i == len2 ? 0 : 1);
+}
+
+/*****************************************************************
+This function is used to compare two data fields for which we know the
+data type. The comparison is done for the LIKE operator.*/
+
+int
+cmp_data_data_slow_like_suffix(
+/*===========================*/
+				/* out: 1, 0, -1, if data1 is greater, equal,
+				less than data2, respectively */
+				/* in: data field (== a pointer to a
+				memory buffer) */
+	const byte*	data1 UNIV_UNUSED,
+				/* in: data field length or UNIV_SQL_NULL */
+	ulint		len1 UNIV_UNUSED,
+				/* in: data field (== a pointer to a memory
+				buffer) */
+	const byte*	data2 UNIV_UNUSED,
+				/* in: data field length or UNIV_SQL_NULL */
+	ulint		len2 UNIV_UNUSED)
+
+{
+	ut_error;	// FIXME:
+	return(1);
+}
+
+/*****************************************************************
+This function is used to compare two data fields for which we know the
+data type. The comparison is done for the LIKE operator.*/
+
+int
+cmp_data_data_slow_like_substr(
+/*===========================*/
+				/* out: 1, 0, -1, if data1 is greater, equal,
+				less than data2, respectively */
+				/* in: data field (== a pointer to a
+				memory buffer) */
+	const byte*	data1 UNIV_UNUSED,
+				/* in: data field length or UNIV_SQL_NULL */
+	ulint		len1 UNIV_UNUSED,
+				/* in: data field (== a pointer to a memory
+				buffer) */
+	const byte*	data2 UNIV_UNUSED,
+				/* in: data field length or UNIV_SQL_NULL */
+	ulint		len2 UNIV_UNUSED)
+{
+	ut_error;	// FIXME:
+	return(1);
+}
 /*************************************************************//**
 This function is used to compare a data tuple to a physical record.
 Only dtuple->n_fields_cmp first fields are taken into account for
@@ -527,10 +739,12 @@ cmp_dtuple_rec_with_match(
 			&& dtype_get_charset_coll(prtype)
 			!= DATA_MYSQL_LATIN1_SWEDISH_CHARSET_COLL)) {
 
-			ret = cmp_whole_field(mtype, prtype,
-					      dfield_get_data(dtuple_field),
-					      (unsigned) dtuple_f_len,
-					      rec_b_ptr, (unsigned) rec_f_len);
+			ret = cmp_whole_field(
+				mtype, prtype,
+				static_cast<const byte*>(
+					dfield_get_data(dtuple_field)),
+				(unsigned) dtuple_f_len,
+				rec_b_ptr, (unsigned) rec_f_len);
 
 			if (ret != 0) {
 				cur_bytes = 0;
@@ -544,7 +758,7 @@ cmp_dtuple_rec_with_match(
 		/* Set the pointers at the current byte */
 
 		rec_b_ptr = rec_b_ptr + cur_bytes;
-		dtuple_b_ptr = (byte*)dfield_get_data(dtuple_field)
+		dtuple_b_ptr = (byte*) dfield_get_data(dtuple_field)
 			+ cur_bytes;
 		/* Compare then the fields */
 
@@ -912,7 +1126,7 @@ cmp_rec_rec_with_match(
 		ulint	mtype;
 		ulint	prtype;
 
-		if (UNIV_UNLIKELY(index->type & DICT_UNIVERSAL)) {
+		if (dict_index_is_univ(index)) {
 			/* This is for the insert buffer B-tree. */
 			mtype = DATA_BINARY;
 			prtype = 0;
@@ -1172,7 +1386,9 @@ cmp_debug_dtuple_rec_with_match(
 			prtype = type->prtype;
 		}
 
-		dtuple_f_data = dfield_get_data(dtuple_field);
+		dtuple_f_data = static_cast<const byte*>(
+			dfield_get_data(dtuple_field));
+
 		dtuple_f_len = dfield_get_len(dtuple_field);
 
 		rec_f_data = rec_get_nth_field(rec, offsets,
diff --git a/storage/innobase/rem/rem0rec.c b/storage/innobase/rem/rem0rec.cc
index 30fc28561fa..5a864f122a3 100644
--- a/storage/innobase/rem/rem0rec.c
+++ b/storage/innobase/rem/rem0rec.cc
@@ -11,13 +11,13 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
 /********************************************************************//**
-@file rem/rem0rec.c
+@file rem/rem0rec.cc
 Record manager
 
 Created 5/30/1994 Heikki Tuuri
@@ -31,6 +31,7 @@ Created 5/30/1994 Heikki Tuuri
 
 #include "mtr0mtr.h"
 #include "mtr0log.h"
+#include "fts0fts.h"
 
 /*			PHYSICAL RECORD (OLD STYLE)
 			===========================
@@ -550,6 +551,9 @@ rec_get_offsets_func(
 			n = dict_index_get_n_fields(index);
 			break;
 		case REC_STATUS_NODE_PTR:
+			/* Node pointer records consist of the
+			uniquely identifying fields of the record
+			followed by a child page number field. */
 			n = dict_index_get_n_unique_in_tree(index) + 1;
 			break;
 		case REC_STATUS_INFIMUM:
@@ -569,6 +573,8 @@ rec_get_offsets_func(
 		n = n_fields;
 	}
 
+	/* The offsets header consists of the allocation size at
+	offsets[0] and the REC_OFFS_HEADER_SIZE bytes. */
 	size = n + (1 + REC_OFFS_HEADER_SIZE);
 
 	if (UNIV_UNLIKELY(!offsets)
@@ -578,7 +584,9 @@ rec_get_offsets_func(
 						     MEM_HEAP_DYNAMIC,
 						     file, line);
 		}
-		offsets = mem_heap_alloc(*heap, size * sizeof(ulint));
+		offsets = static_cast<ulint*>(
+			mem_heap_alloc(*heap, size * sizeof(ulint)));
+
 		rec_offs_set_n_alloc(offsets, size);
 	}
 
@@ -809,7 +817,8 @@ rec_get_converted_size_comp_prefix(
 			continue;
 		}
 
-		ut_ad(len <= col->len || col->mtype == DATA_BLOB);
+		ut_ad(len <= col->len || col->mtype == DATA_BLOB
+		      || (col->len == 0 && col->mtype == DATA_VARCHAR));
 
 		/* If the maximum length of a variable-length field
 		is up to 255 bytes, the actual length is always stored
@@ -1180,7 +1189,9 @@ rec_convert_dtuple_to_rec_comp(
 			*lens-- = (byte) len;
 		} else {
 			ut_ad(len <= dtype_get_len(type)
-			      || dtype_get_mtype(type) == DATA_BLOB);
+			      || dtype_get_mtype(type) == DATA_BLOB
+			      || !strcmp(index->name,
+					 FTS_INDEX_TABLE_IND_NAME));
 			if (len < 128
 			    || (dtype_get_len(type) < 256
 				&& dtype_get_mtype(type) != DATA_BLOB)) {
@@ -1362,7 +1373,7 @@ rec_copy_prefix_to_buf_old(
 			mem_free(*buf);
 		}
 
-		*buf = mem_alloc2(prefix_len, buf_size);
+		*buf = static_cast<byte*>(mem_alloc2(prefix_len, buf_size));
 	}
 
 	ut_memcpy(*buf, rec - area_start, prefix_len);
@@ -1488,7 +1499,7 @@ rec_copy_prefix_to_buf(
 			mem_free(*buf);
 		}
 
-		*buf = mem_alloc2(prefix_len, buf_size);
+		*buf = static_cast<byte*>(mem_alloc2(prefix_len, buf_size));
 	}
 
 	memcpy(*buf, lens + 1, prefix_len);
diff --git a/storage/innobase/row/row0ext.c b/storage/innobase/row/row0ext.cc
index 07e970cf485..8d4da9f034b 100644
--- a/storage/innobase/row/row0ext.c
+++ b/storage/innobase/row/row0ext.cc
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 2006, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 2006, 2011, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,13 +11,13 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
 /**************************************************//**
-@file row/row0ext.c
+@file row/row0ext.cc
 Caching of externally stored column prefixes
 
 Created September 2006 Marko Makela
@@ -42,7 +42,8 @@ row_ext_cache_fill(
 	ulint		zip_size,/*!< compressed page size in bytes, or 0 */
 	const dfield_t*	dfield)	/*!< in: data field */
 {
-	const byte*	field	= dfield_get_data(dfield);
+	const byte*	field	= static_cast<const byte*>(
+					dfield_get_data(dfield));
 	ulint		f_len	= dfield_get_len(dfield);
 	byte*		buf	= ext->buf + i * ext->max_len;
 
@@ -90,19 +91,24 @@ row_ext_create(
 	mem_heap_t*	heap)	/*!< in: heap where created */
 {
 	ulint		i;
-	ulint		zip_size = dict_table_flags_to_zip_size(flags);
+	ulint		zip_size = dict_tf_get_zip_size(flags);
 
-	row_ext_t*	ret = mem_heap_alloc(heap, (sizeof *ret)
-					     + (n_ext - 1) * sizeof ret->len);
+	row_ext_t*	ret;
+
+	ret = static_cast<row_ext_t*>(
+		mem_heap_alloc(heap,
+			       (sizeof *ret) + (n_ext - 1) * sizeof ret->len));
 
 	ut_ad(ut_is_2pow(zip_size));
-	ut_ad(zip_size <= UNIV_PAGE_SIZE);
+	ut_ad(zip_size <= UNIV_ZIP_SIZE_MAX);
 
 	ret->n_ext = n_ext;
 	ret->ext = ext;
 	ret->max_len = DICT_MAX_FIELD_LEN_BY_FORMAT_FLAG(flags);
 
-	ret->buf = mem_heap_alloc(heap, n_ext * ret->max_len);
+	ret->buf = static_cast<byte*>(
+		mem_heap_alloc(heap, n_ext * ret->max_len));
+
 #ifdef UNIV_DEBUG
 	memset(ret->buf, 0xaa, n_ext * ret->max_len);
 	UNIV_MEM_ALLOC(ret->buf, n_ext * ret->max_len);
diff --git a/storage/innobase/row/row0ftsort.cc b/storage/innobase/row/row0ftsort.cc
new file mode 100644
index 00000000000..50b681361d8
--- /dev/null
+++ b/storage/innobase/row/row0ftsort.cc
@@ -0,0 +1,1463 @@
+/*****************************************************************************
+
+Copyright (c) 2010, 2011, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file row/row0ftsort.cc
+Create Full Text Index with (parallel) merge sort
+
+Created 10/13/2010 Jimmy Yang
+*******************************************************/
+
+#include "row0merge.h"
+#include "pars0pars.h"
+#include "row0ftsort.h"
+#include "row0merge.h"
+#include "row0row.h"
+#include "btr0cur.h"
+
+/** Read the next record to buffer N.
+@param N	index into array of merge info structure */
+#define ROW_MERGE_READ_GET_NEXT(N)					\
+	do {								\
+		b[N] = row_merge_read_rec(				\
+			block[N], buf[N], b[N], index,			\
+			fd[N], &foffs[N], &mrec[N], offsets[N]);	\
+		if (UNIV_UNLIKELY(!b[N])) {				\
+			if (mrec[N]) {					\
+				goto exit;				\
+			}						\
+		}							\
+	} while (0)
+
+/** Parallel sort degree */
+UNIV_INTERN ulong	fts_sort_pll_degree	= 2;
+
+/** Parallel sort buffer size */
+UNIV_INTERN ulong	srv_sort_buf_size 	= 1048576;
+
+/*********************************************************************//**
+Create a temporary "fts sort index" used to merge sort the
+tokenized doc string. The index has three "fields":
+
+1) Tokenized word,
+2) Doc ID (depend on number of records to sort, it can be a 4 bytes or 8 bytes
+integer value)
+3) Word's position in original doc.
+
+@return dict_index_t structure for the fts sort index */
+UNIV_INTERN
+dict_index_t*
+row_merge_create_fts_sort_index(
+/*============================*/
+	dict_index_t*		index,	/*!< in: Original FTS index
+					based on which this sort index
+					is created */
+	const dict_table_t*	table,	/*!< in: table that FTS index
+					is being created on */
+	ibool*			opt_doc_id_size)
+					/*!< out: whether to use 4 bytes
+					instead of 8 bytes integer to
+					store Doc ID during sort */
+{
+	dict_index_t*   new_index;
+	dict_field_t*   field;
+	dict_field_t*   idx_field;
+	CHARSET_INFO*	charset;
+
+	// FIXME: This name shouldn't be hard coded here.
+	new_index = dict_mem_index_create(
+		index->table->name, "tmp_fts_idx", 0, DICT_FTS, 3);
+
+	new_index->id = index->id;
+	new_index->table = (dict_table_t*) table;
+	new_index->n_uniq = FTS_NUM_FIELDS_SORT;
+	new_index->n_def = FTS_NUM_FIELDS_SORT;
+	new_index->cached = TRUE;
+
+	idx_field = dict_index_get_nth_field(index, 0);
+	charset = fts_index_get_charset(index);
+
+	/* The first field is on the Tokenized Word */
+	field = dict_index_get_nth_field(new_index, 0);
+	field->name = NULL;
+	field->prefix_len = 0;
+	field->col = static_cast<dict_col_t*>(
+		mem_heap_alloc(new_index->heap, sizeof(dict_col_t)));
+	field->col->len = fts_max_token_size;
+
+	if (strcmp(charset->name, "latin1_swedish_ci") == 0) {
+		field->col->mtype = DATA_VARCHAR;
+	} else {
+		field->col->mtype = DATA_VARMYSQL;
+	}
+
+	field->col->prtype = idx_field->col->prtype | DATA_NOT_NULL;
+	field->col->mbminmaxlen = idx_field->col->mbminmaxlen;
+	field->fixed_len = 0;
+
+	/* Doc ID */
+	field = dict_index_get_nth_field(new_index, 1);
+	field->name = NULL;
+	field->prefix_len = 0;
+	field->col = static_cast<dict_col_t*>(
+		mem_heap_alloc(new_index->heap, sizeof(dict_col_t)));
+	field->col->mtype = DATA_INT;
+	*opt_doc_id_size = FALSE;
+
+	/* Check whether we can use 4 bytes instead of 8 bytes integer
+	field to hold the Doc ID, thus reduce the overall sort size */
+	if (DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_ADD_DOC_ID)) {
+		/* If Doc ID column is being added by this create
+		index, then just check the number of rows in the table */
+		if (table->stat_n_rows < MAX_DOC_ID_OPT_VAL) {
+			*opt_doc_id_size = TRUE;
+		}
+	} else {
+		doc_id_t	max_doc_id;
+
+		/* If the Doc ID column is supplied by user, then
+		check the maximum Doc ID in the table */
+		max_doc_id = fts_get_max_doc_id((dict_table_t*) table);
+
+		if (max_doc_id && max_doc_id < MAX_DOC_ID_OPT_VAL) {
+			*opt_doc_id_size = TRUE;
+		}
+	}
+
+	if (*opt_doc_id_size) {
+		field->col->len = sizeof(ib_uint32_t);
+		field->fixed_len = sizeof(ib_uint32_t);
+	} else {
+		field->col->len = FTS_DOC_ID_LEN;
+		field->fixed_len = FTS_DOC_ID_LEN;
+	}
+
+	field->col->prtype = DATA_NOT_NULL | DATA_BINARY_TYPE;
+
+	field->col->mbminmaxlen = 0;
+
+	/* The third field is on the word's position in the original doc */
+	field = dict_index_get_nth_field(new_index, 2);
+	field->name = NULL;
+	field->prefix_len = 0;
+	field->col = static_cast<dict_col_t*>(
+		mem_heap_alloc(new_index->heap, sizeof(dict_col_t)));
+	field->col->mtype = DATA_INT;
+	field->col->len = 4 ;
+	field->fixed_len = 4;
+	field->col->prtype = DATA_NOT_NULL;
+	field->col->mbminmaxlen = 0;
+
+	return(new_index);
+}
+/*********************************************************************//**
+Initialize FTS parallel sort structures.
+@return TRUE if all successful */
+UNIV_INTERN
+ibool
+row_fts_psort_info_init(
+/*====================*/
+	trx_t*			trx,	/*!< in: transaction */
+	struct TABLE*		table,	/*!< in: MySQL table object */
+	const dict_table_t*	new_table,/*!< in: table on which indexes are
+					created */
+	dict_index_t*		index,	/*!< in: FTS index to be created */
+	ibool			opt_doc_id_size,
+					/*!< in: whether to use 4 bytes
+					instead of 8 bytes integer to
+					store Doc ID during sort */
+	fts_psort_t**		psort,	/*!< out: parallel sort info to be
+					instantiated */
+	fts_psort_t**		merge)	/*!< out: parallel merge info
+					to be instantiated */
+{
+	ulint			i;
+	ulint			j;
+	fts_psort_common_t*	common_info = NULL;
+	fts_psort_t*		psort_info = NULL;
+	fts_psort_t*		merge_info = NULL;
+	ulint			block_size;
+	os_event_t		sort_event;
+	ibool			ret = TRUE;
+
+	block_size = 3 * srv_sort_buf_size;
+
+	*psort = psort_info = static_cast<fts_psort_t*>(mem_zalloc(
+		 fts_sort_pll_degree * sizeof *psort_info));
+
+	if (!psort_info) {
+		return FALSE;
+	}
+
+	sort_event = os_event_create(NULL);
+
+	/* Common Info for all sort threads */
+	common_info = static_cast<fts_psort_common_t*>(
+		mem_alloc(sizeof *common_info));
+
+	common_info->table = table;
+	common_info->new_table = (dict_table_t*) new_table;
+	common_info->trx = trx;
+	common_info->sort_index = index;
+	common_info->all_info = psort_info;
+	common_info->sort_event = sort_event;
+	common_info->opt_doc_id_size = opt_doc_id_size;
+
+	if (!common_info) {
+		mem_free(psort_info);
+		return FALSE;
+	}
+
+	/* There will be FTS_NUM_AUX_INDEX number of "sort buckets" for
+	each parallel sort thread. Each "sort bucket" holds records for
+	a particular "FTS index partition" */
+	for (j = 0; j < fts_sort_pll_degree; j++) {
+
+		UT_LIST_INIT(psort_info[j].fts_doc_list);
+
+		for (i = 0; i < FTS_NUM_AUX_INDEX; i++) {
+
+			psort_info[j].merge_file[i] =
+				 static_cast<merge_file_t*>(
+					mem_zalloc(sizeof(merge_file_t)));
+
+			if (!psort_info[j].merge_file[i]) {
+				ret = FALSE;
+				goto func_exit;
+			}
+
+			psort_info[j].merge_buf[i] = row_merge_buf_create(
+				index);
+
+			row_merge_file_create(psort_info[j].merge_file[i]);
+
+			/* Need to align memory for O_DIRECT write */
+			psort_info[j].block_alloc[i] =
+				static_cast<row_merge_block_t*>(ut_malloc(
+					block_size + 1024));
+
+			psort_info[j].merge_block[i] =
+				static_cast<row_merge_block_t*>(
+					ut_align(
+					psort_info[j].block_alloc[i], 1024));
+
+			if (!psort_info[j].merge_block[i]) {
+				ret = FALSE;
+				goto func_exit;
+			}
+		}
+
+		psort_info[j].child_status = 0;
+		psort_info[j].state = 0;
+		psort_info[j].psort_common = common_info;
+	}
+
+	/* Initialize merge_info structures parallel merge and insert
+	into auxiliary FTS tables (FTS_INDEX_TABLE) */
+	*merge = merge_info = static_cast<fts_psort_t*>(
+		mem_alloc(FTS_NUM_AUX_INDEX * sizeof *merge_info));
+
+	for (j = 0; j < FTS_NUM_AUX_INDEX; j++) {
+
+		merge_info[j].child_status = 0;
+		merge_info[j].state = 0;
+		merge_info[j].psort_common = common_info;
+	}
+
+func_exit:
+	if (!ret) {
+		row_fts_psort_info_destroy(psort_info, merge_info);
+	}
+
+	return(ret);
+}
+/*********************************************************************//**
+Clean up and deallocate FTS parallel sort structures, and close the
+merge sort files  */
+UNIV_INTERN
+void
+row_fts_psort_info_destroy(
+/*=======================*/
+	fts_psort_t*	psort_info,	/*!< parallel sort info */
+	fts_psort_t*	merge_info)	/*!< parallel merge info */
+{
+	ulint	i;
+	ulint	j;
+
+	if (psort_info) {
+		for (j = 0; j < fts_sort_pll_degree; j++) {
+			for (i = 0; i < FTS_NUM_AUX_INDEX; i++) {
+				if (psort_info[j].merge_file[i]) {
+					row_merge_file_destroy(
+						psort_info[j].merge_file[i]);
+				}
+
+				if (psort_info[j].block_alloc[i]) {
+					ut_free(psort_info[j].block_alloc[i]);
+				}
+				mem_free(psort_info[j].merge_file[i]);
+			}
+		}
+
+		mem_free(merge_info[0].psort_common);
+		mem_free(psort_info);
+	}
+
+	if (merge_info) {
+		mem_free(merge_info);
+	}
+}
+/*********************************************************************//**
+Free up merge buffers when merge sort is done */
+UNIV_INTERN
+void
+row_fts_free_pll_merge_buf(
+/*=======================*/
+	fts_psort_t*	psort_info)	/*!< in: parallel sort info */
+{
+	ulint	j;
+	ulint	i;
+
+	if (!psort_info) {
+		return;
+	}
+
+	for (j = 0; j < fts_sort_pll_degree; j++) {
+		for (i = 0; i < FTS_NUM_AUX_INDEX; i++) {
+			row_merge_buf_free(psort_info[j].merge_buf[i]);
+		}
+	}
+
+	return;
+}
+
+/*********************************************************************//**
+Tokenize incoming text data and add to the sort buffer.
+@return	TRUE if the record passed, FALSE if out of space */
+static
+ibool
+row_merge_fts_doc_tokenize(
+/*=======================*/
+	row_merge_buf_t**	sort_buf,	/*!< in/out: sort buffer */
+	doc_id_t		doc_id,		/*!< in: Doc ID */
+	fts_doc_t*		doc,		/*!< in: Doc to be tokenized */
+	dtype_t*		word_dtype,	/*!< in: data structure for
+						word col */
+	merge_file_t**		merge_file,	/*!< in/out: merge file */
+	ibool			opt_doc_id_size,/*!< in: whether to use 4 bytes
+						instead of 8 bytes integer to
+						store Doc ID during sort*/
+	fts_tokenize_ctx_t*	t_ctx)          /*!< in/out: tokenize context */
+{
+	ulint		i;
+	ulint		inc;
+	fts_string_t	str;
+	ulint		len;
+	row_merge_buf_t* buf;
+	dfield_t*	field;
+	fts_string_t	t_str;
+	ibool		buf_full = FALSE;
+	byte		str_buf[FTS_MAX_WORD_LEN + 1];
+	ulint		data_size[FTS_NUM_AUX_INDEX];
+	ulint		n_tuple[FTS_NUM_AUX_INDEX];
+
+	t_str.f_n_char = 0;
+	t_ctx->buf_used = 0;
+
+	memset(n_tuple, 0, FTS_NUM_AUX_INDEX * sizeof(ulint));
+	memset(data_size, 0, FTS_NUM_AUX_INDEX * sizeof(ulint));
+
+	/* Tokenize the data and add each word string, its corresponding
+	doc id and position to sort buffer */
+	for (i = t_ctx->processed_len; i < doc->text.f_len; i += inc) {
+		ib_rbt_bound_t	parent;
+		ulint		idx = 0;
+		ib_uint32_t	position;
+		ulint           offset = 0;
+		ulint		cur_len = 0;
+		doc_id_t	write_doc_id;
+
+		inc = innobase_mysql_fts_get_token(
+			doc->charset, doc->text.f_str + i,
+			doc->text.f_str + doc->text.f_len, &str, &offset);
+
+		ut_a(inc > 0);
+
+		/* Ignore string whose character number is less than
+		"fts_min_token_size" or more than "fts_max_token_size" */
+		if (str.f_n_char < fts_min_token_size
+		    || str.f_n_char > fts_max_token_size) {
+
+			t_ctx->processed_len += inc;
+			continue;
+		}
+
+		t_str.f_len = innobase_fts_casedn_str(
+			doc->charset, (char*) str.f_str, str.f_len,
+			(char*) &str_buf, FTS_MAX_WORD_LEN + 1);
+
+		t_str.f_str = (byte*) &str_buf;
+
+		/* if "cached_stopword" is defined, ingore words in the
+		stopword list */
+		if (t_ctx->cached_stopword
+		    && rbt_search(t_ctx->cached_stopword,
+				  &parent, &t_str) == 0) {
+
+			t_ctx->processed_len += inc;
+			continue;
+		}
+
+		/* There are FTS_NUM_AUX_INDEX auxiliary tables, find
+		out which sort buffer to put this word record in */
+		t_ctx->buf_used = fts_select_index(
+			doc->charset, t_str.f_str, t_str.f_len);
+
+		buf = sort_buf[t_ctx->buf_used];
+
+		ut_a(t_ctx->buf_used < FTS_NUM_AUX_INDEX);
+		idx = t_ctx->buf_used;
+
+		buf->tuples[buf->n_tuples + n_tuple[idx]] = field =
+			static_cast<dfield_t*>(mem_heap_alloc(
+				buf->heap,
+				FTS_NUM_FIELDS_SORT * sizeof *field));
+
+		ut_a(field);
+
+		/* The first field is the tokenized word */
+		dfield_set_data(field, t_str.f_str, t_str.f_len);
+		len = dfield_get_len(field);
+
+		field->type.mtype = word_dtype->mtype;
+		field->type.prtype = word_dtype->prtype | DATA_NOT_NULL;
+
+		/* Variable length field, set to max size. */
+		field->type.len = fts_max_token_size;
+		field->type.mbminmaxlen = word_dtype->mbminmaxlen;
+
+		cur_len += len;
+		dfield_dup(field, buf->heap);
+		field++;
+
+		/* The second field is the Doc ID */
+
+		ib_uint32_t	doc_id_32_bit;
+
+		if (!opt_doc_id_size) {
+			fts_write_doc_id((byte*) &write_doc_id, doc_id);
+
+			dfield_set_data(
+				field, &write_doc_id, sizeof(write_doc_id));
+		} else {
+			mach_write_to_4(
+				(byte*) &doc_id_32_bit, (ib_uint32_t) doc_id);
+
+			dfield_set_data(
+				field, &doc_id_32_bit, sizeof(doc_id_32_bit));
+		}
+
+		len = field->len;
+		ut_ad(len == FTS_DOC_ID_LEN || len == sizeof(ib_uint32_t));
+
+		field->type.mtype = DATA_INT;
+		field->type.prtype = DATA_NOT_NULL | DATA_BINARY_TYPE;
+		field->type.len = len;
+		field->type.mbminmaxlen = 0;
+
+		cur_len += len;
+		dfield_dup(field, buf->heap);
+
+		++field;
+
+		/* The third field is the position */
+		mach_write_to_4(
+			(byte*) &position,
+			(i + offset + inc - str.f_len + t_ctx->init_pos));
+
+		dfield_set_data(field, &position, sizeof(position));
+		len = dfield_get_len(field);
+		ut_ad(len == sizeof(ib_uint32_t));
+
+		field->type.mtype = DATA_INT;
+		field->type.prtype = DATA_NOT_NULL;
+		field->type.len = len;
+		field->type.mbminmaxlen = 0;
+		cur_len += len;
+		dfield_dup(field, buf->heap);
+
+		/* One variable length column, word with its lenght less than
+		fts_max_token_size, add one extra size and one extra byte */
+		cur_len += 2;
+
+		/* Reserve one byte for the end marker of row_merge_block_t. */
+		if (buf->total_size + data_size[idx] + cur_len
+		    >= srv_sort_buf_size - 1) {
+
+			buf_full = TRUE;
+			break;
+		}
+
+		/* Increment the number of tuples */
+		n_tuple[idx]++;
+		t_ctx->processed_len += inc;
+		data_size[idx] += cur_len;
+	}
+
+	/* Update the data length and the number of new word tuples
+	added in this round of tokenization */
+	for (i = 0; i <  FTS_NUM_AUX_INDEX; i++) {
+		sort_buf[i]->total_size += data_size[i];
+
+		sort_buf[i]->n_tuples += n_tuple[i];
+
+		merge_file[i]->n_rec += n_tuple[i];
+		t_ctx->rows_added[i] += n_tuple[i];
+	}
+
+	if (!buf_full) {
+		/* we pad one byte between text accross two fields */
+		t_ctx->init_pos += doc->text.f_len + 1;
+	}
+
+	return(!buf_full);
+}
+
+/*********************************************************************//**
+Function performs parallel tokenization of the incoming doc strings.
+It also performs the initial in memory sort of the parsed records.
+@return OS_THREAD_DUMMY_RETURN */
+UNIV_INTERN
+os_thread_ret_t
+fts_parallel_tokenization(
+/*======================*/
+	void*		arg)	/*!< in: psort_info for the thread */
+{
+	fts_psort_t*		psort_info = (fts_psort_t*) arg;
+	ulint			i;
+	fts_doc_item_t*		doc_item = NULL;
+	fts_doc_item_t*		prev_doc_item = NULL;
+	row_merge_buf_t**	buf;
+	ibool			processed = FALSE;
+	merge_file_t**		merge_file;
+	row_merge_block_t**	block;
+	int			tmpfd[FTS_NUM_AUX_INDEX];
+	ulint			mycount[FTS_NUM_AUX_INDEX];
+	ib_uint64_t		total_rec = 0;
+	ulint			num_doc_processed = 0;
+	doc_id_t		last_doc_id;
+	ulint			zip_size;
+	mem_heap_t*		blob_heap = NULL;
+	fts_doc_t		doc;
+	dict_table_t*		table = psort_info->psort_common->new_table;
+	dtype_t			word_dtype;
+	dict_field_t*		idx_field;
+	fts_tokenize_ctx_t	t_ctx;
+	ulint			retried = 0;
+	ut_ad(psort_info);
+
+	ut_ad(psort_info);
+
+	buf = psort_info->merge_buf;
+	merge_file = psort_info->merge_file;
+	blob_heap = mem_heap_create(512);
+	memset(&doc, 0, sizeof(doc));
+	memset(&t_ctx, 0, sizeof(t_ctx));
+	memset(mycount, 0, FTS_NUM_AUX_INDEX * sizeof(int));
+
+	doc.charset = fts_index_get_charset(
+		psort_info->psort_common->sort_index);
+
+	idx_field = dict_index_get_nth_field(
+		psort_info->psort_common->sort_index, 0);
+	word_dtype.prtype = idx_field->col->prtype;
+	word_dtype.mbminmaxlen = idx_field->col->mbminmaxlen;
+	word_dtype.mtype = (strcmp(doc.charset->name, "latin1_swedish_ci") == 0)
+				? DATA_VARCHAR : DATA_VARMYSQL;
+
+	block = psort_info->merge_block;
+	zip_size = dict_table_zip_size(table);
+
+	doc_item = UT_LIST_GET_FIRST(psort_info->fts_doc_list);
+
+	if (doc_item) {
+		prev_doc_item = doc_item;
+	}
+
+	t_ctx.cached_stopword = table->fts->cache->stopword_info.cached_stopword;
+	processed = TRUE;
+loop:
+	while (doc_item) {
+		dfield_t*	dfield = doc_item->field;
+
+		last_doc_id = doc_item->doc_id;
+
+		if (!(dfield->data)
+		    || dfield_get_len(dfield) == UNIV_SQL_NULL) {
+			num_doc_processed++;
+			doc_item = UT_LIST_GET_NEXT(doc_list, doc_item);
+
+			/* Always remember the last doc_item we processed */
+			if (doc_item) {
+				prev_doc_item = doc_item;
+			}
+			continue;
+		}
+
+		/* If finish processing the last item, update "doc" with
+		strings in the doc_item, otherwise continue processing last
+		item */
+		if (processed) {
+			byte*		data;
+			ulint		data_len;
+
+			dfield = doc_item->field;
+			data = static_cast<byte*>(dfield_get_data(dfield));
+			data_len = dfield_get_len(dfield);
+
+			if (dfield_is_ext(dfield)) {
+				doc.text.f_str =
+					btr_copy_externally_stored_field(
+						&doc.text.f_len, data,
+						zip_size, data_len, blob_heap);
+			} else {
+				doc.text.f_str = data;
+				doc.text.f_len = data_len;
+			}
+
+			doc.tokens = 0;
+			t_ctx.processed_len = 0;
+		} else {
+			/* Not yet finish processing the "doc" on hand,
+			continue processing it */
+			ut_ad(doc.text.f_str);
+			ut_ad(t_ctx.processed_len < doc.text.f_len);
+		}
+
+		processed = row_merge_fts_doc_tokenize(
+			buf, doc_item->doc_id, &doc,
+			&word_dtype,
+			merge_file, psort_info->psort_common->opt_doc_id_size,
+			&t_ctx);
+
+		/* Current sort buffer full, need to recycle */
+		if (!processed) {
+			ut_ad(t_ctx.processed_len < doc.text.f_len);
+			ut_ad(t_ctx.rows_added[t_ctx.buf_used]);
+			break;
+		}
+
+		num_doc_processed++;
+
+		if (fts_enable_diag_print && num_doc_processed % 10000 == 1) {
+			fprintf(stderr, "number of doc processed %d\n",
+				(int) num_doc_processed);
+#ifdef FTS_INTERNAL_DIAG_PRINT
+			for (i = 0; i < FTS_NUM_AUX_INDEX; i++) {
+				fprintf(stderr, "ID %d, partition %d, word "
+					"%d\n",(int) psort_info->psort_id,
+					(int) i, (int) mycount[i]);
+			}
+#endif
+		}
+
+		mem_heap_empty(blob_heap);
+
+		if (doc_item->field->data) {
+			ut_free(doc_item->field->data);
+			doc_item->field->data = NULL;
+		}
+
+		doc_item = UT_LIST_GET_NEXT(doc_list, doc_item);
+
+		/* Always remember the last doc_item we processed */
+		if (doc_item) {
+			prev_doc_item = doc_item;
+			if (last_doc_id != doc_item->doc_id) {
+				t_ctx.init_pos = 0;
+			}
+		}
+	}
+
+	/* If we run out of current sort buffer, need to sort
+	and flush the sort buffer to disk */
+	if (t_ctx.rows_added[t_ctx.buf_used] && !processed) {
+		row_merge_buf_sort(buf[t_ctx.buf_used], NULL);
+		row_merge_buf_write(buf[t_ctx.buf_used],
+				    merge_file[t_ctx.buf_used],
+				    block[t_ctx.buf_used]);
+		row_merge_write(merge_file[t_ctx.buf_used]->fd,
+				merge_file[t_ctx.buf_used]->offset++,
+				block[t_ctx.buf_used]);
+		UNIV_MEM_INVALID(block[t_ctx.buf_used][0], srv_sort_buf_size);
+		buf[t_ctx.buf_used] = row_merge_buf_empty(buf[t_ctx.buf_used]);
+		mycount[t_ctx.buf_used] += t_ctx.rows_added[t_ctx.buf_used];
+		t_ctx.rows_added[t_ctx.buf_used] = 0;
+
+		ut_a(doc_item);
+		goto loop;
+	}
+
+	/* Parent done scanning, and if finish processing all the docs, exit */
+	if (psort_info->state == FTS_PARENT_COMPLETE) {
+	    	if (num_doc_processed >= UT_LIST_GET_LEN(
+			psort_info->fts_doc_list)) {
+			goto exit;
+		} else if (retried > 10000) {
+			ut_ad(!doc_item);
+			/* retied too many times and cannot get new record */
+			fprintf(stderr, "InnoDB: FTS parallel sort processed "
+					"%lu records, the sort queue has "
+					"%lu records. But sort cannot get "
+					"the next records", num_doc_processed,
+					UT_LIST_GET_LEN(
+						psort_info->fts_doc_list));
+			goto exit;
+		}
+	}
+
+	if (doc_item) {
+		doc_item = UT_LIST_GET_NEXT(doc_list, doc_item);
+	} else if (prev_doc_item) {
+		os_thread_yield();
+		doc_item = UT_LIST_GET_NEXT(doc_list, prev_doc_item);
+	} else {
+		os_thread_yield();
+		doc_item = UT_LIST_GET_FIRST(psort_info->fts_doc_list);
+	}
+
+	if (doc_item) {
+		 prev_doc_item = doc_item;
+		retried = 0;
+	} else if (psort_info->state == FTS_PARENT_COMPLETE) {
+		retried++;
+	}
+
+	goto loop;
+
+exit:
+	for (i = 0; i < FTS_NUM_AUX_INDEX; i++) {
+		if (t_ctx.rows_added[i]) {
+			row_merge_buf_sort(buf[i], NULL);
+			row_merge_buf_write(
+				buf[i], (const merge_file_t*) merge_file[i],
+				block[i]);
+			row_merge_write(merge_file[i]->fd,
+					merge_file[i]->offset++, block[i]);
+
+			UNIV_MEM_INVALID(block[i][0], srv_sort_buf_size);
+			buf[i] = row_merge_buf_empty(buf[i]);
+			t_ctx.rows_added[i] = 0;
+		}
+	}
+
+	if (fts_enable_diag_print) {
+		DEBUG_FTS_SORT_PRINT("  InnoDB_FTS: start merge sort\n");
+	}
+
+	for (i = 0; i < FTS_NUM_AUX_INDEX; i++) {
+
+		if (!merge_file[i]->offset) {
+			continue;
+		}
+
+		tmpfd[i] = innobase_mysql_tmpfile();
+		row_merge_sort(psort_info->psort_common->trx,
+				       psort_info->psort_common->sort_index,
+				       merge_file[i],
+				       (row_merge_block_t*) block[i], &tmpfd[i],
+				       psort_info->psort_common->table);
+		total_rec += merge_file[i]->n_rec;
+		close(tmpfd[i]);
+	}
+
+	if (fts_enable_diag_print) {
+		DEBUG_FTS_SORT_PRINT("  InnoDB_FTS: complete merge sort\n");
+	}
+
+	mem_heap_free(blob_heap);
+
+	psort_info->child_status = FTS_CHILD_COMPLETE;
+	os_event_set(psort_info->psort_common->sort_event);
+
+	os_thread_exit(NULL);
+	OS_THREAD_DUMMY_RETURN;
+}
+
+/*********************************************************************//**
+Start the parallel tokenization and parallel merge sort */
+UNIV_INTERN
+void
+row_fts_start_psort(
+/*================*/
+	fts_psort_t*	psort_info)	/*!< parallel sort structure */
+{
+	ulint		i = 0;
+	os_thread_id_t	thd_id;
+
+	for (i = 0; i < fts_sort_pll_degree; i++) {
+		psort_info[i].psort_id = i;
+		os_thread_create(fts_parallel_tokenization,
+				 (void*) &psort_info[i], &thd_id);
+	}
+}
+
+/*********************************************************************//**
+Function performs the merge and insertion of the sorted records.
+@return OS_THREAD_DUMMY_RETURN */
+UNIV_INTERN
+os_thread_ret_t
+fts_parallel_merge(
+/*===============*/
+	void*		arg)		/*!< in: parallel merge info */
+{
+	fts_psort_t*	psort_info = (fts_psort_t*) arg;
+	ulint		id;
+
+	ut_ad(psort_info);
+
+	id = psort_info->psort_id;
+
+	row_fts_merge_insert(psort_info->psort_common->sort_index,
+			     psort_info->psort_common->new_table,
+			     psort_info->psort_common->all_info, id);
+
+	psort_info->child_status = FTS_CHILD_COMPLETE;
+	os_event_set(psort_info->psort_common->sort_event);
+
+	os_thread_exit(NULL);
+	OS_THREAD_DUMMY_RETURN;
+}
+
+/*********************************************************************//**
+Kick off the parallel merge and insert thread */
+UNIV_INTERN
+void
+row_fts_start_parallel_merge(
+/*=========================*/
+	fts_psort_t*	merge_info)	/*!< in: parallel sort info */
+{
+	int		i = 0;
+	os_thread_id_t	thd_id;
+
+	/* Kick off merge/insert threads */
+	for (i = 0; i <  FTS_NUM_AUX_INDEX; i++) {
+		merge_info[i].psort_id = i;
+		merge_info[i].child_status = 0;
+
+		os_thread_create(fts_parallel_merge,
+				 (void*) &merge_info[i], &thd_id);
+	}
+}
+
+/********************************************************************//**
+Insert processed FTS data to auxillary index tables.
+@return	DB_SUCCESS if insertion runs fine */
+UNIV_INTERN
+ulint
+row_merge_write_fts_word(
+/*=====================*/
+	trx_t*		trx,		/*!< in: transaction */
+	que_t**		ins_graph,	/*!< in: Insert query graphs */
+	fts_tokenizer_word_t* word,	/*!< in: sorted and tokenized
+					word */
+	fts_table_t*	fts_table,	/*!< in: fts aux table instance */
+	CHARSET_INFO*	charset)	/*!< in: charset */
+{
+	ulint	selected;
+	ulint	ret = DB_SUCCESS;
+
+	selected = fts_select_index(
+		charset, word->text.f_str, word->text.f_len);
+	fts_table->suffix = fts_get_suffix(selected);
+
+	/* Pop out each fts_node in word->nodes write them to auxiliary table */
+	while(ib_vector_size(word->nodes) > 0) {
+		ulint		error;
+		fts_node_t*	fts_node;
+
+		fts_node = static_cast<fts_node_t*>(ib_vector_pop(word->nodes));
+
+		error = fts_write_node(
+			trx, &ins_graph[selected], fts_table, &word->text,
+			fts_node);
+
+		if (error != DB_SUCCESS) {
+			fprintf(stderr, "InnoDB: failed to write"
+				" word %s to FTS auxiliary index"
+				" table, error (%lu) \n",
+				word->text.f_str, error);
+			ret = error;
+		}
+
+		ut_free(fts_node->ilist);
+		fts_node->ilist = NULL;
+	}
+
+	return(ret);
+}
+
+/*********************************************************************//**
+Read sorted FTS data files and insert data tuples to auxillary tables.
+@return	DB_SUCCESS or error number */
+UNIV_INTERN
+void
+row_fts_insert_tuple(
+/*=================*/
+	fts_psort_insert_t*
+			ins_ctx,	/*!< in: insert context */
+	fts_tokenizer_word_t* word,	/*!< in: last processed
+					tokenized word */
+	ib_vector_t*	positions,	/*!< in: word position */
+	doc_id_t*	in_doc_id,	/*!< in: last item doc id */
+	dtuple_t*	dtuple)		/*!< in: entry to insert */
+{
+	fts_node_t*	fts_node = NULL;
+	dfield_t*	dfield;
+	doc_id_t	doc_id;
+	ulint		position;
+	fts_string_t	token_word;
+	ulint		i;
+
+	/* Get fts_node for the FTS auxillary INDEX table */
+	if (ib_vector_size(word->nodes) > 0) {
+		fts_node = static_cast<fts_node_t*>(
+			ib_vector_last(word->nodes));
+	}
+
+	if (fts_node == NULL
+	    || fts_node->ilist_size > FTS_ILIST_MAX_SIZE) {
+
+		fts_node = static_cast<fts_node_t*>(
+			ib_vector_push(word->nodes, NULL));
+
+		memset(fts_node, 0x0, sizeof(*fts_node));
+	}
+
+	/* If dtuple == NULL, this is the last word to be processed */
+	if (!dtuple) {
+		if (fts_node && ib_vector_size(positions) > 0) {
+			fts_cache_node_add_positions(
+				NULL, fts_node, *in_doc_id,
+				positions);
+
+			/* Write out the current word */
+			row_merge_write_fts_word(ins_ctx->trx,
+						 ins_ctx->ins_graph, word,
+						 &ins_ctx->fts_table,
+						 ins_ctx->charset);
+
+		}
+
+		return;
+	}
+
+	/* Get the first field for the tokenized word */
+	dfield = dtuple_get_nth_field(dtuple, 0);
+
+	token_word.f_n_char = 0;
+	token_word.f_len = dfield->len;
+	token_word.f_str = static_cast<byte*>(dfield_get_data(dfield));
+
+	if (!word->text.f_str) {
+		fts_utf8_string_dup(&word->text, &token_word, ins_ctx->heap);
+	}
+
+	/* compare to the last word, to see if they are the same
+	word */
+	if (innobase_fts_text_cmp(ins_ctx->charset,
+				  &word->text, &token_word) != 0) {
+		ulint	num_item;
+
+		/* Getting a new word, flush the last position info
+		for the currnt word in fts_node */
+		if (ib_vector_size(positions) > 0) {
+			fts_cache_node_add_positions(
+				NULL, fts_node, *in_doc_id, positions);
+		}
+
+		/* Write out the current word */
+		row_merge_write_fts_word(ins_ctx->trx, ins_ctx->ins_graph,
+					 word, &ins_ctx->fts_table,
+					 ins_ctx->charset);
+
+		/* Copy the new word */
+		fts_utf8_string_dup(&word->text, &token_word, ins_ctx->heap);
+
+		num_item = ib_vector_size(positions);
+
+		/* Clean up position queue */
+		for (i = 0; i < num_item; i++) {
+			ib_vector_pop(positions);
+		}
+
+		/* Reset Doc ID */
+		*in_doc_id = 0;
+		memset(fts_node, 0x0, sizeof(*fts_node));
+	}
+
+	/* Get the word's Doc ID */
+	dfield = dtuple_get_nth_field(dtuple, 1);
+
+	if (!ins_ctx->opt_doc_id_size) {
+		doc_id = fts_read_doc_id(
+			static_cast<byte*>(dfield_get_data(dfield)));
+	} else {
+		doc_id = (doc_id_t) mach_read_from_4(
+			static_cast<byte*>(dfield_get_data(dfield)));
+	}
+
+	/* Get the word's position info */
+	dfield = dtuple_get_nth_field(dtuple, 2);
+	position = mach_read_from_4(static_cast<byte*>(dfield_get_data(dfield)));
+
+	/* If this is the same word as the last word, and they
+	have the same Doc ID, we just need to add its position
+	info. Otherwise, we will flush position info to the
+	fts_node and initiate a new position vector  */
+	if (!(*in_doc_id) || *in_doc_id == doc_id) {
+		ib_vector_push(positions, &position);
+	} else {
+		ulint	num_pos = ib_vector_size(positions);
+
+		fts_cache_node_add_positions(NULL, fts_node,
+					     *in_doc_id, positions);
+		for (i = 0; i < num_pos; i++) {
+			ib_vector_pop(positions);
+		}
+		ib_vector_push(positions, &position);
+	}
+
+	/* record the current Doc ID */
+	*in_doc_id = doc_id;
+}
+
+/*********************************************************************//**
+Propagate a newly added record up one level in the selection tree
+@return parent where this value propagated to */
+static
+int
+row_fts_sel_tree_propagate(
+/*=======================*/
+	int		propogated,	/*<! in: tree node propagated */
+	int*		sel_tree,	/*<! in: selection tree */
+	const mrec_t**	mrec,		/*<! in: sort record */
+	ulint**		offsets,	/*<! in: record offsets */
+	dict_index_t*	index)		/*<! in/out: FTS index */
+{
+	ulint	parent;
+	int	child_left;
+	int	child_right;
+	int	selected;
+	ibool	null_eq = FALSE;
+
+	/* Find which parent this value will be propagated to */
+	parent = (propogated - 1) / 2;
+
+	/* Find out which value is smaller, and to propagate */
+	child_left = sel_tree[parent * 2 + 1];
+	child_right = sel_tree[parent * 2 + 2];
+
+	if (child_left == -1 || mrec[child_left] == NULL) {
+		if (child_right == -1
+		    || mrec[child_right] == NULL) {
+			selected = -1;
+		} else {
+			selected = child_right ;
+		}
+	} else if (child_right == -1
+		   || mrec[child_right] == NULL) {
+		selected = child_left;
+	} else if (row_merge_cmp(mrec[child_left], mrec[child_right],
+				 offsets[child_left],
+				 offsets[child_right],
+				 index, &null_eq) < 0) {
+		selected = child_left;
+	} else {
+		selected = child_right;
+	}
+
+	sel_tree[parent] = selected;
+
+	return(parent);
+}
+
+/*********************************************************************//**
+Readjust selection tree after popping the root and read a new value
+@return the new root */
+static
+int
+row_fts_sel_tree_update(
+/*====================*/
+	int*		sel_tree,	/*<! in/out: selection tree */
+	ulint		propagated,	/*<! in: node to propagate up */
+	ulint		height,		/*<! in: tree height */
+	const mrec_t**	mrec,		/*<! in: sort record */
+	ulint**		offsets,	/*<! in: record offsets */
+	dict_index_t*	index)		/*<! in: index dictionary */
+{
+	ulint	i;
+
+	for (i = 1; i <= height; i++) {
+		propagated = row_fts_sel_tree_propagate(
+			propagated, sel_tree, mrec, offsets, index);
+	}
+
+	return(sel_tree[0]);
+}
+
+/*********************************************************************//**
+Build selection tree at a specified level */
+static
+void
+row_fts_build_sel_tree_level(
+/*=========================*/
+	int*		sel_tree,	/*<! in/out: selection tree */
+	ulint		level,		/*<! in: selection tree level */
+	const mrec_t**	mrec,		/*<! in: sort record */
+	ulint**		offsets,	/*<! in: record offsets */
+	dict_index_t*	index)		/*<! in: index dictionary */
+{
+	ulint	start;
+	int	child_left;
+	int	child_right;
+	ulint	i;
+	ulint	num_item;
+
+	start = (1 << level) - 1;
+	num_item = (1 << level);
+
+	for (i = 0; i < num_item;  i++) {
+		ibool	null_eq = FALSE;
+
+		child_left = sel_tree[(start + i) * 2 + 1];
+		child_right = sel_tree[(start + i) * 2 + 2];
+
+		if (child_left == -1) {
+			if (child_right == -1) {
+				sel_tree[start + i] = -1;
+			} else {
+				sel_tree[start + i] =  child_right;
+			}
+			continue;
+		} else if (child_right == -1) {
+			sel_tree[start + i] = child_left;
+			continue;
+		}
+
+		/* Deal with NULL child conditions */
+		if (!mrec[child_left]) {
+			if (!mrec[child_right]) {
+				sel_tree[start + i] = -1;
+			} else {
+				sel_tree[start + i] = child_right;
+			}
+			continue;
+		} else if (!mrec[child_right]) {
+			sel_tree[start + i] = child_left;
+			continue;
+		}
+
+		/* Select the smaller one to set parent pointer */
+		if (row_merge_cmp(mrec[child_left], mrec[child_right],
+				  offsets[child_left],
+				  offsets[child_right],
+				  index, &null_eq) < 0) {
+			sel_tree[start + i] = child_left;
+		} else {
+			sel_tree[start + i] = child_right;
+		}
+	}
+}
+
+/*********************************************************************//**
+Build a selection tree for merge. The selection tree is a binary tree
+and should have fts_sort_pll_degree / 2 levels. With root as level 0
+@return number of tree levels */
+static
+ulint
+row_fts_build_sel_tree(
+/*===================*/
+	int*		sel_tree,	/*<! in/out: selection tree */
+	const mrec_t**	mrec,		/*<! in: sort record */
+	ulint**		offsets,	/*<! in: record offsets */
+	dict_index_t*	index)		/*<! in: index dictionary */
+{
+	ulint	treelevel = 1;
+	ulint	num = 2;
+	int	i = 0;
+	ulint	start;
+
+	/* No need to build selection tree if we only have two merge threads */
+	if (fts_sort_pll_degree <= 2) {
+		return(0);
+	}
+
+	while (num < fts_sort_pll_degree) {
+		num = num << 1;
+		treelevel++;
+	}
+
+	start = (1 << treelevel) - 1;
+
+	for (i = 0; i < (int) fts_sort_pll_degree; i++) {
+		sel_tree[i + start] = i;
+	}
+
+	for (i = treelevel - 1; i >=0; i--) {
+		row_fts_build_sel_tree_level(sel_tree, i, mrec, offsets, index);
+	}
+
+	return(treelevel);
+}
+
+/*********************************************************************//**
+Read sorted file containing index data tuples and insert these data
+tuples to the index
+@return	DB_SUCCESS or error number */
+UNIV_INTERN
+ulint
+row_fts_merge_insert(
+/*=================*/
+	dict_index_t*		index,	/*!< in: index */
+	dict_table_t*		table,	/*!< in: new table */
+	fts_psort_t*		psort_info, /*!< parallel sort info */
+	ulint			id)	/* !< in: which auxiliary table's data
+					to insert to */
+{
+	const byte**		b;
+	mem_heap_t*		tuple_heap;
+	mem_heap_t*		heap;
+	ulint			error = DB_SUCCESS;
+	ulint*			foffs;
+	ulint**			offsets;
+	fts_tokenizer_word_t	new_word;
+	ib_vector_t*		positions;
+	doc_id_t		last_doc_id;
+	ib_alloc_t*		heap_alloc;
+	ulint			n_bytes;
+	ulint			i;
+	mrec_buf_t**		buf;
+	int*			fd;
+	byte**			block;
+	const mrec_t**		mrec;
+	ulint			count = 0;
+	int*			sel_tree;
+	ulint			height;
+	ulint			start;
+	fts_psort_insert_t	ins_ctx;
+	ulint			count_diag = 0;
+
+	ut_ad(index);
+	ut_ad(table);
+
+	/* We use the insert query graph as the dummy graph
+	needed in the row module call */
+
+	ins_ctx.trx = trx_allocate_for_background();
+
+	ins_ctx.trx->op_info = "inserting index entries";
+
+	ins_ctx.opt_doc_id_size = psort_info[0].psort_common->opt_doc_id_size;
+
+	heap = mem_heap_create(500 + sizeof(mrec_buf_t));
+
+	b = (const byte**) mem_heap_alloc(
+		heap, sizeof (*b) * fts_sort_pll_degree);
+	foffs = (ulint*) mem_heap_alloc(
+		heap, sizeof(*foffs) * fts_sort_pll_degree);
+	offsets = (ulint**) mem_heap_alloc(
+		heap, sizeof(*offsets) * fts_sort_pll_degree);
+	buf = (mrec_buf_t**) mem_heap_alloc(
+		heap, sizeof(*buf) * fts_sort_pll_degree);
+	fd = (int*) mem_heap_alloc(heap, sizeof(*fd) * fts_sort_pll_degree);
+	block = (byte**) mem_heap_alloc(
+		heap, sizeof(*block) * fts_sort_pll_degree);
+	mrec = (const mrec_t**) mem_heap_alloc(
+		heap, sizeof(*mrec) * fts_sort_pll_degree);
+	sel_tree = (int*) mem_heap_alloc(
+		heap, sizeof(*sel_tree) * (fts_sort_pll_degree * 2));
+
+	tuple_heap = mem_heap_create(1000);
+
+	ins_ctx.charset = fts_index_get_charset(index);
+	ins_ctx.heap = heap;
+
+	for (i = 0; i < fts_sort_pll_degree; i++) {
+		ulint	num;
+
+		num = 1 + REC_OFFS_HEADER_SIZE
+			+ dict_index_get_n_fields(index);
+		offsets[i] = static_cast<ulint*>(mem_heap_zalloc(
+			heap, num * sizeof *offsets[i]));
+		offsets[i][0] = num;
+		offsets[i][1] = dict_index_get_n_fields(index);
+		block[i] = psort_info[i].merge_block[id];
+		b[i] = psort_info[i].merge_block[id];
+		fd[i] = psort_info[i].merge_file[id]->fd;
+		foffs[i] = 0;
+
+		buf[i] = static_cast<unsigned char (*)[16384]>(
+			mem_heap_alloc(heap, sizeof *buf[i]));
+		count_diag += (int) psort_info[i].merge_file[id]->n_rec;
+	}
+
+	if (fts_enable_diag_print) { 
+		ut_print_timestamp(stderr);
+		fprintf(stderr, "  InnoDB_FTS: to inserted %lu records\n",
+			(ulong) count_diag);
+	}
+
+	/* Initialize related variables if creating FTS indexes */
+	heap_alloc = ib_heap_allocator_create(heap);
+
+	memset(&new_word, 0, sizeof(new_word));
+
+	new_word.nodes = ib_vector_create(heap_alloc, sizeof(fts_node_t), 4);
+	positions = ib_vector_create(heap_alloc, sizeof(ulint), 32);
+	last_doc_id = 0;
+
+	/* Allocate insert query graphs for FTS auxillary
+	Index Table, note we have FTS_NUM_AUX_INDEX such index tables */
+	n_bytes = sizeof(que_t*) * (FTS_NUM_AUX_INDEX + 1);
+	ins_ctx.ins_graph = static_cast<que_t**>(mem_heap_alloc(heap, n_bytes));
+	memset(ins_ctx.ins_graph, 0x0, n_bytes);
+
+	ins_ctx.fts_table.type = FTS_INDEX_TABLE;
+	ins_ctx.fts_table.index_id = index->id;
+	ins_ctx.fts_table.table_id = table->id;
+	ins_ctx.fts_table.parent = index->table->name;
+	ins_ctx.fts_table.table = NULL;
+
+	for (i = 0; i < fts_sort_pll_degree; i++) {
+		if (psort_info[i].merge_file[id]->n_rec == 0) {
+			/* No Rows to read */
+			mrec[i] = b[i] = NULL;
+		} else {
+			if (!row_merge_read(fd[i], foffs[i],
+			    (row_merge_block_t*) block[i])) {
+				error = DB_CORRUPTION;
+				goto exit;
+			}
+
+			ROW_MERGE_READ_GET_NEXT(i);
+		}
+	}
+
+	height = row_fts_build_sel_tree(sel_tree, (const mrec_t **) mrec,
+					offsets, index);
+
+	start = (1 << height) - 1;
+
+	/* Fetch sorted records from sort buffer and insert them into
+	corresponding FTS index auxiliary tables */
+	for (;;) {
+		dtuple_t*	dtuple;
+		ulint		n_ext;
+		int		min_rec = 0;
+
+		if (fts_sort_pll_degree <= 2) {
+			while (!mrec[min_rec]) {
+				min_rec++;
+
+				if (min_rec >= (int) fts_sort_pll_degree) {
+					row_fts_insert_tuple(
+						&ins_ctx, &new_word,
+						positions, &last_doc_id,
+						NULL);
+
+					goto exit;
+				}
+			}
+
+			for (i = min_rec + 1; i < fts_sort_pll_degree; i++) {
+				ibool           null_eq = FALSE;
+				if (!mrec[i]) {
+					continue;
+				}
+
+				if (row_merge_cmp(mrec[i], mrec[min_rec],
+						  offsets[i], offsets[min_rec],
+						  index, &null_eq) < 0) {
+					min_rec = i;
+				}
+			}
+		} else {
+			min_rec = sel_tree[0];
+
+			if (min_rec ==  -1) {
+				row_fts_insert_tuple(
+					&ins_ctx, &new_word,
+					positions, &last_doc_id,
+					NULL);
+
+				goto exit;
+			}
+		}
+
+		dtuple = row_rec_to_index_entry_low(
+			mrec[min_rec], index, offsets[min_rec], &n_ext,
+			tuple_heap);
+
+		row_fts_insert_tuple(
+			&ins_ctx, &new_word, positions,
+			&last_doc_id, dtuple);
+
+
+		ROW_MERGE_READ_GET_NEXT(min_rec);
+
+		if (fts_sort_pll_degree > 2) {
+			if (!mrec[min_rec]) {
+				sel_tree[start + min_rec] = -1;
+			}
+
+			row_fts_sel_tree_update(sel_tree, start + min_rec,
+						height, mrec,
+						offsets, index);
+		}
+
+		count++;
+
+		mem_heap_empty(tuple_heap);
+	}
+
+exit:
+	fts_sql_commit(ins_ctx.trx);
+
+	ins_ctx.trx->op_info = "";
+
+	mem_heap_free(tuple_heap);
+
+	for (i = 0; i < FTS_NUM_AUX_INDEX; i++) {
+		if (ins_ctx.ins_graph[i]) {
+			fts_que_graph_free(ins_ctx.ins_graph[i]);
+		}
+	}
+
+	trx_free_for_background(ins_ctx.trx);
+
+	mem_heap_free(heap);
+
+	if (fts_enable_diag_print) {
+		ut_print_timestamp(stderr);
+		fprintf(stderr, "  InnoDB_FTS: inserted %lu records\n",
+			(ulong) count);
+	}
+
+	return(error);
+}
diff --git a/storage/innobase/row/row0ins.c b/storage/innobase/row/row0ins.cc
index 2b77c6f929d..8476adfcfca 100644
--- a/storage/innobase/row/row0ins.c
+++ b/storage/innobase/row/row0ins.cc
@@ -17,14 +17,14 @@ this program; if not, write to the Free Software Foundation, Inc.,
 *****************************************************************************/
 
 /**************************************************//**
-@file row/row0ins.c
+@file row/row0ins.cc
 Insert into a table
 
 Created 4/20/1996 Heikki Tuuri
 *******************************************************/
 
 #include "m_string.h" /* for my_sys.h */
-#include "my_sys.h" /* DEBUG_SYNC_C */
+#include "my_sys.h" /* DEBUG_SYNC_C_IF_THD */
 #include "row0ins.h"
 
 #ifdef UNIV_NONINL
@@ -49,11 +49,8 @@ Created 4/20/1996 Heikki Tuuri
 #include "data0data.h"
 #include "usr0sess.h"
 #include "buf0lru.h"
-#include "m_string.h"
-#include "my_sys.h"
-
-#define	ROW_INS_PREV	1
-#define	ROW_INS_NEXT	2
+#include "fts0fts.h"
+#include "fts0types.h"
 
 /*************************************************************************
 IMPORTANT NOTE: Any operation that generates redo MUST check that there
@@ -78,7 +75,8 @@ ins_node_create(
 {
 	ins_node_t*	node;
 
-	node = mem_heap_alloc(heap, sizeof(ins_node_t));
+	node = static_cast<ins_node_t*>(
+		mem_heap_alloc(heap, sizeof(ins_node_t)));
 
 	node->common.type = QUE_NODE_INSERT;
 
@@ -115,17 +113,18 @@ ins_node_create_entry_list(
 
 	UT_LIST_INIT(node->entry_list);
 
-	index = dict_table_get_first_index(node->table);
+	/* We will include all indexes (include those corrupted
+	secondary indexes) in the entry list. Filteration of
+	these corrupted index will be done in row_ins() */
 
-	while (index != NULL) {
-		entry = row_build_index_entry(node->row, NULL, index,
-					      node->entry_sys_heap);
-		UT_LIST_ADD_LAST(tuple_list, node->entry_list, entry);
+	for (index = dict_table_get_first_index(node->table);
+	     index != 0;
+	     index = dict_table_get_next_index(index)) {
 
-		/* We will include all indexes (include those corrupted
-		secondary indexes) in the entry list. Filteration of
-		these corrupted index will be done in row_ins() */
-		index = dict_table_get_next_index(index);
+		entry = row_build_index_entry(
+			node->row, NULL, index, node->entry_sys_heap);
+
+		UT_LIST_ADD_LAST(tuple_list, node->entry_list, entry);
 	}
 }
 
@@ -157,7 +156,7 @@ row_ins_alloc_sys_fields(
 
 	dfield = dtuple_get_nth_field(row, dict_col_get_no(col));
 
-	ptr = mem_heap_zalloc(heap, DATA_ROW_ID_LEN);
+	ptr = static_cast<byte*>(mem_heap_zalloc(heap, DATA_ROW_ID_LEN));
 
 	dfield_set_data(dfield, ptr, DATA_ROW_ID_LEN);
 
@@ -168,7 +167,7 @@ row_ins_alloc_sys_fields(
 	col = dict_table_get_sys_col(table, DATA_TRX_ID);
 
 	dfield = dtuple_get_nth_field(row, dict_col_get_no(col));
-	ptr = mem_heap_zalloc(heap, DATA_TRX_ID_LEN);
+	ptr = static_cast<byte*>(mem_heap_zalloc(heap, DATA_TRX_ID_LEN));
 
 	dfield_set_data(dfield, ptr, DATA_TRX_ID_LEN);
 
@@ -179,7 +178,7 @@ row_ins_alloc_sys_fields(
 	col = dict_table_get_sys_col(table, DATA_ROLL_PTR);
 
 	dfield = dtuple_get_nth_field(row, dict_col_get_no(col));
-	ptr = mem_heap_zalloc(heap, DATA_ROLL_PTR_LEN);
+	ptr = static_cast<byte*>(mem_heap_zalloc(heap, DATA_ROLL_PTR_LEN));
 
 	dfield_set_data(dfield, ptr, DATA_ROLL_PTR_LEN);
 }
@@ -372,22 +371,19 @@ row_ins_cascade_ancestor_updates_table(
 	dict_table_t*	table)	/*!< in: table */
 {
 	que_node_t*	parent;
-	upd_node_t*	upd_node;
 
-	parent = que_node_get_parent(node);
+	for (parent = que_node_get_parent(node);
+	     que_node_get_type(parent) == QUE_NODE_UPDATE;
+	     parent = que_node_get_parent(parent)) {
 
-	while (que_node_get_type(parent) == QUE_NODE_UPDATE) {
+		upd_node_t*	upd_node;
 
-		upd_node = parent;
+		upd_node = static_cast<upd_node_t*>(parent);
 
 		if (upd_node->table == table && upd_node->is_delete == FALSE) {
 
 			return(TRUE);
 		}
-
-		parent = que_node_get_parent(parent);
-
-		ut_a(parent);
 	}
 
 	return(FALSE);
@@ -406,14 +402,11 @@ row_ins_cascade_n_ancestors(
 	que_node_t*	parent;
 	ulint		n_ancestors = 0;
 
-	parent = que_node_get_parent(node);
+	for (parent = que_node_get_parent(node);
+	     que_node_get_type(parent) == QUE_NODE_UPDATE;
+	     parent = que_node_get_parent(parent)) {
 
-	while (que_node_get_type(parent) == QUE_NODE_UPDATE) {
 		n_ancestors++;
-
-		parent = que_node_get_parent(parent);
-
-		ut_a(parent);
 	}
 
 	return(n_ancestors);
@@ -434,8 +427,10 @@ row_ins_cascade_calc_update_vec(
 					table */
 	dict_foreign_t*	foreign,	/*!< in: foreign key constraint whose
 					type is != 0 */
-	mem_heap_t*	heap)		/*!< in: memory heap to use as
+	mem_heap_t*	heap,		/*!< in: memory heap to use as
 					temporary storage */
+	trx_t*		trx,		/*!< in: update transaction */
+	ibool*		fts_col_affected)/*!< out: is FTS column affected */
 {
 	upd_node_t*	cascade		= node->cascade_node;
 	dict_table_t*	table		= foreign->foreign_table;
@@ -448,6 +443,9 @@ row_ins_cascade_calc_update_vec(
 	ulint		parent_field_no;
 	ulint		i;
 	ulint		j;
+	ibool		doc_id_updated = FALSE;
+	ulint		doc_id_pos = 0;
+	doc_id_t	new_doc_id = FTS_NULL_DOC_ID;
 
 	ut_a(node);
 	ut_a(foreign);
@@ -472,6 +470,13 @@ row_ins_cascade_calc_update_vec(
 
 	n_fields_updated = 0;
 
+	*fts_col_affected = FALSE;
+
+	if (table->fts) {
+		doc_id_pos = dict_table_get_nth_col_pos(
+			table, table->fts->doc_col);
+	}
+
 	for (i = 0; i < foreign->n_fields; i++) {
 
 		parent_field_no = dict_table_get_nth_col_pos(
@@ -527,7 +532,9 @@ row_ins_cascade_calc_update_vec(
 					col->prtype, col->mbminmaxlen,
 					col->len,
 					ufield_len,
-					dfield_get_data(&ufield->new_val))
+					static_cast<char*>(
+						dfield_get_data(
+							&ufield->new_val)))
 				    < ufield_len) {
 
 					return(ULINT_UNDEFINED);
@@ -552,8 +559,9 @@ row_ins_cascade_calc_update_vec(
 					byte*	padded_data;
 					ulint	mbminlen;
 
-					padded_data = mem_heap_alloc(
-						heap, min_size);
+					padded_data = static_cast<byte*>(
+						mem_heap_alloc(
+							heap, min_size));
 
 					pad = padded_data + ufield_len;
 					pad_len = min_size - ufield_len;
@@ -582,11 +590,91 @@ row_ins_cascade_calc_update_vec(
 							padded_data, min_size);
 				}
 
+				/* Check whether the current column has
+				FTS index on it */
+				if (table->fts
+				    && dict_table_is_fts_column(
+					table->fts->indexes,
+					dict_col_get_no(col))
+					!= ULINT_UNDEFINED) {
+					*fts_col_affected = TRUE;
+				}
+
+				/* If Doc ID is updated, check whether the
+				Doc ID is valid */
+				if (table->fts
+				    && ufield->field_no == doc_id_pos) {
+					doc_id_t	n_doc_id;
+
+					n_doc_id =
+						table->fts->cache->next_doc_id;
+
+					new_doc_id = fts_read_doc_id(
+						static_cast<const byte*>(
+							dfield_get_data(
+							&ufield->new_val)));
+
+					if (new_doc_id <= 0) {
+						fprintf(stderr,
+							"InnoDB: FTS Doc ID "
+							"must be larger than "
+							"0 \n");
+						return(ULINT_UNDEFINED);
+					}
+
+					if (new_doc_id < n_doc_id) {
+						fprintf(stderr,
+						       "InnoDB: FTS Doc ID "
+						       "must be larger than "
+						       IB_ID_FMT" for table",
+						       n_doc_id -1);
+
+						ut_print_name(stderr, trx,
+							      TRUE,
+							      table->name);
+
+						putc('\n', stderr);
+						return(ULINT_UNDEFINED);
+					}
+
+					*fts_col_affected = TRUE;
+					doc_id_updated = TRUE;
+				}
+
 				n_fields_updated++;
 			}
 		}
 	}
 
+	/* Generate a new Doc ID if FTS index columns get updated */
+	if (table->fts && *fts_col_affected) {
+		if (DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_HAS_DOC_ID)) {
+			doc_id_t	doc_id;
+                        upd_field_t*	ufield;
+
+			ut_ad(!doc_id_updated);
+			ufield = update->fields + n_fields_updated;
+			fts_get_next_doc_id(table, &trx->fts_next_doc_id);
+			doc_id = fts_update_doc_id(table, ufield,
+						   &trx->fts_next_doc_id);
+			n_fields_updated++;
+			fts_trx_add_op(trx, table, doc_id, FTS_INSERT, NULL);
+		} else  {
+			if (doc_id_updated) {
+				ut_ad(new_doc_id);
+				fts_trx_add_op(trx, table, new_doc_id,
+					       FTS_INSERT, NULL);
+			} else {
+				fprintf(stderr, "InnoDB: FTS Doc ID must be "
+					"updated along with FTS indexed "
+					"column for table ");
+				ut_print_name(stderr, trx, TRUE, table->name);
+				putc('\n', stderr);
+				return(ULINT_UNDEFINED);
+			}
+		}
+	}
+
 	update->n_fields = n_fields_updated;
 
 	return(n_fields_updated);
@@ -619,6 +707,41 @@ row_ins_set_detailed(
 }
 
 /*********************************************************************//**
+Acquires dict_foreign_err_mutex, rewinds dict_foreign_err_file
+and displays information about the given transaction.
+The caller must release dict_foreign_err_mutex. */
+static
+void
+row_ins_foreign_trx_print(
+/*======================*/
+	trx_t*	trx)	/*!< in: transaction */
+{
+	ulint	n_lock_rec;
+	ulint	n_lock_struct;
+	ulint	heap_size;
+
+	lock_mutex_enter();
+	n_lock_rec = lock_number_of_rows_locked(&trx->lock);
+	n_lock_struct = UT_LIST_GET_LEN(trx->lock.trx_locks);
+	heap_size = mem_heap_get_size(trx->lock.lock_heap);
+	lock_mutex_exit();
+
+	mutex_enter(&trx_sys->mutex);
+
+	mutex_enter(&dict_foreign_err_mutex);
+	rewind(dict_foreign_err_file);
+	ut_print_timestamp(dict_foreign_err_file);
+	fputs(" Transaction:\n", dict_foreign_err_file);
+
+	trx_print_low(dict_foreign_err_file, trx, 600,
+		      n_lock_rec, n_lock_struct, heap_size);
+
+	mutex_exit(&trx_sys->mutex);
+
+	ut_ad(mutex_own(&dict_foreign_err_mutex));
+}
+
+/*********************************************************************//**
 Reports a foreign key error associated with an update or a delete of a
 parent table index entry. */
 static
@@ -640,11 +763,7 @@ row_ins_foreign_report_err(
 
 	row_ins_set_detailed(trx, foreign);
 
-	mutex_enter(&dict_foreign_err_mutex);
-	rewind(ef);
-	ut_print_timestamp(ef);
-	fputs(" Transaction:\n", ef);
-	trx_print(ef, trx, 600);
+	row_ins_foreign_trx_print(trx);
 
 	fputs("Foreign key constraint fails for table ", ef);
 	ut_print_name(ef, trx, TRUE, foreign->foreign_table_name);
@@ -694,11 +813,8 @@ row_ins_foreign_report_add_err(
 
 	row_ins_set_detailed(trx, foreign);
 
-	mutex_enter(&dict_foreign_err_mutex);
-	rewind(ef);
-	ut_print_timestamp(ef);
-	fputs(" Transaction:\n", ef);
-	trx_print(ef, trx, 600);
+	row_ins_foreign_trx_print(trx);
+
 	fputs("Foreign key constraint fails for table ", ef);
 	ut_print_name(ef, trx, TRUE, foreign->foreign_table_name);
 	fputs(":\n", ef);
@@ -793,6 +909,8 @@ row_ins_foreign_check_on_constraint(
 	ulint		i;
 	trx_t*		trx;
 	mem_heap_t*	tmp_heap	= NULL;
+	doc_id_t	doc_id = FTS_NULL_DOC_ID;
+	ibool		fts_col_affacted = FALSE;
 
 	ut_a(thr);
 	ut_a(foreign);
@@ -804,12 +922,12 @@ row_ins_foreign_check_on_constraint(
 	/* Since we are going to delete or update a row, we have to invalidate
 	the MySQL query cache for table. A deadlock of threads is not possible
 	here because the caller of this function does not hold any latches with
-	the sync0sync.h rank above the kernel mutex. The query cache mutex has
-	a rank just above the kernel mutex. */
+	the sync0sync.h rank above the lock_sys_t::mutex. The query cache mutex
+       	has a rank just above the lock_sys_t::mutex. */
 
 	row_ins_invalidate_query_cache(thr, table->name);
 
-	node = thr->run_node;
+	node = static_cast<upd_node_t*>(thr->run_node);
 
 	if (node->is_delete && 0 == (foreign->type
 				     & (DICT_FOREIGN_ON_DELETE_CASCADE
@@ -913,6 +1031,8 @@ row_ins_foreign_check_on_constraint(
 
 	rec = btr_pcur_get_rec(pcur);
 
+	tmp_heap = mem_heap_create(256);
+
 	if (dict_index_is_clust(index)) {
 		/* pcur is already positioned in the clustered index of
 		the child table */
@@ -926,8 +1046,6 @@ row_ins_foreign_check_on_constraint(
 
 		clust_index = dict_table_get_first_index(table);
 
-		tmp_heap = mem_heap_create(256);
-
 		ref = row_build_row_ref(ROW_COPY_POINTERS, index, rec,
 					tmp_heap);
 		btr_pcur_open_with_no_init(clust_index, ref,
@@ -989,6 +1107,10 @@ row_ins_foreign_check_on_constraint(
 		goto nonstandard_exit_func;
 	}
 
+	if (table->fts) {
+		doc_id = fts_get_doc_id_from_rec(table, clust_rec, tmp_heap);
+	}
+
 	if (node->is_delete
 	    ? (foreign->type & DICT_FOREIGN_ON_DELETE_SET_NULL)
 	    : (foreign->type & DICT_FOREIGN_ON_UPDATE_SET_NULL)) {
@@ -1012,6 +1134,31 @@ row_ins_foreign_check_on_constraint(
 			ufield->orig_len = 0;
 			ufield->exp = NULL;
 			dfield_set_null(&ufield->new_val);
+
+			if (table->fts && dict_table_is_fts_column(
+				table->fts->indexes,
+				dict_index_get_nth_col_no(index, i))
+				!= ULINT_UNDEFINED) {
+				fts_col_affacted = TRUE;
+			}
+		}
+
+		if (fts_col_affacted) {
+			fts_trx_add_op(trx, table, doc_id, FTS_DELETE, NULL);
+		}
+	} else if (table->fts && cascade->is_delete) {
+		/* DICT_FOREIGN_ON_DELETE_CASCADE case */
+		for (i = 0; i < foreign->n_fields; i++) {
+			if (table->fts && dict_table_is_fts_column(
+				table->fts->indexes,
+				dict_index_get_nth_col_no(index, i))
+				!= ULINT_UNDEFINED) {
+				fts_col_affacted = TRUE;
+			}
+		}
+
+		if (fts_col_affacted) {
+			fts_trx_add_op(trx, table, doc_id, FTS_DELETE, NULL);
 		}
 	}
 
@@ -1023,8 +1170,9 @@ row_ins_foreign_check_on_constraint(
 
 		upd_vec_heap = mem_heap_create(256);
 
-		n_to_update = row_ins_cascade_calc_update_vec(node, foreign,
-							      upd_vec_heap);
+		n_to_update = row_ins_cascade_calc_update_vec(
+			node, foreign, upd_vec_heap, trx, &fts_col_affacted);
+
 		if (n_to_update == ULINT_UNDEFINED) {
 			err = DB_ROW_IS_REFERENCED;
 
@@ -1050,6 +1198,12 @@ row_ins_foreign_check_on_constraint(
 
 			goto nonstandard_exit_func;
 		}
+
+		/* Mark the old Doc ID as deleted */
+		if (fts_col_affacted) {
+			ut_ad(table->fts);
+			fts_trx_add_op(trx, table, doc_id, FTS_DELETE, NULL);
+		}
 	}
 
 	/* Store pcur position and initialize or store the cascade node
@@ -1087,9 +1241,6 @@ row_ins_foreign_check_on_constraint(
 	release the latch. */
 
 	row_mysql_unfreeze_data_dictionary(thr_get_trx(thr));
-
-	DEBUG_SYNC_C("innodb_dml_cascade_dict_unfreeze");
-
 	row_mysql_freeze_data_dictionary(thr_get_trx(thr));
 
 	mtr_start(mtr);
@@ -1209,13 +1360,13 @@ row_ins_check_foreign_constraint(
 	dtuple_t*	entry,	/*!< in: index entry for index */
 	que_thr_t*	thr)	/*!< in: query thread */
 {
+	ulint		err;
 	upd_node_t*	upd_node;
 	dict_table_t*	check_table;
 	dict_index_t*	check_index;
 	ulint		n_fields_cmp;
 	btr_pcur_t	pcur;
 	int		cmp;
-	ulint		err;
 	ulint		i;
 	mtr_t		mtr;
 	trx_t*		trx		= thr_get_trx(thr);
@@ -1250,7 +1401,7 @@ run_again:
 	}
 
 	if (que_node_get_type(thr->run_node) == QUE_NODE_UPDATE) {
-		upd_node = thr->run_node;
+		upd_node = static_cast<upd_node_t*>(thr->run_node);
 
 		if (!(upd_node->is_delete) && upd_node->foreign == foreign) {
 			/* If a cascaded update is done as defined by a
@@ -1287,11 +1438,8 @@ run_again:
 
 			row_ins_set_detailed(trx, foreign);
 
-			mutex_enter(&dict_foreign_err_mutex);
-			rewind(ef);
-			ut_print_timestamp(ef);
-			fputs(" Transaction:\n", ef);
-			trx_print(ef, trx, 600);
+			row_ins_foreign_trx_print(trx);
+
 			fputs("Foreign key constraint fails for table ", ef);
 			ut_print_name(ef, trx, TRUE,
 				      foreign->foreign_table_name);
@@ -1488,11 +1636,11 @@ end_scan:
 
 do_possible_lock_wait:
 	if (err == DB_LOCK_WAIT) {
-		trx->error_state = err;
+		trx->error_state = static_cast<enum db_err>(err);
 
 		que_thr_stop_for_mysql(thr);
 
-		srv_suspend_mysql_thread(thr);
+		lock_wait_suspend_thread(thr);
 
 		if (trx->error_state == DB_SUCCESS) {
 
@@ -1536,10 +1684,12 @@ row_ins_check_foreign_constraints(
 
 	while (foreign) {
 		if (foreign->foreign_index == index) {
+			dict_table_t*	ref_table = NULL;
 
 			if (foreign->referenced_table == NULL) {
-				dict_table_get(foreign->referenced_table_name_lookup,
-					       FALSE);
+
+				ref_table = dict_table_open_on_name(
+					foreign->referenced_table_name_lookup, FALSE);
 			}
 
 			if (0 == trx->dict_operation_lock_mode) {
@@ -1549,12 +1699,9 @@ row_ins_check_foreign_constraints(
 			}
 
 			if (foreign->referenced_table) {
-				mutex_enter(&(dict_sys->mutex));
-
-				(foreign->referenced_table
-				 ->n_foreign_key_checks_running)++;
-
-				mutex_exit(&(dict_sys->mutex));
+				os_inc_counter(dict_sys->mutex,
+					       foreign->foreign_table
+					       ->n_foreign_key_checks_running);
 			}
 
 			/* NOTE that if the thread ends up waiting for a lock
@@ -1566,21 +1713,21 @@ row_ins_check_foreign_constraints(
 				TRUE, foreign, table, entry, thr);
 
 			if (foreign->referenced_table) {
-				mutex_enter(&(dict_sys->mutex));
-
-				ut_a(foreign->referenced_table
-				     ->n_foreign_key_checks_running > 0);
-				(foreign->referenced_table
-				 ->n_foreign_key_checks_running)--;
-
-				mutex_exit(&(dict_sys->mutex));
+				os_dec_counter(dict_sys->mutex,
+					       foreign->foreign_table
+					       ->n_foreign_key_checks_running);
 			}
 
 			if (got_s_lock) {
 				row_mysql_unfreeze_data_dictionary(trx);
 			}
 
+			if (ref_table != NULL) {
+				dict_table_close(ref_table, FALSE);
+			}
+
 			if (err != DB_SUCCESS) {
+
 				return(err);
 			}
 		}
@@ -1919,23 +2066,22 @@ func_exit:
 }
 
 /***************************************************************//**
-Checks if an index entry has long enough common prefix with an existing
-record so that the intended insert of the entry must be changed to a modify of
-the existing record. In the case of a clustered index, the prefix must be
-n_unique fields long, and in the case of a secondary index, all fields must be
-equal.
-@return 0 if no update, ROW_INS_PREV if previous should be updated;
-currently we do the search so that only the low_match record can match
-enough to the search tuple, not the next record */
+Checks if an index entry has long enough common prefix with an
+existing record so that the intended insert of the entry must be
+changed to a modify of the existing record. In the case of a clustered
+index, the prefix must be n_unique fields long. In the case of a
+secondary index, all fields must be equal.  InnoDB never updates
+secondary index records in place, other than clearing or setting the
+delete-mark flag. We could be able to update the non-unique fields
+of a unique secondary index record by checking the cursor->up_match,
+but we do not do so, because it could have some locking implications.
+@return TRUE if the existing record should be updated; FALSE if not */
 UNIV_INLINE
-ulint
-row_ins_must_modify(
-/*================*/
-	btr_cur_t*	cursor)	/*!< in: B-tree cursor */
+ibool
+row_ins_must_modify_rec(
+/*====================*/
+	const btr_cur_t*	cursor)	/*!< in: B-tree cursor */
 {
-	ulint	enough_match;
-	rec_t*	rec;
-
 	/* NOTE: (compare to the note in row_ins_duplicate_error) Because node
 	pointers on upper levels of the B-tree may match more to entry than
 	to actual user records on the leaf level, we have to check if the
@@ -1943,19 +2089,9 @@ row_ins_must_modify(
 	node pointers contain index->n_unique first fields, and in the case
 	of a secondary index, all fields of the index. */
 
-	enough_match = dict_index_get_n_unique_in_tree(cursor->index);
-
-	if (cursor->low_match >= enough_match) {
-
-		rec = btr_cur_get_rec(cursor);
-
-		if (!page_rec_is_infimum(rec)) {
-
-			return(ROW_INS_PREV);
-		}
-	}
-
-	return(0);
+	return(cursor->low_match
+	       >= dict_index_get_n_unique_in_tree(cursor->index)
+	       && !page_rec_is_infimum(btr_cur_get_rec(cursor)));
 }
 
 /***************************************************************//**
@@ -1983,7 +2119,7 @@ row_ins_index_entry_low(
 {
 	btr_cur_t	cursor;
 	ulint		search_mode;
-	ulint		modify = 0; /* remove warning */
+	ibool		modify			= FALSE;
 	rec_t*		insert_rec;
 	rec_t*		rec;
 	ulint*		offsets;
@@ -2076,20 +2212,13 @@ row_ins_index_entry_low(
 		}
 	}
 
-	modify = row_ins_must_modify(&cursor);
+	modify = row_ins_must_modify_rec(&cursor);
 
-	if (modify != 0) {
+	if (modify) {
 		/* There is already an index entry with a long enough common
 		prefix, we must convert the insert into a modify of an
 		existing record */
 
-		if (modify == ROW_INS_NEXT) {
-			rec = page_rec_get_next(btr_cur_get_rec(&cursor));
-
-			btr_cur_position(index, rec,
-					 btr_cur_get_block(&cursor),&cursor);
-		}
-
 		if (dict_index_is_clust(index)) {
 			err = row_ins_clust_index_entry_by_modify(
 				mode, &cursor, &heap, &big_rec, entry,
@@ -2126,12 +2255,16 @@ row_ins_index_entry_low(
 					rec, index, NULL,
 					ULINT_UNDEFINED, &heap);
 
-				DEBUG_SYNC_C("before_row_ins_upd_extern");
+				DEBUG_SYNC_C_IF_THD((THD*)
+					thr_get_trx(thr)->mysql_thd,
+					"before_row_ins_upd_extern");
 				err = btr_store_big_rec_extern_fields(
 					index, btr_cur_get_block(&cursor),
 					rec, offsets, big_rec, &mtr,
 					BTR_STORE_INSERT_UPDATE);
-				DEBUG_SYNC_C("after_row_ins_upd_extern");
+				DEBUG_SYNC_C_IF_THD((THD*)
+					thr_get_trx(thr)->mysql_thd,
+					"after_row_ins_upd_extern");
 				/* If writing big_rec fails (for
 				example, because of DB_OUT_OF_FILE_SPACE),
 				the record will be corrupted. Even if
@@ -2186,7 +2319,9 @@ function_exit:
 
 		mtr_start(&mtr);
 
-		DEBUG_SYNC_C("before_row_ins_extern_latch");
+		DEBUG_SYNC_C_IF_THD((THD*)
+			thr_get_trx(thr)->mysql_thd,
+			"before_row_ins_extern_latch");
 		btr_cur_search_to_nth_level(index, 0, entry, PAGE_CUR_LE,
 					    BTR_MODIFY_TREE, &cursor, 0,
 					    __FILE__, __LINE__, &mtr);
@@ -2194,11 +2329,15 @@ function_exit:
 		offsets = rec_get_offsets(rec, index, NULL,
 					  ULINT_UNDEFINED, &heap);
 
-		DEBUG_SYNC_C("before_row_ins_extern");
+		DEBUG_SYNC_C_IF_THD((THD*)
+			thr_get_trx(thr)->mysql_thd,
+			"before_row_ins_extern");
 		err = btr_store_big_rec_extern_fields(
 			index, btr_cur_get_block(&cursor),
 			rec, offsets, big_rec, &mtr, BTR_STORE_INSERT);
-		DEBUG_SYNC_C("after_row_ins_extern");
+		DEBUG_SYNC_C_IF_THD((THD*)
+			thr_get_trx(thr)->mysql_thd,
+			"after_row_ins_extern");
 
 stored_big_rec:
 		if (modify) {
@@ -2299,7 +2438,9 @@ row_ins_index_entry_set_vals(
 			len = dtype_get_at_most_n_mbchars(
 				col->prtype, col->mbminmaxlen,
 				ind_field->prefix_len,
-				len, dfield_get_data(row_field));
+				len,
+				static_cast<const char*>(
+					dfield_get_data(row_field)));
 
 			ut_ad(!dfield_is_ext(row_field));
 		}
@@ -2463,17 +2604,19 @@ row_ins(
 	ut_ad(node->state == INS_NODE_INSERT_ENTRIES);
 
 	while (node->index != NULL) {
-		err = row_ins_index_entry_step(node, thr);
+		if (node->index->type != DICT_FTS) {
+			err = row_ins_index_entry_step(node, thr);
 
-		if (err != DB_SUCCESS) {
+			if (err != DB_SUCCESS) {
 
-			return(err);
+				return(err);
+			}
 		}
 
 		node->index = dict_table_get_next_index(node->index);
 		node->entry = UT_LIST_GET_NEXT(tuple_list, node->entry);
 
-		/* Skip corrupted secondar index and its entry */
+		/* Skip corrupted secondary index and its entry */
 		while (node->index && dict_index_is_corrupted(node->index)) {
 
 			node->index = dict_table_get_next_index(node->index);
@@ -2508,9 +2651,9 @@ row_ins_step(
 
 	trx = thr_get_trx(thr);
 
-	trx_start_if_not_started(trx);
+	trx_start_if_not_started_xa(trx);
 
-	node = thr->run_node;
+	node = static_cast<ins_node_t*>(thr->run_node);
 
 	ut_ad(que_node_get_type(node) == QUE_NODE_INSERT);
 
@@ -2583,7 +2726,7 @@ same_trx:
 	err = row_ins(node, thr);
 
 error_handling:
-	trx->error_state = err;
+	trx->error_state = static_cast<enum db_err>(err);
 
 	if (err != DB_SUCCESS) {
 		/* err == DB_LOCK_WAIT or SQL error detected */
diff --git a/storage/innobase/row/row0merge.c b/storage/innobase/row/row0merge.cc
index 750f9d72cf2..0acd7933c19 100644
--- a/storage/innobase/row/row0merge.c
+++ b/storage/innobase/row/row0merge.cc
@@ -11,13 +11,13 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
 /**************************************************//**
-@file row/row0merge.c
+@file row/row0merge.cc
 New index creation routines using a merge sort
 
 Created 12/4/2005 Jan Lindstrom
@@ -56,6 +56,10 @@ Completed by Sunny Bains and Marko Makela
 #include "log0log.h"
 #include "ut0sort.h"
 #include "handler0alter.h"
+#include "fts0fts.h"
+#include "fts0types.h"
+#include "fts0priv.h"
+#include "row0ftsort.h"
 
 /* Ignore posix_fadvise() on those platforms where it does not exist */
 #if defined __WIN__
@@ -81,56 +85,24 @@ static ibool	row_merge_print_block_write;
 /* @} */
 #endif /* UNIV_DEBUG */
 
-/** @brief Block size for I/O operations in merge sort.
-
-The minimum is UNIV_PAGE_SIZE, or page_get_free_space_of_empty()
-rounded to a power of 2.
-
-When not creating a PRIMARY KEY that contains column prefixes, this
-can be set as small as UNIV_PAGE_SIZE / 2.  See the comment above
-ut_ad(data_size < sizeof(row_merge_block_t)). */
-typedef byte	row_merge_block_t[1048576];
-
-/** @brief Secondary buffer for I/O operations of merge records.
-
-This buffer is used for writing or reading a record that spans two
-row_merge_block_t.  Thus, it must be able to hold one merge record,
-whose maximum size is the same as the minimum size of
-row_merge_block_t. */
-typedef byte	mrec_buf_t[UNIV_PAGE_SIZE];
-
-/** @brief Merge record in row_merge_block_t.
-
-The format is the same as a record in ROW_FORMAT=COMPACT with the
-exception that the REC_N_NEW_EXTRA_BYTES are omitted. */
-typedef byte	mrec_t;
-
-/** Buffer for sorting in main memory. */
-struct row_merge_buf_struct {
-	mem_heap_t*	heap;		/*!< memory heap where allocated */
-	dict_index_t*	index;		/*!< the index the tuples belong to */
-	ulint		total_size;	/*!< total amount of data bytes */
-	ulint		n_tuples;	/*!< number of data tuples */
-	ulint		max_tuples;	/*!< maximum number of data tuples */
-	const dfield_t**tuples;		/*!< array of pointers to
-					arrays of fields that form
-					the data tuples */
-	const dfield_t**tmp_tuples;	/*!< temporary copy of tuples,
-					for sorting */
-};
-
-/** Buffer for sorting in main memory. */
-typedef struct row_merge_buf_struct row_merge_buf_t;
-
-/** Information about temporary files used in merge sort */
-struct merge_file_struct {
-	int		fd;		/*!< file descriptor */
-	ulint		offset;		/*!< file offset (end of file) */
-	ib_uint64_t	n_rec;		/*!< number of records in the file */
-};
-
-/** Information about temporary files used in merge sort */
-typedef struct merge_file_struct merge_file_t;
+/* Whether to disable file system cache */
+UNIV_INTERN char        srv_disable_sort_file_cache;
+
+/********************************************************************//**
+Read sorted file containing index data tuples and insert these data
+tuples to the index
+@return	DB_SUCCESS or error number */
+static
+ulint
+row_merge_insert_index_tuples(
+/*==========================*/
+	trx_t*			trx,	/*!< in: transaction */
+	dict_index_t*		index,	/*!< in: index */
+	dict_table_t*		table,	/*!< in: new table */
+	ulint			zip_size,/*!< in: compressed page size of
+					 the old table, or 0 if uncompressed */
+	int			fd,	/*!< in: file descriptor */
+	row_merge_block_t*	block);	/*!< in/out: file buffer */
 
 #ifdef UNIV_DEBUG
 /******************************************************//**
@@ -183,15 +155,15 @@ row_merge_buf_create_low(
 	row_merge_buf_t*	buf;
 
 	ut_ad(max_tuples > 0);
-	ut_ad(max_tuples <= sizeof(row_merge_block_t));
-	ut_ad(max_tuples < buf_size);
 
-	buf = mem_heap_zalloc(heap, buf_size);
+	ut_ad(max_tuples <= srv_sort_buf_size);
+
+	buf = static_cast<row_merge_buf_t*>(mem_heap_zalloc(heap, buf_size));
 	buf->heap = heap;
 	buf->index = index;
 	buf->max_tuples = max_tuples;
-	buf->tuples = mem_heap_alloc(heap,
-				     2 * max_tuples * sizeof *buf->tuples);
+	buf->tuples = static_cast<const dfield_t**>(
+		ut_malloc(2 * max_tuples * sizeof *buf->tuples));
 	buf->tmp_tuples = buf->tuples + max_tuples;
 
 	return(buf);
@@ -200,7 +172,7 @@ row_merge_buf_create_low(
 /******************************************************//**
 Allocate a sort buffer.
 @return	own: sort buffer */
-static
+UNIV_INTERN
 row_merge_buf_t*
 row_merge_buf_create(
 /*=================*/
@@ -211,12 +183,12 @@ row_merge_buf_create(
 	ulint			buf_size;
 	mem_heap_t*		heap;
 
-	max_tuples = sizeof(row_merge_block_t)
+	max_tuples = srv_sort_buf_size
 		/ ut_max(1, dict_index_get_min_size(index));
 
-	buf_size = (sizeof *buf) + (max_tuples - 1) * sizeof *buf->tuples;
+	buf_size = (sizeof *buf);
 
-	heap = mem_heap_create(buf_size + sizeof(row_merge_block_t));
+	heap = mem_heap_create(buf_size);
 
 	buf = row_merge_buf_create_low(heap, index, max_tuples, buf_size);
 
@@ -226,7 +198,7 @@ row_merge_buf_create(
 /******************************************************//**
 Empty a sort buffer.
 @return	sort buffer */
-static
+UNIV_INTERN
 row_merge_buf_t*
 row_merge_buf_empty(
 /*================*/
@@ -236,45 +208,63 @@ row_merge_buf_empty(
 	ulint		max_tuples	= buf->max_tuples;
 	mem_heap_t*	heap		= buf->heap;
 	dict_index_t*	index		= buf->index;
+	void*		tuple		= buf->tuples;
 
-	buf_size = (sizeof *buf) + (max_tuples - 1) * sizeof *buf->tuples;
+	buf_size = (sizeof *buf);;
 
 	mem_heap_empty(heap);
 
-	return(row_merge_buf_create_low(heap, index, max_tuples, buf_size));
+	buf = static_cast<row_merge_buf_t*>(mem_heap_zalloc(heap, buf_size));
+	buf->heap = heap;
+	buf->index = index;
+	buf->max_tuples = max_tuples;
+	buf->tuples = static_cast<const dfield_t**>(tuple);
+	buf->tmp_tuples = buf->tuples + max_tuples;
+
+	return(buf);
 }
 
 /******************************************************//**
 Deallocate a sort buffer. */
-static
+UNIV_INTERN
 void
 row_merge_buf_free(
 /*===============*/
 	row_merge_buf_t*	buf)	/*!< in,own: sort buffer, to be freed */
 {
+	ut_free(buf->tuples);
 	mem_heap_free(buf->heap);
 }
 
 /******************************************************//**
 Insert a data tuple into a sort buffer.
-@return	TRUE if added, FALSE if out of space */
+@return	number of rows added, 0 if out of space */
 static
-ibool
+ulint
 row_merge_buf_add(
 /*==============*/
 	row_merge_buf_t*	buf,	/*!< in/out: sort buffer */
+	dict_index_t*		fts_index,/*!< fts index to be
+					created */
+	fts_psort_t*		psort_info, /*!< in: parallel sort info */
 	const dtuple_t*		row,	/*!< in: row in clustered index */
-	const row_ext_t*	ext)	/*!< in: cache of externally stored
+	const row_ext_t*	ext,	/*!< in: cache of externally stored
 					column prefixes, or NULL */
+	doc_id_t*		doc_id)	/*!< in/out: Doc ID if we are
+					creating FTS index */
+
 {
 	ulint			i;
-	ulint			n_fields;
-	ulint			data_size;
-	ulint			extra_size;
 	const dict_index_t*	index;
 	dfield_t*		entry;
 	dfield_t*		field;
 	const dict_field_t*	ifield;
+	ulint			n_fields;
+	ulint			data_size;
+	ulint			extra_size;
+	ulint			bucket = 0;
+	doc_id_t		write_doc_id;
+	ulint			n_row_added = 0;
 
 	if (buf->n_tuples >= buf->max_tuples) {
 		return(FALSE);
@@ -282,11 +272,16 @@ row_merge_buf_add(
 
 	UNIV_PREFETCH_R(row->fields);
 
-	index = buf->index;
+	/* If we are building FTS index, buf->index points to
+	the 'fts_sort_idx', and real FTS index is stored in
+	fts_index */
+	index = (buf->index->type & DICT_FTS) ? fts_index : buf->index;
 
 	n_fields = dict_index_get_n_fields(index);
 
-	entry = mem_heap_alloc(buf->heap, n_fields * sizeof *entry);
+	entry = static_cast<dfield_t*>(
+		mem_heap_alloc(buf->heap, n_fields * sizeof *entry));
+
 	buf->tuples[buf->n_tuples] = entry;
 	field = entry;
 
@@ -296,21 +291,114 @@ row_merge_buf_add(
 	ifield = dict_index_get_nth_field(index, 0);
 
 	for (i = 0; i < n_fields; i++, field++, ifield++) {
+		ulint			len;
 		const dict_col_t*	col;
 		ulint			col_no;
 		const dfield_t*		row_field;
-		ulint			len;
+		ibool			col_adjusted;
 
 		col = ifield->col;
 		col_no = dict_col_get_no(col);
-		row_field = dtuple_get_nth_field(row, col_no);
-		dfield_copy(field, row_field);
+		col_adjusted = FALSE;
+
+		/* If we are creating a FTS index, a new Doc
+		ID column is being added, so we need to adjust
+		any column number positioned after this Doc ID */
+		if (*doc_id > 0
+		    && DICT_TF2_FLAG_IS_SET(index->table,
+                    			    DICT_TF2_FTS_ADD_DOC_ID)
+		    && col_no > index->table->fts->doc_col) {
+
+			ut_ad(index->table->fts);
+
+			col_no--;
+			col_adjusted = TRUE;
+		}
+
+		/* Process the Doc ID column */
+		if (*doc_id > 0
+		    && col_no == index->table->fts->doc_col
+		    && !col_adjusted) {
+			fts_write_doc_id((byte*) &write_doc_id, *doc_id);
+
+			/* Note: field->data now points to a value on the
+			stack: &write_doc_id after dfield_set_data(). Because
+			there is only one doc_id per row, it shouldn't matter.
+			We allocate a new buffer before we leave the function
+			later below. */
+
+			dfield_set_data(
+				field, &write_doc_id, sizeof(write_doc_id));
+
+			field->type.mtype = ifield->col->mtype;
+			field->type.prtype = ifield->col->prtype;
+			field->type.mbminmaxlen = DATA_MBMINMAXLEN(0, 0);
+			field->type.len = ifield->col->len;
+		} else {
+			row_field = dtuple_get_nth_field(row, col_no);
+
+			dfield_copy(field, row_field);
+
+			/* Tokenize and process data for FTS */
+			if (index->type & DICT_FTS) {
+				fts_doc_item_t*	doc_item;
+				byte*		value;
+
+				if (dfield_is_null(field)) {
+					n_row_added = 1;
+					continue;
+				}
+
+				doc_item = static_cast<fts_doc_item_t*>(
+					mem_heap_alloc(
+						buf->heap,
+						sizeof(fts_doc_item_t)));
+
+				/* fetch Doc ID if it already exists
+				in the row, and not supplied by the caller */
+				if (*doc_id == 0) {
+					const dfield_t*	doc_field;
+					doc_field = dtuple_get_nth_field(
+						row,
+						index->table->fts->doc_col);
+					*doc_id = (doc_id_t) mach_read_from_8(
+						static_cast<byte*>(
+						dfield_get_data(doc_field)));
+
+					if (*doc_id == 0) {
+						fprintf(stderr, "InnoDB FTS: "
+							"User supplied Doc ID "
+							"is zero. Record "
+							"Skipped\n");
+						return(0);
+					}
+				}
+
+				value = static_cast<byte*>(
+					ut_malloc(field->len));
+				memcpy(value, field->data, field->len);
+				field->data = value;
+
+				doc_item->field = field;
+				doc_item->doc_id = *doc_id;
+
+				bucket = *doc_id % fts_sort_pll_degree;
+
+				UT_LIST_ADD_LAST(
+					doc_list,
+					psort_info[bucket].fts_doc_list,
+					doc_item);
+				n_row_added = 1;
+				continue;
+			}
+		}
+
 		len = dfield_get_len(field);
 
 		if (dfield_is_null(field)) {
 			ut_ad(!(col->prtype & DATA_NOT_NULL));
 			continue;
-		} else if (UNIV_LIKELY(!ext)) {
+		} else if (!ext) {
 		} else if (dict_index_is_clust(index)) {
 			/* Flag externally stored fields. */
 			const byte*	buf = row_ext_lookup(ext, col_no,
@@ -340,7 +428,8 @@ row_merge_buf_add(
 				col->prtype,
 				col->mbminmaxlen,
 				ifield->prefix_len,
-				len, dfield_get_data(field));
+				len,
+				static_cast<char*>(dfield_get_data(field)));
 			dfield_set_len(field, len);
 		}
 
@@ -364,6 +453,12 @@ row_merge_buf_add(
 		data_size += len;
 	}
 
+	/* If this is FTS index, we already populated the sort buffer, return
+	here */
+	if (index->type & DICT_FTS) {
+		return(n_row_added);
+	}
+
 #ifdef UNIV_DEBUG
 	{
 		ulint	size;
@@ -390,15 +485,16 @@ row_merge_buf_add(
 	page_zip_rec_needs_ext() limit.  However, no further columns
 	will be moved to external storage until the record is inserted
 	to the clustered index B-tree. */
-	ut_ad(data_size < sizeof(row_merge_block_t));
+	ut_ad(data_size < srv_sort_buf_size);
 
 	/* Reserve one byte for the end marker of row_merge_block_t. */
-	if (buf->total_size + data_size >= sizeof(row_merge_block_t) - 1) {
-		return(FALSE);
+	if (buf->total_size + data_size >= srv_sort_buf_size - 1) {
+		return(0);
 	}
 
 	buf->total_size += data_size;
 	buf->n_tuples++;
+	n_row_added++;
 
 	field = entry;
 
@@ -408,19 +504,9 @@ row_merge_buf_add(
 		dfield_dup(field++, buf->heap);
 	} while (--n_fields);
 
-	return(TRUE);
+	return(n_row_added);
 }
 
-/** Structure for reporting duplicate records. */
-struct row_merge_dup_struct {
-	const dict_index_t*	index;		/*!< index being sorted */
-	struct TABLE*		table;		/*!< MySQL table object */
-	ulint			n_dup;		/*!< number of duplicates */
-};
-
-/** Structure for reporting duplicate records. */
-typedef struct row_merge_dup_struct row_merge_dup_t;
-
 /*************************************************************//**
 Report a duplicate key. */
 static
@@ -451,7 +537,7 @@ row_merge_dup_report(
 			       * sizeof *offsets
 			       + sizeof *buf);
 
-	buf = mem_heap_alloc(heap, sizeof *buf);
+	buf = static_cast<mrec_buf_t*>(mem_heap_alloc(heap, sizeof *buf));
 
 	tuple = dtuple_from_fields(&tuple_store, entry, n_fields);
 	n_ext = dict_index_is_clust(index) ? dtuple_get_n_ext(tuple) : 0;
@@ -541,7 +627,7 @@ row_merge_tuple_sort(
 
 /******************************************************//**
 Sort a buffer. */
-static
+UNIV_INTERN
 void
 row_merge_buf_sort(
 /*===============*/
@@ -554,22 +640,18 @@ row_merge_buf_sort(
 
 /******************************************************//**
 Write a buffer to a block. */
-static
+UNIV_INTERN
 void
 row_merge_buf_write(
 /*================*/
 	const row_merge_buf_t*	buf,	/*!< in: sorted buffer */
-#ifdef UNIV_DEBUG
-	const merge_file_t*	of,	/*!< in: output file */
-#endif /* UNIV_DEBUG */
+	const merge_file_t*	of UNIV_UNUSED,
+					/*!< in: output file */
 	row_merge_block_t*	block)	/*!< out: buffer for writing to file */
-#ifndef UNIV_DEBUG
-# define row_merge_buf_write(buf, of, block) row_merge_buf_write(buf, block)
-#endif /* !UNIV_DEBUG */
 {
 	const dict_index_t*	index	= buf->index;
 	ulint			n_fields= dict_index_get_n_fields(index);
-	byte*			b	= &(*block)[0];
+	byte*			b	= &block[0];
 
 	ulint		i;
 
@@ -596,7 +678,7 @@ row_merge_buf_write(
 			*b++ = (byte) (extra_size + 1);
 		}
 
-		ut_ad(b + size < block[1]);
+		ut_ad(b + size < &block[srv_sort_buf_size]);
 
 		rec_convert_dtuple_to_rec_comp(b + extra_size, 0, index,
 					       REC_STATUS_ORDINARY,
@@ -615,13 +697,13 @@ row_merge_buf_write(
 	}
 
 	/* Write an "end-of-chunk" marker. */
-	ut_a(b < block[1]);
-	ut_a(b == block[0] + buf->total_size);
+	ut_a(b < &block[srv_sort_buf_size]);
+	ut_a(b == &block[0] + buf->total_size);
 	*b++ = 0;
 #ifdef UNIV_DEBUG_VALGRIND
 	/* The rest of the block is uninitialized.  Initialize it
 	to avoid bogus warnings. */
-	memset(b, 0xff, block[1] - b);
+	memset(b, 0xff, &block[srv_sort_buf_size] - b);
 #endif /* UNIV_DEBUG_VALGRIND */
 #ifdef UNIV_DEBUG
 	if (row_merge_print_write) {
@@ -649,9 +731,12 @@ row_merge_heap_create(
 	mem_heap_t*	heap	= mem_heap_create(2 * i * sizeof **offsets1
 						  + 3 * sizeof **buf);
 
-	*buf = mem_heap_alloc(heap, 3 * sizeof **buf);
-	*offsets1 = mem_heap_alloc(heap, i * sizeof **offsets1);
-	*offsets2 = mem_heap_alloc(heap, i * sizeof **offsets2);
+	*buf = static_cast<mrec_buf_t*>(
+		mem_heap_alloc(heap, 3 * sizeof **buf));
+	*offsets1 = static_cast<ulint*>(
+		mem_heap_alloc(heap, i * sizeof **offsets1));
+	*offsets2 = static_cast<ulint*>(
+		mem_heap_alloc(heap, i * sizeof **offsets2));
 
 	(*offsets1)[0] = (*offsets2)[0] = i;
 	(*offsets1)[1] = (*offsets2)[1] = dict_index_get_n_fields(index);
@@ -674,7 +759,8 @@ row_merge_dict_table_get_index(
 	dict_index_t*	index;
 	const char**	column_names;
 
-	column_names = mem_alloc(index_def->n_fields * sizeof *column_names);
+	column_names = static_cast<const char**>(
+		mem_alloc(index_def->n_fields * sizeof *column_names));
 
 	for (i = 0; i < index_def->n_fields; ++i) {
 		column_names[i] = index_def->fields[i].field_name;
@@ -691,7 +777,7 @@ row_merge_dict_table_get_index(
 /********************************************************************//**
 Read a merge block from the file system.
 @return	TRUE if request was successful, FALSE if fail */
-static
+UNIV_INTERN
 ibool
 row_merge_read(
 /*===========*/
@@ -701,7 +787,7 @@ row_merge_read(
 					elements */
 	row_merge_block_t*	buf)	/*!< out: data */
 {
-	ib_uint64_t	ofs = ((ib_uint64_t) offset) * sizeof *buf;
+	os_offset_t	ofs = ((os_offset_t) offset) * srv_sort_buf_size;
 	ibool		success;
 
 #ifdef UNIV_DEBUG
@@ -711,19 +797,25 @@ row_merge_read(
 	}
 #endif /* UNIV_DEBUG */
 
+#ifdef UNIV_DEBUG
+	if (row_merge_print_block_read) {
+		fprintf(stderr, "row_merge_read fd=%d ofs=%lu\n",
+			fd, (ulong) offset);
+	}
+#endif /* UNIV_DEBUG */
+
 	success = os_file_read_no_error_handling(OS_FILE_FROM_FD(fd), buf,
-						 (ulint) (ofs & 0xFFFFFFFF),
-						 (ulint) (ofs >> 32),
-						 sizeof *buf);
+						 ofs, srv_sort_buf_size);
 #ifdef POSIX_FADV_DONTNEED
 	/* Each block is read exactly once.  Free up the file cache. */
-	posix_fadvise(fd, ofs, sizeof *buf, POSIX_FADV_DONTNEED);
+	posix_fadvise(fd, ofs, srv_sort_buf_size, POSIX_FADV_DONTNEED);
 #endif /* POSIX_FADV_DONTNEED */
 
 	if (UNIV_UNLIKELY(!success)) {
 		ut_print_timestamp(stderr);
 		fprintf(stderr,
-			"  InnoDB: failed to read merge block at %llu\n", ofs);
+			"  InnoDB: failed to read merge block at "UINT64PF"\n",
+			ofs);
 	}
 
 	return(UNIV_LIKELY(success));
@@ -732,7 +824,7 @@ row_merge_read(
 /********************************************************************//**
 Write a merge block to the file system.
 @return	TRUE if request was successful, FALSE if fail */
-static
+UNIV_INTERN
 ibool
 row_merge_write(
 /*============*/
@@ -741,14 +833,11 @@ row_merge_write(
 				in number of row_merge_block_t elements */
 	const void*	buf)	/*!< in: data */
 {
-	size_t		buf_len = sizeof(row_merge_block_t);
-	ib_uint64_t	ofs = buf_len * (ib_uint64_t) offset;
+	size_t		buf_len = srv_sort_buf_size;
+	os_offset_t	ofs = buf_len * (os_offset_t) offset;
 	ibool		ret;
 
-	ret = os_file_write("(merge)", OS_FILE_FROM_FD(fd), buf,
-			    (ulint) (ofs & 0xFFFFFFFF),
-			    (ulint) (ofs >> 32),
-			    buf_len);
+	ret = os_file_write("(merge)", OS_FILE_FROM_FD(fd), buf, ofs, buf_len);
 
 #ifdef UNIV_DEBUG
 	if (row_merge_print_block_write) {
@@ -769,7 +858,7 @@ row_merge_write(
 /********************************************************************//**
 Read a merge record.
 @return	pointer to next record, or NULL on I/O error or end of list */
-static __attribute__((nonnull))
+UNIV_INTERN __attribute__((nonnull))
 const byte*
 row_merge_read_rec(
 /*===============*/
@@ -790,8 +879,8 @@ row_merge_read_rec(
 
 	ut_ad(block);
 	ut_ad(buf);
-	ut_ad(b >= block[0]);
-	ut_ad(b < block[1]);
+	ut_ad(b >= &block[0]);
+	ut_ad(b < &block[srv_sort_buf_size]);
 	ut_ad(index);
 	ut_ad(foffs);
 	ut_ad(mrec);
@@ -818,7 +907,7 @@ row_merge_read_rec(
 	if (extra_size >= 0x80) {
 		/* Read another byte of extra_size. */
 
-		if (UNIV_UNLIKELY(b >= block[1])) {
+		if (UNIV_UNLIKELY(b >= &block[srv_sort_buf_size])) {
 			if (!row_merge_read(fd, ++(*foffs), block)) {
 err_exit:
 				/* Signal I/O error. */
@@ -827,7 +916,7 @@ err_exit:
 			}
 
 			/* Wrap around to the beginning of the buffer. */
-			b = block[0];
+			b = &block[0];
 		}
 
 		extra_size = (extra_size & 0x7f) << 8;
@@ -839,12 +928,12 @@ err_exit:
 
 	/* Read the extra bytes. */
 
-	if (UNIV_UNLIKELY(b + extra_size >= block[1])) {
+	if (UNIV_UNLIKELY(b + extra_size >= &block[srv_sort_buf_size])) {
 		/* The record spans two blocks.  Copy the entire record
 		to the auxiliary buffer and handle this as a special
 		case. */
 
-		avail_size = block[1] - b;
+		avail_size = &block[srv_sort_buf_size] - b;
 
 		memcpy(*buf, b, avail_size);
 
@@ -854,7 +943,7 @@ err_exit:
 		}
 
 		/* Wrap around to the beginning of the buffer. */
-		b = block[0];
+		b = &block[0];
 
 		/* Copy the record. */
 		memcpy(*buf + avail_size, b, extra_size - avail_size);
@@ -870,7 +959,7 @@ err_exit:
 		records are much smaller than either buffer, and
 		the record starts near the beginning of each buffer. */
 		ut_a(extra_size + data_size < sizeof *buf);
-		ut_a(b + data_size < block[1]);
+		ut_a(b + data_size < &block[srv_sort_buf_size]);
 
 		/* Copy the data bytes. */
 		memcpy(*buf + extra_size, b, data_size);
@@ -888,7 +977,7 @@ err_exit:
 
 	b += extra_size + data_size;
 
-	if (UNIV_LIKELY(b < block[1])) {
+	if (UNIV_LIKELY(b < &block[srv_sort_buf_size])) {
 		/* The record fits entirely in the block.
 		This is the normal case. */
 		goto func_exit;
@@ -897,7 +986,7 @@ err_exit:
 	/* The record spans two blocks.  Copy it to buf. */
 
 	b -= extra_size + data_size;
-	avail_size = block[1] - b;
+	avail_size = &block[srv_sort_buf_size] - b;
 	memcpy(*buf, b, avail_size);
 	*mrec = *buf + extra_size;
 #ifdef UNIV_DEBUG
@@ -915,7 +1004,7 @@ err_exit:
 	}
 
 	/* Wrap around to the beginning of the buffer. */
-	b = block[0];
+	b = &block[0];
 
 	/* Copy the rest of the record. */
 	memcpy(*buf + avail_size, b, extra_size + data_size - avail_size);
@@ -999,11 +1088,11 @@ row_merge_write_rec(
 
 	ut_ad(block);
 	ut_ad(buf);
-	ut_ad(b >= block[0]);
-	ut_ad(b < block[1]);
+	ut_ad(b >= &block[0]);
+	ut_ad(b < &block[srv_sort_buf_size]);
 	ut_ad(mrec);
 	ut_ad(foffs);
-	ut_ad(mrec < block[0] || mrec > block[1]);
+	ut_ad(mrec < &block[0] || mrec > &block[srv_sort_buf_size]);
 	ut_ad(mrec < buf[0] || mrec > buf[1]);
 
 	/* Normalize extra_size.  Value 0 signals "end of list". */
@@ -1012,10 +1101,10 @@ row_merge_write_rec(
 	size = extra_size + (extra_size >= 0x80)
 		+ rec_offs_data_size(offsets);
 
-	if (UNIV_UNLIKELY(b + size >= block[1])) {
+	if (UNIV_UNLIKELY(b + size >= &block[srv_sort_buf_size])) {
 		/* The record spans two blocks.
 		Copy it to the temporary buffer first. */
-		avail_size = block[1] - b;
+		avail_size = &block[srv_sort_buf_size] - b;
 
 		row_merge_write_rec_low(buf[0],
 					extra_size, size, fd, *foffs,
@@ -1030,10 +1119,10 @@ row_merge_write_rec(
 			return(NULL);
 		}
 
-		UNIV_MEM_INVALID(block[0], sizeof block[0]);
+		UNIV_MEM_INVALID(&block[0], srv_sort_buf_size);
 
 		/* Copy the rest. */
-		b = block[0];
+		b = &block[0];
 		memcpy(b, buf[0] + avail_size, size - avail_size);
 		b += size - avail_size;
 	} else {
@@ -1058,8 +1147,8 @@ row_merge_write_eof(
 	ulint*			foffs)	/*!< in/out: file offset */
 {
 	ut_ad(block);
-	ut_ad(b >= block[0]);
-	ut_ad(b < block[1]);
+	ut_ad(b >= &block[0]);
+	ut_ad(b < &block[srv_sort_buf_size]);
 	ut_ad(foffs);
 #ifdef UNIV_DEBUG
 	if (row_merge_print_write) {
@@ -1069,26 +1158,26 @@ row_merge_write_eof(
 #endif /* UNIV_DEBUG */
 
 	*b++ = 0;
-	UNIV_MEM_ASSERT_RW(block[0], b - block[0]);
-	UNIV_MEM_ASSERT_W(block[0], sizeof block[0]);
+	UNIV_MEM_ASSERT_RW(&block[0], b - &block[0]);
+	UNIV_MEM_ASSERT_W(&block[0], srv_sort_buf_size);
 #ifdef UNIV_DEBUG_VALGRIND
 	/* The rest of the block is uninitialized.  Initialize it
 	to avoid bogus warnings. */
-	memset(b, 0xff, block[1] - b);
+	memset(b, 0xff, &block[srv_sort_buf_size] - b);
 #endif /* UNIV_DEBUG_VALGRIND */
 
 	if (!row_merge_write(fd, (*foffs)++, block)) {
 		return(NULL);
 	}
 
-	UNIV_MEM_INVALID(block[0], sizeof block[0]);
-	return(block[0]);
+	UNIV_MEM_INVALID(&block[0], srv_sort_buf_size);
+	return(&block[0]);
 }
 
 /*************************************************************//**
 Compare two merge records.
 @return	1, 0, -1 if mrec1 is greater, equal, less, respectively, than mrec2 */
-static
+UNIV_INTERN
 int
 row_merge_cmp(
 /*==========*/
@@ -1119,7 +1208,6 @@ row_merge_cmp(
 
 	return(cmp);
 }
-
 /********************************************************************//**
 Reads clustered index of the table and create temporary files
 containing the index entries for the indexes to be built.
@@ -1137,6 +1225,9 @@ row_merge_read_clustered_index(
 					created; identical to old_table
 					unless creating a PRIMARY KEY */
 	dict_index_t**		index,	/*!< in: indexes to be created */
+	dict_index_t*		fts_sort_idx,
+					/*!< in: indexes to be created */
+	fts_psort_t*		psort_info, /*!< in: parallel sort info */
 	merge_file_t*		files,	/*!< in: temporary files */
 	ulint			n_index,/*!< in: number of indexes to create */
 	row_merge_block_t*	block)	/*!< in/out: file buffer */
@@ -1153,6 +1244,13 @@ row_merge_read_clustered_index(
 	ulint			n_nonnull = 0;	/* number of columns
 						changed to NOT NULL */
 	ulint*			nonnull = NULL;	/* NOT NULL columns */
+	dict_index_t*		fts_index = NULL;/* FTS index */
+	doc_id_t		doc_id = 0;
+	doc_id_t		max_doc_id = 0;
+	ibool			add_doc_id = FALSE;
+	os_event_t		fts_parallel_sort_event = NULL;
+	ibool			fts_pll_sort = FALSE;
+	ib_int64_t		sig_count = 0;
 
 	trx->op_info = "reading clustered index";
 
@@ -1162,12 +1260,46 @@ row_merge_read_clustered_index(
 	ut_ad(index);
 	ut_ad(files);
 
+#ifdef FTS_INTERNAL_DIAG_PRINT
+	DEBUG_FTS_SORT_PRINT("FTS_SORT: Start Create Index\n");
+#endif
+
 	/* Create and initialize memory for record buffers */
 
-	merge_buf = mem_alloc(n_index * sizeof *merge_buf);
+	merge_buf = static_cast<row_merge_buf_t**>(
+		mem_alloc(n_index * sizeof *merge_buf));
+
 
 	for (i = 0; i < n_index; i++) {
-		merge_buf[i] = row_merge_buf_create(index[i]);
+		if (index[i]->type & DICT_FTS) {
+
+			/* We are building a FT index, make sure
+			we have the temporary 'fts_sort_idx' */
+			ut_a(fts_sort_idx);
+
+			fts_index = index[i];
+
+			merge_buf[i] = row_merge_buf_create(fts_sort_idx);
+
+			add_doc_id = DICT_TF2_FLAG_IS_SET(
+				old_table, DICT_TF2_FTS_ADD_DOC_ID);
+
+			/* If Doc ID does not exist in the table itself,
+			fetch the first FTS Doc ID */
+			if (add_doc_id) {
+				fts_get_next_doc_id(
+					(dict_table_t*) new_table,
+					 &doc_id);
+				ut_ad(doc_id > 0);
+			}
+
+			fts_pll_sort = TRUE;
+			row_fts_start_psort(psort_info);
+			fts_parallel_sort_event =
+				 psort_info[0].psort_common->sort_event;
+		} else {
+			merge_buf[i] = row_merge_buf_create(index[i]);
+		}
 	}
 
 	mtr_start(&mtr);
@@ -1189,9 +1321,12 @@ row_merge_read_clustered_index(
 		(old) clustered index do not violate the added NOT
 		NULL constraints. */
 
-		ut_a(n_cols == dict_table_get_n_cols(new_table));
+		if (!fts_sort_idx) {
+			ut_a(n_cols == dict_table_get_n_cols(new_table));
+		}
 
-		nonnull = mem_alloc(n_cols * sizeof *nonnull);
+		nonnull = static_cast<ulint*>(
+			mem_alloc(n_cols * sizeof *nonnull));
 
 		for (i = 0; i < n_cols; i++) {
 			if (dict_table_get_nth_col(old_table, i)->prtype
@@ -1230,9 +1365,9 @@ row_merge_read_clustered_index(
 
 		if (btr_pcur_is_after_last_on_page(&pcur)) {
 			if (UNIV_UNLIKELY(trx_is_interrupted(trx))) {
-				i = 0;
 				err = DB_INTERRUPTED;
-				goto err_exit;
+				trx->error_key_num = 0;
+				goto func_exit;
 			}
 
 			btr_pcur_store_position(&pcur, &mtr);
@@ -1274,8 +1409,8 @@ row_merge_read_clustered_index(
 
 					if (dfield_is_null(field)) {
 						err = DB_PRIMARY_KEY_IS_NULL;
-						i = 0;
-						goto err_exit;
+						trx->error_key_num = 0;
+						goto func_exit;
 					}
 
 					field_type->prtype |= DATA_NOT_NULL;
@@ -1283,6 +1418,13 @@ row_merge_read_clustered_index(
 			}
 		}
 
+		/* Get the next Doc ID */
+		if (add_doc_id) {
+			doc_id++;
+		} else {
+			doc_id = 0;
+		}
+
 		/* Build all entries for all the indexes to be created
 		in a single scan of the clustered index. */
 
@@ -1290,10 +1432,26 @@ row_merge_read_clustered_index(
 			row_merge_buf_t*	buf	= merge_buf[i];
 			merge_file_t*		file	= &files[i];
 			const dict_index_t*	index	= buf->index;
+			ulint			rows_added = 0;
 
 			if (UNIV_LIKELY
-			    (row && row_merge_buf_add(buf, row, ext))) {
-				file->n_rec++;
+			    (row && (rows_added = row_merge_buf_add(
+				buf, fts_index, psort_info,
+				row, ext, &doc_id)))) {
+
+				/* If we are creating FTS index,
+				a single row can generate more
+				records for tokenized word */
+				file->n_rec += rows_added;
+				if (doc_id > max_doc_id) {
+					max_doc_id = doc_id;
+				}
+
+				continue;
+			}
+
+			if ((!row || !doc_id)
+			    && index->type & DICT_FTS) {
 				continue;
 			}
 
@@ -1315,7 +1473,6 @@ row_merge_read_clustered_index(
 
 					if (dup.n_dup) {
 						err = DB_DUPLICATE_KEY;
-err_exit:
 						trx->error_key_num = i;
 						goto func_exit;
 					}
@@ -1329,10 +1486,11 @@ err_exit:
 			if (!row_merge_write(file->fd, file->offset++,
 					     block)) {
 				err = DB_OUT_OF_FILE_SPACE;
-				goto err_exit;
+				trx->error_key_num = i;
+				goto func_exit;
 			}
 
-			UNIV_MEM_INVALID(block[0], sizeof block[0]);
+			UNIV_MEM_INVALID(&block[0], srv_sort_buf_size);
 			merge_buf[i] = row_merge_buf_empty(buf);
 
 			if (UNIV_LIKELY(row != NULL)) {
@@ -1341,13 +1499,19 @@ err_exit:
 				and emptied. */
 
 				if (UNIV_UNLIKELY
-				    (!row_merge_buf_add(buf, row, ext))) {
+				    (!(rows_added = row_merge_buf_add(
+					buf, fts_index, psort_info, row,
+					ext, &doc_id)))) {
 					/* An empty buffer should have enough
-					room for at least one record. */
+					room for at least one record.
+					TODO: for FTS index building, we'll
+					need to prepared for coping with very
+					large text/blob data in a single row
+					that could fill up the merge file */
 					ut_error;
 				}
 
-				file->n_rec++;
+				file->n_rec += rows_added;
 			}
 		}
 
@@ -1359,6 +1523,30 @@ err_exit:
 	}
 
 func_exit:
+#ifdef FTS_INTERNAL_DIAG_PRINT
+	DEBUG_FTS_SORT_PRINT("FTS_SORT: Complete Scan Table\n");
+#endif
+	if (fts_pll_sort) {
+		for (i = 0; i < fts_sort_pll_degree; i++) {
+			psort_info[i].state = FTS_PARENT_COMPLETE;
+		}
+wait_again:
+		os_event_wait_time_low(fts_parallel_sort_event,
+				       1000000, sig_count);
+
+		for (i = 0; i < fts_sort_pll_degree; i++) {
+			if (psort_info[i].child_status != FTS_CHILD_COMPLETE) {
+				sig_count = os_event_reset(
+					fts_parallel_sort_event);
+				goto wait_again;
+			}
+		}
+	}
+
+#ifdef FTS_INTERNAL_DIAG_PRINT
+	DEBUG_FTS_SORT_PRINT("FTS_SORT: Complete Tokenization\n");
+#endif
+
 	btr_pcur_close(&pcur);
 	mtr_commit(&mtr);
 	mem_heap_free(row_heap);
@@ -1367,29 +1555,44 @@ func_exit:
 		mem_free(nonnull);
 	}
 
+
 	for (i = 0; i < n_index; i++) {
 		row_merge_buf_free(merge_buf[i]);
 	}
 
+	row_fts_free_pll_merge_buf(psort_info);
+
 	mem_free(merge_buf);
 
+	/* Update the next Doc ID we used. Table should be locked, so
+	no concurrent DML */
+	if (max_doc_id) {
+		fts_update_next_doc_id(new_table, old_table->name, max_doc_id);
+	}
+
 	trx->op_info = "";
 
 	return(err);
 }
 
 /** Write a record via buffer 2 and read the next record to buffer N.
+@param M	FTS merge info structure
+@param N	index into array of merge info structure
+@param INDEX	the FTS index */
+
+
+/** Write a record via buffer 2 and read the next record to buffer N.
 @param N	number of the buffer (0 or 1)
 @param AT_END	statement to execute at end of input */
 #define ROW_MERGE_WRITE_GET_NEXT(N, AT_END)				\
 	do {								\
-		b2 = row_merge_write_rec(&block[2], &buf[2], b2,	\
+		b2 = row_merge_write_rec(&block[2 * srv_sort_buf_size], &buf[2], b2,	\
 					 of->fd, &of->offset,		\
 					 mrec##N, offsets##N);		\
 		if (UNIV_UNLIKELY(!b2 || ++of->n_rec > file->n_rec)) {	\
 			goto corrupt;					\
 		}							\
-		b##N = row_merge_read_rec(&block[N], &buf[N],		\
+		b##N = row_merge_read_rec(&block[N * srv_sort_buf_size], &buf[N],		\
 					  b##N, index,			\
 					  file->fd, foffs##N,		\
 					  &mrec##N, offsets##N);	\
@@ -1450,19 +1653,19 @@ row_merge_blocks(
 	file in two halves, which can be merged on the following pass. */
 
 	if (!row_merge_read(file->fd, *foffs0, &block[0])
-	    || !row_merge_read(file->fd, *foffs1, &block[1])) {
+	    || !row_merge_read(file->fd, *foffs1, &block[srv_sort_buf_size])) {
 corrupt:
 		mem_heap_free(heap);
 		return(DB_CORRUPTION);
 	}
 
-	b0 = block[0];
-	b1 = block[1];
-	b2 = block[2];
+	b0 = &block[0];
+	b1 = &block[srv_sort_buf_size];
+	b2 = &block[2 * srv_sort_buf_size];
 
 	b0 = row_merge_read_rec(&block[0], &buf[0], b0, index, file->fd,
 				foffs0, &mrec0, offsets0);
-	b1 = row_merge_read_rec(&block[1], &buf[1], b1, index, file->fd,
+	b1 = row_merge_read_rec(&block[srv_sort_buf_size], &buf[srv_sort_buf_size], b1, index, file->fd,
 				foffs1, &mrec1, offsets1);
 	if (UNIV_UNLIKELY(!b0 && mrec0)
 	    || UNIV_UNLIKELY(!b1 && mrec1)) {
@@ -1513,7 +1716,7 @@ done0:
 done1:
 
 	mem_heap_free(heap);
-	b2 = row_merge_write_eof(&block[2], b2, of->fd, &of->offset);
+	b2 = row_merge_write_eof(&block[2 * srv_sort_buf_size], b2, of->fd, &of->offset);
 	return(b2 ? DB_SUCCESS : DB_CORRUPTION);
 }
 
@@ -1561,8 +1764,9 @@ corrupt:
 		return(FALSE);
 	}
 
-	b0 = block[0];
-	b2 = block[2];
+	b0 = &block[0];
+
+	b2 = &block[2 * srv_sort_buf_size];
 
 	b0 = row_merge_read_rec(&block[0], &buf[0], b0, index, file->fd,
 				foffs0, &mrec0, offsets0);
@@ -1584,7 +1788,7 @@ done0:
 	(*foffs0)++;
 
 	mem_heap_free(heap);
-	return(row_merge_write_eof(&block[2], b2, of->fd, &of->offset)
+	return(row_merge_write_eof(&block[2 * srv_sort_buf_size], b2, of->fd, &of->offset)
 	       != NULL);
 }
 
@@ -1619,8 +1823,7 @@ row_merge(
 	ulint		n_run	= 0;
 				/*!< num of runs generated from this merge */
 
-
-	UNIV_MEM_ASSERT_W(block[0], 3 * sizeof block[0]);
+	UNIV_MEM_ASSERT_W(&block[0], 3 * srv_sort_buf_size);
 
 	ut_ad(ihalf < file->offset);
 
@@ -1714,7 +1917,7 @@ row_merge(
 	*tmpfd = file->fd;
 	*file = of;
 
-	UNIV_MEM_INVALID(block[0], 3 * sizeof block[0]);
+	UNIV_MEM_INVALID(&block[0], 3 * srv_sort_buf_size);
 
 	return(DB_SUCCESS);
 }
@@ -1722,7 +1925,7 @@ row_merge(
 /*************************************************************//**
 Merge disk files.
 @return	DB_SUCCESS or error code */
-static
+UNIV_INTERN
 ulint
 row_merge_sort(
 /*===========*/
@@ -1849,6 +2052,8 @@ row_merge_insert_index_tuples(
 	ut_ad(index);
 	ut_ad(table);
 
+	ut_ad(!(index->type & DICT_FTS));
+
 	/* We use the insert query graph as the dummy graph
 	needed in the row module call */
 
@@ -1866,17 +2071,23 @@ row_merge_insert_index_tuples(
 	{
 		ulint i	= 1 + REC_OFFS_HEADER_SIZE
 			+ dict_index_get_n_fields(index);
-		offsets = mem_heap_alloc(graph_heap, i * sizeof *offsets);
+
+		offsets = static_cast<ulint*>(
+			mem_heap_alloc(graph_heap, i * sizeof *offsets));
+
 		offsets[0] = i;
 		offsets[1] = dict_index_get_n_fields(index);
 	}
 
-	b = *block;
+	b = block;
 
 	if (!row_merge_read(fd, foffs, block)) {
 		error = DB_CORRUPTION;
 	} else {
-		mrec_buf_t*	buf = mem_heap_alloc(graph_heap, sizeof *buf);
+		mrec_buf_t*	buf;
+
+		buf = static_cast<mrec_buf_t*>(
+			mem_heap_alloc(graph_heap, sizeof *buf));
 
 		for (;;) {
 			const mrec_t*	mrec;
@@ -1920,7 +2131,10 @@ row_merge_insert_index_tuples(
 				}
 
 				thr->lock_state = QUE_THR_LOCK_ROW;
-				trx->error_state = error;
+
+				trx->error_state = static_cast<enum db_err>(
+					error);
+
 				que_thr_stop_for_mysql(thr);
 				thr->lock_state = QUE_THR_LOCK_NOLOCK;
 			} while (row_mysql_handle_errors(&error, trx,
@@ -1973,7 +2187,10 @@ row_merge_lock_table(
 	/* We use the select query graph as the dummy graph needed
 	in the lock module call */
 
-	thr = que_fork_get_first_thr(que_node_get_parent(thr));
+	thr = static_cast<que_thr_t*>(
+		que_fork_get_first_thr(
+			static_cast<que_fork_t*>(que_node_get_parent(thr))));
+
 	que_thr_move_to_run_state_for_mysql(thr, trx);
 
 run_again:
@@ -1982,7 +2199,7 @@ run_again:
 
 	err = lock_table(0, table, mode, thr);
 
-	trx->error_state = err;
+	trx->error_state =static_cast<enum db_err>( err);
 
 	if (UNIV_LIKELY(err == DB_SUCCESS)) {
 		que_thr_stop_for_mysql_no_error(thr, trx);
@@ -2003,7 +2220,9 @@ run_again:
 			que_node_t*	parent;
 
 			parent = que_node_get_parent(thr);
-			run_thr = que_fork_start_command(parent);
+
+			run_thr = que_fork_start_command(
+				static_cast<que_fork_t*>(parent));
 
 			ut_a(run_thr == thr);
 
@@ -2033,7 +2252,7 @@ row_merge_drop_index(
 	dict_table_t*	table,	/*!< in: table */
 	trx_t*		trx)	/*!< in: transaction handle */
 {
-	ulint		err;
+	db_err		err;
 	pars_info_t*	info = pars_info_create();
 
 	/* We use the private SQL parser of Innobase to generate the
@@ -2060,15 +2279,35 @@ row_merge_drop_index(
 
 	pars_info_add_ull_literal(info, "indexid", index->id);
 
-	trx_start_if_not_started(trx);
+	trx_start_if_not_started_xa(trx);
 	trx->op_info = "dropping index";
 
 	ut_a(trx->dict_operation_lock_mode == RW_X_LATCH);
 
-	err = que_eval_sql(info, sql, FALSE, trx);
+	err = static_cast<db_err>(que_eval_sql(info, sql, FALSE, trx));
 
+	DBUG_EXECUTE_IF(
+		"ib_drop_index_too_many_concurrent_trxs",
+		err = DB_TOO_MANY_CONCURRENT_TRXS;
+		trx->error_state = err;);
 
-	if (err != DB_SUCCESS) {
+	if (err == DB_SUCCESS) {
+
+		/* If it is FTS index, drop from table->fts and also drop
+		its auxiliary tables */
+		if (index->type & DICT_FTS) {
+			ut_a(table->fts);
+			fts_drop_index(table, index, trx);
+		}
+
+		/* Replace this index with another equivalent index for all
+		foreign key constraints on this table where this index is
+		used */
+
+		dict_table_replace_index_in_foreign_list(table, index, trx);
+		dict_index_remove_from_cache(table, index);
+
+	} else {
 		/* Even though we ensure that DDL transactions are WAIT
 		and DEADLOCK free, we could encounter other errors e.g.,
 		DB_TOO_MANY_TRANSACTIONS. */
@@ -2079,12 +2318,6 @@ row_merge_drop_index(
 			"with error code: %lu.\n", (ulint) err);
 	}
 
-	/* Replace this index with another equivalent index for all
-	foreign key constraints on this table where this index is used */
-
-	dict_table_replace_index_in_foreign_list(table, index, trx);
-	dict_index_remove_from_cache(table, index);
-
 	trx->op_info = "";
 }
 
@@ -2148,8 +2381,8 @@ row_merge_drop_temp_indexes(void)
 		}
 
 		rec = btr_pcur_get_rec(&pcur);
-		field = rec_get_nth_field_old(rec, DICT_SYS_INDEXES_NAME_FIELD,
-					      &len);
+		field = rec_get_nth_field_old(
+			rec, DICT_FLD__SYS_INDEXES__NAME, &len);
 		if (len == UNIV_SQL_NULL || len == 0
 		    || (char) *field != TEMP_INDEX_PREFIX) {
 			continue;
@@ -2157,7 +2390,8 @@ row_merge_drop_temp_indexes(void)
 
 		/* This is a temporary index. */
 
-		field = rec_get_nth_field_old(rec, 0/*TABLE_ID*/, &len);
+		field = rec_get_nth_field_old(
+			rec, DICT_FLD__SYS_INDEXES__TABLE_ID, &len);
 		if (len != 8) {
 			/* Corrupted TABLE_ID */
 			continue;
@@ -2168,7 +2402,7 @@ row_merge_drop_temp_indexes(void)
 		btr_pcur_store_position(&pcur, &mtr);
 		btr_pcur_commit_specify_mtr(&pcur, &mtr);
 
-		table = dict_table_get_on_id_low(table_id);
+		table = dict_table_open_on_id(table_id, TRUE);
 
 		if (table) {
 			dict_index_t*	index;
@@ -2184,11 +2418,12 @@ row_merge_drop_temp_indexes(void)
 					trx_commit_for_mysql(trx);
 				}
 			}
+
+			dict_table_close(table, TRUE);
 		}
 
 		mtr_start(&mtr);
-		btr_pcur_restore_position(BTR_SEARCH_LEAF,
-					  &pcur, &mtr);
+		btr_pcur_restore_position(BTR_SEARCH_LEAF, &pcur, &mtr);
 	}
 
 	btr_pcur_close(&pcur);
@@ -2198,7 +2433,7 @@ row_merge_drop_temp_indexes(void)
 }
 
 /*********************************************************************//**
-Creates temperary merge files, and if UNIV_PFS_IO defined, register
+Creates temporary merge files, and if UNIV_PFS_IO defined, register
 the file descriptor with Performance Schema.
 @return File descriptor */
 UNIV_INLINE
@@ -2224,15 +2459,19 @@ row_merge_file_create_low(void)
 #endif
 	return(fd);
 }
+
 /*********************************************************************//**
 Create a merge file. */
-static
+UNIV_INTERN
 void
 row_merge_file_create(
 /*==================*/
 	merge_file_t*	merge_file)	/*!< out: merge file structure */
 {
 	merge_file->fd = row_merge_file_create_low();
+	if (srv_disable_sort_file_cache) {
+		os_file_set_nocache(merge_file->fd, "row0merge.c", "sort");
+	}
 	merge_file->offset = 0;
 	merge_file->n_rec = 0;
 }
@@ -2260,7 +2499,7 @@ row_merge_file_destroy_low(
 }
 /*********************************************************************//**
 Destroy a merge file. */
-static
+UNIV_INTERN
 void
 row_merge_file_destroy(
 /*===================*/
@@ -2327,13 +2566,19 @@ row_merge_create_temporary_table(
 	ulint		n_cols = dict_table_get_n_user_cols(table);
 	ulint		error;
 	mem_heap_t*	heap = mem_heap_create(1000);
+	ulint		num_col;
 
 	ut_ad(table_name);
 	ut_ad(index_def);
 	ut_ad(table);
 	ut_ad(mutex_own(&dict_sys->mutex));
 
-	new_table = dict_mem_table_create(table_name, 0, n_cols, table->flags);
+	num_col = DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_ADD_DOC_ID)
+			? n_cols + 1
+			: n_cols;
+
+	new_table = dict_mem_table_create(
+		table_name, 0, num_col, table->flags, table->flags2);
 
 	for (i = 0; i < n_cols; i++) {
 		const dict_col_t*	col;
@@ -2348,12 +2593,28 @@ row_merge_create_temporary_table(
 				       col->len);
 	}
 
+	/* Add the FTS doc_id hidden column */
+	if (DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_ADD_DOC_ID)) {
+		fts_add_doc_id_column(new_table);
+		new_table->fts->doc_col = n_cols;
+	}
+
 	error = row_create_table_for_mysql(new_table, trx);
 	mem_heap_free(heap);
 
 	if (error != DB_SUCCESS) {
-		trx->error_state = error;
+		trx->error_state = static_cast<enum db_err>(error);
 		new_table = NULL;
+	} else {
+		dict_table_t*	temp_table;
+
+		/* We need to bump up the table ref count and before we can
+		use it we need to open the table. */
+
+		temp_table = dict_table_open_on_name_no_stats(
+			new_table->name, TRUE, DICT_ERR_IGNORE_NONE);
+
+		ut_a(new_table == temp_table);
 	}
 
 	return(new_table);
@@ -2371,13 +2632,13 @@ row_merge_rename_indexes(
 	trx_t*		trx,		/*!< in/out: transaction */
 	dict_table_t*	table)		/*!< in/out: table with new indexes */
 {
-	ulint		err = DB_SUCCESS;
+	db_err		err = DB_SUCCESS;
 	pars_info_t*	info = pars_info_create();
 
 	/* We use the private SQL parser of Innobase to generate the
 	query graphs needed in renaming indexes. */
 
-	static const char sql[] =
+	static const char* sql =
 		"PROCEDURE RENAME_INDEXES_PROC () IS\n"
 		"BEGIN\n"
 		"UPDATE SYS_INDEXES SET NAME=SUBSTR(NAME,1,LENGTH(NAME)-1)\n"
@@ -2393,7 +2654,12 @@ row_merge_rename_indexes(
 
 	pars_info_add_ull_literal(info, "tableid", table->id);
 
-	err = que_eval_sql(info, sql, FALSE, trx);
+	err = static_cast<db_err>(que_eval_sql(info, sql, FALSE, trx));
+
+	DBUG_EXECUTE_IF(
+		"ib_rename_indexes_too_many_concurrent_trxs",
+		err = DB_TOO_MANY_CONCURRENT_TRXS;
+		trx->error_state = static_cast<db_err>(err););
 
 	if (err == DB_SUCCESS) {
 		dict_index_t*	index = dict_table_get_first_index(table);
@@ -2407,6 +2673,7 @@ row_merge_rename_indexes(
 		/* Even though we ensure that DDL transactions are WAIT
 		and DEADLOCK free, we could encounter other errors e.g.,
 		DB_TOO_MANY_TRANSACTIONS. */
+
 		trx->error_state = DB_SUCCESS;
 
 		ut_print_timestamp(stderr);
@@ -2495,7 +2762,7 @@ row_merge_rename_tables(
 	if (err != DB_SUCCESS) {
 err_exit:
 		trx->error_state = DB_SUCCESS;
-		trx_general_rollback_for_mysql(trx, NULL);
+		trx_rollback_to_savepoint(trx, NULL);
 		trx->error_state = DB_SUCCESS;
 	}
 
@@ -2530,7 +2797,8 @@ row_merge_create_index_graph(
 	node = ind_create_graph_create(index, heap);
 	thr = pars_complete_graph_for_exec(node, trx, heap);
 
-	ut_a(thr == que_fork_start_command(que_node_get_parent(thr)));
+	ut_a(thr == que_fork_start_command(
+			static_cast<que_fork_t*>(que_node_get_parent(thr))));
 
 	que_run_threads(thr);
 
@@ -2620,11 +2888,12 @@ row_merge_drop_table(
 	dict_table_t*	table)		/*!< in: table to drop */
 {
 	/* There must be no open transactions on the table. */
-	ut_a(table->n_mysql_handles_opened == 0);
+	ut_a(table->n_ref_count == 0);
 
 	return(row_drop_table_for_mysql(table->name, trx, FALSE));
 }
 
+
 /*********************************************************************//**
 Build indexes on a table by reading a clustered index,
 creating a temporary file containing index entries, merge sorting
@@ -2650,8 +2919,13 @@ row_merge_build_indexes(
 	row_merge_block_t*	block;
 	ulint			block_size;
 	ulint			i;
+	ulint			j;
 	ulint			error;
 	int			tmpfd;
+	dict_index_t*		fts_sort_idx = NULL;
+	fts_psort_t*		psort_info = NULL;
+	fts_psort_t*		merge_info = NULL;
+	ib_int64_t		sig_count = 0;
 
 	ut_ad(trx);
 	ut_ad(old_table);
@@ -2659,18 +2933,37 @@ row_merge_build_indexes(
 	ut_ad(indexes);
 	ut_ad(n_indexes);
 
-	trx_start_if_not_started(trx);
+	trx_start_if_not_started_xa(trx);
 
 	/* Allocate memory for merge file data structure and initialize
 	fields */
 
-	merge_files = mem_alloc(n_indexes * sizeof *merge_files);
-	block_size = 3 * sizeof *block;
-	block = os_mem_alloc_large(&block_size);
+	merge_files = static_cast<merge_file_t*>(
+		mem_alloc(n_indexes * sizeof *merge_files));
+
+	block_size = 3 * srv_sort_buf_size;
+	block = static_cast<row_merge_block_t*>(
+		os_mem_alloc_large(&block_size));
 
 	for (i = 0; i < n_indexes; i++) {
 
 		row_merge_file_create(&merge_files[i]);
+
+		if (indexes[i]->type & DICT_FTS) {
+			ibool	opt_doc_id_size = FALSE;
+
+			/* To build FTS index, we would need to extract
+			doc's word, Doc ID, and word's position, so
+			we need to build a "fts sort index" indexing
+			on above three 'fields' */
+			fts_sort_idx = row_merge_create_fts_sort_index(
+					indexes[i], old_table,
+					&opt_doc_id_size);
+
+			row_fts_psort_info_init(trx, table, new_table,
+						fts_sort_idx, opt_doc_id_size,
+						&psort_info, &merge_info);
+		}
 	}
 
 	tmpfd = row_merge_file_create_low();
@@ -2684,7 +2977,7 @@ row_merge_build_indexes(
 
 	error = row_merge_read_clustered_index(
 		trx, table, old_table, new_table, indexes,
-		merge_files, n_indexes, block);
+		fts_sort_idx, psort_info, merge_files, n_indexes, block);
 
 	if (error != DB_SUCCESS) {
 
@@ -2695,32 +2988,98 @@ row_merge_build_indexes(
 	sorting and inserting. */
 
 	for (i = 0; i < n_indexes; i++) {
-		error = row_merge_sort(trx, indexes[i], &merge_files[i],
+		dict_index_t*	sort_idx;
+
+		sort_idx = (indexes[i]->type & DICT_FTS)
+				? fts_sort_idx
+				: indexes[i];
+
+		if (indexes[i]->type & DICT_FTS) {
+			os_event_t	fts_parallel_merge_event;
+
+			fts_parallel_merge_event
+				= merge_info[0].psort_common->sort_event;
+
+			if (FTS_PLL_MERGE) {
+				os_event_reset(fts_parallel_merge_event);
+				row_fts_start_parallel_merge(merge_info);
+wait_again:
+				os_event_wait_time_low(
+					fts_parallel_merge_event, 1000000,
+					sig_count);
+
+				for (j = 0; j < FTS_NUM_AUX_INDEX; j++) {
+					if (merge_info[j].child_status
+					    != FTS_CHILD_COMPLETE) {
+						sig_count = os_event_reset(
+						fts_parallel_merge_event);
+
+						goto wait_again;
+					}
+				}
+			} else {
+				error = row_fts_merge_insert(
+					sort_idx, new_table,
+					psort_info, 0);
+			}
+
+		} else {
+			error = row_merge_sort(trx, sort_idx, &merge_files[i],
 				       block, &tmpfd, table);
 
-		if (error == DB_SUCCESS) {
-			error = row_merge_insert_index_tuples(
-				trx, indexes[i], new_table,
-				dict_table_zip_size(old_table),
-				merge_files[i].fd, block);
+			if (error == DB_SUCCESS) {
+				error = row_merge_insert_index_tuples(
+					trx, sort_idx, new_table,
+					dict_table_zip_size(old_table),
+					merge_files[i].fd, block);
+			}
+
+#ifdef FTS_INTERNAL_DIAG_PRINT
+			DEBUG_FTS_SORT_PRINT("FTS_SORT: Complete Insert\n");
+#endif
 		}
 
 		/* Close the temporary file to free up space. */
 		row_merge_file_destroy(&merge_files[i]);
 
+		if (indexes[i]->type & DICT_FTS) {
+			row_fts_psort_info_destroy(psort_info, merge_info);
+		}
+
 		if (error != DB_SUCCESS) {
 			trx->error_key_num = i;
 			goto func_exit;
 		}
+
+		if (indexes[i]->type & DICT_FTS && fts_enable_diag_print) {
+			char*	name = (char*) indexes[i]->name;
+
+			if (*name == TEMP_INDEX_PREFIX)  {
+				name++;
+			}
+
+			ut_print_timestamp(stderr);
+			fprintf(stderr, " InnoDB: Finished building "
+				"full-text index %s\n", name);
+		}
 	}
 
 func_exit:
+	DBUG_EXECUTE_IF(
+		"ib_build_indexes_too_many_concurrent_trxs",
+		error = DB_TOO_MANY_CONCURRENT_TRXS;
+		trx->error_state = static_cast<db_err>(error););
+
 	row_merge_file_destroy_low(tmpfd);
 
 	for (i = 0; i < n_indexes; i++) {
 		row_merge_file_destroy(&merge_files[i]);
 	}
 
+	if (fts_sort_idx) {
+		dict_mem_index_free(fts_sort_idx);
+	}
+
 	mem_free(merge_files);
 	os_mem_free_large(block, block_size);
 
diff --git a/storage/innobase/row/row0mysql.c b/storage/innobase/row/row0mysql.cc
index 20e8c13ea70..b4827782be6 100644
--- a/storage/innobase/row/row0mysql.c
+++ b/storage/innobase/row/row0mysql.cc
@@ -17,7 +17,7 @@ this program; if not, write to the Free Software Foundation, Inc.,
 *****************************************************************************/
 
 /**************************************************//**
-@file row/row0mysql.c
+@file row/row0mysql.cc
 Interface between Innobase row operations and MySQL.
 Contains also create table and other data dictionary operations.
 
@@ -41,6 +41,7 @@ Created 9/17/2000 Heikki Tuuri
 #include "dict0crea.h"
 #include "dict0load.h"
 #include "dict0boot.h"
+#include "dict0stats.h"
 #include "trx0roll.h"
 #include "trx0purge.h"
 #include "trx0rec.h"
@@ -51,9 +52,9 @@ Created 9/17/2000 Heikki Tuuri
 #include "btr0sea.h"
 #include "fil0fil.h"
 #include "ibuf0ibuf.h"
-#include "m_string.h"
-#include "my_sys.h"
-
+#include "fts0fts.h"
+#include "fts0types.h"
+#include "srv0mon.h"
 
 /** Provide optional 4.x backwards compatibility for 5.0 and above */
 UNIV_INTERN ibool	row_rollback_on_timeout	= FALSE;
@@ -68,12 +69,21 @@ struct row_mysql_drop_struct{
 							/*!< list chain node */
 };
 
+#ifdef UNIV_PFS_MUTEX
+/* Key to register drop list mutex with performance schema */
+UNIV_INTERN mysql_pfs_key_t	row_drop_list_mutex_key;
+#endif /* UNIV_PFS_MUTEX */
+
 /** @brief List of tables we should drop in background.
 
 ALTER TABLE in MySQL requires that the table handler can drop the
 table in background when there are no queries to it any
-more.  Protected by kernel_mutex. */
+more.  Protected by row_drop_list_mutex. */
 static UT_LIST_BASE_NODE_T(row_mysql_drop_t)	row_mysql_drop_list;
+
+/** Mutex protecting the background table drop list. */
+static mutex_t row_drop_list_mutex;
+
 /** Flag: has row_mysql_drop_list been initialized? */
 static ibool	row_mysql_drop_list_inited	= FALSE;
 
@@ -83,7 +93,9 @@ static const char S_innodb_monitor[] = "innodb_monitor";
 static const char S_innodb_lock_monitor[] = "innodb_lock_monitor";
 static const char S_innodb_tablespace_monitor[] = "innodb_tablespace_monitor";
 static const char S_innodb_table_monitor[] = "innodb_table_monitor";
+#ifdef UNIV_MEM_DEBUG
 static const char S_innodb_mem_validate[] = "innodb_mem_validate";
+#endif /* UNIV_MEM_DEBUG */
 /* @} */
 
 /** Evaluates to true if str1 equals str2_onstack, used for comparing
@@ -314,7 +326,7 @@ row_mysql_pad_col(
 /**************************************************************//**
 Stores a non-SQL-NULL field given in the MySQL format in the InnoDB format.
 The counterpart of this function is row_sel_field_store_in_mysql_format() in
-row0sel.c.
+row0sel.cc.
 @return	up to which byte we used buf in the conversion */
 UNIV_INTERN
 byte*
@@ -325,7 +337,10 @@ row_mysql_store_col_in_innobase_format(
 					this function is called! */
 	byte*		buf,		/*!< in/out: buffer for a converted
 					integer value; this must be at least
-					col_len long then! */
+					col_len long then! NOTE that dfield
+					may also get a pointer to 'buf',
+					therefore do not discard this as long
+					as dfield is used! */
 	ibool		row_format_col,	/*!< TRUE if the mysql_data is from
 					a MySQL row, FALSE if from a MySQL
 					key value;
@@ -465,7 +480,7 @@ row_mysql_store_col_in_innobase_format(
 		be stored as "$%&a " (5 bytes).	 The string ".abc "
 		will be stored as "$%&abc" (6 bytes).
 
-		The space padding will be restored in row0sel.c, function
+		The space padding will be restored in row0sel.cc, function
 		row_sel_field_store_in_mysql_format(). */
 
 		ulint		n_chars;
@@ -491,7 +506,7 @@ row_mysql_store_col_in_innobase_format(
 /**************************************************************//**
 Convert a row in the MySQL format to a row in the Innobase format. Note that
 the function to convert a MySQL format key value to an InnoDB dtuple is
-row_sel_convert_mysql_key_to_innobase() in row0sel.c. */
+row_sel_convert_mysql_key_to_innobase() in row0sel.cc. */
 static
 void
 row_mysql_convert_row_to_innobase(
@@ -542,6 +557,15 @@ row_mysql_convert_row_to_innobase(
 next_column:
 		;
 	}
+
+	/* If there is a FTS doc id column and it is not user supplied (
+	generated by server) then assign it a new doc id. */
+	if (prebuilt->table->fts) {
+
+		ut_a(prebuilt->table->fts->doc_col != ULINT_UNDEFINED);
+
+		fts_create_doc_id(prebuilt->table, row, prebuilt->heap);
+	}
 }
 
 /****************************************************************//**
@@ -572,7 +596,7 @@ handle_new_error:
 	switch (err) {
 	case DB_LOCK_WAIT_TIMEOUT:
 		if (row_rollback_on_timeout) {
-			trx_general_rollback_for_mysql(trx, NULL);
+			trx_rollback_to_savepoint(trx, NULL);
 			break;
 		}
 		/* fall through */
@@ -585,17 +609,19 @@ handle_new_error:
 	case DB_CANNOT_ADD_CONSTRAINT:
 	case DB_TOO_MANY_CONCURRENT_TRXS:
 	case DB_OUT_OF_FILE_SPACE:
+	case DB_READ_ONLY:
+	case DB_FTS_INVALID_DOCID:
 	case DB_INTERRUPTED:
 		if (savept) {
 			/* Roll back the latest, possibly incomplete
 			insertion or update */
 
-			trx_general_rollback_for_mysql(trx, savept);
+			trx_rollback_to_savepoint(trx, savept);
 		}
 		/* MySQL will roll back the latest SQL statement */
 		break;
 	case DB_LOCK_WAIT:
-		srv_suspend_mysql_thread(thr);
+		lock_wait_suspend_thread(thr);
 
 		if (trx->error_state != DB_SUCCESS) {
 			que_thr_stop_for_mysql(thr);
@@ -612,7 +638,7 @@ handle_new_error:
 		/* Roll back the whole transaction; this resolution was added
 		to version 3.23.43 */
 
-		trx_general_rollback_for_mysql(trx, NULL);
+		trx_rollback_to_savepoint(trx, NULL);
 		break;
 
 	case DB_MUST_GET_MORE_FILE_SPACE:
@@ -723,7 +749,8 @@ row_create_prebuilt(
 	calls */
 	heap = mem_heap_create(PREBUILT_HEAP_INITIAL_SIZE);
 
-	prebuilt = mem_heap_zalloc(heap, sizeof(*prebuilt));
+	prebuilt = static_cast<row_prebuilt_t*>(
+		mem_heap_zalloc(heap, sizeof(*prebuilt)));
 
 	prebuilt->magic_n = ROW_PREBUILT_ALLOCATED;
 	prebuilt->magic_n2 = ROW_PREBUILT_ALLOCATED;
@@ -737,9 +764,7 @@ row_create_prebuilt(
 	btr_pcur_reset(&prebuilt->clust_pcur);
 
 	prebuilt->select_lock_type = LOCK_NONE;
-	prebuilt->stored_select_lock_type = 99999999;
-	UNIV_MEM_INVALID(&prebuilt->stored_select_lock_type,
-			 sizeof prebuilt->stored_select_lock_type);
+	prebuilt->stored_select_lock_type = LOCK_NONE_UNSET;
 
 	prebuilt->search_tuple = dtuple_create(heap, search_tuple_n_fields);
 
@@ -752,12 +777,15 @@ row_create_prebuilt(
 	prebuilt->autoinc_error = 0;
 	prebuilt->autoinc_offset = 0;
 
-	/* Default to 1, we will set the actual value later in 
+	/* Default to 1, we will set the actual value later in
 	ha_innobase::get_auto_increment(). */
 	prebuilt->autoinc_increment = 1;
 
 	prebuilt->autoinc_last_value = 0;
 
+	/* During UPDATE and DELETE we need the doc id. */
+	prebuilt->fts_doc_id = 0;
+
 	prebuilt->mysql_row_len = mysql_row_len;
 
 	return(prebuilt);
@@ -822,28 +850,40 @@ row_prebuilt_free(
 		mem_heap_free(prebuilt->old_vers_heap);
 	}
 
-	for (i = 0; i < MYSQL_FETCH_CACHE_SIZE; i++) {
-		if (prebuilt->fetch_cache[i] != NULL) {
+	if (prebuilt->fetch_cache[0] != NULL) {
+		byte*	base = prebuilt->fetch_cache[0] - 4;
+		byte*	ptr = base;
 
-			if ((ROW_PREBUILT_FETCH_MAGIC_N != mach_read_from_4(
-				     (prebuilt->fetch_cache[i]) - 4))
-			    || (ROW_PREBUILT_FETCH_MAGIC_N != mach_read_from_4(
-					(prebuilt->fetch_cache[i])
-					+ prebuilt->mysql_row_len))) {
-				fputs("InnoDB: Error: trying to free"
-				      " a corrupt fetch buffer.\n", stderr);
+		for (i = 0; i < MYSQL_FETCH_CACHE_SIZE; i++) {
+			byte*	row;
+			ulint	magic1;
+			ulint	magic2;
+
+			magic1 = mach_read_from_4(ptr);
+			ptr += 4;
+
+			row = ptr;
+			ptr += prebuilt->mysql_row_len;
+
+			magic2 = mach_read_from_4(ptr);
+			ptr += 4;
 
-				mem_analyze_corruption(
-					prebuilt->fetch_cache[i]);
+			if (ROW_PREBUILT_FETCH_MAGIC_N != magic1
+			    || row != prebuilt->fetch_cache[i]
+			    || ROW_PREBUILT_FETCH_MAGIC_N != magic2) {
 
+				fputs("InnoDB: Error: trying to free"
+					" a corrupt fetch buffer.\n", stderr);
+
+				mem_analyze_corruption(base);
 				ut_error;
 			}
-
-			mem_free((prebuilt->fetch_cache[i]) - 4);
 		}
+
+		mem_free(base);
 	}
 
-	dict_table_decrement_handle_count(prebuilt->table, dict_locked);
+	dict_table_close(prebuilt->table, dict_locked);
 
 	mem_heap_free(prebuilt->heap);
 }
@@ -926,8 +966,10 @@ row_get_prebuilt_insert_row(
 		prebuilt->ins_node = node;
 
 		if (prebuilt->ins_upd_rec_buff == NULL) {
-			prebuilt->ins_upd_rec_buff = mem_heap_alloc(
-				prebuilt->heap, prebuilt->mysql_row_len);
+			prebuilt->ins_upd_rec_buff = static_cast<byte*>(
+				mem_heap_alloc(
+					prebuilt->heap,
+					prebuilt->mysql_row_len));
 		}
 
 		row = dtuple_create(prebuilt->heap,
@@ -937,10 +979,12 @@ row_get_prebuilt_insert_row(
 
 		ins_node_set_new_row(node, row);
 
-		prebuilt->ins_graph = que_node_get_parent(
-			pars_complete_graph_for_exec(node,
-						     prebuilt->trx,
-						     prebuilt->heap));
+		prebuilt->ins_graph = static_cast<que_fork_t*>(
+			que_node_get_parent(
+				pars_complete_graph_for_exec(
+					node,
+					prebuilt->trx, prebuilt->heap)));
+
 		prebuilt->ins_graph->state = QUE_FORK_ACTIVE;
 	}
 
@@ -969,29 +1013,10 @@ row_update_statistics_if_needed(
 	a counter table which is very small and updated very often. */
 
 	if (counter > 2000000000
-	    || ((ib_int64_t)counter > 16 + table->stat_n_rows / 16)) {
+	    || ((ib_int64_t) counter > 16 + table->stat_n_rows / 16)) {
 
-		dict_update_statistics(table, FALSE /* update even if stats
-						    are initialized */);
-	}
-}
-
-/*********************************************************************//**
-Unlocks AUTO_INC type locks that were possibly reserved by a trx. This
-function should be called at the the end of an SQL statement, by the
-connection thread that owns the transaction (trx->mysql_thd). */
-UNIV_INTERN
-void
-row_unlock_table_autoinc_for_mysql(
-/*===============================*/
-	trx_t*	trx)	/*!< in/out: transaction */
-{
-	if (lock_trx_holds_autoinc_locks(trx)) {
-		mutex_enter(&kernel_mutex);
-
-		lock_release_autoinc_locks(trx);
-
-		mutex_exit(&kernel_mutex);
+		ut_ad(!mutex_own(&dict_sys->mutex));
+		dict_stats_update(table, DICT_STATS_FETCH, FALSE);
 	}
 }
 
@@ -1020,7 +1045,7 @@ row_lock_table_autoinc_for_mysql(
 
 	/* If we already hold an AUTOINC lock on the table then do nothing.
         Note: We peek at the value of the current owner without acquiring
-	the kernel mutex. **/
+	the lock mutex. **/
 	if (trx == table->autoinc_trx) {
 
 		return(DB_SUCCESS);
@@ -1047,11 +1072,11 @@ run_again:
 	/* It may be that the current session has not yet started
 	its transaction, or it has been committed: */
 
-	trx_start_if_not_started(trx);
+	trx_start_if_not_started_xa(trx);
 
 	err = lock_table(0, prebuilt->table, LOCK_AUTO_INC, thr);
 
-	trx->error_state = err;
+	trx->error_state = static_cast<enum db_err>(err);
 
 	if (err != DB_SUCCESS) {
 		que_thr_stop_for_mysql(thr);
@@ -1118,16 +1143,21 @@ run_again:
 	/* It may be that the current session has not yet started
 	its transaction, or it has been committed: */
 
-	trx_start_if_not_started(trx);
+	trx_start_if_not_started_xa(trx);
 
 	if (table) {
-		err = lock_table(0, table, mode, thr);
+		err = lock_table(
+			0, table,
+			static_cast<enum lock_mode>(mode), thr);
 	} else {
-		err = lock_table(0, prebuilt->table,
-				 prebuilt->select_lock_type, thr);
+		err = lock_table(
+			0, prebuilt->table,
+			static_cast<enum lock_mode>(
+				prebuilt->select_lock_type),
+			thr);
 	}
 
-	trx->error_state = err;
+	trx->error_state = static_cast<enum db_err>(err);
 
 	if (err != DB_SUCCESS) {
 		que_thr_stop_for_mysql(thr);
@@ -1167,10 +1197,11 @@ row_insert_for_mysql(
 	ibool		was_lock_wait;
 	trx_t*		trx		= prebuilt->trx;
 	ins_node_t*	node		= prebuilt->ins_node;
+	dict_table_t*	table		= prebuilt->table;
 
 	ut_ad(trx);
 
-	if (prebuilt->table->ibd_file_missing) {
+	if (table->ibd_file_missing) {
 		ut_print_timestamp(stderr);
 		fprintf(stderr, "  InnoDB: Error:\n"
 			"InnoDB: MySQL is trying to use a table handle"
@@ -1216,7 +1247,7 @@ row_insert_for_mysql(
 
 	row_mysql_delay_if_needed();
 
-	trx_start_if_not_started(trx);
+	trx_start_if_not_started_xa(trx);
 
 	if (node == NULL) {
 		row_get_prebuilt_insert_row(prebuilt);
@@ -1247,13 +1278,16 @@ run_again:
 	err = trx->error_state;
 
 	if (err != DB_SUCCESS) {
+error_exit:
 		que_thr_stop_for_mysql(thr);
 
-		/* TODO: what is this? */ thr->lock_state= QUE_THR_LOCK_ROW;
+		/* FIXME: What's this ? */
+		thr->lock_state = QUE_THR_LOCK_ROW;
 
-		was_lock_wait = row_mysql_handle_errors(&err, trx, thr,
-							&savept);
-		thr->lock_state= QUE_THR_LOCK_NOLOCK;
+		was_lock_wait = row_mysql_handle_errors(
+			&err, trx, thr, &savept);
+
+		thr->lock_state = QUE_THR_LOCK_NOLOCK;
 
 		if (was_lock_wait) {
 			goto run_again;
@@ -1264,18 +1298,71 @@ run_again:
 		return((int) err);
 	}
 
+	if (dict_table_has_fts_index(table)) {
+		doc_id_t        doc_id;
+
+		/* Extract the doc id from the hidden FTS column */
+		doc_id = fts_get_doc_id_from_row(table, node->row);
+
+		if (doc_id <= 0) {
+			fprintf(stderr,
+				"InnoDB: FTS Doc ID must be large than 0 \n");
+			err = DB_FTS_INVALID_DOCID;
+			trx->error_state = DB_FTS_INVALID_DOCID;
+			goto error_exit;
+		}
+
+		if (!DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_HAS_DOC_ID)) {
+			doc_id_t	next_doc_id
+				= table->fts->cache->next_doc_id;
+
+			if (doc_id < next_doc_id) {
+				fprintf(stderr,
+					"InnoDB: FTS Doc ID must be large than"
+					" "UINT64PF" for table",
+					next_doc_id - 1);
+				ut_print_name(stderr, trx, TRUE, table->name);
+				putc('\n', stderr);
+
+				err = DB_FTS_INVALID_DOCID;
+				trx->error_state = DB_FTS_INVALID_DOCID;
+				goto error_exit;
+			}
+
+			/* Difference between Doc IDs are restricted within
+			4 bytes integer. See fts_get_encoded_len() */
+
+			if (doc_id - next_doc_id >= FTS_DOC_ID_MAX_STEP) {
+				fprintf(stderr,
+					"InnoDB: Doc ID "UINT64PF" is too"
+					" big. Its difference with largest"
+					" used Doc ID "UINT64PF" cannot"
+					" exceed or equal to %d\n",
+					doc_id, next_doc_id - 1,
+					FTS_DOC_ID_MAX_STEP);
+				err = DB_FTS_INVALID_DOCID;
+				trx->error_state = DB_FTS_INVALID_DOCID;
+				goto error_exit;
+			}
+		}
+
+		/* Pass NULL for the columns affected, since an INSERT affects
+		all FTS indexes. */
+		fts_trx_add_op(trx, table, doc_id, FTS_INSERT, NULL);
+	}
+
 	que_thr_stop_for_mysql_no_error(thr, trx);
 
-	prebuilt->table->stat_n_rows++;
+	table->stat_n_rows++;
 
 	srv_n_rows_inserted++;
 
 	if (prebuilt->table->stat_n_rows == 0) {
 		/* Avoid wrap-over */
-		prebuilt->table->stat_n_rows--;
+		table->stat_n_rows--;
 	}
 
-	row_update_statistics_if_needed(prebuilt->table);
+	row_update_statistics_if_needed(table);
 	trx->op_info = "";
 
 	return((int) err);
@@ -1298,10 +1385,11 @@ row_prebuild_sel_graph(
 
 		node = sel_node_create(prebuilt->heap);
 
-		prebuilt->sel_graph = que_node_get_parent(
-			pars_complete_graph_for_exec(node,
-						     prebuilt->trx,
-						     prebuilt->heap));
+		prebuilt->sel_graph = static_cast<que_fork_t*>(
+			que_node_get_parent(
+				pars_complete_graph_for_exec(
+					static_cast<sel_node_t*>(node),
+					prebuilt->trx, prebuilt->heap)));
 
 		prebuilt->sel_graph->state = QUE_FORK_ACTIVE;
 	}
@@ -1369,16 +1457,108 @@ row_get_prebuilt_update_vector(
 
 		prebuilt->upd_node = node;
 
-		prebuilt->upd_graph = que_node_get_parent(
-			pars_complete_graph_for_exec(node,
-						     prebuilt->trx,
-						     prebuilt->heap));
+		prebuilt->upd_graph = static_cast<que_fork_t*>(
+			que_node_get_parent(
+				pars_complete_graph_for_exec(
+					static_cast<upd_node_t*>(node),
+					prebuilt->trx, prebuilt->heap)));
+
 		prebuilt->upd_graph->state = QUE_FORK_ACTIVE;
 	}
 
 	return(prebuilt->upd_node->update);
 }
 
+/********************************************************************
+Handle an update of a column that has an FTS index. */
+static
+void
+row_fts_do_update(
+/*==============*/
+	trx_t*		trx,		/* in: transaction */
+	dict_table_t*	table,		/* in: Table with FTS index */
+	doc_id_t	old_doc_id,	/* in: old document id */
+	doc_id_t	new_doc_id)	/* in: new document id */
+{
+	if (trx->fts_next_doc_id) {
+		fts_trx_add_op(trx, table, old_doc_id, FTS_DELETE, NULL);
+		fts_trx_add_op(trx, table, new_doc_id, FTS_INSERT, NULL);
+	}
+}
+
+/************************************************************************
+Handles FTS matters for an update or a delete.
+NOTE: should not be called if the table does not have an FTS index. .*/
+static
+ulint
+row_fts_update_or_delete(
+/*=====================*/
+	row_prebuilt_t*	prebuilt)	/* in: prebuilt struct in MySQL
+					handle */
+{
+	trx_t*		trx = prebuilt->trx;
+	dict_table_t*	table = prebuilt->table;
+	upd_node_t*	node = prebuilt->upd_node;
+	doc_id_t	old_doc_id = prebuilt->fts_doc_id;
+
+	ut_a(dict_table_has_fts_index(prebuilt->table));
+
+	/* Deletes are simple; get them out of the way first. */
+	if (node->is_delete) {
+		/* A delete affects all FTS indexes, so we pass NULL */
+		fts_trx_add_op(trx, table, old_doc_id, FTS_DELETE, NULL);
+	} else {
+		doc_id_t	new_doc_id;
+
+		new_doc_id = fts_read_doc_id((byte*) &trx->fts_next_doc_id);
+
+		if (new_doc_id == 0) {
+			fprintf(stderr, " InnoDB FTS: Doc ID cannot be 0 \n");
+			return(DB_FTS_INVALID_DOCID);
+		}
+
+		row_fts_do_update(trx, table, old_doc_id, new_doc_id);
+	}
+
+	return(DB_SUCCESS);
+}
+
+/*********************************************************************//**
+Initialize the Doc ID system for FK table with FTS index */
+static
+void
+init_fts_doc_id_for_ref(
+/*====================*/
+	dict_table_t*	table,		/*!< in: table */
+	ulint		depth)		/*!< in: recusive call depth */
+{
+	dict_foreign_t* foreign;
+
+	foreign = UT_LIST_GET_FIRST(table->referenced_list);
+
+	depth++;
+
+	/* Limit on tables involved in cascading delete/update */
+	if (depth > FK_MAX_CASCADE_DEL) {
+		return;
+	}
+
+	/* Loop through this table's referenced list and also
+	recursively traverse each table's foreign table list */
+	while (foreign && foreign->foreign_table) {
+		if (foreign->foreign_table->fts) {
+			fts_init_doc_id(foreign->foreign_table);
+		}
+
+		if (UT_LIST_GET_LEN(foreign->foreign_table->referenced_list)
+		    > 0 && foreign->foreign_table != table) {
+			init_fts_doc_id_for_ref(foreign->foreign_table, depth);
+		}
+
+		foreign = UT_LIST_GET_NEXT(referenced_list, foreign);
+	}
+}
+
 /*********************************************************************//**
 Does an update or delete of a row for MySQL.
 @return	error code or DB_SUCCESS */
@@ -1446,13 +1626,13 @@ row_update_for_mysql(
 		return(DB_ERROR);
 	}
 
-	DEBUG_SYNC_C("innodb_row_update_for_mysql_begin");
-
 	trx->op_info = "updating or deleting";
 
 	row_mysql_delay_if_needed();
 
-	trx_start_if_not_started(trx);
+	init_fts_doc_id_for_ref(table, 0);
+
+	trx_start_if_not_started_xa(trx);
 
 	node = prebuilt->upd_node;
 
@@ -1522,6 +1702,15 @@ run_again:
 
 	que_thr_stop_for_mysql_no_error(thr, trx);
 
+	if (dict_table_has_fts_index(table)
+	    && trx->fts_next_doc_id != UINT64_UNDEFINED) {
+		err = row_fts_update_or_delete(prebuilt);
+		if (err != DB_SUCCESS) {
+			trx->op_info = "";
+			return((int) err);
+		}
+	}
+
 	if (node->is_delete) {
 		if (prebuilt->table->stat_n_rows > 0) {
 			prebuilt->table->stat_n_rows--;
@@ -1618,7 +1807,7 @@ row_unlock_for_mysql(
 			index = btr_pcur_get_btr_cur(clust_pcur)->index;
 		}
 
-		if (UNIV_UNLIKELY(!dict_index_is_clust(index))) {
+		if (!dict_index_is_clust(index)) {
 			/* This is not a clustered index record.  We
 			do not know how to unlock the record. */
 			goto no_unlock;
@@ -1650,19 +1839,23 @@ row_unlock_for_mysql(
 			/* We did not update the record: unlock it */
 
 			rec = btr_pcur_get_rec(pcur);
-			index = btr_pcur_get_btr_cur(pcur)->index;
 
-			lock_rec_unlock(trx, btr_pcur_get_block(pcur),
-					rec, prebuilt->select_lock_type);
+			lock_rec_unlock(
+				trx,
+				btr_pcur_get_block(pcur),
+				rec,
+				static_cast<enum lock_mode>(
+					prebuilt->select_lock_type));
 
 			if (prebuilt->new_rec_locks >= 2) {
 				rec = btr_pcur_get_rec(clust_pcur);
-				index = btr_pcur_get_btr_cur(clust_pcur)->index;
 
-				lock_rec_unlock(trx,
-						btr_pcur_get_block(clust_pcur),
-						rec,
-						prebuilt->select_lock_type);
+				lock_rec_unlock(
+					trx,
+					btr_pcur_get_block(clust_pcur),
+					rec,
+					static_cast<enum lock_mode>(
+						prebuilt->select_lock_type));
 			}
 		}
 no_unlock:
@@ -1722,7 +1915,7 @@ run_again:
 
 		que_thr_stop_for_mysql(thr);
 
-		srv_suspend_mysql_thread(thr);
+		lock_wait_suspend_thread(thr);
 
 		/* Note that a lock wait may also end in a lock wait timeout,
 		or this transaction is picked as a victim in selective
@@ -1788,7 +1981,7 @@ row_mysql_freeze_data_dictionary_func(
 {
 	ut_a(trx->dict_operation_lock_mode == 0);
 
-	rw_lock_s_lock_inline(&dict_operation_lock, 0, file, line);
+	rw_lock_s_lock_func(&dict_operation_lock, 0, file, line);
 
 	trx->dict_operation_lock_mode = RW_S_LATCH;
 }
@@ -1825,7 +2018,7 @@ row_mysql_lock_data_dictionary_func(
 	/* Serialize data dictionary operations with dictionary mutex:
 	no deadlocks or lock waits can occur then in these operations */
 
-	rw_lock_x_lock_inline(&dict_operation_lock, 0, file, line);
+	rw_lock_x_lock_func(&dict_operation_lock, 0, file, line);
 	trx->dict_operation_lock_mode = RW_X_LATCH;
 
 	mutex_enter(&(dict_sys->mutex));
@@ -1904,15 +2097,13 @@ err_exit:
 		goto err_exit;
 	}
 
-	trx_start_if_not_started(trx);
+	trx_start_if_not_started_xa(trx);
 
 	/* The table name is prefixed with the database name and a '/'.
 	Certain table names starting with 'innodb_' have their special
 	meaning regardless of the database name.  Thus, we need to
 	ignore the database name prefix in the comparisons. */
-	table_name = strchr(table->name, '/');
-	ut_a(table_name);
-	table_name++;
+	table_name = dict_remove_db_name(table->name);
 	table_name_len = strlen(table_name) + 1;
 
 	if (STR_EQ(table_name, table_name_len, S_innodb_monitor)) {
@@ -1925,23 +2116,24 @@ err_exit:
 		/* The lock timeout monitor thread also takes care
 		of InnoDB monitor prints */
 
-		os_event_set(srv_lock_timeout_thread_event);
+		os_event_set(srv_timeout_event);
 	} else if (STR_EQ(table_name, table_name_len,
 			  S_innodb_lock_monitor)) {
 
 		srv_print_innodb_monitor = TRUE;
 		srv_print_innodb_lock_monitor = TRUE;
-		os_event_set(srv_lock_timeout_thread_event);
+		os_event_set(srv_timeout_event);
 	} else if (STR_EQ(table_name, table_name_len,
 			  S_innodb_tablespace_monitor)) {
 
 		srv_print_innodb_tablespace_monitor = TRUE;
-		os_event_set(srv_lock_timeout_thread_event);
+		os_event_set(srv_timeout_event);
 	} else if (STR_EQ(table_name, table_name_len,
 			  S_innodb_table_monitor)) {
 
 		srv_print_innodb_table_monitor = TRUE;
-		os_event_set(srv_lock_timeout_thread_event);
+		os_event_set(srv_timeout_event);
+#ifdef UNIV_MEM_DEBUG
 	} else if (STR_EQ(table_name, table_name_len,
 			  S_innodb_mem_validate)) {
 		/* We define here a debugging feature intended for
@@ -1954,15 +2146,12 @@ err_exit:
 		      "quiet because allocation from a mem heap"
 		      " is not protected\n"
 		      "by any semaphore.\n", stderr);
-#ifdef UNIV_MEM_DEBUG
 		ut_a(mem_validate());
 		fputs("Memory validated\n", stderr);
-#else /* UNIV_MEM_DEBUG */
-		fputs("Memory NOT validated (recompile with UNIV_MEM_DEBUG)\n",
-		      stderr);
 #endif /* UNIV_MEM_DEBUG */
 	}
 
+
 	heap = mem_heap_create(512);
 
 	trx_set_dict_operation(trx, TRX_DICT_OP_TABLE);
@@ -1971,7 +2160,9 @@ err_exit:
 
 	thr = pars_complete_graph_for_exec(node, trx, heap);
 
-	ut_a(thr == que_fork_start_command(que_node_get_parent(thr)));
+	ut_a(thr == que_fork_start_command(
+			static_cast<que_fork_t*>(que_node_get_parent(thr))));
+
 	que_run_threads(thr);
 
 	err = trx->error_state;
@@ -1981,7 +2172,7 @@ err_exit:
 		break;
 	case DB_OUT_OF_FILE_SPACE:
 		trx->error_state = DB_SUCCESS;
-		trx_general_rollback_for_mysql(trx, NULL);
+		trx_rollback_to_savepoint(trx, NULL);
 
 		ut_print_timestamp(stderr);
 		fputs("  InnoDB: Warning: cannot create table ",
@@ -1989,7 +2180,16 @@ err_exit:
 		ut_print_name(stderr, trx, TRUE, table->name);
 		fputs(" because tablespace full\n", stderr);
 
-		if (dict_table_get_low(table->name)) {
+		if (dict_table_open_on_name_no_stats(
+			table->name, FALSE, DICT_ERR_IGNORE_NONE)) {
+
+			/* Make things easy for the drop table code. */
+
+			if (table->can_be_evicted) {
+				dict_table_move_from_lru_to_non_lru(table);
+			}
+
+			dict_table_close(table, FALSE);
 
 			row_drop_table_for_mysql(table->name, trx, FALSE);
 			trx_commit_for_mysql(trx);
@@ -1999,8 +2199,7 @@ err_exit:
 	case DB_TOO_MANY_CONCURRENT_TRXS:
 		/* We already have .ibd file here. it should be deleted. */
 
-		if (table->space && !fil_delete_tablespace(table->space,
-							   FALSE)) {
+		if (table->space && !fil_delete_tablespace(table->space)) {
 			ut_print_timestamp(stderr);
 			fprintf(stderr,
 				"  InnoDB: Error: not able to"
@@ -2017,7 +2216,7 @@ err_exit:
 		table already exists */
 
 		trx->error_state = DB_SUCCESS;
-		trx_general_rollback_for_mysql(trx, NULL);
+		trx_rollback_to_savepoint(trx, NULL);
 		dict_mem_table_free(table);
 		break;
 	}
@@ -2055,7 +2254,9 @@ row_create_index_for_mysql(
 	ulint		i;
 	ulint		len;
 	char*		table_name;
+	char*		index_name;
 	dict_table_t*	table;
+	ibool		is_fts = FALSE;
 
 #ifdef UNIV_SYNC_DEBUG
 	ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_EX));
@@ -2068,10 +2269,14 @@ row_create_index_for_mysql(
 	table later, after the index object is freed (inside
 	que_run_threads()) and thus index->table_name is not available. */
 	table_name = mem_strdup(index->table_name);
+	index_name = mem_strdup(index->name);
 
-	table = dict_table_get_low(table_name);
+	is_fts = (index->type == DICT_FTS);
 
-	trx_start_if_not_started(trx);
+	table = dict_table_open_on_name_no_stats(table_name, TRUE,
+						 DICT_ERR_IGNORE_NONE);
+
+	trx_start_if_not_started_xa(trx);
 
 	for (i = 0; i < index->n_def; i++) {
 		/* Check that prefix_len and actual length
@@ -2097,27 +2302,40 @@ row_create_index_for_mysql(
 	trx_set_dict_operation(trx, TRX_DICT_OP_TABLE);
 
 	/* Note that the space id where we store the index is inherited from
-	the table in dict_build_index_def_step() in dict0crea.c. */
+	the table in dict_build_index_def_step() in dict0crea.cc. */
 
 	node = ind_create_graph_create(index, heap);
 
 	thr = pars_complete_graph_for_exec(node, trx, heap);
 
-	ut_a(thr == que_fork_start_command(que_node_get_parent(thr)));
+	ut_a(thr == que_fork_start_command(
+			static_cast<que_fork_t*>(que_node_get_parent(thr))));
+
 	que_run_threads(thr);
 
 	err = trx->error_state;
 
 	que_graph_free((que_t*) que_node_get_parent(thr));
 
+	/* Create the index specific FTS auxiliary tables. */
+	if (err == DB_SUCCESS && is_fts) {
+		dict_index_t*	idx;
+
+		idx = dict_table_get_index_on_name(table, index_name);
+
+		ut_ad(idx);
+		err = fts_create_index_tables(trx, idx);
+	}
+
 error_handling:
+	dict_table_close(table, TRUE);
 
 	if (err != DB_SUCCESS) {
 		/* We have special error handling here */
 
 		trx->error_state = DB_SUCCESS;
 
-		trx_general_rollback_for_mysql(trx, NULL);
+		trx_rollback_to_savepoint(trx, NULL);
 
 		row_drop_table_for_mysql(table_name, trx, FALSE);
 
@@ -2129,6 +2347,7 @@ error_handling:
 	trx->op_info = "";
 
 	mem_free(table_name);
+	mem_free(index_name);
 
 	return((int) err);
 }
@@ -2170,7 +2389,7 @@ row_table_add_foreign_constraints(
 
 	trx->op_info = "adding foreign keys";
 
-	trx_start_if_not_started(trx);
+	trx_start_if_not_started_xa(trx);
 
 	trx_set_dict_operation(trx, TRX_DICT_OP_TABLE);
 
@@ -2186,7 +2405,7 @@ row_table_add_foreign_constraints(
 
 		trx->error_state = DB_SUCCESS;
 
-		trx_general_rollback_for_mysql(trx, NULL);
+		trx_rollback_to_savepoint(trx, NULL);
 
 		row_drop_table_for_mysql(name, trx, FALSE);
 
@@ -2204,7 +2423,7 @@ in ALTER TABLE to the fact that the table handler does not remove the
 table before all handles to it has been removed. Furhermore, the MySQL's
 call to drop table must be non-blocking. Therefore we do the drop table
 as a background operation, which is taken care of by the master thread
-in srv0srv.c.
+in srv0srv.cc.
 @return	error code or DB_SUCCESS */
 static
 int
@@ -2245,7 +2464,7 @@ row_drop_table_for_mysql_in_background(
 }
 
 /*********************************************************************//**
-The master thread in srv0srv.c calls this regularly to drop tables which
+The master thread in srv0srv.cc calls this regularly to drop tables which
 we must drop in background after queries to them have ended. Such lazy
 dropping of tables is needed in ALTER TABLE on Unix.
 @return	how many tables dropped + remaining tables in list */
@@ -2259,19 +2478,15 @@ row_drop_tables_for_mysql_in_background(void)
 	ulint			n_tables;
 	ulint			n_tables_dropped = 0;
 loop:
-	mutex_enter(&kernel_mutex);
+	mutex_enter(&row_drop_list_mutex);
 
-	if (!row_mysql_drop_list_inited) {
-
-		UT_LIST_INIT(row_mysql_drop_list);
-		row_mysql_drop_list_inited = TRUE;
-	}
+	ut_a(row_mysql_drop_list_inited);
 
 	drop = UT_LIST_GET_FIRST(row_mysql_drop_list);
 
 	n_tables = UT_LIST_GET_LEN(row_mysql_drop_list);
 
-	mutex_exit(&kernel_mutex);
+	mutex_exit(&row_drop_list_mutex);
 
 	if (drop == NULL) {
 		/* All tables dropped */
@@ -2279,9 +2494,8 @@ loop:
 		return(n_tables + n_tables_dropped);
 	}
 
-	mutex_enter(&(dict_sys->mutex));
-	table = dict_table_get_low(drop->table_name);
-	mutex_exit(&(dict_sys->mutex));
+	table = dict_table_open_on_name_no_stats(drop->table_name, FALSE,
+						 DICT_ERR_IGNORE_NONE);
 
 	if (table == NULL) {
 		/* If for some reason the table has already been dropped
@@ -2290,6 +2504,10 @@ loop:
 		goto already_dropped;
 	}
 
+	ut_a(!table->can_be_evicted);
+
+	dict_table_close(table, FALSE);
+
 	if (DB_SUCCESS != row_drop_table_for_mysql_in_background(
 		    drop->table_name)) {
 		/* If the DROP fails for some table, we return, and let the
@@ -2301,10 +2519,12 @@ loop:
 	n_tables_dropped++;
 
 already_dropped:
-	mutex_enter(&kernel_mutex);
+	mutex_enter(&row_drop_list_mutex);
 
 	UT_LIST_REMOVE(row_mysql_drop_list, row_mysql_drop_list, drop);
 
+	MONITOR_DEC(MONITOR_BACKGROUND_DROP_TABLE);
+
 	ut_print_timestamp(stderr);
 	fputs("  InnoDB: Dropped table ", stderr);
 	ut_print_name(stderr, NULL, TRUE, drop->table_name);
@@ -2314,29 +2534,31 @@ already_dropped:
 
 	mem_free(drop);
 
-	mutex_exit(&kernel_mutex);
+	mutex_exit(&row_drop_list_mutex);
 
 	goto loop;
 }
 
 /*********************************************************************//**
-Get the background drop list length. NOTE: the caller must own the kernel
-mutex!
+Get the background drop list length. NOTE: the caller must own the
+drop list mutex!
 @return	how many tables in list */
 UNIV_INTERN
 ulint
 row_get_background_drop_list_len_low(void)
 /*======================================*/
 {
-	ut_ad(mutex_own(&kernel_mutex));
+	ulint	len;
 
-	if (!row_mysql_drop_list_inited) {
+	mutex_enter(&row_drop_list_mutex);
 
-		UT_LIST_INIT(row_mysql_drop_list);
-		row_mysql_drop_list_inited = TRUE;
-	}
+	ut_a(row_mysql_drop_list_inited);
+
+	len = UT_LIST_GET_LEN(row_mysql_drop_list);
 
-	return(UT_LIST_GET_LEN(row_mysql_drop_list));
+	mutex_exit(&row_drop_list_mutex);
+
+	return(len);
 }
 
 /*********************************************************************//**
@@ -2354,40 +2576,38 @@ row_add_table_to_background_drop_list(
 {
 	row_mysql_drop_t*	drop;
 
-	mutex_enter(&kernel_mutex);
-
-	if (!row_mysql_drop_list_inited) {
+	mutex_enter(&row_drop_list_mutex);
 
-		UT_LIST_INIT(row_mysql_drop_list);
-		row_mysql_drop_list_inited = TRUE;
-	}
+	ut_a(row_mysql_drop_list_inited);
 
 	/* Look if the table already is in the drop list */
-	drop = UT_LIST_GET_FIRST(row_mysql_drop_list);
+	for (drop = UT_LIST_GET_FIRST(row_mysql_drop_list);
+	     drop != NULL;
+	     drop = UT_LIST_GET_NEXT(row_mysql_drop_list, drop)) {
 
-	while (drop != NULL) {
 		if (strcmp(drop->table_name, name) == 0) {
 			/* Already in the list */
 
-			mutex_exit(&kernel_mutex);
+			mutex_exit(&row_drop_list_mutex);
 
 			return(FALSE);
 		}
-
-		drop = UT_LIST_GET_NEXT(row_mysql_drop_list, drop);
 	}
 
-	drop = mem_alloc(sizeof(row_mysql_drop_t));
+	drop = static_cast<row_mysql_drop_t*>(
+		mem_alloc(sizeof(row_mysql_drop_t)));
 
 	drop->table_name = mem_strdup(name);
 
 	UT_LIST_ADD_LAST(row_mysql_drop_list, row_mysql_drop_list, drop);
 
+	MONITOR_INC(MONITOR_BACKGROUND_DROP_TABLE);
+
 	/*	fputs("InnoDB: Adding table ", stderr);
 	ut_print_name(stderr, trx, TRUE, drop->table_name);
 	fputs(" to background drop list\n", stderr); */
 
-	mutex_exit(&kernel_mutex);
+	mutex_exit(&row_drop_list_mutex);
 
 	return(TRUE);
 }
@@ -2427,7 +2647,7 @@ row_discard_tablespace_for_mysql(
 	3) Insert buffer: we remove all entries for the tablespace in
 	the insert buffer tree; as long as the tablespace mem object
 	does not exist, ongoing insert buffer page merges are
-	discarded in buf0rea.c. If we recreate the tablespace mem
+	discarded in buf0rea.cc. If we recreate the tablespace mem
 	object with IMPORT TABLESPACE later, then the tablespace will
 	have the same id, but the tablespace_version field in the mem
 	object is different, and ongoing old insert buffer page merges
@@ -2441,14 +2661,15 @@ row_discard_tablespace_for_mysql(
 	discard. We also reserve the data dictionary latch. */
 
 	trx->op_info = "discarding tablespace";
-	trx_start_if_not_started(trx);
+	trx_start_if_not_started_xa(trx);
 
 	/* Serialize data dictionary operations with dictionary mutex:
 	no deadlocks can occur then in these operations */
 
 	row_mysql_lock_data_dictionary(trx);
 
-	table = dict_table_get_low(name);
+	table = dict_table_open_on_name_no_stats(name, TRUE,
+						 DICT_ERR_IGNORE_NONE);
 
 	if (!table) {
 		err = DB_TABLE_NOT_FOUND;
@@ -2551,7 +2772,7 @@ row_discard_tablespace_for_mysql(
 
 	if (err != DB_SUCCESS) {
 		trx->error_state = DB_SUCCESS;
-		trx_general_rollback_for_mysql(trx, NULL);
+		trx_rollback_to_savepoint(trx, NULL);
 		trx->error_state = DB_SUCCESS;
 	} else {
 		dict_table_change_id_in_cache(table, new_id);
@@ -2560,7 +2781,7 @@ row_discard_tablespace_for_mysql(
 
 		if (!success) {
 			trx->error_state = DB_SUCCESS;
-			trx_general_rollback_for_mysql(trx, NULL);
+			trx_rollback_to_savepoint(trx, NULL);
 			trx->error_state = DB_SUCCESS;
 
 			err = DB_ERROR;
@@ -2573,6 +2794,11 @@ row_discard_tablespace_for_mysql(
 	}
 
 funct_exit:
+
+	if (table != NULL) {
+		dict_table_close(table, TRUE);
+	}
+
 	trx_commit_for_mysql(trx);
 
 	row_mysql_unlock_data_dictionary(trx);
@@ -2595,10 +2821,10 @@ row_import_tablespace_for_mysql(
 {
 	dict_table_t*	table;
 	ibool		success;
-	ib_uint64_t	current_lsn;
+	lsn_t		current_lsn;
 	ulint		err		= DB_SUCCESS;
 
-	trx_start_if_not_started(trx);
+	trx_start_if_not_started_xa(trx);
 
 	trx->op_info = "importing tablespace";
 
@@ -2629,6 +2855,7 @@ row_import_tablespace_for_mysql(
 		err = DB_ERROR;
 
 		row_mysql_lock_data_dictionary(trx);
+		table = NULL;
 
 		goto funct_exit;
 	}
@@ -2638,7 +2865,8 @@ row_import_tablespace_for_mysql(
 
 	row_mysql_lock_data_dictionary(trx);
 
-	table = dict_table_get_low(name);
+	table = dict_table_open_on_name_no_stats(name, TRUE,
+						 DICT_ERR_IGNORE_NONE);
 
 	if (!table) {
 		ut_print_timestamp(stderr);
@@ -2688,7 +2916,7 @@ row_import_tablespace_for_mysql(
 
 	success = fil_open_single_table_tablespace(
 		TRUE, table->space,
-		table->flags == DICT_TF_COMPACT ? 0 : table->flags,
+		dict_tf_to_fsp_flags(table->flags),
 		table->name);
 	if (success) {
 		table->ibd_file_missing = FALSE;
@@ -2709,6 +2937,11 @@ row_import_tablespace_for_mysql(
 	}
 
 funct_exit:
+
+	if (table != NULL) {
+		dict_table_close(table, TRUE);
+	}
+
 	trx_commit_for_mysql(trx);
 
 	row_mysql_unlock_data_dictionary(trx);
@@ -2740,6 +2973,7 @@ row_truncate_table_for_mysql(
 	table_id_t	new_id;
 	ulint		recreate_space = 0;
 	pars_info_t*	info = NULL;
+	ibool		has_internal_doc_id;
 
 	/* How do we prevent crashes caused by ongoing operations on
 	the table? Old operations could try to access non-existent
@@ -2795,7 +3029,7 @@ row_truncate_table_for_mysql(
 
 	trx->op_info = "truncating table";
 
-	trx_start_if_not_started(trx);
+	trx_start_if_not_started_xa(trx);
 
 	/* Serialize data dictionary operations with dictionary mutex:
 	no deadlocks can occur then in these operations */
@@ -2844,7 +3078,7 @@ row_truncate_table_for_mysql(
 
 	/* TODO: could we replace the counter n_foreign_key_checks_running
 	with lock checks on the table? Acquire here an exclusive lock on the
-	table, and rewrite lock0lock.c and the lock wait in srv0srv.c so that
+	table, and rewrite lock0lock.cc and the lock wait in srv0srv.cc so that
 	they can cope with the table having been truncated here? Foreign key
 	checks take an IS or IX lock on the table. */
 
@@ -2862,6 +3096,7 @@ row_truncate_table_for_mysql(
 	}
 
 	/* Remove all locks except the table-level S and X locks. */
+
 	lock_remove_all_on_table(table, FALSE);
 
 	trx->table_id = table->id;
@@ -2885,7 +3120,8 @@ row_truncate_table_for_mysql(
 
 			if (space == ULINT_UNDEFINED
 			    || fil_create_new_single_table_tablespace(
-				    space, table->name, FALSE, flags,
+				    space, table->name, FALSE,
+				    flags, table->flags2,
 				    FIL_IBD_FILE_INITIAL_SIZE) != DB_SUCCESS) {
 				dict_table_x_unlock_indexes(table);
 				ut_print_timestamp(stderr);
@@ -2933,7 +3169,7 @@ row_truncate_table_for_mysql(
 	tuple = dtuple_create(heap, 1);
 	dfield = dtuple_get_nth_field(tuple, 0);
 
-	buf = mem_heap_alloc(heap, 8);
+	buf = static_cast<byte*>(mem_heap_alloc(heap, 8));
 	mach_write_to_8(buf, table->id);
 
 	dfield_set_data(dfield, buf, 8);
@@ -2956,7 +3192,8 @@ row_truncate_table_for_mysql(
 
 		rec = btr_pcur_get_rec(&pcur);
 
-		field = rec_get_nth_field_old(rec, 0, &len);
+		field = rec_get_nth_field_old(
+			rec, DICT_FLD__SYS_INDEXES__TABLE_ID, &len);
 		ut_ad(len == 8);
 
 		if (memcmp(buf, field, len) != 0) {
@@ -2978,7 +3215,7 @@ row_truncate_table_for_mysql(
 
 		if (root_page_no != FIL_NULL) {
 			page_rec_write_field(
-				rec, DICT_SYS_INDEXES_PAGE_NO_FIELD,
+				rec, DICT_FLD__SYS_INDEXES__PAGE_NO,
 				root_page_no, &mtr);
 			/* We will need to commit and restart the
 			mini-transaction in order to avoid deadlocks.
@@ -3006,6 +3243,49 @@ next_rec:
 
 	dict_hdr_get_new_id(&new_id, NULL, NULL);
 
+	/* Create new FTS auxiliary tables with the new_id, and
+	drop the old index later, only if everything runs successful. */
+	has_internal_doc_id = dict_table_has_fts_index(table)
+			      || DICT_TF2_FLAG_IS_SET(
+				table, DICT_TF2_FTS_HAS_DOC_ID);
+	if (has_internal_doc_id) {
+		dict_table_t	fts_table;
+		ulint		i;
+
+		fts_table.name = table->name;
+		fts_table.id = new_id;
+
+		err = fts_create_common_tables(trx, &fts_table, table->name,
+					       TRUE);
+
+		if (err == DB_SUCCESS) {
+			for (i = 0; i < ib_vector_size(table->fts->indexes);
+			     i++) {
+				dict_index_t*	fts_index;
+
+				fts_index = static_cast<dict_index_t*>(
+					ib_vector_getp(
+						table->fts->indexes, i));
+
+				fts_create_index_tables_low(
+					trx, fts_index, table->name, new_id);
+			}
+		}
+
+		if (err != DB_SUCCESS) {
+			trx->error_state = DB_SUCCESS;
+			trx_rollback_to_savepoint(trx, NULL);
+			trx->error_state = DB_SUCCESS;
+			ut_print_timestamp(stderr);
+			fputs("  InnoDB: Unable to truncate FTS index for"
+			      " table", stderr);
+			ut_print_name(stderr, trx, TRUE, table->name);
+			fputs("\n", stderr);
+
+			goto funct_exit;
+		}
+	}
+
 	info = pars_info_create();
 
 	pars_info_add_int4_literal(info, "space", (lint) table->space);
@@ -3029,7 +3309,7 @@ next_rec:
 
 	if (err != DB_SUCCESS) {
 		trx->error_state = DB_SUCCESS;
-		trx_general_rollback_for_mysql(trx, NULL);
+		trx_rollback_to_savepoint(trx, NULL);
 		trx->error_state = DB_SUCCESS;
 		ut_print_timestamp(stderr);
 		fputs("  InnoDB: Unable to assign a new identifier to table ",
@@ -3038,17 +3318,41 @@ next_rec:
 		fputs("\n"
 		      "InnoDB: after truncating it.  Background processes"
 		      " may corrupt the table!\n", stderr);
+
+		/* Fail to update the table id, so drop the new
+		FTS auxiliary tables */
+		if (has_internal_doc_id) {
+			dict_table_t	fts_table;
+
+			fts_table.name = table->name;
+			fts_table.id = new_id;
+
+			fts_drop_tables(trx, &fts_table);
+		}
+
 		err = DB_ERROR;
 	} else {
+		/* Drop the old FTS index */
+		if (has_internal_doc_id) {
+			fts_drop_tables(trx, table);
+		}
+
 		dict_table_change_id_in_cache(table, new_id);
+
+		/* Reset the Doc ID in cache to 0 */
+		if (has_internal_doc_id && table->fts->cache) {
+			table->fts->fts_status |= TABLE_DICT_LOCKED;
+			fts_update_next_doc_id(table, NULL, 0);
+			fts_cache_clear(table->fts->cache, TRUE);
+			fts_cache_init(table->fts->cache);
+			table->fts->fts_status &= ~TABLE_DICT_LOCKED;
+		}
 	}
 
 	/* Reset auto-increment. */
 	dict_table_autoinc_lock(table);
 	dict_table_autoinc_initialize(table, 1);
 	dict_table_autoinc_unlock(table);
-	dict_update_statistics(table, FALSE /* update even if stats are
-					    initialized */);
 
 	trx_commit_for_mysql(trx);
 
@@ -3056,6 +3360,11 @@ funct_exit:
 
 	row_mysql_unlock_data_dictionary(trx);
 
+	/* We are supposed to recalc and save the stats only
+	on ANALYZE, but it also makes sense to do so on TRUNCATE */
+	dict_stats_update(table, DICT_STATS_RECALC_PERSISTENT_SILENT,
+			  FALSE);
+
 	trx->op_info = "";
 
 	srv_wake_master_thread();
@@ -3087,7 +3396,8 @@ row_drop_table_for_mysql(
 	const char*	table_name;
 	ulint		namelen;
 	ibool		locked_dictionary	= FALSE;
-	pars_info_t*    info			= NULL;
+	ibool		fts_bg_thread_exited	= FALSE;
+	pars_info_t*	info			= NULL;
 
 	ut_a(name != NULL);
 
@@ -3101,17 +3411,19 @@ row_drop_table_for_mysql(
 		return(DB_ERROR);
 	}
 
-	trx->op_info = "dropping table";
-
-	trx_start_if_not_started(trx);
-
 	/* The table name is prefixed with the database name and a '/'.
 	Certain table names starting with 'innodb_' have their special
 	meaning regardless of the database name.  Thus, we need to
 	ignore the database name prefix in the comparisons. */
 	table_name = strchr(name, '/');
-	ut_a(table_name);
-	table_name++;
+
+	if (table_name) {
+		table_name++;
+	} else {
+		/* Ancillary FTS tables don't have '/' characters. */
+		table_name = name;
+	}
+
 	namelen = strlen(table_name) + 1;
 
 	if (namelen == sizeof S_innodb_monitor
@@ -3143,6 +3455,10 @@ row_drop_table_for_mysql(
 	/* Serialize data dictionary operations with dictionary mutex:
 	no deadlocks can occur then in these operations */
 
+	trx->op_info = "dropping table";
+
+	trx_start_if_not_started(trx);
+
 	if (trx->dict_operation_lock_mode != RW_X_LATCH) {
 		/* Prevent foreign key checks etc. while we are dropping the
 		table */
@@ -3152,13 +3468,16 @@ row_drop_table_for_mysql(
 		locked_dictionary = TRUE;
 	}
 
+retry:
 	ut_ad(mutex_own(&(dict_sys->mutex)));
 #ifdef UNIV_SYNC_DEBUG
 	ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_EX));
 #endif /* UNIV_SYNC_DEBUG */
 
-	table = dict_table_get_low_ignore_err(
-		name, DICT_ERR_IGNORE_INDEX_ROOT | DICT_ERR_IGNORE_CORRUPT);
+	table = dict_table_open_on_name_no_stats(
+		name, TRUE,
+		static_cast<dict_err_ignore_t>(
+			DICT_ERR_IGNORE_INDEX_ROOT | DICT_ERR_IGNORE_CORRUPT));
 
 	if (!table) {
 		err = DB_TABLE_NOT_FOUND;
@@ -3179,6 +3498,46 @@ row_drop_table_for_mysql(
 		goto funct_exit;
 	}
 
+	if (table->fts) {
+		fts_t*          fts = table->fts;
+
+		/* It is possible that background 'Add' thread fts_add_thread()
+		just gets called and the fts_optimize_thread()
+		is processing deleted records. There could be undetected
+		deadlock between threads synchronization and dict_sys_mutex
+		since fts_parse_sql() requires dict_sys->mutex. Ask the
+		background thread to exit before proceeds to drop table to
+		avoid undetected deadlocks */
+		row_mysql_unlock_data_dictionary(trx);
+
+		if (fts->add_wq && (!fts_bg_thread_exited)) {
+			/* Wait for any background threads accessing the table
+			to exit. */
+			mutex_enter(&fts->bg_threads_mutex);
+			fts->fts_status |= BG_THREAD_STOP;
+
+			dict_table_wait_for_bg_threads_to_exit(table, 250000);
+
+			mutex_exit(&fts->bg_threads_mutex);
+
+			row_mysql_lock_data_dictionary(trx);
+			fts_bg_thread_exited = TRUE;
+			goto retry;
+		} else {
+			fts_optimize_remove_table(table);
+			row_mysql_lock_data_dictionary(trx);
+		}
+	}
+
+	/* Move the table the the non-LRU list so that it isn't
+	considered for eviction. */
+
+	if (table->can_be_evicted) {
+		dict_table_move_from_lru_to_non_lru(table);
+	}
+
+	dict_table_close(table, TRUE);
+
 	/* Check if the table is referenced by foreign key constraints from
 	some other table (not the table itself) */
 
@@ -3218,25 +3577,34 @@ check_next_foreign:
 		goto check_next_foreign;
 	}
 
-	if (table->n_mysql_handles_opened > 0) {
-		ibool	added;
+	/* TODO: could we replace the counter n_foreign_key_checks_running
+	with lock checks on the table? Acquire here an exclusive lock on the
+	table, and rewrite lock0lock.cc and the lock wait in srv0srv.cc so that
+	they can cope with the table having been dropped here? Foreign key
+	checks take an IS or IX lock on the table. */
 
-		added = row_add_table_to_background_drop_list(table->name);
+	if (table->n_foreign_key_checks_running > 0) {
+
+		const char*	table_name = table->name;
+		ibool		added;
+
+		added = row_add_table_to_background_drop_list(table_name);
 
 		if (added) {
 			ut_print_timestamp(stderr);
-			fputs("  InnoDB: Warning: MySQL is"
-			      " trying to drop table ", stderr);
-			ut_print_name(stderr, trx, TRUE, table->name);
+			fputs("  InnoDB: You are trying to drop table ",
+			      stderr);
+			ut_print_name(stderr, trx, TRUE, table_name);
 			fputs("\n"
-			      "InnoDB: though there are still"
-			      " open handles to it.\n"
-			      "InnoDB: Adding the table to the"
-			      " background drop queue.\n",
+			      "InnoDB: though there is a"
+			      " foreign key check running on it.\n"
+			      "InnoDB: Adding the table to"
+			      " the background drop queue.\n",
 			      stderr);
 
 			/* We return DB_SUCCESS to MySQL though the drop will
 			happen lazily later */
+
 			err = DB_SUCCESS;
 		} else {
 			/* The table is already in the background drop list */
@@ -3246,34 +3614,42 @@ check_next_foreign:
 		goto funct_exit;
 	}
 
-	/* TODO: could we replace the counter n_foreign_key_checks_running
-	with lock checks on the table? Acquire here an exclusive lock on the
-	table, and rewrite lock0lock.c and the lock wait in srv0srv.c so that
-	they can cope with the table having been dropped here? Foreign key
-	checks take an IS or IX lock on the table. */
+	/* Remove all locks that are on the table or its records, if there
+	are no refernces to the table but it has record locks, we release
+	the record locks unconditionally. One use case is:
 
-	if (table->n_foreign_key_checks_running > 0) {
+		CREATE TABLE t2 (PRIMARY KEY (a)) SELECT * FROM t1;
 
-		const char*	table_name = table->name;
-		ibool		added;
+	If after the user transaction has done the SELECT and there is a
+	problem in completing the CREATE TABLE operation, MySQL will drop
+	the table. InnoDB will create a new background transaction to do the
+	actual drop, the trx instance that is passed to this function. To
+	preserve existing behaviour we remove the locks but ideally we
+	shouldn't have to. There should never be record locks on a table
+	that is going to be dropped. */
 
-		added = row_add_table_to_background_drop_list(table_name);
+	if (table->n_ref_count == 0) {
+		lock_remove_all_on_table(table, TRUE);
+		ut_a(table->n_rec_locks == 0);
+	} else if (table->n_ref_count > 0 || table->n_rec_locks > 0) {
+		ibool	added;
+
+		added = row_add_table_to_background_drop_list(table->name);
 
 		if (added) {
 			ut_print_timestamp(stderr);
-			fputs("  InnoDB: You are trying to drop table ",
-			      stderr);
-			ut_print_name(stderr, trx, TRUE, table_name);
+			fputs("  InnoDB: Warning: MySQL is"
+			      " trying to drop table ", stderr);
+			ut_print_name(stderr, trx, TRUE, table->name);
 			fputs("\n"
-			      "InnoDB: though there is a"
-			      " foreign key check running on it.\n"
-			      "InnoDB: Adding the table to"
-			      " the background drop queue.\n",
+			      "InnoDB: though there are still"
+			      " open handles to it.\n"
+			      "InnoDB: Adding the table to the"
+			      " background drop queue.\n",
 			      stderr);
 
 			/* We return DB_SUCCESS to MySQL though the drop will
 			happen lazily later */
-
 			err = DB_SUCCESS;
 		} else {
 			/* The table is already in the background drop list */
@@ -3283,8 +3659,10 @@ check_next_foreign:
 		goto funct_exit;
 	}
 
-	/* Remove all locks there are on the table or its records */
-	lock_remove_all_on_table(table, TRUE);
+	/* If we get this far then the table to be dropped must not have
+	any table or record locks on it. */
+
+	ut_a(!lock_table_has_locks(table));
 
 	trx_set_dict_operation(trx, TRX_DICT_OP_TABLE);
 	trx->table_id = table->id;
@@ -3389,7 +3767,6 @@ check_next_foreign:
 
 	switch (err) {
 		ibool		is_temp;
-		const char*	name_or_path;
 		mem_heap_t*	heap;
 
 	case DB_SUCCESS:
@@ -3402,14 +3779,26 @@ check_next_foreign:
 		name = mem_heap_strdup(heap, name);
 		space_id = table->space;
 
-		if (table->dir_path_of_temp_table != NULL) {
-			name_or_path = mem_heap_strdup(
-				heap, table->dir_path_of_temp_table);
-			is_temp = TRUE;
-		} else {
-			name_or_path = name;
-			is_temp = (table->flags >> DICT_TF2_SHIFT)
-				& DICT_TF2_TEMPORARY;
+		is_temp = table->flags2 & DICT_TF2_TEMPORARY;
+		ut_a(table->dir_path_of_temp_table == NULL || is_temp);
+
+		if (dict_table_has_fts_index(table)
+		    || DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_HAS_DOC_ID)) {
+			ut_ad(table->n_ref_count == 0);
+			err = fts_drop_tables(trx, table);
+
+			if (err != DB_SUCCESS) {
+				ut_print_timestamp(stderr);
+				fprintf(stderr," InnoDB: Error: (%lu) not "
+					"able to remove ancillary FTS tables "
+					"for table ", err);
+				ut_print_name(stderr, trx, TRUE, name);
+				fputs("\n", stderr);
+
+				goto funct_exit;
+			}
+
+			fts_free(table);
 		}
 
 		dict_table_remove_from_cache(table);
@@ -3427,10 +3816,8 @@ check_next_foreign:
 		wrong: we do not want to delete valuable data of the user */
 
 		if (err == DB_SUCCESS && space_id > 0) {
-			if (!fil_space_for_table_exists_in_mem(space_id,
-							       name_or_path,
-							       is_temp, FALSE,
-							       !is_temp)) {
+			if (!fil_space_for_table_exists_in_mem(
+					space_id, name, FALSE, !is_temp)) {
 				err = DB_SUCCESS;
 
 				fprintf(stderr,
@@ -3439,7 +3826,7 @@ check_next_foreign:
 					"InnoDB: of table ");
 				ut_print_name(stderr, trx, TRUE, name);
 				fprintf(stderr, ".\n");
-			} else if (!fil_delete_tablespace(space_id, FALSE)) {
+			} else if (!fil_delete_tablespace(space_id)) {
 				fprintf(stderr,
 					"InnoDB: We removed now the InnoDB"
 					" internal data dictionary entry\n"
@@ -3461,12 +3848,35 @@ check_next_foreign:
 		mem_heap_free(heap);
 		break;
 
+	case DB_OUT_OF_FILE_SPACE:
+		err = DB_MUST_GET_MORE_FILE_SPACE;
+
+		row_mysql_handle_errors(&err, trx, NULL, NULL);
+
+		/* raise error */
+		ut_error;
+		break;
+
 	case DB_TOO_MANY_CONCURRENT_TRXS:
 		/* Cannot even find a free slot for the
 		the undo log. We can directly exit here
 		and return the DB_TOO_MANY_CONCURRENT_TRXS
 		error. */
 
+	default:
+		/* This is some error we do not expect. Print
+		the error number and rollback transaction */
+		ut_print_timestamp(stderr);
+
+		fprintf(stderr, "InnoDB: unknown error code %lu"
+			" while dropping table:", (ulong) err);
+		ut_print_name(stderr, trx, TRUE, name);
+		fprintf(stderr, ".\n");
+
+		trx->error_state = DB_SUCCESS;
+		trx_rollback_to_savepoint(trx, NULL);
+		trx->error_state = DB_SUCCESS;
+
 		/* Mark all indexes available in the data dictionary
 		cache again. */
 
@@ -3477,18 +3887,6 @@ check_next_foreign:
 			index->to_be_dropped = FALSE;
 			rw_lock_x_unlock(dict_index_get_lock(index));
 		}
-		break;
-
-	case DB_OUT_OF_FILE_SPACE:
-		err = DB_MUST_GET_MORE_FILE_SPACE;
-
-		row_mysql_handle_errors(&err, trx, NULL, NULL);
-
-		/* Fall through to raise error */
-
-	default:
-		/* No other possible error returns */
-		ut_error;
 	}
 
 funct_exit:
@@ -3544,23 +3942,30 @@ row_mysql_drop_temp_tables(void)
 			break;
 		}
 
+		/* The high order bit of N_COLS is set unless
+		ROW_FORMAT=REDUNDANT. */
 		rec = btr_pcur_get_rec(&pcur);
-		field = rec_get_nth_field_old(rec, 4/*N_COLS*/, &len);
-		if (len != 4 || !(mach_read_from_4(field) & 0x80000000UL)) {
+		field = rec_get_nth_field_old(
+			rec, DICT_FLD__SYS_TABLES__N_COLS, &len);
+		if (len != 4
+		    || !(mach_read_from_4(field) & DICT_N_COLS_COMPACT)) {
 			continue;
 		}
 
-		/* Because this is not a ROW_FORMAT=REDUNDANT table,
-		the is_temp flag is valid.  Examine it. */
-
-		field = rec_get_nth_field_old(rec, 7/*MIX_LEN*/, &len);
+		/* Older versions of InnoDB, which only supported tables
+		in ROW_FORMAT=REDUNDANT could write garbage to
+		SYS_TABLES.MIX_LEN, where we now store the is_temp flag.
+		Above, we assumed is_temp=0 if ROW_FORMAT=REDUNDANT. */
+		field = rec_get_nth_field_old(
+			rec, DICT_FLD__SYS_TABLES__MIX_LEN, &len);
 		if (len != 4
 		    || !(mach_read_from_4(field) & DICT_TF2_TEMPORARY)) {
 			continue;
 		}
 
 		/* This is a temporary table. */
-		field = rec_get_nth_field_old(rec, 0/*NAME*/, &len);
+		field = rec_get_nth_field_old(
+			rec, DICT_FLD__SYS_TABLES__NAME, &len);
 		if (len == UNIV_SQL_NULL || len == 0) {
 			/* Corrupted SYS_TABLES.NAME */
 			continue;
@@ -3670,21 +4075,23 @@ row_drop_database_for_mysql(
 
 	trx->op_info = "dropping database";
 
-	trx_start_if_not_started(trx);
+	trx_start_if_not_started_xa(trx);
 loop:
 	row_mysql_lock_data_dictionary(trx);
 
 	while ((table_name = dict_get_first_table_name_in_db(name))) {
 		ut_a(memcmp(table_name, name, namelen) == 0);
 
-		table = dict_table_get_low(table_name);
+		table = dict_table_open_on_name_no_stats(table_name, TRUE,
+							 DICT_ERR_IGNORE_NONE);
 
 		ut_a(table);
+		ut_a(!table->can_be_evicted);
 
 		/* Wait until MySQL does not have any queries running on
 		the table */
 
-		if (table->n_mysql_handles_opened > 0) {
+		if (table->n_ref_count > 0) {
 			row_mysql_unlock_data_dictionary(trx);
 
 			ut_print_timestamp(stderr);
@@ -3826,14 +4233,14 @@ row_rename_table_for_mysql(
 	trx_t*		trx,		/*!< in: transaction handle */
 	ibool		commit)		/*!< in: if TRUE then commit trx */
 {
-	dict_table_t*	table;
+	dict_table_t*	table			= NULL;
+	ibool		dict_locked		= FALSE;
 	ulint		err			= DB_ERROR;
 	mem_heap_t*	heap			= NULL;
 	const char**	constraints_to_drop	= NULL;
 	ulint		n_constraints_to_drop	= 0;
 	ibool		old_is_tmp, new_is_tmp;
 	pars_info_t*	info			= NULL;
-	int		retry;
 
 	ut_a(old_name != NULL);
 	ut_a(new_name != NULL);
@@ -3861,12 +4268,15 @@ row_rename_table_for_mysql(
 	}
 
 	trx->op_info = "renaming table";
-	trx_start_if_not_started(trx);
+	trx_start_if_not_started_xa(trx);
 
 	old_is_tmp = row_is_mysql_tmp_table_name(old_name);
 	new_is_tmp = row_is_mysql_tmp_table_name(new_name);
 
-	table = dict_table_get_low(old_name);
+	dict_locked = trx->dict_operation_lock_mode == RW_X_LATCH;
+
+	table = dict_table_open_on_name_no_stats(old_name, dict_locked,
+						 DICT_ERR_IGNORE_NONE);
 
 	if (!table) {
 		err = DB_TABLE_NOT_FOUND;
@@ -3911,36 +4321,17 @@ row_rename_table_for_mysql(
 			&constraints_to_drop);
 
 		if (err != DB_SUCCESS) {
-
 			goto funct_exit;
 		}
 	}
 
-	/* Is a foreign key check running on this table? */
-	for (retry = 0; retry < 100
-	     && table->n_foreign_key_checks_running > 0; ++retry) {
-		row_mysql_unlock_data_dictionary(trx);
-		os_thread_yield();
-		row_mysql_lock_data_dictionary(trx);
-	}
-
-	if (table->n_foreign_key_checks_running > 0) {
-		ut_print_timestamp(stderr);
-		fputs(" InnoDB: Error: in ALTER TABLE ", stderr);
-		ut_print_name(stderr, trx, TRUE, old_name);
-		fprintf(stderr, "\n"
-			"InnoDB: a FOREIGN KEY check is running.\n"
-			"InnoDB: Cannot rename table.\n");
-		err = DB_TABLE_IN_FK_CHECK;
-		goto funct_exit;
-	}
-
 	/* We use the private SQL parser of Innobase to generate the query
 	graphs needed in updating the dictionary data from system tables. */
 
 	info = pars_info_create();
 
 	pars_info_add_str_literal(info, "new_table_name", new_name);
+
 	pars_info_add_str_literal(info, "old_table_name", old_name);
 
 	err = que_eval_sql(info,
@@ -4085,7 +4476,7 @@ end:
 			      "InnoDB: succeed.\n", stderr);
 		}
 		trx->error_state = DB_SUCCESS;
-		trx_general_rollback_for_mysql(trx, NULL);
+		trx_rollback_to_savepoint(trx, NULL);
 		trx->error_state = DB_SUCCESS;
 	} else {
 		/* The following call will also rename the .ibd data file if
@@ -4094,7 +4485,7 @@ end:
 		if (!dict_table_rename_in_cache(table, new_name,
 						!new_is_tmp)) {
 			trx->error_state = DB_SUCCESS;
-			trx_general_rollback_for_mysql(trx, NULL);
+			trx_rollback_to_savepoint(trx, NULL);
 			trx->error_state = DB_SUCCESS;
 			err = DB_ERROR;
 			goto funct_exit;
@@ -4135,13 +4526,17 @@ end:
 			ut_a(dict_table_rename_in_cache(table,
 							old_name, FALSE));
 			trx->error_state = DB_SUCCESS;
-			trx_general_rollback_for_mysql(trx, NULL);
+			trx_rollback_to_savepoint(trx, NULL);
 			trx->error_state = DB_SUCCESS;
 		}
 	}
 
 funct_exit:
 
+	if (table != NULL) {
+		dict_table_close(table, dict_locked);
+	}
+
 	if (commit) {
 		trx_commit_for_mysql(trx);
 	}
@@ -4189,7 +4584,13 @@ row_check_index_for_mysql(
 
 	*n_rows = 0;
 
-	buf = mem_alloc(UNIV_PAGE_SIZE);
+	/* Full Text index are implemented by auxiliary tables,
+	not the B-tree */
+	if (index->type & DICT_FTS) {
+		return(TRUE);
+	}
+
+	buf = static_cast<byte*>(mem_alloc(UNIV_PAGE_SIZE));
 	heap = mem_heap_create(100);
 
 	cnt = 1000;
@@ -4290,7 +4691,9 @@ not_ok:
 				* sizeof *offsets;
 
 			tmp_heap = mem_heap_create(size);
-			offsets = mem_heap_dup(tmp_heap, offsets, size);
+
+			offsets = static_cast<ulint*>(
+				mem_heap_dup(tmp_heap, offsets, size));
 		}
 
 		mem_heap_empty(heap);
@@ -4322,19 +4725,45 @@ row_is_magic_monitor_table(
 	const char*	name; /* table_name without database/ */
 	ulint		len;
 
-	name = strchr(table_name, '/');
-	ut_a(name != NULL);
-	name++;
+	name = dict_remove_db_name(table_name);
 	len = strlen(name) + 1;
 
-	if (STR_EQ(name, len, S_innodb_monitor)
-	    || STR_EQ(name, len, S_innodb_lock_monitor)
-	    || STR_EQ(name, len, S_innodb_tablespace_monitor)
-	    || STR_EQ(name, len, S_innodb_table_monitor)
-	    || STR_EQ(name, len, S_innodb_mem_validate)) {
+	return(STR_EQ(name, len, S_innodb_monitor)
+	       || STR_EQ(name, len, S_innodb_lock_monitor)
+	       || STR_EQ(name, len, S_innodb_tablespace_monitor)
+	       || STR_EQ(name, len, S_innodb_table_monitor)
+#ifdef UNIV_MEM_DEBUG
+	       || STR_EQ(name, len, S_innodb_mem_validate)
+#endif /* UNIV_MEM_DEBUG */
+	       );
+}
 
-		return(TRUE);
-	}
+/*********************************************************************//**
+Initialize this module */
+UNIV_INTERN
+void
+row_mysql_init(void)
+/*================*/
+{
+	mutex_create(
+		row_drop_list_mutex_key,
+	       	&row_drop_list_mutex, SYNC_NO_ORDER_CHECK);
 
-	return(FALSE);
+	UT_LIST_INIT(row_mysql_drop_list);
+
+	row_mysql_drop_list_inited = TRUE;
+}
+
+/*********************************************************************//**
+Close this module */
+UNIV_INTERN
+void
+row_mysql_close(void)
+/*================*/
+{
+	ut_a(UT_LIST_GET_LEN(row_mysql_drop_list) == 0);
+
+	mutex_free(&row_drop_list_mutex);
+
+	row_mysql_drop_list_inited = FALSE;
 }
diff --git a/storage/innobase/row/row0purge.c b/storage/innobase/row/row0purge.cc
index efcfdc3bac5..ab28b396920 100644
--- a/storage/innobase/row/row0purge.c
+++ b/storage/innobase/row/row0purge.cc
@@ -17,7 +17,7 @@ this program; if not, write to the Free Software Foundation, Inc.,
 *****************************************************************************/
 
 /**************************************************//**
-@file row/row0purge.c
+@file row/row0purge.cc
 Purge obsolete records
 
 Created 3/14/1997 Heikki Tuuri
@@ -43,6 +43,7 @@ Created 3/14/1997 Heikki Tuuri
 #include "row0vers.h"
 #include "row0mysql.h"
 #include "log0log.h"
+#include "srv0mon.h"
 
 /*************************************************************************
 IMPORTANT NOTE: Any operation that generates redo MUST check that there
@@ -61,18 +62,19 @@ UNIV_INTERN
 purge_node_t*
 row_purge_node_create(
 /*==================*/
-	que_thr_t*	parent,	/*!< in: parent node, i.e., a thr node */
-	mem_heap_t*	heap)	/*!< in: memory heap where created */
+	que_thr_t*	parent,		/*!< in: parent node  */
+	mem_heap_t*	heap)		/*!< in: memory heap where created */
 {
 	purge_node_t*	node;
 
 	ut_ad(parent && heap);
 
-	node = mem_heap_alloc(heap, sizeof(purge_node_t));
+	node = static_cast<purge_node_t*>(
+		mem_heap_zalloc(heap, sizeof(*node)));
 
 	node->common.type = QUE_NODE_PURGE;
 	node->common.parent = parent;
-
+	node->done = TRUE;
 	node->heap = mem_heap_create(256);
 
 	return(node);
@@ -90,23 +92,22 @@ row_purge_reposition_pcur(
 	purge_node_t*	node,	/*!< in: row purge node */
 	mtr_t*		mtr)	/*!< in: mtr */
 {
-	ibool	found;
-
 	if (node->found_clust) {
-		found = btr_pcur_restore_position(mode, &(node->pcur), mtr);
+		ibool	found;
 
-		return(found);
-	}
+		found = btr_pcur_restore_position(mode, &node->pcur, mtr);
 
-	found = row_search_on_row_ref(&(node->pcur), mode, node->table,
-				      node->ref, mtr);
-	node->found_clust = found;
+		return(found);
+	} else {
+		node->found_clust = row_search_on_row_ref(
+			&node->pcur, mode, node->table, node->ref, mtr);
 
-	if (found) {
-		btr_pcur_store_position(&(node->pcur), mtr);
+		if (node->found_clust) {
+			btr_pcur_store_position(&node->pcur, mtr);
+		}
 	}
 
-	return(found);
+	return(node->found_clust);
 }
 
 /***********************************************************//**
@@ -133,7 +134,7 @@ row_purge_remove_clust_if_poss_low(
 
 	index = dict_table_get_first_index(node->table);
 
-	pcur = &(node->pcur);
+	pcur = &node->pcur;
 	btr_cur = btr_pcur_get_btr_cur(pcur);
 
 	log_free_check();
@@ -370,7 +371,7 @@ row_purge_remove_sec_if_poss_leaf(
 	pcur.btr_cur.purge_node = node;
 	/* Set the query thread, so that ibuf_insert_low() will be
 	able to invoke thd_get_trx(). */
-	pcur.btr_cur.thr = que_node_get_parent(node);
+	pcur.btr_cur.thr = static_cast<que_thr_t*>(que_node_get_parent(node));
 
 	search_result = row_search_index_entry(
 		index, entry, BTR_MODIFY_LEAF | BTR_DELETE, &pcur, &mtr);
@@ -478,10 +479,12 @@ row_purge_del_mark(
 
 		index = node->index;
 
-		/* Build the index entry */
-		entry = row_build_index_entry(node->row, NULL, index, heap);
-		ut_a(entry);
-		row_purge_remove_sec_if_poss(node, index, entry);
+		if (node->index->type != DICT_FTS) {
+			/* Build the index entry */
+			entry = row_build_index_entry(node->row, NULL, index, heap);
+			ut_a(entry);
+			row_purge_remove_sec_if_poss(node, index, entry);
+		}
 
 		node->index = dict_table_get_next_index(node->index);
 	}
@@ -499,9 +502,10 @@ void
 row_purge_upd_exist_or_extern_func(
 /*===============================*/
 #ifdef UNIV_DEBUG
-	const que_thr_t*thr,	/*!< in: query thread */
+	const que_thr_t*thr,		/*!< in: query thread */
 #endif /* UNIV_DEBUG */
-	purge_node_t*	node)	/*!< in: row purge node */
+	purge_node_t*	node,		/*!< in: row purge node */
+	trx_undo_rec_t*	undo_rec)	/*!< in: record to purge */
 {
 	mem_heap_t*	heap;
 	dtuple_t*	entry;
@@ -554,12 +558,13 @@ skip_secondaries:
 			= upd_get_nth_field(node->update, i);
 
 		if (dfield_is_ext(&ufield->new_val)) {
+			trx_rseg_t*	rseg;
 			buf_block_t*	block;
 			ulint		internal_offset;
 			byte*		data_field;
 
 			/* We use the fact that new_val points to
-			node->undo_rec and get thus the offset of
+			undo_rec and get thus the offset of
 			dfield data inside the undo record. Then we
 			can calculate from node->roll_ptr the file
 			address of the new_val data */
@@ -567,13 +572,18 @@ skip_secondaries:
 			internal_offset
 				= ((const byte*)
 				   dfield_get_data(&ufield->new_val))
-				- node->undo_rec;
+				- undo_rec;
 
 			ut_a(internal_offset < UNIV_PAGE_SIZE);
 
 			trx_undo_decode_roll_ptr(node->roll_ptr,
 						 &is_insert, &rseg_id,
 						 &page_no, &offset);
+
+			rseg = trx_sys_get_nth_rseg(trx_sys, rseg_id);
+			ut_a(rseg != NULL);
+			ut_a(rseg->id == rseg_id);
+
 			mtr_start(&mtr);
 
 			/* We have to acquire an X-latch to the clustered
@@ -594,10 +604,9 @@ skip_secondaries:
 
 			btr_root_get(index, &mtr);
 
-			/* We assume in purge of externally stored fields
-			that the space id of the undo log record is 0! */
+			block = buf_page_get(
+				rseg->space, 0, page_no, RW_X_LATCH, &mtr);
 
-			block = buf_page_get(0, 0, page_no, RW_X_LATCH, &mtr);
 			buf_block_dbg_add_level(block, SYNC_TRX_UNDO_PAGE);
 
 			data_field = buf_block_get_frame(block)
@@ -616,26 +625,25 @@ skip_secondaries:
 }
 
 #ifdef UNIV_DEBUG
-# define row_purge_upd_exist_or_extern(thr,node)	\
-	row_purge_upd_exist_or_extern_func(thr,node)
+# define row_purge_upd_exist_or_extern(thr,node,undo_rec)	\
+	row_purge_upd_exist_or_extern_func(thr,node,undo_rec)
 #else /* UNIV_DEBUG */
-# define row_purge_upd_exist_or_extern(thr,node)	\
-	row_purge_upd_exist_or_extern_func(node)
+# define row_purge_upd_exist_or_extern(thr,node,undo_rec)	\
+	row_purge_upd_exist_or_extern_func(node,undo_rec)
 #endif /* UNIV_DEBUG */
 
 /***********************************************************//**
 Parses the row reference and other info in a modify undo log record.
-@return TRUE if purge operation required: NOTE that then the CALLER
-must unfreeze data dictionary! */
+@return TRUE if purge operation required */
 static
 ibool
 row_purge_parse_undo_rec(
 /*=====================*/
-	purge_node_t*	node,	/*!< in: row undo node */
-	ibool*		updated_extern,
-				/*!< out: TRUE if an externally stored field
-				was updated */
-	que_thr_t*	thr)	/*!< in: query thread */
+	purge_node_t*		node,		/*!< in: row undo node */
+	trx_undo_rec_t*		undo_rec,	/*!< in: record to purge */
+	ibool*			updated_extern, /*!< out: TRUE if an externally
+						stored field was updated */
+	que_thr_t*		thr)		/*!< in: query thread */
 {
 	dict_index_t*	clust_index;
 	byte*		ptr;
@@ -649,14 +657,13 @@ row_purge_parse_undo_rec(
 
 	ut_ad(node && thr);
 
-	trx = thr_get_trx(thr);
-
 	ptr = trx_undo_rec_get_pars(
-		node->undo_rec, &type, &node->cmpl_info,
+		undo_rec, &type, &node->cmpl_info,
 		updated_extern, &undo_no, &table_id);
+
 	node->rec_type = type;
 
-	if (type == TRX_UNDO_UPD_DEL_REC && !(*updated_extern)) {
+	if (type == TRX_UNDO_UPD_DEL_REC && !*updated_extern) {
 
 		return(FALSE);
 	}
@@ -677,24 +684,22 @@ row_purge_parse_undo_rec(
 	/* Prevent DROP TABLE etc. from running when we are doing the purge
 	for this row */
 
-	row_mysql_freeze_data_dictionary(trx);
+	rw_lock_s_lock_func(&dict_operation_lock, 0, __FILE__, __LINE__);
 
-	mutex_enter(&(dict_sys->mutex));
-
-	node->table = dict_table_get_on_id_low(table_id);
-
-	mutex_exit(&(dict_sys->mutex));
+	node->table = dict_table_open_on_id(table_id, FALSE);
 
 	if (node->table == NULL) {
-		/* The table has been dropped: no need to do purge */
 err_exit:
-		row_mysql_unfreeze_data_dictionary(trx);
+		/* The table has been dropped: no need to do purge */
+		rw_lock_s_unlock_gen(&dict_operation_lock, 0);
 		return(FALSE);
 	}
 
 	if (node->table->ibd_file_missing) {
 		/* We skip purge of missing .ibd files */
 
+		dict_table_close(node->table, FALSE);
+
 		node->table = NULL;
 
 		goto err_exit;
@@ -703,6 +708,9 @@ err_exit:
 	clust_index = dict_table_get_first_index(node->table);
 
 	if (clust_index == NULL) {
+
+		dict_table_close(node->table, FALSE);
+
 		/* The table was corrupt in the data dictionary */
 
 		goto err_exit;
@@ -711,6 +719,8 @@ err_exit:
 	ptr = trx_undo_rec_get_row_ref(ptr, clust_index, &(node->ref),
 				       node->heap);
 
+	trx = thr_get_trx(thr);
+
 	ptr = trx_undo_update_rec_get_update(ptr, clust_index, type, trx_id,
 					     roll_ptr, info_bits, trx,
 					     node->heap, &(node->update));
@@ -728,6 +738,61 @@ err_exit:
 }
 
 /***********************************************************//**
+Purges the parsed record. */
+static
+void
+row_purge_record_func(
+/*==================*/
+	purge_node_t*	node,		/*!< in: row purge node */
+	trx_undo_rec_t*	undo_rec,	/*!< in: record to purge */
+#ifdef UNIV_DEBUG
+	const que_thr_t*thr,		/*!< in: query thread */
+#endif /* UNIV_DEBUG */
+	ibool		updated_extern)	/*!< in: TRUE if external columns
+					were updated */
+{
+	dict_index_t*	clust_index;
+
+	clust_index = dict_table_get_first_index(node->table);
+
+	node->index = dict_table_get_next_index(clust_index);
+
+	switch (node->rec_type) {
+	case TRX_UNDO_DEL_MARK_REC:
+		row_purge_del_mark(node);
+		MONITOR_INC(MONITOR_N_DEL_ROW_PURGE);
+		break;
+	default:
+		if (!updated_extern) {
+			break;
+		}
+		/* fall through */
+	case TRX_UNDO_UPD_EXIST_REC:
+		row_purge_upd_exist_or_extern(thr, node, undo_rec);
+		MONITOR_INC(MONITOR_N_UPD_EXIST_EXTERN);
+		break;
+	}
+
+	if (node->found_clust) {
+		btr_pcur_close(&node->pcur);
+	}
+
+	if (node->table != NULL) {
+		dict_table_close(node->table, FALSE);
+		node->table = NULL;
+	}
+
+}
+
+#ifdef UNIV_DEBUG
+# define row_purge_record(node,undo_rec,thr,updated_extern)	\
+	row_purge_record_func(node,undo_rec,thr,updated_extern)
+#else /* UNIV_DEBUG */
+# define row_purge_record(node,undo_rec,thr,updated_extern)	\
+	row_purge_record_func(node,undo_rec,updated_extern)
+#endif /* UNIV_DEBUG */
+
+/***********************************************************//**
 Fetches an undo log record and does the purge for the recorded operation.
 If none left, or the current purge completed, returns the control to the
 parent node, which is always a query thread node. */
@@ -735,53 +800,51 @@ static __attribute__((nonnull))
 void
 row_purge(
 /*======*/
-	purge_node_t*	node,	/*!< in: row purge node */
-	que_thr_t*	thr)	/*!< in: query thread */
+	purge_node_t*	node,		/*!< in: row purge node */
+	trx_undo_rec_t*	undo_rec,	/*!< in: record to purge */
+	que_thr_t*	thr)		/*!< in: query thread */
 {
-	ibool		updated_extern;
-
 	ut_ad(node);
 	ut_ad(thr);
 
-	node->undo_rec = trx_purge_fetch_next_rec(&node->roll_ptr,
-						  &node->reservation,
-						  node->heap);
-	if (!node->undo_rec) {
-		/* Purge completed for this query thread */
+	if (undo_rec != &trx_purge_dummy_rec) {
+		ibool	updated_extern;
 
-		thr->run_node = que_node_get_parent(node);
+		if (row_purge_parse_undo_rec(
+			node, undo_rec, &updated_extern, thr)) {
 
-		return;
+			row_purge_record(node, undo_rec, thr, updated_extern);
+
+			rw_lock_s_unlock_gen(&dict_operation_lock, 0);
+		}
 	}
+}
+
+/***********************************************************//**
+Reset the purge query thread. */
+UNIV_INLINE
+void
+row_purge_end(
+/*==========*/
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	purge_node_t*	node;
 
-	if (node->undo_rec != &trx_purge_dummy_rec
-	    && row_purge_parse_undo_rec(node, &updated_extern, thr)) {
-		node->found_clust = FALSE;
+	ut_ad(thr);
 
-		node->index = dict_table_get_next_index(
-			dict_table_get_first_index(node->table));
+	node = static_cast<purge_node_t*>(thr->run_node);
 
-		if (node->rec_type == TRX_UNDO_DEL_MARK_REC) {
-			row_purge_del_mark(node);
+	ut_ad(que_node_get_type(node) == QUE_NODE_PURGE);
 
-		} else if (updated_extern
-			   || node->rec_type == TRX_UNDO_UPD_EXIST_REC) {
+	thr->run_node = que_node_get_parent(node);
 
-			row_purge_upd_exist_or_extern(thr, node);
-		}
+	node->undo_recs = NULL;
 
-		if (node->found_clust) {
-			btr_pcur_close(&(node->pcur));
-		}
+	node->done = TRUE;
 
-		row_mysql_unfreeze_data_dictionary(thr_get_trx(thr));
-	}
+	ut_a(thr->run_node != NULL);
 
-	/* Do some cleanup */
-	trx_purge_rec_release(node->reservation);
 	mem_heap_empty(node->heap);
-
-	thr->run_node = node;
 }
 
 /***********************************************************//**
@@ -798,11 +861,39 @@ row_purge_step(
 
 	ut_ad(thr);
 
-	node = thr->run_node;
+	node = static_cast<purge_node_t*>(thr->run_node);
+
+	node->table = NULL;
+	node->row = NULL;
+	node->ref = NULL;
+	node->index = NULL;
+	node->update = NULL;
+	node->found_clust = FALSE;
+	node->rec_type = ULINT_UNDEFINED;
+	node->cmpl_info = ULINT_UNDEFINED;
+
+	ut_a(!node->done);
 
 	ut_ad(que_node_get_type(node) == QUE_NODE_PURGE);
 
-	row_purge(node, thr);
+	if (!(node->undo_recs == NULL || ib_vector_is_empty(node->undo_recs))) {
+		trx_purge_rec_t*purge_rec;
+
+		purge_rec = static_cast<trx_purge_rec_t*>(
+			ib_vector_pop(node->undo_recs));
+
+		node->roll_ptr = purge_rec->roll_ptr;
+
+		row_purge(node, purge_rec->undo_rec, thr);
+
+		if (ib_vector_is_empty(node->undo_recs)) {
+			row_purge_end(thr);
+		} else {
+			thr->run_node = node;
+		}
+	} else {
+		row_purge_end(thr);
+	}
 
 	return(thr);
 }
diff --git a/storage/innobase/row/row0row.c b/storage/innobase/row/row0row.cc
index c15e2bbf739..8c703b1e06c 100644
--- a/storage/innobase/row/row0row.c
+++ b/storage/innobase/row/row0row.cc
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1996, 2012, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 1996, 2011, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -17,7 +17,7 @@ this program; if not, write to the Free Software Foundation, Inc.,
 *****************************************************************************/
 
 /**************************************************//**
-@file row/row0row.c
+@file row/row0row.cc
 General row routines
 
 Created 4/20/1996 Heikki Tuuri
@@ -75,7 +75,7 @@ row_build_index_entry(
 	entry_len = dict_index_get_n_fields(index);
 	entry = dtuple_create(heap, entry_len);
 
-	if (UNIV_UNLIKELY(index->type & DICT_UNIVERSAL)) {
+	if (dict_index_is_univ(index)) {
 		dtuple_set_n_fields_cmp(entry, entry_len);
 		/* There may only be externally stored columns
 		in a clustered index B-tree of a user table. */
@@ -124,7 +124,7 @@ row_build_index_entry(
 		stored off-page. */
 		ut_ad(col->ord_part);
 
-		if (UNIV_LIKELY_NULL(ext)) {
+		if (ext) {
 			/* See if the column is stored externally. */
 			const byte*	buf = row_ext_lookup(ext, col_no,
 							     &len);
@@ -166,7 +166,7 @@ row_build_index_entry(
 			len = dtype_get_at_most_n_mbchars(
 				col->prtype, col->mbminmaxlen,
 				ind_field->prefix_len, len,
-				dfield_get_data(dfield));
+				static_cast<char*>(dfield_get_data(dfield)));
 			dfield_set_len(dfield, len);
 		}
 	}
@@ -233,6 +233,7 @@ row_build(
 
 	ut_ad(index && rec && heap);
 	ut_ad(dict_index_is_clust(index));
+	ut_ad(!mutex_own(&trx_sys->mutex));
 
 	if (!offsets) {
 		offsets = rec_get_offsets(rec, index, offsets_,
@@ -241,7 +242,7 @@ row_build(
 		ut_ad(rec_offs_validate(rec, index, offsets));
 	}
 
-#ifdef UNIV_BLOB_NULL_DEBUG
+#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
 	if (rec_offs_any_null_extern(rec, offsets)) {
 		/* This condition can occur during crash recovery
 		before trx_rollback_active() has completed execution,
@@ -252,11 +253,13 @@ row_build(
 		ut_a(trx_undo_roll_ptr_is_insert(
 			     row_get_rec_roll_ptr(rec, index, offsets)));
 	}
-#endif /* UNIV_BLOB_NULL_DEBUG */
+#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
 
 	if (type != ROW_COPY_POINTERS) {
 		/* Take a copy of rec to heap */
-		buf = mem_heap_alloc(heap, rec_offs_size(offsets));
+		buf = static_cast<byte*>(
+			mem_heap_alloc(heap, rec_offs_size(offsets)));
+
 		rec = rec_copy(buf, rec, offsets);
 		/* Avoid a debug assertion in rec_offs_validate(). */
 		rec_offs_make_valid(rec, index, (ulint*) offsets);
@@ -275,7 +278,8 @@ row_build(
 	n_fields = rec_offs_n_fields(offsets);
 	n_ext_cols = rec_offs_n_extern(offsets);
 	if (n_ext_cols) {
-		ext_cols = mem_heap_alloc(heap, n_ext_cols * sizeof *ext_cols);
+		ext_cols = static_cast<ulint*>(
+			mem_heap_alloc(heap, n_ext_cols * sizeof *ext_cols));
 	}
 
 	for (i = j = 0; i < n_fields; i++) {
@@ -322,7 +326,7 @@ row_build(
 		768-byte prefix of each externally stored
 		column. No cache is needed. */
 		ut_ad(dict_table_get_format(index->table)
-		      < DICT_TF_FORMAT_ZIP);
+		      < UNIV_FORMAT_B);
 	} else if (j) {
 		*ext = row_ext_create(j, ext_cols, index->table->flags, row,
 				      heap);
@@ -361,7 +365,7 @@ row_rec_to_index_entry_low(
 	ulint		rec_len;
 
 	ut_ad(rec && heap && index);
-	/* Because this function may be invoked by row0merge.c
+	/* Because this function may be invoked by row0merge.cc
 	on a record whose header is in different format, the check
 	rec_offs_validate(rec, index, offsets) must be avoided here. */
 	ut_ad(n_ext);
@@ -433,14 +437,16 @@ row_rec_to_index_entry(
 
 	if (type == ROW_COPY_DATA) {
 		/* Take a copy of rec to heap */
-		buf = mem_heap_alloc(heap, rec_offs_size(offsets));
+		buf = static_cast<byte*>(
+			mem_heap_alloc(heap, rec_offs_size(offsets)));
+
 		rec = rec_copy(buf, rec, offsets);
 		/* Avoid a debug assertion in rec_offs_validate(). */
 		rec_offs_make_valid(rec, index, offsets);
-#ifdef UNIV_BLOB_NULL_DEBUG
+#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
 	} else {
 		ut_a(!rec_offs_any_null_extern(rec, offsets));
-#endif /* UNIV_BLOB_NULL_DEBUG */
+#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
 	}
 
 	entry = row_rec_to_index_entry_low(rec, index, offsets, n_ext, heap);
@@ -501,7 +507,8 @@ row_build_row_ref(
 	if (type == ROW_COPY_DATA) {
 		/* Take a copy of rec to heap */
 
-		buf = mem_heap_alloc(heap, rec_offs_size(offsets));
+		buf = static_cast<byte*>(
+			mem_heap_alloc(heap, rec_offs_size(offsets)));
 
 		rec = rec_copy(buf, rec, offsets);
 		/* Avoid a debug assertion in rec_offs_validate(). */
@@ -813,8 +820,6 @@ row_search_index_entry(
 	return(ROW_FOUND);
 }
 
-#include <my_sys.h>
-
 /*******************************************************************//**
 Formats the raw data in "data" (in InnoDB on-disk format) that is of
 type DATA_INT using "prtype" and writes the result to "buf".
@@ -842,24 +847,17 @@ row_raw_format_int(
 {
 	ulint	ret;
 
-	if (data_len <= sizeof(ullint)) {
+	if (data_len <= sizeof(ib_uint64_t)) {
 
-		ullint		value;
+		ib_uint64_t	value;
 		ibool		unsigned_type = prtype & DATA_UNSIGNED;
 
-		value = mach_read_int_type((const byte*) data,
-					   data_len, unsigned_type);
-
-		if (unsigned_type) {
-
-			ret = ut_snprintf(buf, buf_size, "%llu",
-					  value) + 1;
-		} else {
-
-			ret = ut_snprintf(buf, buf_size, "%lld",
-					  (long long) value) + 1;
-		}
+		value = mach_read_int_type(
+			(const byte*) data, data_len, unsigned_type);
 
+		ret = ut_snprintf(
+			buf, buf_size,
+			unsigned_type ? UINT64PF : INT64PF, value) + 1;
 	} else {
 
 		*format_in_hex = TRUE;
@@ -1019,6 +1017,8 @@ test_row_raw_format_int()
 	ulint	ret;
 	char	buf[128];
 	ibool	format_in_hex;
+	speedo_t speedo;
+	ulint	i;
 
 #define CALL_AND_TEST(data, data_len, prtype, buf, buf_size,\
 		      ret_expected, buf_expected, format_in_hex_expected)\
@@ -1201,9 +1201,6 @@ test_row_raw_format_int()
 
 	/* speed test */
 
-	speedo_t	speedo;
-	ulint		i;
-
 	speedo_reset(&speedo);
 
 	for (i = 0; i < 1000000; i++) {
diff --git a/storage/innobase/row/row0sel.c b/storage/innobase/row/row0sel.cc
index 7bec0a26225..599b8df68a4 100644
--- a/storage/innobase/row/row0sel.c
+++ b/storage/innobase/row/row0sel.cc
@@ -18,13 +18,13 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
 /***************************************************//**
-@file row/row0sel.c
+@file row/row0sel.cc
 Select
 
 Created 12/19/1997 Heikki Tuuri
@@ -57,6 +57,9 @@ Created 12/19/1997 Heikki Tuuri
 #include "read0read.h"
 #include "buf0lru.h"
 #include "ha_prototypes.h"
+#include "srv0mon.h"
+
+#include "my_compare.h" /* enum icp_result */
 
 /* Maximum number of rows to prefetch; MySQL interface has another parameter */
 #define SEL_MAX_N_PREFETCH	16
@@ -105,12 +108,12 @@ row_sel_sec_rec_is_for_blob(
 {
 	ulint	len;
 	byte	buf[REC_VERSION_56_MAX_INDEX_COL_LEN];
-	ulint	zip_size = dict_table_flags_to_zip_size(table->flags);
+	ulint	zip_size = dict_tf_get_zip_size(table->flags);
 
 	/* This function should never be invoked on an Antelope format
 	table, because they should always contain enough prefix in the
 	clustered index record. */
-	ut_ad(dict_table_get_format(table) >= DICT_TF_FORMAT_ZIP);
+	ut_ad(dict_table_get_format(table) >= UNIV_FORMAT_B);
 	ut_a(clust_len >= BTR_EXTERN_FIELD_REF_SIZE);
 	ut_ad(prefix_len >= sec_len);
 	ut_ad(prefix_len > 0);
@@ -268,7 +271,9 @@ sel_node_create(
 {
 	sel_node_t*	node;
 
-	node = mem_heap_alloc(heap, sizeof(sel_node_t));
+	node = static_cast<sel_node_t*>(
+		mem_heap_alloc(heap, sizeof(sel_node_t)));
+
 	node->common.type = QUE_NODE_SELECT;
 	node->state = SEL_NODE_OPEN;
 
@@ -330,7 +335,8 @@ UNIV_INLINE
 void
 sel_assign_into_var_values(
 /*=======================*/
-	sym_node_t*	var,	/*!< in: first variable in a list of variables */
+	sym_node_t*	var,	/*!< in: first variable in a list of
+				variables */
 	sel_node_t*	node)	/*!< in: select node */
 {
 	que_node_t*	exp;
@@ -340,15 +346,15 @@ sel_assign_into_var_values(
 		return;
 	}
 
-	exp = node->select_list;
+	for (exp = node->select_list;
+	     var != 0;
+	     var = static_cast<sym_node_t*>(que_node_get_next(var))) {
 
-	while (var) {
 		ut_ad(exp);
 
 		eval_node_copy_val(var->alias, exp);
 
 		exp = que_node_get_next(exp);
-		var = que_node_get_next(var);
 	}
 }
 
@@ -365,12 +371,12 @@ sel_reset_aggregate_vals(
 
 	ut_ad(node->is_aggregate);
 
-	func_node = node->select_list;
+	for (func_node = static_cast<func_node_t*>(node->select_list);
+	     func_node != 0;
+	     func_node = static_cast<func_node_t*>(
+		     	que_node_get_next(func_node))) {
 
-	while (func_node) {
 		eval_node_set_int_val(func_node, 0);
-
-		func_node = que_node_get_next(func_node);
 	}
 
 	node->aggregate_already_fetched = FALSE;
@@ -496,8 +502,9 @@ sel_col_prefetch_buf_alloc(
 
 	ut_ad(que_node_get_type(column) == QUE_NODE_SYMBOL);
 
-	column->prefetch_buf = mem_alloc(SEL_MAX_N_PREFETCH
-					 * sizeof(sel_buf_t));
+	column->prefetch_buf = static_cast<sel_buf_t*>(
+		mem_alloc(SEL_MAX_N_PREFETCH * sizeof(sel_buf_t)));
+
 	for (i = 0; i < SEL_MAX_N_PREFETCH; i++) {
 		sel_buf = column->prefetch_buf + i;
 
@@ -536,8 +543,8 @@ Pops the column values for a prefetched, cached row from the column prefetch
 buffers and places them to the val fields in the column nodes. */
 static
 void
-sel_pop_prefetched_row(
-/*===================*/
+sel_dequeue_prefetched_row(
+/*=======================*/
 	plan_t*	plan)	/*!< in: plan node for a table */
 {
 	sym_node_t*	column;
@@ -578,7 +585,7 @@ sel_pop_prefetched_row(
 		column values to be able to free it later: therefore
 		we swap the values for sel_buf and val */
 
-		sel_buf->data = dfield_get_data(val);
+		sel_buf->data = static_cast<byte*>(dfield_get_data(val));
 		sel_buf->len = dfield_get_len(val);
 		sel_buf->val_buf_size = que_node_get_val_buf_size(column);
 
@@ -598,8 +605,8 @@ Pushes the column values for a prefetched, cached row to the column prefetch
 buffers from the val fields in the column nodes. */
 UNIV_INLINE
 void
-sel_push_prefetched_row(
-/*====================*/
+sel_enqueue_prefetched_row(
+/*=======================*/
 	plan_t*	plan)	/*!< in: plan node for a table */
 {
 	sym_node_t*	column;
@@ -626,14 +633,14 @@ sel_push_prefetched_row(
 
 	ut_ad(pos < SEL_MAX_N_PREFETCH);
 
-	column = UT_LIST_GET_FIRST(plan->columns);
+	for (column = UT_LIST_GET_FIRST(plan->columns);
+	     column != 0;
+	     column = UT_LIST_GET_NEXT(col_var_list, column)) {
 
-	while (column) {
 		if (!column->copy_val) {
 			/* There is no sense to push pointers to database
 			page fields when we do not keep latch on the page! */
-
-			goto next_col;
+			continue;
 		}
 
 		if (!column->prefetch_buf) {
@@ -646,7 +653,7 @@ sel_push_prefetched_row(
 
 		val = que_node_get_val(column);
 
-		data = dfield_get_data(val);
+		data = static_cast<byte*>(dfield_get_data(val));
 		len = dfield_get_len(val);
 		val_buf_size = que_node_get_val_buf_size(column);
 
@@ -660,8 +667,6 @@ sel_push_prefetched_row(
 		sel_buf->data = data;
 		sel_buf->len = len;
 		sel_buf->val_buf_size = val_buf_size;
-next_col:
-		column = UT_LIST_GET_NEXT(col_var_list, column);
 	}
 }
 
@@ -752,13 +757,14 @@ row_sel_test_end_conds(
 	/* All conditions in end_conds are comparisons of a column to an
 	expression */
 
-	cond = UT_LIST_GET_FIRST(plan->end_conds);
+	for (cond = UT_LIST_GET_FIRST(plan->end_conds);
+	     cond != 0;
+	     cond = UT_LIST_GET_NEXT(cond_list, cond)) {
 
-	while (cond) {
 		/* Evaluate the left side of the comparison, i.e., get the
 		column value if there is an indirection */
 
-		eval_sym(cond->args);
+		eval_sym(static_cast<sym_node_t*>(cond->args));
 
 		/* Do the comparison */
 
@@ -766,8 +772,6 @@ row_sel_test_end_conds(
 
 			return(FALSE);
 		}
-
-		cond = UT_LIST_GET_NEXT(cond_list, cond);
 	}
 
 	return(TRUE);
@@ -858,7 +862,7 @@ row_sel_get_clust_rec(
 		ut_a(node->read_view);
 
 		/* In a rare case it is possible that no clust rec is found
-		for a delete-marked secondary index record: if in row0umod.c
+		for a delete-marked secondary index record: if in row0umod.cc
 		in row_undo_mod_remove_clust_low() we have already removed
 		the clust rec, while purge is still cleaning and removing
 		secondary index records associated with earlier versions of
@@ -894,7 +898,9 @@ row_sel_get_clust_rec(
 		err = lock_clust_rec_read_check_and_lock(
 			0, btr_pcur_get_block(&plan->clust_pcur),
 			clust_rec, index, offsets,
-			node->row_lock_mode, lock_type, thr);
+			static_cast<enum lock_mode>(node->row_lock_mode),
+			lock_type,
+			thr);
 
 		switch (err) {
 		case DB_SUCCESS:
@@ -993,7 +999,7 @@ sel_set_rec_lock(
 
 	trx = thr_get_trx(thr);
 
-	if (UT_LIST_GET_LEN(trx->trx_locks) > 10000) {
+	if (UT_LIST_GET_LEN(trx->lock.trx_locks) > 10000) {
 		if (buf_LRU_buf_pool_running_out()) {
 
 			return(DB_LOCK_TABLE_FULL);
@@ -1002,10 +1008,12 @@ sel_set_rec_lock(
 
 	if (dict_index_is_clust(index)) {
 		err = lock_clust_rec_read_check_and_lock(
-			0, block, rec, index, offsets, mode, type, thr);
+			0, block, rec, index, offsets,
+			static_cast<enum lock_mode>(mode), type, thr);
 	} else {
 		err = lock_sec_rec_read_check_and_lock(
-			0, block, rec, index, offsets, mode, type, thr);
+			0, block, rec, index, offsets,
+			static_cast<enum lock_mode>(mode), type, thr);
 	}
 
 	return(err);
@@ -1206,6 +1214,9 @@ row_sel_try_search_shortcut(
 	sel_node_t*	node,	/*!< in: select node for a consistent read */
 	plan_t*		plan,	/*!< in: plan for a unique search in clustered
 				index */
+	ibool		search_latch_locked,
+				/*!< in: whether the search holds
+				btr_search_latch */
 	mtr_t*		mtr)	/*!< in: mtr */
 {
 	dict_index_t*	index;
@@ -1222,10 +1233,12 @@ row_sel_try_search_shortcut(
 	ut_ad(plan->unique_search);
 	ut_ad(!plan->must_get_clust);
 #ifdef UNIV_SYNC_DEBUG
-	ut_ad(rw_lock_own(&btr_search_latch, RW_LOCK_SHARED));
+	if (search_latch_locked) {
+		ut_ad(rw_lock_own(&btr_search_latch, RW_LOCK_SHARED));
+	}
 #endif /* UNIV_SYNC_DEBUG */
 
-	row_sel_open_pcur(plan, TRUE, mtr);
+	row_sel_open_pcur(plan, search_latch_locked, mtr);
 
 	rec = btr_pcur_get_rec(&(plan->pcur));
 
@@ -1370,7 +1383,7 @@ table_loop:
 	index = plan->index;
 
 	if (plan->n_rows_prefetched > 0) {
-		sel_pop_prefetched_row(plan);
+		sel_dequeue_prefetched_row(plan);
 
 		goto next_table_no_mtr;
 	}
@@ -1409,7 +1422,9 @@ table_loop:
 			rw_lock_s_lock(&btr_search_latch);
 		}
 
-		found_flag = row_sel_try_search_shortcut(node, plan, &mtr);
+		found_flag = row_sel_try_search_shortcut(node, plan,
+							 search_latch_locked,
+							 &mtr);
 
 		if (found_flag == SEL_FOUND) {
 
@@ -1812,13 +1827,13 @@ skip_lock:
 		goto next_table;
 	}
 
-	sel_push_prefetched_row(plan);
+	sel_enqueue_prefetched_row(plan);
 
 	if (plan->n_rows_prefetched == SEL_MAX_N_PREFETCH) {
 
 		/* The prefetch buffer is now full */
 
-		sel_pop_prefetched_row(plan);
+		sel_dequeue_prefetched_row(plan);
 
 		goto next_table;
 	}
@@ -1917,7 +1932,7 @@ table_exhausted:
 	if (plan->n_rows_prefetched > 0) {
 		/* The table became exhausted during a prefetch */
 
-		sel_pop_prefetched_row(plan);
+		sel_dequeue_prefetched_row(plan);
 
 		goto next_table_no_mtr;
 	}
@@ -2023,14 +2038,11 @@ row_sel_step(
 /*=========*/
 	que_thr_t*	thr)	/*!< in: query thread */
 {
-	ulint		i_lock_mode;
-	sym_node_t*	table_node;
 	sel_node_t*	node;
-	ulint		err;
 
 	ut_ad(thr);
 
-	node = thr->run_node;
+	node = static_cast<sel_node_t*>(thr->run_node);
 
 	ut_ad(que_node_get_type(node) == QUE_NODE_SELECT);
 
@@ -2048,7 +2060,7 @@ row_sel_step(
 		/* It may be that the current session has not yet started
 		its transaction, or it has been committed: */
 
-		trx_start_if_not_started(thr_get_trx(thr));
+		trx_start_if_not_started_xa(thr_get_trx(thr));
 
 		plan_reset_cursor(sel_node_get_nth_plan(node, 0));
 
@@ -2057,24 +2069,34 @@ row_sel_step(
 			node->read_view = trx_assign_read_view(
 				thr_get_trx(thr));
 		} else {
+			sym_node_t*	table_node;
+			enum lock_mode	i_lock_mode;
+
 			if (node->set_x_locks) {
 				i_lock_mode = LOCK_IX;
 			} else {
 				i_lock_mode = LOCK_IS;
 			}
 
-			table_node = node->table_list;
+			for (table_node = node->table_list;
+			     table_node != 0;
+			     table_node = static_cast<sym_node_t*>(
+					que_node_get_next(table_node))) {
+
+				enum db_err	err;
+
+				err = static_cast<enum db_err>(lock_table(
+					0, table_node->table, i_lock_mode,
+					thr));
 
-			while (table_node) {
-				err = lock_table(0, table_node->table,
-						 i_lock_mode, thr);
 				if (err != DB_SUCCESS) {
-					thr_get_trx(thr)->error_state = err;
+					trx_t*	trx;
+
+					trx = thr_get_trx(thr);
+					trx->error_state = err;
 
 					return(NULL);
 				}
-
-				table_node = que_node_get_next(table_node);
 			}
 		}
 
@@ -2098,7 +2120,7 @@ row_sel_step(
 		}
 	}
 
-	err = row_sel(node, thr);
+	enum db_err err = static_cast<enum db_err>(row_sel(node, thr));
 
 	/* NOTE! if queries are parallelized, the following assignment may
 	have problems; the assignment should be made only if thr is the
@@ -2129,7 +2151,7 @@ fetch_step(
 
 	ut_ad(thr);
 
-	node = thr->run_node;
+	node = static_cast<fetch_node_t*>(thr->run_node);
 	sel_node = node->cursor_def;
 
 	ut_ad(que_node_get_type(node) == QUE_NODE_FETCH);
@@ -2142,12 +2164,12 @@ fetch_step(
 				sel_assign_into_var_values(node->into_list,
 							   sel_node);
 			} else {
-				void* ret = (*node->func->func)(
+				ibool ret = (*node->func->func)(
 					sel_node, node->func->arg);
 
 				if (!ret) {
 					sel_node->state
-						= SEL_NODE_NO_MORE_ROWS;
+						 = SEL_NODE_NO_MORE_ROWS;
 				}
 			}
 		}
@@ -2188,21 +2210,22 @@ row_fetch_print(
 	void*	row,		/*!< in:  sel_node_t* */
 	void*	user_arg)	/*!< in:  not used */
 {
-	sel_node_t*	node = row;
 	que_node_t*	exp;
 	ulint		i = 0;
+	sel_node_t*	node = static_cast<sel_node_t*>(row);
 
 	UT_NOT_USED(user_arg);
 
 	fprintf(stderr, "row_fetch_print: row %p\n", row);
 
-	exp = node->select_list;
+	for (exp = node->select_list;
+	     exp != 0;
+	     exp = que_node_get_next(exp), i++) {
 
-	while (exp) {
 		dfield_t*	dfield = que_node_get_val(exp);
 		const dtype_t*	type = dfield_get_type(dfield);
 
-		fprintf(stderr, " column %lu:\n", (ulong)i);
+		fprintf(stderr, " column %lu:\n", (ulong) i);
 
 		dtype_print(type);
 		putc('\n', stderr);
@@ -2214,9 +2237,6 @@ row_fetch_print(
 		} else {
 			fputs(" <NULL>;\n", stderr);
 		}
-
-		exp = que_node_get_next(exp);
-		i++;
 	}
 
 	return((void*)42);
@@ -2237,7 +2257,7 @@ row_printf_step(
 
 	ut_ad(thr);
 
-	node = thr->run_node;
+	node = static_cast<row_printf_node_t*>(thr->run_node);
 
 	sel_node = node->sel_node;
 
@@ -2285,6 +2305,42 @@ row_printf_step(
 	return(thr);
 }
 
+/********************************************************************
+Creates a key in Innobase dtuple format.*/
+
+void
+row_create_key(
+/*===========*/
+	dtuple_t*	tuple,		/* in: tuple where to build;
+					NOTE: we assume that the type info
+					in the tuple is already according
+					to index! */
+	dict_index_t*	index,		/* in: index of the key value */
+	doc_id_t*	doc_id)		/* in: doc id to search. */
+{
+	dtype_t		type;
+	dict_field_t*	field;
+	doc_id_t	temp_doc_id;
+	dfield_t*	dfield = dtuple_get_nth_field(tuple, 0);
+
+	ut_a(dict_index_get_n_unique(index) == 1);
+
+	/* Permit us to access any field in the tuple (ULINT_MAX): */
+	dtuple_set_n_fields(tuple, ULINT_MAX);
+
+	field = dict_index_get_nth_field(index, 0);
+	dict_col_copy_type(field->col, &type);
+	ut_a(dtype_get_mtype(&type) == DATA_INT);
+
+	/* Convert to storage byte order */
+	mach_write_to_8((byte*) &temp_doc_id, *doc_id);
+	*doc_id = temp_doc_id;
+
+	ut_a(sizeof(*doc_id) == field->fixed_len);
+	dfield_set_data(dfield, doc_id, field->fixed_len);
+
+	dtuple_set_n_fields(tuple, 1);
+}
 /****************************************************************//**
 Converts a key value stored in MySQL format to an Innobase dtuple. The last
 field of the key value may be just a prefix of a fixed length field: hence
@@ -2533,13 +2589,23 @@ row_sel_store_row_id_to_prebuilt(
 	ut_memcpy(prebuilt->row_id, data, len);
 }
 
+#ifdef UNIV_DEBUG
+/** Convert a non-SQL-NULL field from Innobase format to MySQL format. */
+# define row_sel_field_store_in_mysql_format(dest,templ,idx,field,src,len) \
+	row_sel_field_store_in_mysql_format_func(dest,templ,idx,field,src,len)
+#else /* UNIV_DEBUG */
+/** Convert a non-SQL-NULL field from Innobase format to MySQL format. */
+# define row_sel_field_store_in_mysql_format(dest,templ,idx,field,src,len) \
+	row_sel_field_store_in_mysql_format_func(dest,templ,src,len)
+#endif /* UNIV_DEBUG */
+
 /**************************************************************//**
 Stores a non-SQL-NULL field in the MySQL format. The counterpart of this
-function is row_mysql_store_col_in_innobase_format() in row0mysql.c. */
-static
+function is row_mysql_store_col_in_innobase_format() in row0mysql.cc. */
+static __attribute__((nonnull))
 void
-row_sel_field_store_in_mysql_format(
-/*================================*/
+row_sel_field_store_in_mysql_format_func(
+/*=====================================*/
 	byte*		dest,	/*!< in/out: buffer where to store; NOTE
 				that BLOBs are not in themselves
 				stored here: the caller must allocate
@@ -2551,10 +2617,22 @@ row_sel_field_store_in_mysql_format(
 				Its following fields are referenced:
 				type, is_unsigned, mysql_col_len,
 				mbminlen, mbmaxlen */
+#ifdef UNIV_DEBUG
+	const dict_index_t* index,
+				/*!< in: InnoDB index */
+	ulint		field_no,
+				/*!< in: templ->rec_field_no or
+				templ->clust_rec_field_no or
+				templ->icp_rec_field_no */
+#endif /* UNIV_DEBUG */
 	const byte*	data,	/*!< in: data to store */
 	ulint		len)	/*!< in: length of the data */
 {
-	byte*	ptr;
+	byte*			ptr;
+#ifdef UNIV_DEBUG
+	const dict_field_t*	field
+		= dict_index_get_nth_field(index, field_no);
+#endif /* UNIV_DEBUG */
 
 	ut_ad(len != UNIV_SQL_NULL);
 	UNIV_MEM_ASSERT_RW(data, len);
@@ -2651,17 +2729,30 @@ row_sel_field_store_in_mysql_format(
 		ut_ad(templ->mysql_col_len >= len);
 		ut_ad(templ->mbmaxlen >= templ->mbminlen);
 
+		/* If field_no equals to templ->icp_rec_field_no,
+		we are examining a row pointed by "icp_rec_field_no".
+		There is possibility that icp_rec_field_no refers to
+		a field in a secondary index while templ->rec_field_no
+		points to field in a primary index. The length
+		should still be equal, unless the field pointed
+		by icp_rec_field_no has a prefix */
 		ut_ad(templ->mbmaxlen > templ->mbminlen
-		      || templ->mysql_col_len == len);
+		      || templ->mysql_col_len == len
+		      || (field_no == templ->icp_rec_field_no
+			  && field->prefix_len > 0));
+
 		/* The following assertion would fail for old tables
 		containing UTF-8 ENUM columns due to Bug #9526. */
 		ut_ad(!templ->mbmaxlen
 		      || !(templ->mysql_col_len % templ->mbmaxlen));
-		ut_ad(len * templ->mbmaxlen >= templ->mysql_col_len);
+		ut_ad(len * templ->mbmaxlen >= templ->mysql_col_len
+		      || (field_no == templ->icp_rec_field_no
+			  && field->prefix_len > 0));
+		ut_ad(!(field->prefix_len % templ->mbmaxlen));
 
 		if (templ->mbminlen == 1 && templ->mbmaxlen != 1) {
 			/* Pad with spaces. This undoes the stripping
-			done in row0mysql.c, function
+			done in row0mysql.cc, function
 			row_mysql_store_col_in_innobase_format(). */
 
 			memset(dest + len, 0x20, templ->mysql_col_len - len);
@@ -2682,148 +2773,117 @@ row_sel_field_store_in_mysql_format(
 	case DATA_DECIMAL:
 		/* Above are the valid column types for MySQL data. */
 #endif /* UNIV_DEBUG */
-		ut_ad(templ->mysql_col_len == len);
+		ut_ad(field->prefix_len
+		      ? field->prefix_len == len
+		      : templ->mysql_col_len == len);
 		memcpy(dest, data, len);
 	}
 }
 
+#ifdef UNIV_DEBUG
+/** Convert a field from Innobase format to MySQL format. */
+# define row_sel_store_mysql_field(m,p,r,i,o,f,t) \
+	row_sel_store_mysql_field_func(m,p,r,i,o,f,t)
+#else /* UNIV_DEBUG */
+/** Convert a field from Innobase format to MySQL format. */
+# define row_sel_store_mysql_field(m,p,r,i,o,f,t) \
+	row_sel_store_mysql_field_func(m,p,r,o,f,t)
+#endif /* UNIV_DEBUG */
 /**************************************************************//**
-Convert a row in the Innobase format to a row in the MySQL format.
-Note that the template in prebuilt may advise us to copy only a few
-columns to mysql_rec, other columns are left blank. All columns may not
-be needed in the query.
-@return TRUE on success, FALSE if not all columns could be retrieved */
+Convert a field in the Innobase format to a field in the MySQL format. */
 static __attribute__((warn_unused_result))
 ibool
-row_sel_store_mysql_rec(
-/*====================*/
-	byte*		mysql_rec,	/*!< out: row in the MySQL format */
-	row_prebuilt_t*	prebuilt,	/*!< in: prebuilt struct */
-	const rec_t*	rec,		/*!< in: Innobase record in the index
-					which was described in prebuilt's
-					template, or in the clustered index;
-					must be protected by a page latch */
-	ibool		rec_clust,	/*!< in: TRUE if rec is in the
-					clustered index instead of
-					prebuilt->index */
-	const ulint*	offsets)	/*!< in: array returned by
-					rec_get_offsets(rec) */
+row_sel_store_mysql_field_func(
+/*===========================*/
+	byte*			mysql_rec,	/*!< out: record in the
+						MySQL format */
+	row_prebuilt_t*		prebuilt,	/*!< in/out: prebuilt struct */
+	const rec_t*		rec,		/*!< in: InnoDB record;
+						must be protected by
+						a page latch */
+#ifdef UNIV_DEBUG
+	const dict_index_t*	index,		/*!< in: index of rec */
+#endif
+	const ulint*		offsets,	/*!< in: array returned by
+						rec_get_offsets() */
+	ulint			field_no,	/*!< in: templ->rec_field_no or
+						templ->clust_rec_field_no or
+						templ->icp_rec_field_no */
+	const mysql_row_templ_t*templ)		/*!< in: row template */
 {
-	mem_heap_t*	extern_field_heap	= NULL;
-	mem_heap_t*	heap;
-	ulint		i;
+	const byte*	data;
+	ulint		len;
 
-	ut_ad(prebuilt->mysql_template);
 	ut_ad(prebuilt->default_rec);
-	ut_ad(rec_offs_validate(rec, NULL, offsets));
-	ut_ad(!rec_get_deleted_flag(rec, rec_offs_comp(offsets)));
-
-	if (UNIV_LIKELY_NULL(prebuilt->blob_heap)) {
-		mem_heap_free(prebuilt->blob_heap);
-		prebuilt->blob_heap = NULL;
-	}
-
-	for (i = 0; i < prebuilt->n_template; i++) {
-
-		const mysql_row_templ_t*templ = prebuilt->mysql_template + i;
-		const byte*		data;
-		ulint			len;
-		ulint			field_no;
+	ut_ad(templ);
+	ut_ad(templ >= prebuilt->mysql_template);
+	ut_ad(templ < &prebuilt->mysql_template[prebuilt->n_template]);
+	ut_ad(field_no == templ->clust_rec_field_no
+	      || field_no == templ->rec_field_no
+	      || field_no == templ->icp_rec_field_no);
+	ut_ad(rec_offs_validate(rec, index, offsets));
 
-		field_no = rec_clust
-			? templ->clust_rec_field_no : templ->rec_field_no;
+	if (UNIV_UNLIKELY(rec_offs_nth_extern(offsets, field_no))) {
 
-		if (UNIV_UNLIKELY(rec_offs_nth_extern(offsets, field_no))) {
+		mem_heap_t*	heap;
+		/* Copy an externally stored field to a temporary heap */
 
-			/* Copy an externally stored field to the temporary
-			heap */
+		ut_a(!prebuilt->trx->has_search_latch);
+		ut_ad(field_no == templ->clust_rec_field_no);
 
-			ut_a(!prebuilt->trx->has_search_latch);
+		if (UNIV_UNLIKELY(templ->type == DATA_BLOB)) {
+			if (prebuilt->blob_heap == NULL) {
+				prebuilt->blob_heap = mem_heap_create(
+					UNIV_PAGE_SIZE);
+			}
 
-			if (UNIV_UNLIKELY(templ->type == DATA_BLOB)) {
-				if (prebuilt->blob_heap == NULL) {
-					prebuilt->blob_heap = mem_heap_create(
-						UNIV_PAGE_SIZE);
-				}
+			heap = prebuilt->blob_heap;
+		} else {
+			heap = mem_heap_create(UNIV_PAGE_SIZE);
+		}
 
-				heap = prebuilt->blob_heap;
-			} else {
-				extern_field_heap
-					= mem_heap_create(UNIV_PAGE_SIZE);
+		/* NOTE: if we are retrieving a big BLOB, we may
+		already run out of memory in the next call, which
+		causes an assert */
 
-				heap = extern_field_heap;
-			}
+		data = btr_rec_copy_externally_stored_field(
+			rec, offsets,
+			dict_table_zip_size(prebuilt->table),
+			field_no, &len, heap);
 
-			/* NOTE: if we are retrieving a big BLOB, we may
-			already run out of memory in the next call, which
-			causes an assert */
-
-			data = btr_rec_copy_externally_stored_field(
-				rec, offsets,
-				dict_table_zip_size(prebuilt->table),
-				field_no, &len, heap);
-
-			if (UNIV_UNLIKELY(!data)) {
-				/* The externally stored field
-				was not written yet. This
-				record should only be seen by
-				recv_recovery_rollback_active()
-				or any TRX_ISO_READ_UNCOMMITTED
-				transactions. */
-
-				if (extern_field_heap) {
-					mem_heap_free(extern_field_heap);
-				}
+		if (UNIV_UNLIKELY(!data)) {
+			/* The externally stored field was not written
+			yet. This record should only be seen by
+			recv_recovery_rollback_active() or any
+			TRX_ISO_READ_UNCOMMITTED transactions. */
 
-				return(FALSE);
+			if (heap != prebuilt->blob_heap) {
+				mem_heap_free(heap);
 			}
 
-			ut_a(len != UNIV_SQL_NULL);
-		} else {
-			/* Field is stored in the row. */
-
-			data = rec_get_nth_field(rec, offsets, field_no, &len);
-
-			if (UNIV_UNLIKELY(templ->type == DATA_BLOB)
-			    && len != UNIV_SQL_NULL) {
+			ut_a(prebuilt->trx->isolation_level
+			     == TRX_ISO_READ_UNCOMMITTED);
+			return(FALSE);
+		}
 
-				/* It is a BLOB field locally stored in the
-				InnoDB record: we MUST copy its contents to
-				prebuilt->blob_heap here because later code
-				assumes all BLOB values have been copied to a
-				safe place. */
+		ut_a(len != UNIV_SQL_NULL);
 
-				if (prebuilt->blob_heap == NULL) {
-					prebuilt->blob_heap = mem_heap_create(
-						UNIV_PAGE_SIZE);
-				}
+		row_sel_field_store_in_mysql_format(
+			mysql_rec + templ->mysql_col_offset,
+			templ, index, field_no, data, len);
 
-				data = memcpy(mem_heap_alloc(
-						prebuilt->blob_heap, len),
-						data, len);
-			}
+		if (heap != prebuilt->blob_heap) {
+			mem_heap_free(heap);
 		}
+	} else {
+		/* Field is stored in the row. */
 
-		if (len != UNIV_SQL_NULL) {
-			row_sel_field_store_in_mysql_format(
-				mysql_rec + templ->mysql_col_offset,
-				templ, data, len);
-
-			/* Cleanup */
-			if (extern_field_heap) {
-				mem_heap_free(extern_field_heap);
-				extern_field_heap = NULL;
-			}
+		data = rec_get_nth_field(rec, offsets, field_no, &len);
 
-			if (templ->mysql_null_bit_mask) {
-				/* It is a nullable column with a non-NULL
-				value */
-				mysql_rec[templ->mysql_null_byte_offset]
-					&= ~(byte) templ->mysql_null_bit_mask;
-			}
-		} else {
+		if (len == UNIV_SQL_NULL) {
 			/* MySQL assumes that the field for an SQL
 			NULL value is set to the default value. */
+			ut_ad(templ->mysql_null_bit_mask);
 
 			UNIV_MEM_ASSERT_RW(prebuilt->default_rec
 					   + templ->mysql_col_offset,
@@ -2834,9 +2894,111 @@ row_sel_store_mysql_rec(
 			       (const byte*) prebuilt->default_rec
 			       + templ->mysql_col_offset,
 			       templ->mysql_col_len);
+			return(TRUE);
+		}
+
+		if (UNIV_UNLIKELY(templ->type == DATA_BLOB)) {
+
+			/* It is a BLOB field locally stored in the
+			InnoDB record: we MUST copy its contents to
+			prebuilt->blob_heap here because
+			row_sel_field_store_in_mysql_format() stores a
+			pointer to the data, and the data passed to us
+			will be invalid as soon as the
+			mini-transaction is committed and the page
+			latch on the clustered index page is
+			released. */
+
+			if (prebuilt->blob_heap == NULL) {
+				prebuilt->blob_heap = mem_heap_create(
+					UNIV_PAGE_SIZE);
+			}
+
+			data = static_cast<byte*>(
+				mem_heap_dup(prebuilt->blob_heap, data, len));
+		}
+
+		row_sel_field_store_in_mysql_format(
+			mysql_rec + templ->mysql_col_offset,
+			templ, index, field_no, data, len);
+	}
+
+	ut_ad(len != UNIV_SQL_NULL);
+
+	if (templ->mysql_null_bit_mask) {
+		/* It is a nullable column with a non-NULL
+		value */
+		mysql_rec[templ->mysql_null_byte_offset]
+			&= ~(byte) templ->mysql_null_bit_mask;
+	}
+
+	return(TRUE);
+}
+
+/**************************************************************//**
+Convert a row in the Innobase format to a row in the MySQL format.
+Note that the template in prebuilt may advise us to copy only a few
+columns to mysql_rec, other columns are left blank. All columns may not
+be needed in the query.
+@return TRUE on success, FALSE if not all columns could be retrieved */
+static __attribute__((warn_unused_result))
+ibool
+row_sel_store_mysql_rec(
+/*====================*/
+	byte*		mysql_rec,	/*!< out: row in the MySQL format */
+	row_prebuilt_t*	prebuilt,	/*!< in: prebuilt struct */
+	const rec_t*	rec,		/*!< in: Innobase record in the index
+					which was described in prebuilt's
+					template, or in the clustered index;
+					must be protected by a page latch */
+	ibool		rec_clust,	/*!< in: TRUE if rec is in the
+					clustered index instead of
+					prebuilt->index */
+	const dict_index_t* index,	/*!< in: index of rec */
+	const ulint*	offsets)	/*!< in: array returned by
+					rec_get_offsets(rec) */
+{
+	ulint	i;
+
+	ut_ad(rec_clust || index == prebuilt->index);
+	ut_ad(!rec_clust || dict_index_is_clust(index));
+
+	if (UNIV_LIKELY_NULL(prebuilt->blob_heap)) {
+		mem_heap_free(prebuilt->blob_heap);
+		prebuilt->blob_heap = NULL;
+	}
+
+	for (i = 0; i < prebuilt->n_template; i++) {
+		const mysql_row_templ_t*templ = &prebuilt->mysql_template[i];
+		const ulint		field_no
+			= rec_clust
+			? templ->clust_rec_field_no
+			: templ->rec_field_no;
+		/* We should never deliver column prefixes to MySQL,
+		except for evaluating innobase_index_cond(). */
+		ut_ad(dict_index_get_nth_field(index, field_no)->prefix_len
+		      == 0);
+
+		if (!row_sel_store_mysql_field(mysql_rec, prebuilt,
+					       rec, index, offsets,
+					       field_no, templ)) {
+			return(FALSE);
 		}
 	}
 
+	/* FIXME: We only need to read the doc_id if an FTS indexed
+	column is being updated.
+	NOTE, the record must be cluster index record. Secondary index
+	might not have the Doc ID */
+	if (dict_table_has_fts_index(prebuilt->table)
+	    && dict_index_is_clust(index)) {
+
+		prebuilt->fts_doc_id = fts_get_doc_id_from_rec(
+			prebuilt->table,
+			rec,
+			prebuilt->heap);
+	}
+
 	return(TRUE);
 }
 
@@ -2935,7 +3097,7 @@ row_sel_get_clust_rec_for_mysql(
 	    < dict_index_get_n_unique(clust_index)) {
 
 		/* In a rare case it is possible that no clust rec is found
-		for a delete-marked secondary index record: if in row0umod.c
+		for a delete-marked secondary index record: if in row0umod.cc
 		in row_undo_mod_remove_clust_low() we have already removed
 		the clust rec, while purge is still cleaning and removing
 		secondary index records associated with earlier versions of
@@ -2959,7 +3121,6 @@ row_sel_get_clust_rec_for_mysql(
 			rec_print(stderr, clust_rec, clust_index);
 			putc('\n', stderr);
 			trx_print(stderr, trx, 600);
-
 			fputs("\n"
 			      "InnoDB: Submit a detailed bug report"
 			      " to http://bugs.mysql.com\n", stderr);
@@ -2983,7 +3144,10 @@ row_sel_get_clust_rec_for_mysql(
 		err = lock_clust_rec_read_check_and_lock(
 			0, btr_pcur_get_block(&prebuilt->clust_pcur),
 			clust_rec, clust_index, *offsets,
-			prebuilt->select_lock_type, LOCK_REC_NOT_GAP, thr);
+			static_cast<enum lock_mode>(prebuilt->select_lock_type),
+			LOCK_REC_NOT_GAP,
+			thr);
+
 		switch (err) {
 		case DB_SUCCESS:
 		case DB_SUCCESS_LOCKED_REC:
@@ -3005,13 +3169,17 @@ row_sel_get_clust_rec_for_mysql(
 			    clust_rec, clust_index, *offsets,
 			    trx->read_view)) {
 
+			ulint	db_err;
+
 			/* The following call returns 'offsets' associated with
 			'old_vers' */
-			err = row_sel_build_prev_vers_for_mysql(
+			db_err = row_sel_build_prev_vers_for_mysql(
 				trx->read_view, clust_index, prebuilt,
 				clust_rec, offsets, offset_heap, &old_vers,
 				mtr);
 
+			err = static_cast<enum db_err>(db_err);
+
 			if (err != DB_SUCCESS || old_vers == NULL) {
 
 				goto err_exit;
@@ -3171,8 +3339,8 @@ row_sel_copy_cached_field_for_mysql(
 Pops a cached row for MySQL from the fetch cache. */
 UNIV_INLINE
 void
-row_sel_pop_cached_row_for_mysql(
-/*=============================*/
+row_sel_dequeue_cached_row_for_mysql(
+/*=================================*/
 	byte*		buf,		/*!< in/out: buffer where to copy the
 					row */
 	row_prebuilt_t*	prebuilt)	/*!< in: prebuilt struct */
@@ -3201,7 +3369,7 @@ row_sel_pop_cached_row_for_mysql(
 				buf[templ->mysql_null_byte_offset]
 					^= (buf[templ->mysql_null_byte_offset]
 					    ^ cached_rec[templ->mysql_null_byte_offset])
-					& (byte)templ->mysql_null_bit_mask;
+					& (byte) templ->mysql_null_bit_mask;
 			}
 		}
 	} else if (prebuilt->mysql_prefix_len > 63) {
@@ -3231,69 +3399,90 @@ row_sel_pop_cached_row_for_mysql(
 }
 
 /********************************************************************//**
-Pushes a row for MySQL to the fetch cache.
-@return TRUE on success, FALSE if the record contains incomplete BLOBs */
-UNIV_INLINE __attribute__((warn_unused_result))
-ibool
-row_sel_push_cache_row_for_mysql(
-/*=============================*/
-	row_prebuilt_t*	prebuilt,	/*!< in: prebuilt struct */
-	const rec_t*	rec,		/*!< in: record to push, in the index
-					which was described in prebuilt's
-					template, or in the clustered index;
-					must be protected by a page latch */
-	ibool		rec_clust,	/*!< in: TRUE if rec is in the
-					clustered index instead of
-					prebuilt->index */
-	const ulint*	offsets)	/*!< in: rec_get_offsets(rec) */
+Initialise the prefetch cache. */
+UNIV_INLINE
+void
+row_sel_prefetch_cache_init(
+/*========================*/
+	row_prebuilt_t*	prebuilt)	/*!< in/out: prebuilt struct */
 {
-	byte*	buf;
 	ulint	i;
+	ulint	sz;
+	byte*	ptr;
 
-	ut_ad(prebuilt->n_fetch_cached < MYSQL_FETCH_CACHE_SIZE);
-	ut_ad(rec_offs_validate(rec, NULL, offsets));
-	ut_ad(!rec_get_deleted_flag(rec, rec_offs_comp(offsets)));
-	ut_a(!prebuilt->templ_contains_blob);
+	/* Reserve space for the magic number. */
+	sz = UT_ARR_SIZE(prebuilt->fetch_cache) * (prebuilt->mysql_row_len + 8);
+	ptr = static_cast<byte*>(mem_alloc(sz));
 
-	if (prebuilt->fetch_cache[0] == NULL) {
-		/* Allocate memory for the fetch cache */
+	for (i = 0; i < UT_ARR_SIZE(prebuilt->fetch_cache); i++) {
 
-		for (i = 0; i < MYSQL_FETCH_CACHE_SIZE; i++) {
+		/* A user has reported memory corruption in these
+		buffers in Linux. Put magic numbers there to help
+		to track a possible bug. */
 
-			/* A user has reported memory corruption in these
-			buffers in Linux. Put magic numbers there to help
-			to track a possible bug. */
+		mach_write_to_4(ptr, ROW_PREBUILT_FETCH_MAGIC_N);
+		ptr += 4;
 
-			buf = mem_alloc(prebuilt->mysql_row_len + 8);
+		prebuilt->fetch_cache[i] = ptr;
+		ptr += prebuilt->mysql_row_len;
 
-			prebuilt->fetch_cache[i] = buf + 4;
+		mach_write_to_4(ptr, ROW_PREBUILT_FETCH_MAGIC_N);
+		ptr += 4;
+	}
+}
 
-			mach_write_to_4(buf, ROW_PREBUILT_FETCH_MAGIC_N);
-			mach_write_to_4(buf + 4 + prebuilt->mysql_row_len,
-					ROW_PREBUILT_FETCH_MAGIC_N);
-		}
+/********************************************************************//**
+Get the last fetch cache buffer from the queue.
+@return pointer to buffer. */
+UNIV_INLINE
+byte*
+row_sel_fetch_last_buf(
+/*===================*/
+	row_prebuilt_t*	prebuilt)	/*!< in/out: prebuilt struct */
+{
+	ut_ad(!prebuilt->templ_contains_blob);
+	ut_ad(prebuilt->n_fetch_cached < MYSQL_FETCH_CACHE_SIZE);
+
+	if (prebuilt->fetch_cache[0] == NULL) {
+		/* Allocate memory for the fetch cache */
+		ut_ad(prebuilt->n_fetch_cached == 0);
+
+		row_sel_prefetch_cache_init(prebuilt);
 	}
 
 	ut_ad(prebuilt->fetch_cache_first == 0);
 	UNIV_MEM_INVALID(prebuilt->fetch_cache[prebuilt->n_fetch_cached],
 			 prebuilt->mysql_row_len);
 
-	if (UNIV_UNLIKELY(!row_sel_store_mysql_rec(
-				  prebuilt->fetch_cache[
-					  prebuilt->n_fetch_cached],
-				  prebuilt, rec, rec_clust, offsets))) {
-		return(FALSE);
+	return(prebuilt->fetch_cache[prebuilt->n_fetch_cached]);
+}
+
+/********************************************************************//**
+Pushes a row for MySQL to the fetch cache. */
+UNIV_INLINE
+void
+row_sel_enqueue_cache_row_for_mysql(
+/*================================*/
+	byte*		mysql_rec,	/*!< in/out: MySQL record */
+	row_prebuilt_t*	prebuilt)	/*!< in/out: prebuilt struct */
+{
+	/* For non ICP code path the row should already exist in the
+	next fetch cache slot. */
+
+	if (prebuilt->idx_cond != NULL) {
+		byte*	dest = row_sel_fetch_last_buf(prebuilt);
+
+		ut_memcpy(dest, mysql_rec, prebuilt->mysql_row_len);
 	}
 
-	prebuilt->n_fetch_cached++;
-	return(TRUE);
+	++prebuilt->n_fetch_cached;
 }
 
 /*********************************************************************//**
 Tries to do a shortcut to fetch a clustered index record with a unique key,
 using the hash index if possible (not always). We assume that the search
 mode is PAGE_CUR_GE, it is a consistent read, there is a read view in trx,
-btr search latch has been locked in S-mode.
+btr search latch has been locked in S-mode if AHI is enabled.
 @return	SEL_FOUND, SEL_EXHAUSTED, SEL_RETRY */
 static
 ulint
@@ -3317,7 +3506,9 @@ row_sel_try_search_shortcut_for_mysql(
 #ifndef UNIV_SEARCH_DEBUG
 	btr_pcur_open_with_no_init(index, search_tuple, PAGE_CUR_GE,
 				   BTR_SEARCH_LEAF, pcur,
-				   RW_S_LATCH,
+				   (trx->has_search_latch)
+				    ? RW_S_LATCH
+				    : 0,
 				   mtr);
 #else /* UNIV_SEARCH_DEBUG */
 	btr_pcur_open_with_no_init(index, search_tuple, PAGE_CUR_GE,
@@ -3363,6 +3554,89 @@ row_sel_try_search_shortcut_for_mysql(
 	return(SEL_FOUND);
 }
 
+/*********************************************************************//**
+Check a pushed-down index condition.
+@return ICP_NO_MATCH, ICP_MATCH, or ICP_OUT_OF_RANGE */
+static
+enum icp_result
+row_search_idx_cond_check(
+/*======================*/
+	byte*			mysql_rec,	/*!< out: record
+						in MySQL format (invalid unless
+						prebuilt->idx_cond!=NULL and
+						we return ICP_MATCH) */
+	row_prebuilt_t*		prebuilt,	/*!< in/out: prebuilt struct
+						for the table handle */
+	const rec_t*		rec,		/*!< in: InnoDB record */
+	const ulint*		offsets)	/*!< in: rec_get_offsets() */
+{
+	enum icp_result result;
+	ulint		i;
+
+	ut_ad(rec_offs_validate(rec, prebuilt->index, offsets));
+
+	if (!prebuilt->idx_cond) {
+		return(ICP_MATCH);
+	}
+
+	MONITOR_INC(MONITOR_ICP_ATTEMPTS);
+
+	/* Convert to MySQL format those fields that are needed for
+	evaluating the index condition. */
+
+	if (UNIV_LIKELY_NULL(prebuilt->blob_heap)) {
+		mem_heap_empty(prebuilt->blob_heap);
+	}
+
+	for (i = 0; i < prebuilt->idx_cond_n_cols; i++) {
+		const mysql_row_templ_t*templ = &prebuilt->mysql_template[i];
+
+		if (!row_sel_store_mysql_field(mysql_rec, prebuilt,
+					       rec, prebuilt->index, offsets,
+					       templ->icp_rec_field_no,
+					       templ)) {
+			return(ICP_NO_MATCH);
+		}
+	}
+
+	/* We assume that the index conditions on
+	case-insensitive columns are case-insensitive. The
+	case of such columns may be wrong in a secondary
+	index, if the case of the column has been updated in
+	the past, or a record has been deleted and a record
+	inserted in a different case. */
+	result = innobase_index_cond(prebuilt->idx_cond);
+	switch (result) {
+	case ICP_MATCH:
+		/* Convert the remaining fields to MySQL format.
+		If this is a secondary index record, we must defer
+		this until we have fetched the clustered index record. */
+		if (!prebuilt->need_to_access_clustered
+		    || dict_index_is_clust(prebuilt->index)) {
+			if (!row_sel_store_mysql_rec(
+				    mysql_rec, prebuilt, rec, FALSE,
+				    prebuilt->index, offsets)) {
+				ut_ad(dict_index_is_clust(prebuilt->index));
+				return(ICP_NO_MATCH);
+			}
+		}
+		MONITOR_INC(MONITOR_ICP_MATCH);
+		return(result);
+	case ICP_NO_MATCH:
+		MONITOR_INC(MONITOR_ICP_NO_MATCH);
+		return(result);
+	case ICP_OUT_OF_RANGE:
+		MONITOR_INC(MONITOR_ICP_OUT_OF_RANGE);
+		return(result);
+        case ICP_ERROR:
+        case ICP_ABORTED_BY_USER:
+                return(result);
+	}
+
+	ut_error;
+	return(result);
+}
+
 /********************************************************************//**
 Searches for rows in the database. This is used in the interface to
 MySQL. This function opens a cursor, and also implements fetch next
@@ -3405,7 +3679,6 @@ row_search_for_mysql(
 	const rec_t*	clust_rec;
 	ulint		err				= DB_SUCCESS;
 	ibool		unique_search			= FALSE;
-	ibool		unique_search_from_clust_index	= FALSE;
 	ibool		mtr_has_extra_clust_latch	= FALSE;
 	ibool		moves_up			= FALSE;
 	ibool		set_also_gap_locks		= TRUE;
@@ -3466,7 +3739,7 @@ row_search_for_mysql(
 		return(DB_CORRUPTION);
 	}
 
-	if (UNIV_UNLIKELY(prebuilt->magic_n != ROW_PREBUILT_ALLOCATED)) {
+	if (prebuilt->magic_n != ROW_PREBUILT_ALLOCATED) {
 		fprintf(stderr,
 			"InnoDB: Error: trying to free a corrupt\n"
 			"InnoDB: table handle. Magic n %lu, table name ",
@@ -3566,7 +3839,7 @@ row_search_for_mysql(
 			prebuilt->fetch_cache_first = 0;
 
 		} else if (UNIV_LIKELY(prebuilt->n_fetch_cached > 0)) {
-			row_sel_pop_cached_row_for_mysql(buf, prebuilt);
+			row_sel_dequeue_cached_row_for_mysql(buf, prebuilt);
 
 			prebuilt->n_rows_fetched++;
 
@@ -3653,8 +3926,6 @@ row_search_for_mysql(
 
 		mode = PAGE_CUR_GE;
 
-		unique_search_from_clust_index = TRUE;
-
 		if (trx->mysql_n_tables_locked == 0
 		    && prebuilt->select_lock_type == LOCK_NONE
 		    && trx->isolation_level > TRX_ISO_READ_UNCOMMITTED
@@ -3693,9 +3964,23 @@ row_search_for_mysql(
 				mtr_commit(&mtr). */
 				ut_ad(!rec_get_deleted_flag(rec, comp));
 
-				if (!row_sel_store_mysql_rec(buf, prebuilt,
-							     rec, FALSE,
-							     offsets)) {
+				if (prebuilt->idx_cond) {
+					switch (row_search_idx_cond_check(
+							buf, prebuilt,
+							rec, offsets)) {
+					case ICP_NO_MATCH:
+					case ICP_OUT_OF_RANGE:
+                                        case ICP_ERROR:
+                                        case ICP_ABORTED_BY_USER:
+						goto shortcut_mismatch;
+					case ICP_MATCH:
+						goto shortcut_match;
+					}
+				}
+
+				if (!row_sel_store_mysql_rec(
+					    buf, prebuilt,
+					    rec, FALSE, index, offsets)) {
 					/* Only fresh inserts may contain
 					incomplete externally stored
 					columns. Pretend that such
@@ -3706,13 +3991,12 @@ row_search_for_mysql(
 					rolling back a recovered
 					transaction. Rollback happens
 					at a lower level, not here. */
-					ut_a(trx->isolation_level
-					     == TRX_ISO_READ_UNCOMMITTED);
 
 					/* Proceed as in case SEL_RETRY. */
 					break;
 				}
 
+			shortcut_match:
 				mtr_commit(&mtr);
 
 				/* ut_print_name(stderr, index->name);
@@ -3724,6 +4008,7 @@ row_search_for_mysql(
 				goto release_search_latch_if_needed;
 
 			case SEL_EXHAUSTED:
+			shortcut_mismatch:
 				mtr_commit(&mtr);
 
 				/* ut_print_name(stderr, index->name);
@@ -3764,9 +4049,15 @@ release_search_latch_if_needed:
 		trx->has_search_latch = FALSE;
 	}
 
-	ut_ad(prebuilt->sql_stat_start || trx->conc_state == TRX_ACTIVE);
-	ut_ad(trx->conc_state == TRX_NOT_STARTED
-	      || trx->conc_state == TRX_ACTIVE);
+	/* The state of a running trx can only be changed by the
+	thread that is currently serving the transaction. Because we
+	are that thread, we can read trx->state without holding any
+	mutex. */
+	ut_ad(prebuilt->sql_stat_start || trx->state == TRX_STATE_ACTIVE);
+
+	ut_ad(trx->state == TRX_STATE_NOT_STARTED
+	      || trx->state == TRX_STATE_ACTIVE);
+
 	ut_ad(prebuilt->sql_stat_start
 	      || prebuilt->select_lock_type != LOCK_NONE
 	      || trx->read_view);
@@ -3806,8 +4097,9 @@ release_search_latch_if_needed:
 	if (!prebuilt->sql_stat_start) {
 		/* No need to set an intention lock or assign a read view */
 
-		if (trx->read_view == NULL
-		    && prebuilt->select_lock_type == LOCK_NONE) {
+		if (UNIV_UNLIKELY
+		    (trx->read_view == NULL
+		     && prebuilt->select_lock_type == LOCK_NONE)) {
 
 			fputs("InnoDB: Error: MySQL is trying to"
 			      " perform a consistent read\n"
@@ -3923,10 +4215,12 @@ rec_loop:
 #ifdef UNIV_SEARCH_DEBUG
 	/*
 	fputs("Using ", stderr);
-	dict_index_name_print(stderr, index);
+	dict_index_name_print(stderr, trx, index);
 	fprintf(stderr, " cnt %lu ; Page no %lu\n", cnt,
 	page_get_page_no(page_align(rec)));
-	rec_print(rec);
+	rec_print(stderr, rec, index);
+	printf("delete-mark: %lu\n",
+	       rec_get_deleted_flag(rec, page_rec_is_comp(rec)));
 	*/
 #endif /* UNIV_SEARCH_DEBUG */
 
@@ -4108,8 +4402,10 @@ wrong_offs:
 			btr_pcur_store_position(pcur, &mtr);
 
 			err = DB_RECORD_NOT_FOUND;
-			/* ut_print_name(stderr, index->name);
-			fputs(" record not found 3\n", stderr); */
+#if 0
+			ut_print_name(stderr, trx, FALSE, index->name);
+			fputs(" record not found 3\n", stderr);
+#endif
 
 			goto normal_return;
 		}
@@ -4147,8 +4443,10 @@ wrong_offs:
 			btr_pcur_store_position(pcur, &mtr);
 
 			err = DB_RECORD_NOT_FOUND;
-			/* ut_print_name(stderr, index->name);
-			fputs(" record not found 4\n", stderr); */
+#if 0
+			ut_print_name(stderr, trx, FALSE, index->name);
+			fputs(" record not found 4\n", stderr);
+#endif
 
 			goto normal_return;
 		}
@@ -4173,8 +4471,7 @@ wrong_offs:
 		if (!set_also_gap_locks
 		    || srv_locks_unsafe_for_binlog
 		    || trx->isolation_level <= TRX_ISO_READ_COMMITTED
-		    || (unique_search
-			&& !UNIV_UNLIKELY(rec_get_deleted_flag(rec, comp)))) {
+		    || (unique_search && !rec_get_deleted_flag(rec, comp))) {
 
 			goto no_gap_lock;
 		} else {
@@ -4243,30 +4540,30 @@ no_gap_lock:
 				goto lock_wait_or_error;
 			}
 
-			mutex_enter(&kernel_mutex);
-			if (trx->was_chosen_as_deadlock_victim) {
-				mutex_exit(&kernel_mutex);
-				err = DB_DEADLOCK;
+			/* Check whether it was a deadlock or not, if not
+			a deadlock and the transaction had to wait then
+			release the lock it is waiting on. */
 
-				goto lock_wait_or_error;
-			}
-			if (UNIV_LIKELY(trx->wait_lock != NULL)) {
-				lock_cancel_waiting_and_release(
-					trx->wait_lock);
-			} else {
-				mutex_exit(&kernel_mutex);
+			err = lock_trx_handle_wait(trx);
 
+			switch (err) {
+			case DB_SUCCESS:
 				/* The lock was granted while we were
 				searching for the last committed version.
 				Do a normal locking read. */
 
-				offsets = rec_get_offsets(rec, index, offsets,
-							  ULINT_UNDEFINED,
-							  &heap);
+				offsets = rec_get_offsets(
+					rec, index, offsets, ULINT_UNDEFINED,
+					&heap);
+				goto locks_ok;
+			case DB_DEADLOCK:
+				goto lock_wait_or_error;
+			case DB_LOCK_WAIT:
 				err = DB_SUCCESS;
 				break;
+			default:
+				ut_error;
 			}
-			mutex_exit(&kernel_mutex);
 
 			if (old_vers == NULL) {
 				/* The row was not yet committed */
@@ -4334,17 +4631,36 @@ no_gap_lock:
 
 			if (!lock_sec_rec_cons_read_sees(
 				    rec, trx->read_view)) {
-				goto requires_clust_rec;
+				/* We should look at the clustered index.
+				However, as this is a non-locking read,
+				we can skip the clustered index lookup if
+				the condition does not match the secondary
+				index entry. */
+				switch (row_search_idx_cond_check(
+						buf, prebuilt, rec, offsets)) {
+				case ICP_NO_MATCH:
+					goto next_rec;
+				case ICP_OUT_OF_RANGE:
+                                case ICP_ERROR:
+                                case ICP_ABORTED_BY_USER:
+					err = DB_RECORD_NOT_FOUND;
+					goto idx_cond_failed;
+				case ICP_MATCH:
+					goto requires_clust_rec;
+				}
+
+				ut_error;
 			}
 		}
 	}
 
+locks_ok:
 	/* NOTE that at this point rec can be an old version of a clustered
 	index record built for a consistent read. We cannot assume after this
 	point that rec is on a buffer pool page. Functions like
 	page_rec_is_comp() cannot be used! */
 
-	if (UNIV_UNLIKELY(rec_get_deleted_flag(rec, comp))) {
+	if (rec_get_deleted_flag(rec, comp)) {
 
 		/* The record is delete-marked: we can skip it */
 
@@ -4370,9 +4686,7 @@ no_gap_lock:
 		applicable to unique secondary indexes. Current behaviour is
 		to widen the scope of a lock on an already delete marked record
 		if the same record is deleted twice by the same transaction */
-		if (index == clust_index && unique_search
-		    && !prebuilt->used_in_HANDLER) {
-
+		if (index == clust_index && unique_search) {
 			err = DB_RECORD_NOT_FOUND;
 
 			goto normal_return;
@@ -4381,12 +4695,29 @@ no_gap_lock:
 		goto next_rec;
 	}
 
+	/* Check if the record matches the index condition. */
+	switch (row_search_idx_cond_check(buf, prebuilt, rec, offsets)) {
+	case ICP_NO_MATCH:
+		if (did_semi_consistent_read) {
+			row_unlock_for_mysql(prebuilt, TRUE);
+		}
+		goto next_rec;
+	case ICP_OUT_OF_RANGE:
+        case ICP_ERROR:
+        case ICP_ABORTED_BY_USER:
+		err = DB_RECORD_NOT_FOUND;
+		goto idx_cond_failed;
+	case ICP_MATCH:
+		break;
+	}
+
 	/* Get the clustered index record if needed, if we did not do the
 	search using the clustered index. */
 
 	if (index != clust_index && prebuilt->need_to_access_clustered) {
 
 requires_clust_rec:
+		ut_ad(index != clust_index);
 		/* We use a 'goto' to the preceding label if a consistent
 		read of a secondary index record requires us to look up old
 		versions of the associated clustered index record. */
@@ -4429,7 +4760,7 @@ requires_clust_rec:
 			goto lock_wait_or_error;
 		}
 
-		if (UNIV_UNLIKELY(rec_get_deleted_flag(clust_rec, comp))) {
+		if (rec_get_deleted_flag(clust_rec, comp)) {
 
 			/* The record is delete marked: we can skip it */
 
@@ -4449,6 +4780,26 @@ requires_clust_rec:
 
 		result_rec = clust_rec;
 		ut_ad(rec_offs_validate(result_rec, clust_index, offsets));
+
+		if (prebuilt->idx_cond) {
+			/* Convert the record to MySQL format. We were
+			unable to do this in row_search_idx_cond_check(),
+			because the condition is on the secondary index
+			and the requested column is in the clustered index.
+			We convert all fields, including those that
+			may have been used in ICP, because the
+			secondary index may contain a column prefix
+			rather than the full column. Also, as noted
+			in Bug #56680, the column in the secondary
+			index may be in the wrong case, and the
+			authoritative case is in result_rec, the
+			appropriate version of the clustered index record. */
+			if (!row_sel_store_mysql_rec(
+				    buf, prebuilt, result_rec,
+				    TRUE, clust_index, offsets)) {
+				goto next_rec;
+			}
+		}
 	} else {
 		result_rec = rec;
 	}
@@ -4472,7 +4823,8 @@ requires_clust_rec:
 	    && !prebuilt->clust_index_was_generated
 	    && !prebuilt->used_in_HANDLER
 	    && prebuilt->template_type
-	    != ROW_MYSQL_DUMMY_TEMPLATE) {
+	    != ROW_MYSQL_DUMMY_TEMPLATE
+	    && !prebuilt->result) {
 
 		/* Inside an update, for example, we do not cache rows,
 		since we may use the cursor position to do the actual
@@ -4483,9 +4835,19 @@ requires_clust_rec:
 		not cache rows because there the cursor is a scrollable
 		cursor. */
 
-		if (!row_sel_push_cache_row_for_mysql(prebuilt, result_rec,
-						      result_rec != rec,
-						      offsets)) {
+		ut_a(prebuilt->n_fetch_cached < MYSQL_FETCH_CACHE_SIZE);
+
+		/* We only convert from InnoDB row format to MySQL row
+		format when ICP is disabled. */
+
+		if (!prebuilt->idx_cond
+		    && !row_sel_store_mysql_rec(
+			    row_sel_fetch_last_buf(prebuilt),
+			    prebuilt, result_rec,
+			    result_rec != rec,
+			    result_rec != rec ? clust_index : index,
+			    offsets)) {
+
 			/* Only fresh inserts may contain incomplete
 			externally stored columns. Pretend that such
 			records do not exist. Such records may only be
@@ -4493,14 +4855,14 @@ requires_clust_rec:
 			level or when rolling back a recovered
 			transaction. Rollback happens at a lower
 			level, not here. */
-			ut_a(trx->isolation_level == TRX_ISO_READ_UNCOMMITTED);
-		} else if (prebuilt->n_fetch_cached
-			   == MYSQL_FETCH_CACHE_SIZE) {
-
-			goto got_row;
+			goto next_rec;
 		}
 
-		goto next_rec;
+		row_sel_enqueue_cache_row_for_mysql(buf, prebuilt);
+
+		if (prebuilt->n_fetch_cached < MYSQL_FETCH_CACHE_SIZE) {
+			goto next_rec;
+		}
 	} else {
 		if (UNIV_UNLIKELY
 		    (prebuilt->template_type == ROW_MYSQL_DUMMY_TEMPLATE)) {
@@ -4521,12 +4883,13 @@ requires_clust_rec:
 			       rec_offs_size(offsets));
 			mach_write_to_4(buf,
 					rec_offs_extra_size(offsets) + 4);
-		} else {
-			/* Returning a row to MySQL */
-
-			if (!row_sel_store_mysql_rec(buf, prebuilt, result_rec,
-						     result_rec != rec,
-						     offsets)) {
+		} else if (!prebuilt->idx_cond) {
+			/* The record was not yet converted to MySQL format. */
+			if (!row_sel_store_mysql_rec(
+				    buf, prebuilt, result_rec,
+				    result_rec != rec,
+				    result_rec != rec ? clust_index : index,
+				    offsets)) {
 				/* Only fresh inserts may contain
 				incomplete externally stored
 				columns. Pretend that such records do
@@ -4535,26 +4898,20 @@ requires_clust_rec:
 				isolation level or when rolling back a
 				recovered transaction. Rollback
 				happens at a lower level, not here. */
-				ut_a(trx->isolation_level
-				     == TRX_ISO_READ_UNCOMMITTED);
 				goto next_rec;
 			}
 		}
 
 		if (prebuilt->clust_index_was_generated) {
-			if (result_rec != rec) {
-				offsets = rec_get_offsets(
-					rec, index, offsets, ULINT_UNDEFINED,
-					&heap);
-			}
-			row_sel_store_row_id_to_prebuilt(prebuilt, rec,
-							 index, offsets);
+			row_sel_store_row_id_to_prebuilt(
+				prebuilt, result_rec,
+				result_rec == rec ? index : clust_index,
+				offsets);
 		}
 	}
 
 	/* From this point on, 'offsets' are invalid. */
 
-got_row:
 	/* We have an optimization to save CPU time: if this is a consistent
 	read on a unique condition on the clustered index, then we do not
 	store the pcur position, because any fetch next or prev will anyway
@@ -4562,7 +4919,12 @@ got_row:
 	HANDLER command where the user can move the cursor with PREV or NEXT
 	even after a unique search. */
 
-	if (!unique_search_from_clust_index
+	err = DB_SUCCESS;
+
+idx_cond_failed:
+	if (!unique_search
+	    || !dict_index_is_clust(index)
+	    || direction != 0
 	    || prebuilt->select_lock_type != LOCK_NONE
 	    || prebuilt->used_in_HANDLER) {
 
@@ -4571,8 +4933,6 @@ got_row:
 		btr_pcur_store_position(pcur, &mtr);
 	}
 
-	err = DB_SUCCESS;
-
 	goto normal_return;
 
 next_rec:
@@ -4587,6 +4947,18 @@ next_rec:
 	/*-------------------------------------------------------------*/
 	/* PHASE 5: Move the cursor to the next index record */
 
+	/* NOTE: For moves_up==FALSE, the mini-transaction will be
+	committed and restarted every time when switching b-tree
+	pages. For moves_up==TRUE in index condition pushdown, we can
+	scan an entire secondary index tree within a single
+	mini-transaction. As long as the prebuilt->idx_cond does not
+	match, we do not need to consult the clustered index or
+	return records to MySQL, and thus we can avoid repositioning
+	the cursor. What prevents us from buffer-fixing all leaf pages
+	within the mini-transaction is the btr_leaf_page_release()
+	call in btr_pcur_move_to_next_page(). Only the leaf page where
+	the cursor is positioned will remain buffer-fixed. */
+
 	if (UNIV_UNLIKELY(mtr_has_extra_clust_latch)) {
 		/* We must commit mtr if we are moving to the next
 		non-clustered index record, because we could break the
@@ -4651,7 +5023,7 @@ lock_table_wait:
 	mtr_commit(&mtr);
 	mtr_has_extra_clust_latch = FALSE;
 
-	trx->error_state = err;
+	trx->error_state = static_cast<enum db_err>(err);
 
 	/* The following is a patch for MySQL */
 
@@ -4685,7 +5057,7 @@ lock_table_wait:
 			on the same user record, we cannot use
 			row_unlock_for_mysql() to unlock any records, and
 			we must thus reset the new rec lock info. Since
-			in lock0lock.c we have blocked the inheriting of gap
+			in lock0lock.cc we have blocked the inheriting of gap
 			X-locks, we actually do not have any new record locks
 			set in this case.
 
@@ -4721,7 +5093,7 @@ normal_return:
 	mtr_commit(&mtr);
 
 	if (prebuilt->n_fetch_cached > 0) {
-		row_sel_pop_cached_row_for_mysql(buf, prebuilt);
+		row_sel_dequeue_cached_row_for_mysql(buf, prebuilt);
 
 		err = DB_SUCCESS;
 	}
@@ -4776,25 +5148,23 @@ row_search_check_if_query_cache_permitted(
 	dict_table_t*	table;
 	ibool		ret	= FALSE;
 
-	table = dict_table_get(norm_name, FALSE);
+	table = dict_table_open_on_name(norm_name, FALSE);
 
 	if (table == NULL) {
 
 		return(FALSE);
 	}
 
-	mutex_enter(&kernel_mutex);
-
 	/* Start the transaction if it is not started yet */
 
-	trx_start_if_not_started_low(trx);
+	trx_start_if_not_started(trx);
 
 	/* If there are locks on the table or some trx has invalidated the
 	cache up to our trx id, then ret = FALSE.
 	We do not check what type locks there are on the table, though only
 	IX type locks actually would require ret = FALSE. */
 
-	if (UT_LIST_GET_LEN(table->locks) == 0
+	if (lock_table_get_n_locks(table) == 0
 	    && trx->id >= table->query_cache_inv_trx_id) {
 
 		ret = TRUE;
@@ -4807,11 +5177,12 @@ row_search_check_if_query_cache_permitted(
 
 			trx->read_view = read_view_open_now(
 				trx->id, trx->global_read_view_heap);
+
 			trx->global_read_view = trx->read_view;
 		}
 	}
 
-	mutex_exit(&kernel_mutex);
+	dict_table_close(table, FALSE);
 
 	return(ret);
 }
diff --git a/storage/innobase/row/row0uins.c b/storage/innobase/row/row0uins.cc
index 4fa97c9355d..78fd4ad5199 100644
--- a/storage/innobase/row/row0uins.c
+++ b/storage/innobase/row/row0uins.cc
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1997, 2010, Innobase Oy. All Rights Reserved.
+Copyright (c) 1997, 2010, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,13 +11,13 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
 /**************************************************//**
-@file row/row0uins.c
+@file row/row0uins.cc
 Fresh insert undo
 
 Created 2/25/1997 Heikki Tuuri
@@ -97,15 +97,12 @@ row_undo_ins_remove_clust_rec(
 
 	btr_cur = btr_pcur_get_btr_cur(&(node->pcur));
 
-	success = btr_cur_optimistic_delete(btr_cur, &mtr);
-
-	btr_pcur_commit_specify_mtr(&(node->pcur), &mtr);
-
-	if (success) {
-		trx_undo_rec_release(node->trx, node->undo_no);
-
-		return(DB_SUCCESS);
+	if (btr_cur_optimistic_delete(btr_cur, &mtr)) {
+		err = DB_SUCCESS;
+		goto func_exit;
 	}
+
+	btr_pcur_commit_specify_mtr(&node->pcur, &mtr);
 retry:
 	/* If did not succeed, try pessimistic descent to tree */
 	mtr_start(&mtr);
@@ -135,8 +132,8 @@ retry:
 		goto retry;
 	}
 
-	btr_pcur_commit_specify_mtr(&(node->pcur), &mtr);
-
+func_exit:
+	btr_pcur_commit_specify_mtr(&node->pcur, &mtr);
 	trx_undo_rec_release(node->trx, node->undo_no);
 
 	return(err);
@@ -255,7 +252,8 @@ static
 void
 row_undo_ins_parse_undo_rec(
 /*========================*/
-	undo_node_t*	node)	/*!< in/out: row undo node */
+	undo_node_t*	node,		/*!< in/out: row undo node */
+	ibool		dict_locked)	/*!< in: TRUE if own dict_sys->mutex */
 {
 	dict_index_t*	clust_index;
 	byte*		ptr;
@@ -273,18 +271,27 @@ row_undo_ins_parse_undo_rec(
 	node->rec_type = type;
 
 	node->update = NULL;
-	node->table = dict_table_get_on_id(table_id, node->trx);
+	node->table = dict_table_open_on_id(table_id, dict_locked);
 
 	/* Skip the UNDO if we can't find the table or the .ibd file. */
 	if (UNIV_UNLIKELY(node->table == NULL)) {
 	} else if (UNIV_UNLIKELY(node->table->ibd_file_missing)) {
+		dict_table_close(node->table, dict_locked);
 		node->table = NULL;
 	} else {
 		clust_index = dict_table_get_first_index(node->table);
 
 		if (clust_index != NULL) {
-			ptr = trx_undo_rec_get_row_ref(
+			trx_undo_rec_get_row_ref(
 				ptr, clust_index, &node->ref, node->heap);
+
+			if (!row_undo_search_clust_to_pcur(node)) {
+
+				dict_table_close(node->table, dict_locked);
+
+				node->table = NULL;
+			}
+
 		} else {
 			ut_print_timestamp(stderr);
 			fprintf(stderr, "  InnoDB: table ");
@@ -293,11 +300,67 @@ row_undo_ins_parse_undo_rec(
 			fprintf(stderr, " has no indexes, "
 				"ignoring the table\n");
 
+			dict_table_close(node->table, dict_locked);
+
 			node->table = NULL;
 		}
 	}
 }
 
+/***************************************************************//**
+Removes secondary index records.
+@return	DB_SUCCESS or DB_OUT_OF_FILE_SPACE */
+static
+ulint
+row_undo_ins_remove_sec_rec(
+/*========================*/
+	undo_node_t*	node)	/*!< in/out: row undo node */
+{
+	ulint		err	= DB_SUCCESS;
+	mem_heap_t*	heap;
+
+	heap = mem_heap_create(1024);
+
+	while (node->index != NULL) {
+		dtuple_t*	entry;
+
+		if (node->index->type & DICT_FTS) {
+			dict_table_next_uncorrupted_index(node->index);
+			continue;
+		}
+
+		entry = row_build_index_entry(node->row, node->ext,
+					      node->index, heap);
+		if (UNIV_UNLIKELY(!entry)) {
+			/* The database must have crashed after
+			inserting a clustered index record but before
+			writing all the externally stored columns of
+			that record.  Because secondary index entries
+			are inserted after the clustered index record,
+			we may assume that the secondary index record
+			does not exist.  However, this situation may
+			only occur during the rollback of incomplete
+			transactions. */
+			ut_a(trx_is_recv(node->trx));
+		} else {
+			log_free_check();
+
+			err = row_undo_ins_remove_sec(node->index, entry);
+
+			if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
+				goto func_exit;
+			}
+		}
+
+		mem_heap_empty(heap);
+		dict_table_next_uncorrupted_index(node->index);
+	}
+
+func_exit:
+	mem_heap_free(heap);
+	return(err);
+}
+
 /***********************************************************//**
 Undoes a fresh insert of a row to a table. A fresh insert means that
 the same clustered index unique key did not have any record, even delete
@@ -311,12 +374,17 @@ row_undo_ins(
 /*=========*/
 	undo_node_t*	node)	/*!< in: row undo node */
 {
+	ulint		err;
+	ibool		dict_locked;
+
 	ut_ad(node);
 	ut_ad(node->state == UNDO_NODE_INSERT);
 
-	row_undo_ins_parse_undo_rec(node);
+	dict_locked = node->trx->dict_operation_lock_mode == RW_X_LATCH;
+
+	row_undo_ins_parse_undo_rec(node, dict_locked);
 
-	if (!node->table || !row_undo_search_clust_to_pcur(node)) {
+	if (node->table == NULL) {
 		trx_undo_rec_release(node->trx, node->undo_no);
 
 		return(DB_SUCCESS);
@@ -330,36 +398,20 @@ row_undo_ins(
 
 	dict_table_skip_corrupt_index(node->index);
 
-	while (node->index != NULL) {
-		dtuple_t*	entry;
-		ulint		err;
+	err = row_undo_ins_remove_sec_rec(node);
 
-		entry = row_build_index_entry(node->row, node->ext,
-					      node->index, node->heap);
-		if (UNIV_UNLIKELY(!entry)) {
-			/* The database must have crashed after
-			inserting a clustered index record but before
-			writing all the externally stored columns of
-			that record.  Because secondary index entries
-			are inserted after the clustered index record,
-			we may assume that the secondary index record
-			does not exist.  However, this situation may
-			only occur during the rollback of incomplete
-			transactions. */
-			ut_a(trx_is_recv(node->trx));
-		} else {
-			log_free_check();
-			err = row_undo_ins_remove_sec(node->index, entry);
+	if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
+		goto func_exit;
+	}
 
-			if (err != DB_SUCCESS) {
+	log_free_check();
 
-				return(err);
-			}
-		}
+	err = row_undo_ins_remove_clust_rec(node);
 
-		dict_table_next_uncorrupted_index(node->index);
-	}
+func_exit:
+	dict_table_close(node->table, dict_locked);
 
-	log_free_check();
-	return(row_undo_ins_remove_clust_rec(node));
+	node->table = NULL;
+
+	return(err);
 }
diff --git a/storage/innobase/row/row0umod.c b/storage/innobase/row/row0umod.cc
index 9597c476125..c9b42265553 100644
--- a/storage/innobase/row/row0umod.c
+++ b/storage/innobase/row/row0umod.cc
@@ -11,13 +11,13 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
 /**************************************************//**
-@file row/row0umod.c
+@file row/row0umod.cc
 Undo modify of a row
 
 Created 2/27/1997 Heikki Tuuri
@@ -420,7 +420,7 @@ Delete marks or removes a secondary index entry if found.
 NOTE that if we updated the fields of a delete-marked secondary index record
 so that alphabetically they stayed the same, e.g., 'abc' -> 'aBc', we cannot
 return to the original values because we do not know them. But this should
-not cause problems because in row0sel.c, in queries we always retrieve the
+not cause problems because in row0sel.cc, in queries we always retrieve the
 clustered index record or an earlier version of it, if the secondary index
 record through which we do the search is delete-marked.
 @return	DB_SUCCESS or DB_OUT_OF_FILE_SPACE */
@@ -566,26 +566,24 @@ row_undo_mod_upd_del_sec(
 	que_thr_t*	thr)	/*!< in: query thread */
 {
 	mem_heap_t*	heap;
-	dtuple_t*	entry;
-	dict_index_t*	index;
 	ulint		err	= DB_SUCCESS;
 
 	ut_ad(node->rec_type == TRX_UNDO_UPD_DEL_REC);
+
 	heap = mem_heap_create(1024);
 
 	while (node->index != NULL) {
+		dict_index_t*	index	= node->index;
+		dtuple_t*	entry;
 
-		/* Skip all corrupted secondary index */
-		dict_table_skip_corrupt_index(node->index);
-
-		if (!node->index) {
-			break;
+		if (index->type & DICT_FTS) {
+			dict_table_next_uncorrupted_index(node->index);
+			continue;
 		}
 
-		index = node->index;
+		entry = row_build_index_entry(
+			node->row, node->ext, index, heap);
 
-		entry = row_build_index_entry(node->row, node->ext,
-					      index, heap);
 		if (UNIV_UNLIKELY(!entry)) {
 			/* The database must have crashed after
 			inserting a clustered index record but before
@@ -601,15 +599,14 @@ row_undo_mod_upd_del_sec(
 			err = row_undo_mod_del_mark_or_remove_sec(
 				node, thr, index, entry);
 
-			if (err != DB_SUCCESS) {
+			if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
 
 				break;
 			}
 		}
 
 		mem_heap_empty(heap);
-
-		node->index = dict_table_get_next_index(node->index);
+		dict_table_next_uncorrupted_index(node->index);
 	}
 
 	mem_heap_free(heap);
@@ -628,25 +625,24 @@ row_undo_mod_del_mark_sec(
 	que_thr_t*	thr)	/*!< in: query thread */
 {
 	mem_heap_t*	heap;
-	dtuple_t*	entry;
-	dict_index_t*	index;
-	ulint		err;
+	ulint		err	= DB_SUCCESS;
 
 	heap = mem_heap_create(1024);
 
 	while (node->index != NULL) {
-		/* Skip all corrupted secondary index */
-		dict_table_skip_corrupt_index(node->index);
+		dict_index_t*	index	= node->index;
+		dtuple_t*	entry;
 
-		if (!node->index) {
-			break;
+		if (index->type == DICT_FTS) {
+			dict_table_next_uncorrupted_index(node->index);
+			continue;
 		}
 
-		index = node->index;
+		entry = row_build_index_entry(
+			node->row, node->ext, index, heap);
 
-		entry = row_build_index_entry(node->row, node->ext,
-					      index, heap);
 		ut_a(entry);
+
 		err = row_undo_mod_del_unmark_sec_and_undo_update(
 			BTR_MODIFY_LEAF, thr, index, entry);
 		if (err == DB_FAIL) {
@@ -654,19 +650,18 @@ row_undo_mod_del_mark_sec(
 				BTR_MODIFY_TREE, thr, index, entry);
 		}
 
-		if (err != DB_SUCCESS) {
+		if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
 
-			mem_heap_free(heap);
-
-			return(err);
+			break;
 		}
 
-		node->index = dict_table_get_next_index(node->index);
+		mem_heap_empty(heap);
+		dict_table_next_uncorrupted_index(node->index);
 	}
 
 	mem_heap_free(heap);
 
-	return(DB_SUCCESS);
+	return(err);
 }
 
 /***********************************************************//**
@@ -680,116 +675,106 @@ row_undo_mod_upd_exist_sec(
 	que_thr_t*	thr)	/*!< in: query thread */
 {
 	mem_heap_t*	heap;
-	dtuple_t*	entry;
-	dict_index_t*	index;
-	ulint		err;
+	ulint		err	= DB_SUCCESS;
 
-	if (node->cmpl_info & UPD_NODE_NO_ORD_CHANGE) {
+	if (node->index == NULL
+	    || (node->cmpl_info & UPD_NODE_NO_ORD_CHANGE)) {
 		/* No change in secondary indexes */
 
-		return(DB_SUCCESS);
+		return(err);
 	}
 
 	heap = mem_heap_create(1024);
 
 	while (node->index != NULL) {
-		/* Skip all corrupted secondary index */
-		dict_table_skip_corrupt_index(node->index);
-
-		if (!node->index) {
-			break;
+		dict_index_t*	index	= node->index;
+		dtuple_t*	entry;
+
+		if (index->type == DICT_FTS
+		    || !row_upd_changes_ord_field_binary(
+			index, node->update, thr, node->row, node->ext)) {
+			dict_table_next_uncorrupted_index(node->index);
+			continue;
 		}
 
-		index = node->index;
-
-		if (row_upd_changes_ord_field_binary(node->index, node->update,
-						     thr,
-						     node->row, node->ext)) {
-
-			/* Build the newest version of the index entry */
-			entry = row_build_index_entry(node->row, node->ext,
-						      index, heap);
-			if (UNIV_UNLIKELY(!entry)) {
-				/* The server must have crashed in
-				row_upd_clust_rec_by_insert() before
-				the updated externally stored columns (BLOBs)
-				of the new clustered index entry were
-				written. */
-
-				/* The table must be in DYNAMIC or COMPRESSED
-				format.  REDUNDANT and COMPACT formats
-				store a local 768-byte prefix of each
-				externally stored column. */
-				ut_a(dict_table_get_format(index->table)
-				     >= DICT_TF_FORMAT_ZIP);
-
-				/* This is only legitimate when
-				rolling back an incomplete transaction
-				after crash recovery. */
-				ut_a(thr_get_trx(thr)->is_recovered);
-
-				/* The server must have crashed before
-				completing the insert of the new
-				clustered index entry and before
-				inserting to the secondary indexes.
-				Because node->row was not yet written
-				to this index, we can ignore it.  But
-				we must restore node->undo_row. */
-			} else {
-				/* NOTE that if we updated the fields of a
-				delete-marked secondary index record so that
-				alphabetically they stayed the same, e.g.,
-				'abc' -> 'aBc', we cannot return to the
-				original values because we do not know them.
-				But this should not cause problems because
-				in row0sel.c, in queries we always retrieve
-				the clustered index record or an earlier
-				version of it, if the secondary index record
-				through which we do the search is
-				delete-marked. */
-
-				err = row_undo_mod_del_mark_or_remove_sec(
-					node, thr, index, entry);
-				if (err != DB_SUCCESS) {
-					mem_heap_free(heap);
-
-					return(err);
-				}
-
-				mem_heap_empty(heap);
+		/* Build the newest version of the index entry */
+		entry = row_build_index_entry(node->row, node->ext,
+					      index, heap);
+		if (UNIV_UNLIKELY(!entry)) {
+			/* The server must have crashed in
+			row_upd_clust_rec_by_insert() before
+			the updated externally stored columns (BLOBs)
+			of the new clustered index entry were written. */
+
+			/* The table must be in DYNAMIC or COMPRESSED
+			format.  REDUNDANT and COMPACT formats
+			store a local 768-byte prefix of each
+			externally stored column. */
+			ut_a(dict_table_get_format(index->table)
+			     >= UNIV_FORMAT_B);
+
+			/* This is only legitimate when
+			rolling back an incomplete transaction
+			after crash recovery. */
+			ut_a(thr_get_trx(thr)->is_recovered);
+
+			/* The server must have crashed before
+			completing the insert of the new
+			clustered index entry and before
+			inserting to the secondary indexes.
+			Because node->row was not yet written
+			to this index, we can ignore it.  But
+			we must restore node->undo_row. */
+		} else {
+			/* NOTE that if we updated the fields of a
+			delete-marked secondary index record so that
+			alphabetically they stayed the same, e.g.,
+			'abc' -> 'aBc', we cannot return to the
+			original values because we do not know them.
+			But this should not cause problems because
+			in row0sel.cc, in queries we always retrieve
+			the clustered index record or an earlier
+			version of it, if the secondary index record
+			through which we do the search is
+			delete-marked. */
+
+			err = row_undo_mod_del_mark_or_remove_sec(
+				node, thr, index, entry);
+			if (err != DB_SUCCESS) {
+				break;
 			}
+		}
 
-			/* We may have to update the delete mark in the
-			secondary index record of the previous version of
-			the row. We also need to update the fields of
-			the secondary index record if we updated its fields
-			but alphabetically they stayed the same, e.g.,
-			'abc' -> 'aBc'. */
-			entry = row_build_index_entry(node->undo_row,
-						      node->undo_ext,
-						      index, heap);
-			ut_a(entry);
+		mem_heap_empty(heap);
+		/* We may have to update the delete mark in the
+		secondary index record of the previous version of
+		the row. We also need to update the fields of
+		the secondary index record if we updated its fields
+		but alphabetically they stayed the same, e.g.,
+		'abc' -> 'aBc'. */
+		entry = row_build_index_entry(node->undo_row,
+					      node->undo_ext,
+					      index, heap);
+		ut_a(entry);
 
+		err = row_undo_mod_del_unmark_sec_and_undo_update(
+			BTR_MODIFY_LEAF, thr, index, entry);
+		if (err == DB_FAIL) {
 			err = row_undo_mod_del_unmark_sec_and_undo_update(
-				BTR_MODIFY_LEAF, thr, index, entry);
-			if (err == DB_FAIL) {
-				err = row_undo_mod_del_unmark_sec_and_undo_update(
-					BTR_MODIFY_TREE, thr, index, entry);
-			}
-
-			if (err != DB_SUCCESS) {
-				mem_heap_free(heap);
+				BTR_MODIFY_TREE, thr, index, entry);
+		}
 
-				return(err);
-			}
+		if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
+			break;
 		}
 
-		node->index = dict_table_get_next_index(node->index);
+		mem_heap_empty(heap);
+		dict_table_next_uncorrupted_index(node->index);
 	}
 
 	mem_heap_free(heap);
 
-	return(DB_SUCCESS);
+	return(err);
 }
 
 /***********************************************************//**
@@ -798,8 +783,9 @@ static
 void
 row_undo_mod_parse_undo_rec(
 /*========================*/
-	undo_node_t*	node,	/*!< in: row undo node */
-	que_thr_t*	thr)	/*!< in: query thread */
+	undo_node_t*	node,		/*!< in: row undo node */
+	que_thr_t*	thr,		/*!< in: query thread */
+	ibool		dict_locked)	/*!< in: TRUE if own dict_sys->mutex */
 {
 	dict_index_t*	clust_index;
 	byte*		ptr;
@@ -819,7 +805,7 @@ row_undo_mod_parse_undo_rec(
 				    &dummy_extern, &undo_no, &table_id);
 	node->rec_type = type;
 
-	node->table = dict_table_get_on_id(table_id, trx);
+	node->table = dict_table_open_on_id(table_id, dict_locked);
 
 	/* TODO: other fixes associated with DROP TABLE + rollback in the
 	same table by another user */
@@ -830,6 +816,8 @@ row_undo_mod_parse_undo_rec(
 	}
 
 	if (node->table->ibd_file_missing) {
+		dict_table_close(node->table, dict_locked);
+
 		/* We skip undo operations to missing .ibd files */
 		node->table = NULL;
 
@@ -850,6 +838,13 @@ row_undo_mod_parse_undo_rec(
 	node->new_roll_ptr = roll_ptr;
 	node->new_trx_id = trx_id;
 	node->cmpl_info = cmpl_info;
+
+	if (!row_undo_search_clust_to_pcur(node)) {
+
+		dict_table_close(node->table, dict_locked);
+
+		node->table = NULL;
+	}
 }
 
 /***********************************************************//**
@@ -862,14 +857,17 @@ row_undo_mod(
 	undo_node_t*	node,	/*!< in: row undo node */
 	que_thr_t*	thr)	/*!< in: query thread */
 {
-	ulint	err;
+	ulint		err;
+	ibool		dict_locked;
 
 	ut_ad(node && thr);
 	ut_ad(node->state == UNDO_NODE_MODIFY);
 
-	row_undo_mod_parse_undo_rec(node, thr);
+	dict_locked = thr_get_trx(thr)->dict_operation_lock_mode == RW_X_LATCH;
+
+	row_undo_mod_parse_undo_rec(node, thr, dict_locked);
 
-	if (!node->table || !row_undo_search_clust_to_pcur(node)) {
+	if (node->table == NULL) {
 		/* It is already undone, or will be undone by another query
 		thread, or table was dropped */
 
@@ -885,24 +883,29 @@ row_undo_mod(
 	/* Skip all corrupted secondary index */
 	dict_table_skip_corrupt_index(node->index);
 
-	if (node->rec_type == TRX_UNDO_UPD_EXIST_REC) {
-
+	switch (node->rec_type) {
+	case TRX_UNDO_UPD_EXIST_REC:
 		err = row_undo_mod_upd_exist_sec(node, thr);
-
-	} else if (node->rec_type == TRX_UNDO_DEL_MARK_REC) {
-
+		break;
+	case TRX_UNDO_DEL_MARK_REC:
 		err = row_undo_mod_del_mark_sec(node, thr);
-	} else {
-		ut_ad(node->rec_type == TRX_UNDO_UPD_DEL_REC);
+		break;
+	case TRX_UNDO_UPD_DEL_REC:
 		err = row_undo_mod_upd_del_sec(node, thr);
+		break;
+	default:
+		ut_error;
+		err = DB_ERROR;
 	}
 
-	if (err != DB_SUCCESS) {
+	if (err == DB_SUCCESS) {
 
-		return(err);
+		err = row_undo_mod_clust(node, thr);
 	}
 
-	err = row_undo_mod_clust(node, thr);
+	dict_table_close(node->table, dict_locked);
+
+	node->table = NULL;
 
 	return(err);
 }
diff --git a/storage/innobase/row/row0undo.c b/storage/innobase/row/row0undo.cc
index 09970b7fe21..6519e10334d 100644
--- a/storage/innobase/row/row0undo.c
+++ b/storage/innobase/row/row0undo.cc
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1997, 2011, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,13 +11,13 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
 /**************************************************//**
-@file row/row0undo.c
+@file row/row0undo.cc
 Row undo
 
 Created 1/8/1997 Heikki Tuuri
@@ -135,7 +135,8 @@ row_undo_node_create(
 
 	ut_ad(trx && parent && heap);
 
-	undo = mem_heap_alloc(heap, sizeof(undo_node_t));
+	undo = static_cast<undo_node_t*>(
+		mem_heap_alloc(heap, sizeof(undo_node_t)));
 
 	undo->common.type = QUE_NODE_UNDO;
 	undo->common.parent = parent;
@@ -200,7 +201,7 @@ row_undo_search_clust_to_pcur(
 	} else {
 		row_ext_t**	ext;
 
-		if (dict_table_get_format(node->table) >= DICT_TF_FORMAT_ZIP) {
+		if (dict_table_get_format(node->table) >= UNIV_FORMAT_B) {
 			/* In DYNAMIC or COMPRESSED format, there is
 			no prefix of externally stored columns in the
 			clustered index record. Build a cache of
@@ -356,17 +357,17 @@ row_undo_step(
 
 	ut_ad(thr);
 
-	srv_activity_count++;
+	srv_inc_activity_count();
 
 	trx = thr_get_trx(thr);
 
-	node = thr->run_node;
+	node = static_cast<undo_node_t*>(thr->run_node);
 
 	ut_ad(que_node_get_type(node) == QUE_NODE_UNDO);
 
 	err = row_undo(node, thr);
 
-	trx->error_state = err;
+	trx->error_state = static_cast<enum db_err>(err);
 
 	if (err != DB_SUCCESS) {
 		/* SQL error detected */
diff --git a/storage/innobase/row/row0upd.c b/storage/innobase/row/row0upd.cc
index 4f5096a162b..28faa59add8 100644
--- a/storage/innobase/row/row0upd.c
+++ b/storage/innobase/row/row0upd.cc
@@ -17,7 +17,7 @@ this program; if not, write to the Free Software Foundation, Inc.,
 *****************************************************************************/
 
 /**************************************************//**
-@file row/row0upd.c
+@file row/row0upd.cc
 Update of a row
 
 Created 12/27/1996 Heikki Tuuri
@@ -239,18 +239,18 @@ row_upd_check_references_constraints(
 				entry, index, node->update,
 				foreign->n_fields))) {
 
+			dict_table_t*	ref_table = NULL;
+
 			if (foreign->foreign_table == NULL) {
-				dict_table_get(foreign->foreign_table_name_lookup,
-					       FALSE);
+
+				ref_table = dict_table_open_on_name(
+					foreign->foreign_table_name_lookup, FALSE);
 			}
 
 			if (foreign->foreign_table) {
-				mutex_enter(&(dict_sys->mutex));
-
-				(foreign->foreign_table
-				 ->n_foreign_key_checks_running)++;
-
-				mutex_exit(&(dict_sys->mutex));
+				os_inc_counter(dict_sys->mutex,
+					       foreign->foreign_table
+					       ->n_foreign_key_checks_running);
 			}
 
 			/* NOTE that if the thread ends up waiting for a lock
@@ -262,15 +262,13 @@ row_upd_check_references_constraints(
 				FALSE, foreign, table, entry, thr);
 
 			if (foreign->foreign_table) {
-				mutex_enter(&(dict_sys->mutex));
-
-				ut_a(foreign->foreign_table
-				     ->n_foreign_key_checks_running > 0);
-
-				(foreign->foreign_table
-				 ->n_foreign_key_checks_running)--;
+				os_dec_counter(dict_sys->mutex,
+					       foreign->foreign_table
+					       ->n_foreign_key_checks_running);
+			}
 
-				mutex_exit(&(dict_sys->mutex));
+			if (ref_table != NULL) {
+				dict_table_close(ref_table, FALSE);
 			}
 
 			if (err != DB_SUCCESS) {
@@ -305,7 +303,9 @@ upd_node_create(
 {
 	upd_node_t*	node;
 
-	node = mem_heap_alloc(heap, sizeof(upd_node_t));
+	node = static_cast<upd_node_t*>(
+		mem_heap_alloc(heap, sizeof(upd_node_t)));
+
 	node->common.type = QUE_NODE_UPDATE;
 
 	node->state = UPD_NODE_UPDATE_CLUSTERED;
@@ -349,7 +349,7 @@ row_upd_rec_sys_fields_in_recovery(
 {
 	ut_ad(rec_offs_validate(rec, NULL, offsets));
 
-	if (UNIV_LIKELY_NULL(page_zip)) {
+	if (page_zip) {
 		page_zip_write_trx_id_and_roll_ptr(
 			page_zip, rec, offsets, pos, trx_id, roll_ptr);
 	} else {
@@ -390,7 +390,7 @@ row_upd_index_entry_sys_field(
 	pos = dict_index_get_sys_col_pos(index, type);
 
 	dfield = dtuple_get_nth_field(entry, pos);
-	field = dfield_get_data(dfield);
+	field = static_cast<byte*>(dfield_get_data(dfield));
 
 	if (type == DATA_TRX_ID) {
 		trx_write_trx_id(field, val);
@@ -545,7 +545,7 @@ row_upd_rec_in_place(
 #endif /* UNIV_BLOB_DEBUG */
 	}
 
-	if (UNIV_LIKELY_NULL(page_zip)) {
+	if (page_zip) {
 		page_zip_write_rec(page_zip, rec, index, offsets, 0);
 	}
 }
@@ -674,9 +674,11 @@ row_upd_index_write_log(
 			} else {
 				mlog_close(mtr, log_ptr);
 
-				mlog_catenate_string(mtr,
-						     dfield_get_data(new_val),
-						     len);
+				mlog_catenate_string(
+					mtr,
+					static_cast<byte*>(
+						dfield_get_data(new_val)),
+					len);
 
 				log_ptr = mlog_open(mtr, MLOG_BUF_MARGIN);
 				buf_end = log_ptr + MLOG_BUF_MARGIN;
@@ -815,7 +817,7 @@ row_upd_build_sec_rec_difference_binary(
 		and also in the case where we have a column prefix index
 		and the last characters in the index field are spaces; the
 		latter case probably caused the assertion failures reported at
-		row0upd.c line 713 in versions 4.0.14 - 4.0.16. */
+		row0upd.cc line 713 in versions 4.0.14 - 4.0.16. */
 
 		/* NOTE: we compare the fields as binary strings!
 		(No collation) */
@@ -893,8 +895,8 @@ row_upd_build_difference_binary(
 			goto skip_compare;
 		}
 
-		if (UNIV_UNLIKELY(!dfield_is_ext(dfield)
-				  != !rec_offs_nth_extern(offsets, i))
+		if (!dfield_is_ext(dfield)
+		    != !rec_offs_nth_extern(offsets, i)
 		    || !dfield_data_is_binary_equal(dfield, len, data)) {
 
 			upd_field = upd_get_nth_field(update, n_diff);
@@ -934,11 +936,11 @@ row_upd_ext_fetch(
 					out: fetched length of the prefix */
 	mem_heap_t*	heap)		/*!< in: heap where to allocate */
 {
-	byte*	buf = mem_heap_alloc(heap, *len);
+	byte*	buf = static_cast<byte*>(mem_heap_alloc(heap, *len));
+
+	*len = btr_copy_externally_stored_field_prefix(
+		buf, *len, zip_size, data, local_len);
 
-	*len = btr_copy_externally_stored_field_prefix(buf, *len,
-						       zip_size,
-						       data, local_len);
 	/* We should never update records containing a half-deleted BLOB. */
 	ut_a(*len);
 
@@ -972,7 +974,7 @@ row_upd_index_replace_new_col_val(
 	}
 
 	len = dfield_get_len(dfield);
-	data = dfield_get_data(dfield);
+	data = static_cast<const byte*>(dfield_get_data(dfield));
 
 	if (field->prefix_len > 0) {
 		ibool		fetch_ext = dfield_is_ext(dfield)
@@ -1023,10 +1025,12 @@ row_upd_index_replace_new_col_val(
 		stored part of the column.  The data
 		will have to be copied. */
 		ut_a(uf->orig_len > BTR_EXTERN_FIELD_REF_SIZE);
-		buf = mem_heap_alloc(heap, uf->orig_len);
+		buf = static_cast<byte*>(mem_heap_alloc(heap, uf->orig_len));
+
 		/* Copy the locally stored prefix. */
 		memcpy(buf, data,
 		       uf->orig_len - BTR_EXTERN_FIELD_REF_SIZE);
+
 		/* Copy the BLOB pointer. */
 		memcpy(buf + uf->orig_len - BTR_EXTERN_FIELD_REF_SIZE,
 		       data + len - BTR_EXTERN_FIELD_REF_SIZE,
@@ -1173,7 +1177,9 @@ row_upd_replace(
 	table = index->table;
 	ut_ad(n_cols == dict_table_get_n_cols(table));
 
-	ext_cols = mem_heap_alloc(heap, n_cols * sizeof *ext_cols);
+	ext_cols = static_cast<ulint*>(
+		mem_heap_alloc(heap, n_cols * sizeof *ext_cols));
+
 	n_ext_cols = 0;
 
 	dtuple_set_info_bits(row, update->info_bits);
@@ -1294,7 +1300,7 @@ row_upd_changes_ord_field_binary_func(
 		if (UNIV_LIKELY(ind_field->prefix_len == 0)
 		    || dfield_is_null(dfield)) {
 			/* do nothing special */
-		} else if (UNIV_LIKELY_NULL(ext)) {
+		} else if (ext) {
 			/* Silence a compiler warning without
 			silencing a Valgrind error. */
 			dfield_len = 0;
@@ -1325,7 +1331,8 @@ row_upd_changes_ord_field_binary_func(
 			dfield_len -= BTR_EXTERN_FIELD_REF_SIZE;
 			ut_a(dict_index_is_clust(index)
 			     || ind_field->prefix_len <= dfield_len);
-			buf = dfield_get_data(dfield);
+
+			buf = static_cast<byte*>(dfield_get_data(dfield));
 copy_dfield:
 			ut_a(dfield_len > 0);
 			dfield_copy(&dfield_ext, dfield);
@@ -1378,6 +1385,107 @@ row_upd_changes_some_index_ord_field_binary(
 }
 
 /***********************************************************//**
+Checks if an FTS Doc ID column is affected by an UPDATE.
+@return TRUE if the Doc ID column is changed */
+UNIV_INTERN
+ulint
+row_upd_changes_doc_id(
+/*===================*/
+	dict_table_t*	table,		/*!< in: table */
+	upd_field_t*	upd_field)	/*!< in: field to check */
+{
+	ulint		col_no;
+	dict_index_t*	clust_index;
+	fts_t*		fts = table->fts;
+
+	clust_index = dict_table_get_first_index(table);
+
+	/* Convert from index-specific column number to table-global
+	column number. */
+	col_no = dict_index_get_nth_col_no(clust_index, upd_field->field_no);
+
+	return(col_no == fts->doc_col);
+}
+/***********************************************************//**
+Checks if an FTS indexed column is affected by an UPDATE.
+@return offset within fts_t::indexes if FTS indexed column updated else
+ULINT_UNDEFINED */
+UNIV_INTERN
+ulint
+row_upd_changes_fts_column(
+/*=======================*/
+	dict_table_t*	table,		/*!< in: table */
+	upd_field_t*	upd_field)	/*!< in: field to check */
+{
+	ulint		col_no;
+	dict_index_t*	clust_index;
+	fts_t*		fts = table->fts;
+
+	clust_index = dict_table_get_first_index(table);
+
+	/* Convert from index-specific column number to table-global
+	column number. */
+	col_no = dict_index_get_nth_col_no(clust_index, upd_field->field_no);
+
+	return(dict_table_is_fts_column(fts->indexes, col_no));
+}
+
+/***********************************************************//**
+Checks if an update vector changes the table's FTS-indexed columns.
+NOTE: must not be called for tables which do not have an FTS-index.
+Also, the vector returned must be explicitly freed as it's allocated
+using the ut_malloc() allocator.
+@return vector of FTS indexes that were affected by the update */
+UNIV_INTERN
+ib_vector_t*
+row_upd_changes_fts_columns(
+/*========================*/
+	dict_table_t*	table,		/*!< in: table */
+	upd_t*		update)		/*!< in: update vector for the row */
+{
+	ulint		i;
+	ulint		offset;
+	fts_t*		fts = table->fts;
+	ib_vector_t*	updated_fts_indexes = NULL;
+
+	for (i = 0; i < upd_get_n_fields(update); ++i) {
+		upd_field_t*	upd_field = upd_get_nth_field(update, i);
+
+		offset = row_upd_changes_fts_column(table, upd_field);
+
+		if (offset != ULINT_UNDEFINED) {
+
+			dict_index_t*	index;
+
+			/* TODO: Investigate if we can check whether the
+			existing set of affected indexes matches the new
+			affected set. If matched then we don't need to
+			do the extra malloc()/free(). */
+
+			/* This vector is created from the ut_malloc()
+			allocator because we only want to keep one instance
+			around not matter how many times this row is
+			updated. The old entry should be deleted when
+			we update the FTS row info with this new vector. */
+			if (updated_fts_indexes == NULL) {
+				ib_alloc_t*	ut_alloc;
+
+				ut_alloc = ib_ut_allocator_create();
+
+				updated_fts_indexes = ib_vector_create(
+					ut_alloc, sizeof(dict_index_t*), 2);
+			}
+
+			index = static_cast<dict_index_t*>(
+				ib_vector_getp(fts->indexes, offset));
+			ib_vector_push(updated_fts_indexes, &index);
+		}
+	}
+
+	return(updated_fts_indexes);
+}
+
+/***********************************************************//**
 Checks if an update vector changes some of the first ordering fields of an
 index record. This is only used in foreign key checks and we can assume
 that index does not contain column prefixes.
@@ -1511,7 +1619,7 @@ row_upd_store_row(
 	offsets = rec_get_offsets(rec, clust_index, offsets_,
 				  ULINT_UNDEFINED, &heap);
 
-	if (dict_table_get_format(node->table) >= DICT_TF_FORMAT_ZIP) {
+	if (dict_table_get_format(node->table) >= UNIV_FORMAT_B) {
 		/* In DYNAMIC or COMPRESSED format, there is no prefix
 		of externally stored columns in the clustered index
 		record. Build a cache of column prefixes. */
@@ -1613,9 +1721,7 @@ row_upd_sec_index_entry(
 		      "InnoDB: record ", stderr);
 		rec_print(stderr, rec, index);
 		putc('\n', stderr);
-
 		trx_print(stderr, trx, 0);
-
 		fputs("\n"
 		      "InnoDB: Submit a detailed bug report"
 		      " to http://bugs.mysql.com\n", stderr);
@@ -1764,7 +1870,9 @@ row_upd_clust_rec_by_insert_inherit_func(
 		len = dfield_get_len(dfield);
 		ut_a(len != UNIV_SQL_NULL);
 		ut_a(len >= BTR_EXTERN_FIELD_REF_SIZE);
-		data = dfield_get_data(dfield);
+
+		data = static_cast<byte*>(dfield_get_data(dfield));
+
 		data += len - BTR_EXTERN_FIELD_REF_SIZE;
 		/* The pointer must not be zero. */
 		ut_a(memcmp(data, field_ref_zero, BTR_EXTERN_FIELD_REF_SIZE));
@@ -2246,8 +2354,7 @@ exit_func:
 
 	if (node->cmpl_info & UPD_NODE_NO_ORD_CHANGE) {
 
-		err = row_upd_clust_rec(node, index, thr, mtr);
-		return(err);
+		return(row_upd_clust_rec(node, index, thr, mtr));
 	}
 
 	row_upd_store_row(node);
@@ -2322,26 +2429,27 @@ row_upd(
 		}
 	}
 
-	if (node->state == UPD_NODE_UPDATE_CLUSTERED
-	    || node->state == UPD_NODE_INSERT_CLUSTERED
-	    || node->state == UPD_NODE_INSERT_BLOB) {
-
+	switch (node->state) {
+	case UPD_NODE_UPDATE_CLUSTERED:
+	case UPD_NODE_INSERT_CLUSTERED:
+	case UPD_NODE_INSERT_BLOB:
 		log_free_check();
 		err = row_upd_clust_step(node, thr);
 
 		if (err != DB_SUCCESS) {
 
-			goto function_exit;
+			return(err);
 		}
 	}
 
-	if (!node->is_delete && (node->cmpl_info & UPD_NODE_NO_ORD_CHANGE)) {
+	if (node->index == NULL
+	    || (!node->is_delete
+		&& (node->cmpl_info & UPD_NODE_NO_ORD_CHANGE))) {
 
-		goto function_exit;
+		return(DB_SUCCESS);
 	}
 
-	while (node->index != NULL) {
-
+	do {
 		/* Skip corrupted index */
 		dict_table_skip_corrupt_index(node->index);
 
@@ -2349,32 +2457,33 @@ row_upd(
 			break;
 		}
 
-		log_free_check();
-		err = row_upd_sec_step(node, thr);
+		if (node->index->type != DICT_FTS) {
+			log_free_check();
+			err = row_upd_sec_step(node, thr);
 
-		if (err != DB_SUCCESS) {
+			if (err != DB_SUCCESS) {
 
-			goto function_exit;
+				return(err);
+			}
 		}
 
 		node->index = dict_table_get_next_index(node->index);
-	}
+	} while (node->index != NULL);
 
-function_exit:
-	if (err == DB_SUCCESS) {
-		/* Do some cleanup */
+	ut_ad(err == DB_SUCCESS);
 
-		if (node->row != NULL) {
-			node->row = NULL;
-			node->ext = NULL;
-			node->upd_row = NULL;
-			node->upd_ext = NULL;
-			mem_heap_empty(node->heap);
-		}
+	/* Do some cleanup */
 
-		node->state = UPD_NODE_UPDATE_CLUSTERED;
+	if (node->row != NULL) {
+		node->row = NULL;
+		node->ext = NULL;
+		node->upd_row = NULL;
+		node->upd_ext = NULL;
+		mem_heap_empty(node->heap);
 	}
 
+	node->state = UPD_NODE_UPDATE_CLUSTERED;
+
 	return(err);
 }
 
@@ -2398,9 +2507,9 @@ row_upd_step(
 
 	trx = thr_get_trx(thr);
 
-	trx_start_if_not_started(trx);
+	trx_start_if_not_started_xa(trx);
 
-	node = thr->run_node;
+	node = static_cast<upd_node_t*>(thr->run_node);
 
 	sel_node = node->select;
 
@@ -2470,7 +2579,7 @@ row_upd_step(
 	err = row_upd(node, thr);
 
 error_handling:
-	trx->error_state = err;
+	trx->error_state = static_cast<enum db_err>(err);
 
 	if (err != DB_SUCCESS) {
 		return(NULL);
diff --git a/storage/innobase/row/row0vers.c b/storage/innobase/row/row0vers.cc
index 5fd7d082194..98ec4a3a13b 100644
--- a/storage/innobase/row/row0vers.c
+++ b/storage/innobase/row/row0vers.cc
@@ -11,13 +11,13 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
 /**************************************************//**
-@file row/row0vers.c
+@file row/row0vers.cc
 Row versions
 
 Created 2/6/1997 Heikki Tuuri
@@ -48,91 +48,51 @@ Created 2/6/1997 Heikki Tuuri
 
 /*****************************************************************//**
 Finds out if an active transaction has inserted or modified a secondary
-index record. NOTE: the kernel mutex is temporarily released in this
-function!
-@return NULL if committed, else the active transaction */
-UNIV_INTERN
-trx_t*
-row_vers_impl_x_locked_off_kernel(
-/*==============================*/
-	const rec_t*	rec,	/*!< in: record in a secondary index */
-	dict_index_t*	index,	/*!< in: the secondary index */
-	const ulint*	offsets)/*!< in: rec_get_offsets(rec, index) */
+index record.
+@return 0 if committed, else the active transaction id;
+NOTE that this function can return false positives but never false
+negatives. The caller must confirm all positive results by calling
+trx_is_active() while holding lock_sys->mutex. */
+UNIV_INLINE
+trx_id_t
+row_vers_impl_x_locked_low(
+/*=======================*/
+	const rec_t*	clust_rec,	/*!< in: clustered index record */
+	dict_index_t*	clust_index,	/*!< in: the clustered index */
+	const rec_t*	rec,		/*!< in: secondary index record */
+	dict_index_t*	index,		/*!< in: the secondary index */
+	const ulint*	offsets,	/*!< in: rec_get_offsets(rec, index) */
+	mtr_t*		mtr)		/*!< in/out: mini-transaction */
 {
-	dict_index_t*	clust_index;
-	rec_t*		clust_rec;
-	ulint*		clust_offsets;
-	rec_t*		version;
 	trx_id_t	trx_id;
-	mem_heap_t*	heap;
-	mem_heap_t*	heap2;
-	dtuple_t*	row;
-	dtuple_t*	entry	= NULL; /* assignment to eliminate compiler
-					warning */
-	trx_t*		trx;
-	ulint		rec_del;
-#ifdef UNIV_DEBUG
-	ulint		err;
-#endif /* UNIV_DEBUG */
-	mtr_t		mtr;
+	ibool		corrupt;
 	ulint		comp;
+	ulint		rec_del;
+	const rec_t*	version;
+	rec_t*		prev_version = NULL;
+	ulint*		clust_offsets;
+	mem_heap_t*	heap;
 
-	ut_ad(mutex_own(&kernel_mutex));
-#ifdef UNIV_SYNC_DEBUG
-	ut_ad(!rw_lock_own(&(purge_sys->latch), RW_LOCK_SHARED));
-#endif /* UNIV_SYNC_DEBUG */
-
-	mutex_exit(&kernel_mutex);
-
-	mtr_start(&mtr);
-
-	/* Search for the clustered index record: this is a time-consuming
-	operation: therefore we release the kernel mutex; also, the release
-	is required by the latching order convention. The latch on the
-	clustered index locks the top of the stack of versions. We also
-	reserve purge_latch to lock the bottom of the version stack. */
-
-	clust_rec = row_get_clust_rec(BTR_SEARCH_LEAF, rec, index,
-				      &clust_index, &mtr);
-	if (!clust_rec) {
-		/* In a rare case it is possible that no clust rec is found
-		for a secondary index record: if in row0umod.c
-		row_undo_mod_remove_clust_low() we have already removed the
-		clust rec, while purge is still cleaning and removing
-		secondary index records associated with earlier versions of
-		the clustered index record. In that case there cannot be
-		any implicit lock on the secondary index record, because
-		an active transaction which has modified the secondary index
-		record has also modified the clustered index record. And in
-		a rollback we always undo the modifications to secondary index
-		records before the clustered index record. */
-
-		mutex_enter(&kernel_mutex);
-		mtr_commit(&mtr);
-
-		return(NULL);
-	}
+	ut_ad(rec_offs_validate(rec, index, offsets));
 
 	heap = mem_heap_create(1024);
-	clust_offsets = rec_get_offsets(clust_rec, clust_index, NULL,
-					ULINT_UNDEFINED, &heap);
-	trx_id = row_get_rec_trx_id(clust_rec, clust_index, clust_offsets);
 
-	mtr_s_lock(&(purge_sys->latch), &mtr);
+	clust_offsets = rec_get_offsets(
+		clust_rec, clust_index, NULL, ULINT_UNDEFINED, &heap);
 
-	mutex_enter(&kernel_mutex);
+	trx_id = row_get_rec_trx_id(clust_rec, clust_index, clust_offsets);
+	corrupt = FALSE;
 
-	trx = NULL;
-	if (!trx_is_active(trx_id)) {
+	if (!trx_rw_is_active(trx_id, &corrupt)) {
 		/* The transaction that modified or inserted clust_rec is no
-		longer active: no implicit lock on rec */
-		goto exit_func;
-	}
-
-	if (!lock_check_trx_id_sanity(trx_id, clust_rec, clust_index,
-				      clust_offsets, TRUE)) {
-		/* Corruption noticed: try to avoid a crash by returning */
-		goto exit_func;
+		longer active, or it is corrupt: no implicit lock on rec */
+		if (corrupt) {
+			lock_report_trx_id_insanity(
+				trx_id, clust_rec, clust_index, clust_offsets,
+				trx_sys_get_max_trx_id());
+		}
+		mem_heap_free(heap);
+		return(0);
 	}
 
 	comp = page_rec_is_comp(rec);
@@ -140,125 +100,126 @@ row_vers_impl_x_locked_off_kernel(
 	ut_ad(!!comp == dict_table_is_comp(index->table));
 	ut_ad(!comp == !page_rec_is_comp(clust_rec));
 
-	/* We look up if some earlier version, which was modified by the trx_id
-	transaction, of the clustered index record would require rec to be in
-	a different state (delete marked or unmarked, or have different field
-	values, or not existing). If there is such a version, then rec was
-	modified by the trx_id transaction, and it has an implicit x-lock on
-	rec. Note that if clust_rec itself would require rec to be in a
-	different state, then the trx_id transaction has not yet had time to
-	modify rec, and does not necessarily have an implicit x-lock on rec. */
-
 	rec_del = rec_get_deleted_flag(rec, comp);
-	trx = NULL;
-
-	version = clust_rec;
 
-	for (;;) {
-		rec_t*		prev_version;
-		ulint		vers_del;
+	/* We look up if some earlier version, which was modified by
+	the trx_id transaction, of the clustered index record would
+	require rec to be in a different state (delete marked or
+	unmarked, or have different field values, or not existing). If
+	there is such a version, then rec was modified by the trx_id
+	transaction, and it has an implicit x-lock on rec. Note that
+	if clust_rec itself would require rec to be in a different
+	state, then the trx_id transaction has not yet had time to
+	modify rec, and does not necessarily have an implicit x-lock
+	on rec. */
+
+	for (version = clust_rec;; version = prev_version) {
+		ulint		err;
 		row_ext_t*	ext;
+		const dtuple_t*	row;
+		dtuple_t*	entry;
+		ulint		vers_del;
 		trx_id_t	prev_trx_id;
+		mem_heap_t*	old_heap = heap;
 
-		mutex_exit(&kernel_mutex);
+		/* We keep the semaphore in mtr on the clust_rec page, so
+		that no other transaction can update it and get an
+		implicit x-lock on rec until mtr_commit(mtr). */
 
-		/* While we retrieve an earlier version of clust_rec, we
-		release the kernel mutex, because it may take time to access
-		the disk. After the release, we have to check if the trx_id
-		transaction is still active. We keep the semaphore in mtr on
-		the clust_rec page, so that no other transaction can update
-		it and get an implicit x-lock on rec. */
-
-		heap2 = heap;
 		heap = mem_heap_create(1024);
-#ifdef UNIV_DEBUG
-		err =
-#endif /* UNIV_DEBUG */
-		trx_undo_prev_version_build(clust_rec, &mtr, version,
-					    clust_index, clust_offsets,
-					    heap, &prev_version);
-		mem_heap_free(heap2); /* free version and clust_offsets */
 
-		if (prev_version == NULL) {
-			mutex_enter(&kernel_mutex);
+		err = trx_undo_prev_version_build(
+			clust_rec, mtr, version, clust_index, clust_offsets,
+			heap, &prev_version);
 
-			if (!trx_is_active(trx_id)) {
-				/* Transaction no longer active: no
-				implicit x-lock */
+ 		/* Free version and clust_offsets. */
 
-				break;
-			}
+		mem_heap_free(old_heap);
 
-			/* If the transaction is still active,
-			clust_rec must be a fresh insert, because no
-			previous version was found. */
-			ut_ad(err == DB_SUCCESS);
+		if (prev_version == NULL) {
 
-			/* It was a freshly inserted version: there is an
-			implicit x-lock on rec */
+			/* clust_rec must be a fresh insert, because
+			no previous version was found or the transaction
+			has committed. The caller has to recheck as the
+			synopsis of this function states, whether trx_id
+			is active or not. */
 
-			trx = trx_get_on_id(trx_id);
+			ut_a(err == DB_SUCCESS || err == DB_MISSING_HISTORY);
 
 			break;
 		}
 
-		clust_offsets = rec_get_offsets(prev_version, clust_index,
-						NULL, ULINT_UNDEFINED, &heap);
+		clust_offsets = rec_get_offsets(
+			prev_version, clust_index, NULL, ULINT_UNDEFINED,
+			&heap);
 
 		vers_del = rec_get_deleted_flag(prev_version, comp);
-		prev_trx_id = row_get_rec_trx_id(prev_version, clust_index,
-						 clust_offsets);
-
-		/* If the trx_id and prev_trx_id are different and if
-		the prev_version is marked deleted then the
-		prev_trx_id must have already committed for the trx_id
-		to be able to modify the row. Therefore, prev_trx_id
-		cannot hold any implicit lock. */
+
+		prev_trx_id = row_get_rec_trx_id(
+			prev_version, clust_index, clust_offsets);
+
+		/* If trx_id differs from prev_trx_id and if the
+		prev_version is marked deleted then the prev_trx_id
+		must have already committed for the trx_id to be able
+		to modify the row. Therefore, prev_trx_id cannot hold
+		any implicit lock. */
+
 		if (vers_del && trx_id != prev_trx_id) {
 
-			mutex_enter(&kernel_mutex);
+			trx_id = 0;
 			break;
 		}
 
 		/* The stack of versions is locked by mtr.  Thus, it
 		is safe to fetch the prefixes for externally stored
 		columns. */
+
 		row = row_build(ROW_COPY_POINTERS, clust_index, prev_version,
 				clust_offsets, NULL, &ext, heap);
+
 		entry = row_build_index_entry(row, ext, index, heap);
+
 		/* entry may be NULL if a record was inserted in place
 		of a deleted record, and the BLOB pointers of the new
 		record were not initialized yet.  But in that case,
 		prev_version should be NULL. */
-		ut_a(entry);
 
-		mutex_enter(&kernel_mutex);
-
-		if (!trx_is_active(trx_id)) {
-			/* Transaction no longer active: no implicit x-lock */
-
-			break;
-		}
+		ut_a(entry != NULL);
 
-		/* If we get here, we know that the trx_id transaction is
-		still active and it has modified prev_version. Let us check
-		if prev_version would require rec to be in a different
-		state. */
+		/* If we get here, we know that the trx_id transaction
+		modified prev_version. Let us check if prev_version
+		would require rec to be in a different state. */
 
 		/* The previous version of clust_rec must be
-		accessible, because the transaction is still active
-		and clust_rec was not a fresh insert. */
+		accessible, because clust_rec was not a fresh insert.
+		There is no guarantee that the transaction is still
+		active. */
+
 		ut_ad(err == DB_SUCCESS);
 
 		/* We check if entry and rec are identified in the alphabetical
 		ordering */
-		if (0 == cmp_dtuple_rec(entry, rec, offsets)) {
+
+		if (!trx_rw_is_active(trx_id, &corrupt)) {
+			/* Transaction no longer active: no implicit
+			x-lock. This situation should only be possible
+			because we are not holding lock_sys->mutex. */
+			ut_ad(!lock_mutex_own());
+			if (corrupt) {
+				lock_report_trx_id_insanity(
+					trx_id,
+					prev_version, clust_index,
+					clust_offsets,
+					trx_sys_get_max_trx_id());
+			}
+			trx_id = 0;
+			break;
+		} else if (0 == cmp_dtuple_rec(entry, rec, offsets)) {
 			/* The delete marks of rec and prev_version should be
 			equal for rec to be in the state required by
 			prev_version */
 
 			if (rec_del != vers_del) {
-				trx = trx_get_on_id(trx_id);
 
 				break;
 			}
@@ -268,38 +229,91 @@ row_vers_impl_x_locked_off_kernel(
 			alphabetical ordering, but the field values changed
 			still. For example, 'abc' -> 'ABC'. Check also that. */
 
-			dtuple_set_types_binary(entry,
-						dtuple_get_n_fields(entry));
-			if (0 != cmp_dtuple_rec(entry, rec, offsets)) {
+			dtuple_set_types_binary(
+				entry, dtuple_get_n_fields(entry));
 
-				trx = trx_get_on_id(trx_id);
+			if (0 != cmp_dtuple_rec(entry, rec, offsets)) {
 
 				break;
 			}
+
 		} else if (!rec_del) {
 			/* The delete mark should be set in rec for it to be
 			in the state required by prev_version */
 
-			trx = trx_get_on_id(trx_id);
-
 			break;
 		}
 
 		if (trx_id != prev_trx_id) {
-			/* The versions modified by the trx_id transaction end
-			to prev_version: no implicit x-lock */
+			/* prev_version was the first version modified by
+			the trx_id transaction: no implicit x-lock */
 
+			trx_id = 0;
 			break;
 		}
+	}
 
-		version = prev_version;
-	}/* for (;;) */
+	mem_heap_free(heap);
+	return(trx_id);
+}
+
+/*****************************************************************//**
+Finds out if an active transaction has inserted or modified a secondary
+index record.
+@return 0 if committed, else the active transaction id;
+NOTE that this function can return false positives but never false
+negatives. The caller must confirm all positive results by calling
+trx_is_active() while holding lock_sys->mutex. */
+UNIV_INTERN
+trx_id_t
+row_vers_impl_x_locked(
+/*===================*/
+	const rec_t*	rec,	/*!< in: record in a secondary index */
+	dict_index_t*	index,	/*!< in: the secondary index */
+	const ulint*	offsets)/*!< in: rec_get_offsets(rec, index) */
+{
+	dict_index_t*	clust_index;
+	const rec_t*	clust_rec;
+	trx_id_t	trx_id;
+	mtr_t		mtr;
+
+	ut_ad(!lock_mutex_own());
+	ut_ad(!mutex_own(&trx_sys->mutex));
+
+	mtr_start(&mtr);
+
+	/* Search for the clustered index record. The latch on the
+	page of clust_rec locks the top of the stack of versions. The
+	bottom of the version stack is not locked; oldest versions may
+	disappear by the fact that transactions may be committed and
+	collected by the purge. This is not a problem, because we are
+	only interested in active transactions. */
+
+	clust_rec = row_get_clust_rec(
+		BTR_SEARCH_LEAF, rec, index, &clust_index, &mtr);
+
+	if (UNIV_UNLIKELY(!clust_rec)) {
+		/* In a rare case it is possible that no clust rec is found
+		for a secondary index record: if in row0umod.cc
+		row_undo_mod_remove_clust_low() we have already removed the
+		clust rec, while purge is still cleaning and removing
+		secondary index records associated with earlier versions of
+		the clustered index record. In that case there cannot be
+		any implicit lock on the secondary index record, because
+		an active transaction which has modified the secondary index
+		record has also modified the clustered index record. And in
+		a rollback we always undo the modifications to secondary index
+		records before the clustered index record. */
+
+		trx_id = 0;
+	} else {
+		trx_id = row_vers_impl_x_locked_low(
+			clust_rec, clust_index, rec, index, offsets, &mtr);
+	}
 
-exit_func:
 	mtr_commit(&mtr);
-	mem_heap_free(heap);
 
-	return(trx);
+	return(trx_id);
 }
 
 /*****************************************************************//**
@@ -321,15 +335,7 @@ row_vers_must_preserve_del_marked(
 
 	mtr_s_lock(&(purge_sys->latch), mtr);
 
-	if (trx_purge_update_undo_must_exist(trx_id)) {
-
-		/* A purge operation is not yet allowed to remove this
-		delete marked record */
-
-		return(TRUE);
-	}
-
-	return(FALSE);
+	return(!read_view_sees_trx_id(purge_sys->view, trx_id));
 }
 
 /*****************************************************************//**
@@ -369,7 +375,6 @@ row_vers_old_has_index_entry(
 #ifdef UNIV_SYNC_DEBUG
 	ut_ad(!rw_lock_own(&(purge_sys->latch), RW_LOCK_SHARED));
 #endif /* UNIV_SYNC_DEBUG */
-	mtr_s_lock(&(purge_sys->latch), mtr);
 
 	clust_index = dict_table_get_first_index(index->table);
 
@@ -382,7 +387,11 @@ row_vers_old_has_index_entry(
 	if (also_curr && !rec_get_deleted_flag(rec, comp)) {
 		row_ext_t*	ext;
 
-		/* The stack of versions is locked by mtr.
+		/* The top of the stack of versions is locked by the
+		mtr holding a latch on the page containing the
+		clustered index record. The bottom of the stack is
+		locked by the fact that the purge_sys->view must
+		'overtake' any read view of an active transaction.
 		Thus, it is safe to fetch the prefixes for
 		externally stored columns. */
 		row = row_build(ROW_COPY_POINTERS, clust_index,
@@ -522,7 +531,6 @@ row_vers_build_for_consistent_read(
 
 	ut_ad(!read_view_sees_trx_id(view, trx_id));
 
-	rw_lock_s_lock(&(purge_sys->latch));
 	version = rec;
 
 	for (;;) {
@@ -550,13 +558,14 @@ row_vers_build_for_consistent_read(
 				/* The view already sees this version: we can
 				copy it to in_heap and return */
 
-#ifdef UNIV_BLOB_NULL_DEBUG
+#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
 				ut_a(!rec_offs_any_null_extern(
 					     version, *offsets));
-#endif /* UNIV_BLOB_NULL_DEBUG */
+#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
+
+				buf = static_cast<byte*>(mem_heap_alloc(
+					in_heap, rec_offs_size(*offsets)));
 
-				buf = mem_heap_alloc(in_heap,
-						     rec_offs_size(*offsets));
 				*old_vers = rec_copy(buf, version, *offsets);
 				rec_offs_make_valid(*old_vers, index,
 						    *offsets);
@@ -588,9 +597,9 @@ row_vers_build_for_consistent_read(
 		*offsets = rec_get_offsets(prev_version, index, *offsets,
 					   ULINT_UNDEFINED, offset_heap);
 
-#ifdef UNIV_BLOB_NULL_DEBUG
+#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
 		ut_a(!rec_offs_any_null_extern(prev_version, *offsets));
-#endif /* UNIV_BLOB_NULL_DEBUG */
+#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
 
 		trx_id = row_get_rec_trx_id(prev_version, index, *offsets);
 
@@ -599,7 +608,10 @@ row_vers_build_for_consistent_read(
 			/* The view already sees this version: we can copy
 			it to in_heap and return */
 
-			buf = mem_heap_alloc(in_heap, rec_offs_size(*offsets));
+			buf = static_cast<byte*>(
+				mem_heap_alloc(
+					in_heap, rec_offs_size(*offsets)));
+
 			*old_vers = rec_copy(buf, prev_version, *offsets);
 			rec_offs_make_valid(*old_vers, index, *offsets);
 			err = DB_SUCCESS;
@@ -611,7 +623,6 @@ row_vers_build_for_consistent_read(
 	}/* for (;;) */
 
 	mem_heap_free(heap);
-	rw_lock_s_unlock(&(purge_sys->latch));
 
 	return(err);
 }
@@ -657,16 +668,10 @@ row_vers_build_for_semi_consistent_read(
 
 	ut_ad(rec_offs_validate(rec, index, *offsets));
 
-	rw_lock_s_lock(&(purge_sys->latch));
-	/* The S-latch on purge_sys prevents the purge view from
-	changing.  Thus, if we have an uncommitted transaction at
-	this point, then purge cannot remove its undo log even if
-	the transaction could commit now. */
-
 	version = rec;
 
 	for (;;) {
-		trx_t*		version_trx;
+		const trx_t*	version_trx;
 		mem_heap_t*	heap2;
 		rec_t*		prev_version;
 		trx_id_t	version_trx_id;
@@ -676,24 +681,27 @@ row_vers_build_for_semi_consistent_read(
 			rec_trx_id = version_trx_id;
 		}
 
-		mutex_enter(&kernel_mutex);
-		version_trx = trx_get_on_id(version_trx_id);
+		mutex_enter(&trx_sys->mutex);
+		version_trx = trx_get_rw_trx_by_id(version_trx_id);
+		/* Because version_trx is a read-write transaction,
+		its state cannot change from or to NOT_STARTED while
+		we are holding the trx_sys->mutex.  It may change from
+		ACTIVE to PREPARED or COMMITTED. */
 		if (version_trx
-		    && (version_trx->conc_state == TRX_COMMITTED_IN_MEMORY
-			|| version_trx->conc_state == TRX_NOT_STARTED)) {
-
+		    && trx_state_eq(version_trx,
+				    TRX_STATE_COMMITTED_IN_MEMORY)) {
 			version_trx = NULL;
 		}
-		mutex_exit(&kernel_mutex);
+		mutex_exit(&trx_sys->mutex);
 
 		if (!version_trx) {
 
 			/* We found a version that belongs to a
 			committed transaction: return it. */
 
-#ifdef UNIV_BLOB_NULL_DEBUG
+#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
 			ut_a(!rec_offs_any_null_extern(version, *offsets));
-#endif /* UNIV_BLOB_NULL_DEBUG */
+#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
 
 			if (rec == version) {
 				*old_vers = rec;
@@ -702,7 +710,7 @@ row_vers_build_for_semi_consistent_read(
 			}
 
 			/* We assume that a rolled-back transaction stays in
-			TRX_ACTIVE state until all the changes have been
+			TRX_STATE_ACTIVE state until all the changes have been
 			rolled back and the transaction is removed from
 			the global list of transactions. */
 
@@ -719,7 +727,10 @@ row_vers_build_for_semi_consistent_read(
 							   offset_heap);
 			}
 
-			buf = mem_heap_alloc(in_heap, rec_offs_size(*offsets));
+			buf = static_cast<byte*>(
+				mem_heap_alloc(
+					in_heap, rec_offs_size(*offsets)));
+
 			*old_vers = rec_copy(buf, version, *offsets);
 			rec_offs_make_valid(*old_vers, index, *offsets);
 			err = DB_SUCCESS;
@@ -752,15 +763,14 @@ row_vers_build_for_semi_consistent_read(
 		version = prev_version;
 		*offsets = rec_get_offsets(version, index, *offsets,
 					   ULINT_UNDEFINED, offset_heap);
-#ifdef UNIV_BLOB_NULL_DEBUG
+#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
 		ut_a(!rec_offs_any_null_extern(version, *offsets));
-#endif /* UNIV_BLOB_NULL_DEBUG */
+#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
 	}/* for (;;) */
 
 	if (heap) {
 		mem_heap_free(heap);
 	}
-	rw_lock_s_unlock(&(purge_sys->latch));
 
 	return(err);
 }
diff --git a/storage/innobase/srv/srv0conc.cc b/storage/innobase/srv/srv0conc.cc
new file mode 100644
index 00000000000..d5c949f3a06
--- /dev/null
+++ b/storage/innobase/srv/srv0conc.cc
@@ -0,0 +1,606 @@
+/*****************************************************************************
+
+Copyright (c) 2011, Oracle and/or its affiliates. All Rights Reserved.
+
+Portions of this file contain modifications contributed and copyrighted by
+Google, Inc. Those modifications are gratefully acknowledged and are described
+briefly in the InnoDB documentation. The contributions by Google are
+incorporated with their permission, and subject to the conditions contained in
+the file COPYING.Google.
+
+Portions of this file contain modifications contributed and copyrighted
+by Percona Inc.. Those modifications are
+gratefully acknowledged and are described briefly in the InnoDB
+documentation. The contributions by Percona Inc. are incorporated with
+their permission, and subject to the conditions contained in the file
+COPYING.Percona.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file srv/srv0conc.cc
+
+InnoDB concurrency manager
+
+Created 2011/04/18 Sunny Bains
+*******************************************************/
+
+#include "srv0srv.h"
+#include "sync0sync.h"
+#include "trx0trx.h"
+#include "ha_prototypes.h"
+
+#include "mysql/plugin.h"
+
+/** Number of times a thread is allowed to enter InnoDB within the same
+SQL query after it has once got the ticket. */
+UNIV_INTERN ulong	srv_n_free_tickets_to_enter = 500;
+
+#ifdef HAVE_ATOMIC_BUILTINS
+/** Maximum sleep delay (in micro-seconds), value of 0 disables it. */
+UNIV_INTERN ulong	srv_adaptive_max_sleep_delay = 150000;
+#endif /* HAVE_ATOMIC_BUILTINS */
+
+UNIV_INTERN ulong	srv_thread_sleep_delay	= 10000;
+
+
+/** We are prepared for a situation that we have this many threads waiting for
+a semaphore inside InnoDB. innobase_start_or_create_for_mysql() sets the
+value. */
+
+UNIV_INTERN ulint	srv_max_n_threads	= 0;
+
+/** The following controls how many threads we let inside InnoDB concurrently:
+threads waiting for locks are not counted into the number because otherwise
+we could get a deadlock. Value of 0 will disable the concurrency check. */
+
+UNIV_INTERN ulong	srv_thread_concurrency	= 0;
+
+#ifndef HAVE_ATOMIC_BUILTINS
+
+/** This mutex protects srv_conc data structures */
+static os_fast_mutex_t	srv_conc_mutex;
+
+/** Slot for a thread waiting in the concurrency control queue. */
+typedef struct srv_conc_slot_struct	srv_conc_slot_t;
+
+/** Concurrency list node */
+typedef UT_LIST_NODE_T(srv_conc_slot_t)	srv_conc_node_t;
+
+struct srv_conc_slot_struct{
+	os_event_t	event;		/*!< event to wait */
+	ibool		reserved;	/*!< TRUE if slot
+					reserved */
+	ibool		wait_ended;	/*!< TRUE when another thread has
+					already set the event and the thread
+					in this slot is free to proceed; but
+					reserved may still be TRUE at that
+					point */
+	srv_conc_node_t	srv_conc_queue;	/*!< queue node */
+};
+
+/** Queue of threads waiting to get in */
+typedef UT_LIST_BASE_NODE_T(srv_conc_slot_t)	srv_conc_queue_t;
+
+static srv_conc_queue_t	srv_conc_queue;
+
+/** Array of wait slots */
+static srv_conc_slot_t*	srv_conc_slots;
+
+#if defined(UNIV_PFS_MUTEX)
+/* Key to register srv_conc_mutex_key with performance schema */
+UNIV_INTERN mysql_pfs_key_t	srv_conc_mutex_key;
+#endif /* UNIV_PFS_MUTEX */
+
+#endif /* !HAVE_ATOMIC_BUILTINS */
+
+typedef struct srv_conc_struct srv_conc_t;
+
+/** Variables tracking the active and waiting threads. */
+struct srv_conc_struct {
+	char		pad[64  - (sizeof(ulint) + sizeof(lint))];
+
+	/** Number of transactions that have declared_to_be_inside_innodb set.
+	It used to be a non-error for this value to drop below zero temporarily.
+	This is no longer true. We'll, however, keep the lint datatype to add
+	assertions to catch any corner cases that we may have missed. */
+
+	volatile lint	n_active;
+
+	/** Number of OS threads waiting in the FIFO for permission to
+	enter InnoDB */
+	volatile lint	n_waiting;
+};
+
+/* Control variables for tracking concurrency. */
+static srv_conc_t	srv_conc;
+
+/*********************************************************************//**
+Initialise the concurrency management data structures */
+void
+srv_conc_init(void)
+/*===============*/
+{
+#ifndef HAVE_ATOMIC_BUILTINS
+	ulint		i;
+
+	/* Init the server concurrency restriction data structures */
+
+	os_fast_mutex_init(srv_conc_mutex_key, &srv_conc_mutex);
+
+	UT_LIST_INIT(srv_conc_queue);
+
+	srv_conc_slots = static_cast<srv_conc_slot_t*>(
+		mem_zalloc(OS_THREAD_MAX_N * sizeof(*srv_conc_slots)));
+
+	for (i = 0; i < OS_THREAD_MAX_N; i++) {
+		srv_conc_slot_t*	conc_slot = &srv_conc_slots[i];
+
+		conc_slot->event = os_event_create(NULL);
+		ut_a(conc_slot->event);
+	}
+#endif /* !HAVE_ATOMIC_BUILTINS */
+}
+
+/*********************************************************************//**
+Free the concurrency management data structures */
+void
+srv_conc_free(void)
+/*===============*/
+{
+#ifndef HAVE_ATOMIC_BUILTINS
+	os_fast_mutex_free(&srv_conc_mutex);
+	mem_free(srv_conc_slots);
+	srv_conc_slots = NULL;
+#endif /* !HAVE_ATOMIC_BUILTINS */
+}
+
+#ifdef HAVE_ATOMIC_BUILTINS
+/*********************************************************************//**
+Note that a user thread is entering InnoDB. */
+static
+void
+srv_enter_innodb_with_tickets(
+/*==========================*/
+	trx_t*	trx)			/*!< in/out: transaction that wants
+					to enter InnoDB */
+{
+	trx->declared_to_be_inside_innodb = TRUE;
+	trx->n_tickets_to_enter_innodb = srv_n_free_tickets_to_enter;
+}
+
+/*********************************************************************//**
+Handle the scheduling of a user thread that wants to enter InnoDB.  Setting
+srv_adaptive_max_sleep_delay > 0 switches the adaptive sleep calibration to
+ON. When set, we want to wait in the queue for as little time as possible.
+However, very short waits will result in a lot of context switches and that
+is also not desirable. When threads need to sleep multiple times we increment
+os_thread_sleep_delay by one. When we see threads getting a slot without
+waiting and there are no other threads waiting in the queue, we try and reduce
+the wait as much as we can. Currently we reduce it by half each time. If the
+thread only had to wait for one turn before it was able to enter InnoDB we
+decrement it by one. This is to try and keep the sleep time stable around the
+"optimum" sleep time. */
+static
+void
+srv_conc_enter_innodb_with_atomics(
+/*===============================*/
+	trx_t*	trx)			/*!< in/out: transaction that wants
+					to enter InnoDB */
+{
+	ulint	n_sleeps = 0;
+	ibool	notified_mysql = FALSE;
+
+	ut_a(!trx->declared_to_be_inside_innodb);
+
+	for (;;) {
+		ulint	sleep_in_us;
+
+		if (srv_conc.n_active < (lint) srv_thread_concurrency) {
+			ulint	n_active;
+
+			/* Check if there are any free tickets. */
+			n_active = os_atomic_increment_lint(
+				&srv_conc.n_active, 1);
+
+			if (n_active <= srv_thread_concurrency) {
+
+				srv_enter_innodb_with_tickets(trx);
+
+				if (notified_mysql) {
+
+					(void) os_atomic_decrement_lint(
+						&srv_conc.n_waiting, 1);
+
+					thd_wait_end(
+						static_cast<THD*>(
+							trx->mysql_thd));
+				}
+
+				if (srv_adaptive_max_sleep_delay > 0) {
+					if (srv_thread_sleep_delay > 20
+					    && n_sleeps == 1) {
+
+						--srv_thread_sleep_delay;
+					}
+
+					if (srv_conc.n_waiting == 0) {
+						srv_thread_sleep_delay >>= 1;
+					}
+				}
+
+				return;
+			}
+
+			/* Since there were no free seats, we relinquish
+			the overbooked ticket. */
+
+			(void) os_atomic_decrement_lint(
+				&srv_conc.n_active, 1);
+		}
+
+		if (!notified_mysql) {
+			(void) os_atomic_increment_lint(
+				&srv_conc.n_waiting, 1);
+
+			/* Release possible search system latch this
+			thread has */
+
+			if (trx->has_search_latch) {
+				trx_search_latch_release_if_reserved(trx);
+			}
+
+			thd_wait_begin(
+				static_cast<THD*>(trx->mysql_thd),
+				THD_WAIT_USER_LOCK);
+
+			notified_mysql = TRUE;
+		}
+
+		trx->op_info = "sleeping before entering InnoDB";
+
+		sleep_in_us = srv_thread_sleep_delay;
+
+		/* Guard against overflow when adaptive sleep delay is on. */
+
+		if (srv_adaptive_max_sleep_delay > 0
+		    && sleep_in_us > srv_adaptive_max_sleep_delay) {
+
+			sleep_in_us = srv_adaptive_max_sleep_delay;
+			srv_thread_sleep_delay = sleep_in_us;
+		}
+
+		os_thread_sleep(sleep_in_us);
+
+		trx->op_info = "";
+
+		++n_sleeps;
+
+		if (srv_adaptive_max_sleep_delay > 0 && n_sleeps > 1) {
+			++srv_thread_sleep_delay;
+		}
+	}
+}
+
+/*********************************************************************//**
+Note that a user thread is leaving InnoDB code. */
+static
+void
+srv_conc_exit_innodb_with_atomics(
+/*==============================*/
+	trx_t*	trx)		/*!< in/out: transaction */
+{
+	trx->n_tickets_to_enter_innodb = 0;
+	trx->declared_to_be_inside_innodb = FALSE;
+
+	(void) os_atomic_decrement_lint(&srv_conc.n_active, 1);
+}
+#else
+/*********************************************************************//**
+Note that a user thread is leaving InnoDB code. */
+static
+void
+srv_conc_exit_innodb_without_atomics(
+/*=================================*/
+	trx_t*	trx)		/*!< in/out: transaction */
+{
+	srv_conc_slot_t*	slot;
+
+	os_fast_mutex_lock(&srv_conc_mutex);
+
+	ut_ad(srv_conc.n_active > 0);
+	srv_conc.n_active--;
+	trx->declared_to_be_inside_innodb = FALSE;
+	trx->n_tickets_to_enter_innodb = 0;
+
+	slot = NULL;
+
+	if (srv_conc.n_active < (lint) srv_thread_concurrency) {
+		/* Look for a slot where a thread is waiting and no other
+		thread has yet released the thread */
+
+		for (slot = UT_LIST_GET_FIRST(srv_conc_queue);
+		     slot != NULL && slot->wait_ended == TRUE;
+		     slot = UT_LIST_GET_NEXT(srv_conc_queue, slot)) {
+
+			/* No op */
+		}
+
+		if (slot != NULL) {
+			slot->wait_ended = TRUE;
+
+			/* We increment the count on behalf of the released
+			thread */
+
+			srv_conc.n_active++;
+		}
+	}
+
+	os_fast_mutex_unlock(&srv_conc_mutex);
+
+	if (slot != NULL) {
+		os_event_set(slot->event);
+	}
+}
+
+/*********************************************************************//**
+Handle the scheduling of a user thread that wants to enter InnoDB. */
+static
+void
+srv_conc_enter_innodb_without_atomics(
+/*==================================*/
+	trx_t*	trx)			/*!< in/out: transaction that wants
+					to enter InnoDB */
+{
+	ulint			i;
+	srv_conc_slot_t*	slot = NULL;
+	ibool			has_slept = FALSE;
+
+	os_fast_mutex_lock(&srv_conc_mutex);
+retry:
+	if (UNIV_UNLIKELY(trx->declared_to_be_inside_innodb)) {
+		os_fast_mutex_unlock(&srv_conc_mutex);
+		ut_print_timestamp(stderr);
+		fputs("  InnoDB: Error: trying to declare trx"
+		      " to enter InnoDB, but\n"
+		      "InnoDB: it already is declared.\n", stderr);
+		trx_print(stderr, trx, 0);
+		putc('\n', stderr);
+		return;
+	}
+
+	ut_ad(srv_conc.n_active >= 0);
+
+	if (srv_conc.n_active < (lint) srv_thread_concurrency) {
+
+		srv_conc.n_active++;
+		trx->declared_to_be_inside_innodb = TRUE;
+		trx->n_tickets_to_enter_innodb = srv_n_free_tickets_to_enter;
+
+		os_fast_mutex_unlock(&srv_conc_mutex);
+
+		return;
+	}
+
+	/* If the transaction is not holding resources, let it sleep
+	for srv_thread_sleep_delay microseconds, and try again then */
+
+	if (!has_slept && !trx->has_search_latch
+	    && NULL == UT_LIST_GET_FIRST(trx->lock.trx_locks)) {
+
+		has_slept = TRUE; /* We let it sleep only once to avoid
+				starvation */
+
+		srv_conc.n_waiting++;
+
+		os_fast_mutex_unlock(&srv_conc_mutex);
+
+		trx->op_info = "sleeping before joining InnoDB queue";
+
+		/* Peter Zaitsev suggested that we take the sleep away
+		altogether. But the sleep may be good in pathological
+		situations of lots of thread switches. Simply put some
+		threads aside for a while to reduce the number of thread
+		switches. */
+		if (srv_thread_sleep_delay > 0) {
+			os_thread_sleep(srv_thread_sleep_delay);
+		}
+
+		trx->op_info = "";
+
+		os_fast_mutex_lock(&srv_conc_mutex);
+
+		srv_conc.n_waiting--;
+
+		goto retry;
+	}
+
+	/* Too many threads inside: put the current thread to a queue */
+
+	for (i = 0; i < OS_THREAD_MAX_N; i++) {
+		slot = srv_conc_slots + i;
+
+		if (!slot->reserved) {
+
+			break;
+		}
+	}
+
+	if (i == OS_THREAD_MAX_N) {
+		/* Could not find a free wait slot, we must let the
+		thread enter */
+
+		srv_conc.n_active++;
+		trx->declared_to_be_inside_innodb = TRUE;
+		trx->n_tickets_to_enter_innodb = 0;
+
+		os_fast_mutex_unlock(&srv_conc_mutex);
+
+		return;
+	}
+
+	/* Release possible search system latch this thread has */
+	if (trx->has_search_latch) {
+		trx_search_latch_release_if_reserved(trx);
+	}
+
+	/* Add to the queue */
+	slot->reserved = TRUE;
+	slot->wait_ended = FALSE;
+
+	UT_LIST_ADD_LAST(srv_conc_queue, srv_conc_queue, slot);
+
+	os_event_reset(slot->event);
+
+	srv_conc.n_waiting++;
+
+	os_fast_mutex_unlock(&srv_conc_mutex);
+
+	/* Go to wait for the event; when a thread leaves InnoDB it will
+	release this thread */
+
+	ut_ad(!trx->has_search_latch);
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(!sync_thread_levels_nonempty_trx(trx->has_search_latch));
+#endif /* UNIV_SYNC_DEBUG */
+	trx->op_info = "waiting in InnoDB queue";
+
+	thd_wait_begin(static_cast<THD*>(trx->mysql_thd), THD_WAIT_USER_LOCK);
+
+	os_event_wait(slot->event);
+	thd_wait_end(static_cast<THD*>(trx->mysql_thd));
+
+	trx->op_info = "";
+
+	os_fast_mutex_lock(&srv_conc_mutex);
+
+	srv_conc.n_waiting--;
+
+	/* NOTE that the thread which released this thread already
+	incremented the thread counter on behalf of this thread */
+
+	slot->reserved = FALSE;
+
+	UT_LIST_REMOVE(srv_conc_queue, srv_conc_queue, slot);
+
+	trx->declared_to_be_inside_innodb = TRUE;
+	trx->n_tickets_to_enter_innodb = srv_n_free_tickets_to_enter;
+
+	os_fast_mutex_unlock(&srv_conc_mutex);
+}
+#endif /* HAVE_ATOMIC_BUILTINS */
+
+/*********************************************************************//**
+Puts an OS thread to wait if there are too many concurrent threads
+(>= srv_thread_concurrency) inside InnoDB. The threads wait in a FIFO queue. */
+UNIV_INTERN
+void
+srv_conc_enter_innodb(
+/*==================*/
+	trx_t*	trx)	/*!< in: transaction object associated with the
+			thread */
+{
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(!sync_thread_levels_nonempty_trx(trx->has_search_latch));
+#endif /* UNIV_SYNC_DEBUG */
+
+#ifdef HAVE_ATOMIC_BUILTINS
+	srv_conc_enter_innodb_with_atomics(trx);
+#else
+	srv_conc_enter_innodb_without_atomics(trx);
+#endif /* HAVE_ATOMIC_BUILTINS */
+}
+
+/*********************************************************************//**
+This lets a thread enter InnoDB regardless of the number of threads inside
+InnoDB. This must be called when a thread ends a lock wait. */
+UNIV_INTERN
+void
+srv_conc_force_enter_innodb(
+/*========================*/
+	trx_t*	trx)	/*!< in: transaction object associated with the
+			thread */
+{
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(!sync_thread_levels_nonempty_trx(trx->has_search_latch));
+#endif /* UNIV_SYNC_DEBUG */
+
+	if (!srv_thread_concurrency) {
+
+		return;
+	}
+
+	ut_ad(srv_conc.n_active >= 0);
+
+#ifdef HAVE_ATOMIC_BUILTINS
+	(void) os_atomic_increment_lint(&srv_conc.n_active, 1);
+#else
+	os_fast_mutex_lock(&srv_conc_mutex);
+	++srv_conc.n_active;
+	os_fast_mutex_unlock(&srv_conc_mutex);
+#endif /* HAVE_ATOMIC_BUILTINS */
+
+	trx->n_tickets_to_enter_innodb = 1;
+	trx->declared_to_be_inside_innodb = TRUE;
+}
+
+/*********************************************************************//**
+This must be called when a thread exits InnoDB in a lock wait or at the
+end of an SQL statement. */
+UNIV_INTERN
+void
+srv_conc_force_exit_innodb(
+/*=======================*/
+	trx_t*	trx)	/*!< in: transaction object associated with the
+			thread */
+{
+	if ((trx->mysql_thd != NULL
+	     && thd_is_replication_slave_thread(trx->mysql_thd))
+	    || trx->declared_to_be_inside_innodb == FALSE) {
+
+		return;
+	}
+
+#ifdef HAVE_ATOMIC_BUILTINS
+	srv_conc_exit_innodb_with_atomics(trx);
+#else
+	srv_conc_exit_innodb_without_atomics(trx);
+#endif /* HAVE_ATOMIC_BUILTINS */
+
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(!sync_thread_levels_nonempty_trx(trx->has_search_latch));
+#endif /* UNIV_SYNC_DEBUG */
+}
+
+/*********************************************************************//**
+Get the count of threads waiting inside InnoDB. */
+UNIV_INTERN
+ulint
+srv_conc_get_waiting_threads(void)
+/*==============================*/
+{
+	return(srv_conc.n_waiting);
+}
+
+/*********************************************************************//**
+Get the count of threads active inside InnoDB. */
+UNIV_INTERN
+ulint
+srv_conc_get_active_threads(void)
+/*==============================*/
+{
+	return(srv_conc.n_active);
+ }
+
diff --git a/storage/innobase/srv/srv0mon.cc b/storage/innobase/srv/srv0mon.cc
new file mode 100644
index 00000000000..9c6e56bcb9d
--- /dev/null
+++ b/storage/innobase/srv/srv0mon.cc
@@ -0,0 +1,1833 @@
+/*****************************************************************************
+
+Copyright (c) 2010, 2011, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file srv/srv0mon.cc
+Database monitor counter interfaces
+
+Created 12/9/2009 Jimmy Yang
+*******************************************************/
+
+#ifndef UNIV_HOTBACKUP
+#include "os0file.h"
+#include "mach0data.h"
+#include "srv0mon.h"
+#include "srv0srv.h"
+#include "buf0buf.h"
+#include "trx0sys.h"
+#include "trx0rseg.h"
+#include "lock0lock.h"
+#include "ibuf0ibuf.h"
+#include "btr0cur.h"
+#ifdef UNIV_NONINL
+#include "srv0mon.ic"
+#endif
+
+/* Macro to standardize the counter names for counters in the
+"monitor_buf_page" module as they have very structured defines */
+#define	MONITOR_BUF_PAGE(name, description, code, op, op_code)	\
+	{"buffer_page_"op"_"name, "buffer_page_io",		\
+	 "Number of "description" Pages "op,			\
+	 MONITOR_GROUP_MODULE, MONITOR_DEFAULT_START,		\
+	 MONITOR_##code##_##op_code}
+
+#define MONITOR_BUF_PAGE_READ(name, description, code)		\
+	 MONITOR_BUF_PAGE(name, description, code, "read", PAGE_READ)
+
+#define MONITOR_BUF_PAGE_WRITTEN(name, description, code)	\
+	 MONITOR_BUF_PAGE(name, description, code, "written", PAGE_WRITTEN)
+
+
+/** This array defines basic static information of monitor counters,
+including each monitor's name, module it belongs to, a short
+description and its property/type and corresponding monitor_id.
+Please note: If you add a monitor here, please add its corresponding
+monitor_id to "enum monitor_id_value" structure in srv0mon.h file. */
+
+static monitor_info_t	innodb_counter_info[] =
+{
+	/* A dummy item to mark the module start, this is
+	to accomodate the default value (0) set for the
+	global variables with the control system. */
+	{"module_start", "module_start", "module_start",
+	MONITOR_MODULE,
+	MONITOR_DEFAULT_START, MONITOR_DEFAULT_START},
+
+	/* ========== Counters for Server Metadata ========== */
+	{"module_metadata", "metadata", "Server Metadata",
+	 MONITOR_MODULE,
+	 MONITOR_DEFAULT_START, MONITOR_MODULE_METADATA},
+
+	{"metadata_table_handles_opened", "metadata",
+	 "Number of table handles opened",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_TABLE_OPEN},
+
+	{"metadata_table_handles_closed", "metadata",
+	 "Number of table handles closed",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_TABLE_CLOSE},
+
+	{"metadata_table_reference_count", "metadata",
+	 "Table reference counter",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_TABLE_REFERENCE},
+
+	{"metadata_mem_pool_size", "metadata",
+	 "Size of a memory pool InnoDB uses to store data dictionary"
+	 " and internal data structures in bytes",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON | MONITOR_DISPLAY_CURRENT),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_META_MEM_POOL},
+
+	/* ========== Counters for Lock Module ========== */
+	{"module_lock", "lock", "Lock Module",
+	 MONITOR_MODULE,
+	 MONITOR_DEFAULT_START, MONITOR_MODULE_LOCK},
+
+	{"lock_deadlocks", "lock", "Number of deadlocks",
+	 MONITOR_DEFAULT_ON,
+	 MONITOR_DEFAULT_START, MONITOR_DEADLOCK},
+
+	{"lock_timeouts", "lock", "Number of lock timeouts",
+	 MONITOR_DEFAULT_ON,
+	 MONITOR_DEFAULT_START, MONITOR_TIMEOUT},
+
+	{"lock_rec_lock_waits", "lock",
+	 "Number of times enqueued into record lock wait queue",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_LOCKREC_WAIT},
+
+	{"lock_table_lock_waits", "lock",
+	 "Number of times enqueued into table lock wait queue",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_TABLELOCK_WAIT},
+
+	{"lock_rec_lock_requests", "lock",
+	 "Number of record locks requested",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_NUM_RECLOCK_REQ},
+
+	{"lock_rec_lock_created", "lock", "Number of record locks created",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_RECLOCK_CREATED},
+
+	{"lock_rec_lock_removed", "lock",
+	 "Number of record locks removed from the lock queue",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_RECLOCK_REMOVED},
+
+	{"lock_rec_locks", "lock",
+	 "Current number of record locks on tables",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_NUM_RECLOCK},
+
+	{"lock_table_lock_created", "lock", "Number of table locks created",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_TABLELOCK_CREATED},
+
+	{"lock_table_lock_removed", "lock",
+	 "Number of table locks removed from the lock queue",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_TABLELOCK_REMOVED},
+
+	{"lock_table_locks", "lock",
+	 "Current number of table locks on tables",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_NUM_TABLELOCK},
+
+	{"lock_row_lock_current_waits", "lock",
+	 "Number of row locks currently being waited for"
+	 " (innodb_row_lock_current_waits)",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_ROW_LOCK_CURRENT_WAIT},
+
+	{"lock_row_lock_time", "lock",
+	 "Time spent in acquiring row locks, in milliseconds"
+	 " (innodb_row_lock_time)",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_LOCK_WAIT_TIME},
+
+	{"lock_row_lock_time_max", "lock",
+	 "The maximum time to acquire a row lock, in milliseconds"
+	 " (innodb_row_lock_time_max)",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DISPLAY_CURRENT | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_LOCK_MAX_WAIT_TIME},
+
+	{"lock_row_lock_waits", "lock",
+	 "Number of times a row lock had to be waited for"
+	 " (innodb_row_lock_waits)",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_ROW_LOCK_WAIT},
+
+	{"lock_row_lock_time_avg", "lock",
+	 "The average time to acquire a row lock, in milliseconds"
+	 " (innodb_row_lock_time_avg)",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DISPLAY_CURRENT | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_LOCK_AVG_WAIT_TIME},
+
+	/* ========== Counters for Buffer Manager and I/O ========== */
+	{"module_buffer", "buffer", "Buffer Manager Module",
+	 MONITOR_MODULE,
+	 MONITOR_DEFAULT_START, MONITOR_MODULE_BUFFER},
+
+	{"buffer_pool_size", "server",
+	 "Server buffer pool size (all buffer pools) in bytes",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON | MONITOR_DISPLAY_CURRENT),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_BUFFER_POOL_SIZE},
+
+	{"buffer_pool_reads", "buffer",
+	 "Number of reads directly from disk (innodb_buffer_pool_reads)",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_BUF_POOL_READS},
+
+	{"buffer_pool_read_requests", "buffer",
+	 "Number of logical read requests (innodb_buffer_pool_read_requests)",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_BUF_POOL_READ_REQUESTS},
+
+	{"buffer_pool_write_requests", "buffer",
+	 "Number of write requests (innodb_buffer_pool_write_requests)",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_BUF_POOL_WRITE_REQUEST},
+
+	{"buffer_pool_pages_in_flush", "buffer",
+	 "Number of pages in flush list",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_PAGE_INFLUSH},
+
+	{"buffer_pool_wait_free", "buffer",
+	 "Number of times waited for free buffer"
+	 " (innodb_buffer_pool_wait_free)",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_BUF_POOL_WAIT_FREE},
+
+	{"buffer_pool_read_ahead", "buffer",
+	 "Number of pages read as read ahead (innodb_buffer_pool_read_ahead)",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_BUF_POOL_READ_AHEAD},
+
+	{"buffer_pool_read_ahead_evicted", "buffer",
+	 "Read-ahead pages evicted without being accessed"
+	 " (innodb_buffer_pool_read_ahead_evicted)",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_BUF_POOL_READ_AHEAD_EVICTED},
+
+	{"buffer_pool_pages_total", "buffer",
+	 "Total buffer pool size in pages (innodb_buffer_pool_pages_total)",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DISPLAY_CURRENT | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_BUF_POOL_PAGE_TOTAL},
+
+	{"buffer_pool_pages_misc", "buffer",
+	 "Buffer pages for misc use such as row locks or the adaptive"
+	 " hash index (innodb_buffer_pool_pages_misc)",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DISPLAY_CURRENT | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_BUF_POOL_PAGE_MISC},
+
+	{"buffer_pool_pages_data", "buffer",
+	 "Buffer pages containing data (innodb_buffer_pool_pages_data)",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DISPLAY_CURRENT | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_BUF_POOL_PAGES_DATA},
+
+	{"buffer_pool_pages_dirty", "buffer",
+	 "Buffer pages currently dirty (innodb_buffer_pool_pages_dirty)",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DISPLAY_CURRENT | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_BUF_POOL_PAGES_DIRTY},
+
+	{"buffer_pool_pages_free", "buffer",
+	 "Buffer pages currently free (innodb_buffer_pool_pages_free)",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DISPLAY_CURRENT | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_BUF_POOL_PAGES_FREE},
+
+	{"buffer_pages_created", "buffer",
+	 "Number of pages created (innodb_pages_created)",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_PAGE_CREATED},
+
+	{"buffer_pages_written", "buffer",
+	 "Number of pages written (innodb_pages_written)",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_PAGES_WRITTEN},
+
+	{"buffer_pages_read", "buffer",
+	 "Number of pages read (innodb_pages_read)",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_PAGES_READ},
+
+	{"buffer_data_reads", "buffer",
+	 "Amount of data read in bytes (innodb_data_reads)",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_BYTE_READ},
+
+	{"buffer_data_written", "buffer",
+	 "Amount of data written in bytes (innodb_data_written)",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_BYTE_WRITTEN},
+
+	/* Cumulative counter for scanning in flush batches */
+	{"buffer_flush_batch_scanned", "buffer",
+	 "Total pages scanned as part of flush batch",
+	 MONITOR_SET_OWNER,
+	 MONITOR_FLUSH_BATCH_SCANNED_NUM_CALL,
+	 MONITOR_FLUSH_BATCH_SCANNED},
+
+	{"buffer_flush_batch_num_scan", "buffer",
+	 "Number of times buffer flush list flush is called",
+	 MONITOR_SET_MEMBER, MONITOR_FLUSH_BATCH_SCANNED,
+	 MONITOR_FLUSH_BATCH_SCANNED_NUM_CALL},
+
+	{"buffer_flush_batch_scanned_per_call", "buffer",
+	 "Pages scanned per flush batch scan",
+	 MONITOR_SET_MEMBER, MONITOR_FLUSH_BATCH_SCANNED,
+	 MONITOR_FLUSH_BATCH_SCANNED_PER_CALL},
+
+	/* Cumulative counter for pages flushed in flush batches */
+	{"buffer_flush_batch_total_pages", "buffer",
+	 "Total pages flushed as part of flush batch",
+	 MONITOR_SET_OWNER, MONITOR_FLUSH_BATCH_COUNT,
+	 MONITOR_FLUSH_BATCH_TOTAL_PAGE},
+
+	{"buffer_flush_batches", "buffer",
+	 "Number of flush batches",
+	 MONITOR_SET_MEMBER, MONITOR_FLUSH_BATCH_TOTAL_PAGE,
+	 MONITOR_FLUSH_BATCH_COUNT},
+
+	{"buffer_flush_batch_pages", "buffer",
+	 "Pages queued as a flush batch",
+	 MONITOR_SET_MEMBER, MONITOR_FLUSH_BATCH_TOTAL_PAGE,
+	 MONITOR_FLUSH_BATCH_PAGES},
+
+	/* Cumulative counter for flush batches because of neighbor */
+	{"buffer_flush_neighbor_total_pages", "buffer",
+	 "Total neighbors flushed as part of neighbor flush",
+	 MONITOR_SET_OWNER, MONITOR_FLUSH_NEIGHBOR_COUNT,
+	 MONITOR_FLUSH_NEIGHBOR_TOTAL_PAGE},
+
+	{"buffer_flush_neighbor", "buffer",
+	 "Number of times neighbors flushing is invoked",
+	 MONITOR_SET_MEMBER, MONITOR_FLUSH_NEIGHBOR_TOTAL_PAGE,
+	 MONITOR_FLUSH_NEIGHBOR_COUNT},
+
+	{"buffer_flush_neighbor_pages", "buffer",
+	 "Pages queued as a neighbor batch",
+	 MONITOR_SET_MEMBER, MONITOR_FLUSH_NEIGHBOR_TOTAL_PAGE,
+	 MONITOR_FLUSH_NEIGHBOR_PAGES},
+
+	/* Cumulative counter for flush batches because of max_dirty */
+	{"buffer_flush_max_dirty_total_pages", "buffer",
+	 "Total pages flushed as part of max_dirty batches",
+	 MONITOR_SET_OWNER, MONITOR_FLUSH_MAX_DIRTY_COUNT,
+	 MONITOR_FLUSH_MAX_DIRTY_TOTAL_PAGE},
+
+	{"buffer_flush_max_dirty", "buffer",
+	 "Number of max_dirty batches",
+	 MONITOR_SET_MEMBER, MONITOR_FLUSH_MAX_DIRTY_TOTAL_PAGE,
+	 MONITOR_FLUSH_MAX_DIRTY_COUNT},
+
+	{"buffer_flush_max_dirty_pages", "buffer",
+	 "Pages queued as a max_dirty batch",
+	 MONITOR_SET_MEMBER, MONITOR_FLUSH_MAX_DIRTY_TOTAL_PAGE,
+	 MONITOR_FLUSH_MAX_DIRTY_PAGES},
+
+	/* Cumulative counter for flush batches because of adaptive */
+	{"buffer_flush_adaptive_total_pages", "buffer",
+	 "Total pages flushed as part of adaptive batches",
+	 MONITOR_SET_OWNER, MONITOR_FLUSH_ADAPTIVE_COUNT,
+	 MONITOR_FLUSH_ADAPTIVE_TOTAL_PAGE},
+
+	{"buffer_flush_adaptive", "buffer",
+	 "Number of adaptive batches",
+	 MONITOR_SET_MEMBER, MONITOR_FLUSH_ADAPTIVE_TOTAL_PAGE,
+	 MONITOR_FLUSH_ADAPTIVE_COUNT},
+
+	{"buffer_flush_adaptive_pages", "buffer",
+	 "Pages queued as an adaptive batch",
+	 MONITOR_SET_MEMBER, MONITOR_FLUSH_ADAPTIVE_TOTAL_PAGE,
+	 MONITOR_FLUSH_ADAPTIVE_PAGES},
+
+	/* Cumulative counter for flush batches because of async */
+	{"buffer_flush_async_total_pages", "buffer",
+	 "Total pages flushed as part of async batches",
+	 MONITOR_SET_OWNER, MONITOR_FLUSH_ASYNC_COUNT,
+	 MONITOR_FLUSH_ASYNC_TOTAL_PAGE},
+
+	{"buffer_flush_async", "buffer",
+	 "Number of async batches",
+	 MONITOR_SET_MEMBER, MONITOR_FLUSH_ASYNC_TOTAL_PAGE,
+	 MONITOR_FLUSH_ASYNC_COUNT},
+
+	{"buffer_flush_async_pages", "buffer",
+	 "Pages queued as an async batch",
+	 MONITOR_SET_MEMBER, MONITOR_FLUSH_ASYNC_TOTAL_PAGE,
+	 MONITOR_FLUSH_ASYNC_PAGES},
+
+	/* Cumulative counter for flush batches because of sync */
+	{"buffer_flush_sync_total_pages", "buffer",
+	 "Total pages flushed as part of sync batches",
+	 MONITOR_SET_OWNER, MONITOR_FLUSH_SYNC_COUNT,
+	 MONITOR_FLUSH_SYNC_TOTAL_PAGE},
+
+	{"buffer_flush_sync", "buffer",
+	 "Number of sync batches",
+	 MONITOR_SET_MEMBER, MONITOR_FLUSH_SYNC_TOTAL_PAGE,
+	 MONITOR_FLUSH_SYNC_COUNT},
+
+	{"buffer_flush_sync_pages", "buffer",
+	 "Pages queued as a sync batch",
+	 MONITOR_SET_MEMBER, MONITOR_FLUSH_SYNC_TOTAL_PAGE,
+	 MONITOR_FLUSH_SYNC_PAGES},
+
+	/* Cumulative counter for flush batches because of background */
+	{"buffer_flush_background_total_pages", "buffer",
+	 "Total pages flushed as part of background batches",
+	 MONITOR_SET_OWNER, MONITOR_FLUSH_BACKGROUND_COUNT,
+	 MONITOR_FLUSH_BACKGROUND_TOTAL_PAGE},
+
+	{"buffer_flush_background", "buffer",
+	 "Number of background batches",
+	 MONITOR_SET_MEMBER, MONITOR_FLUSH_BACKGROUND_TOTAL_PAGE,
+	 MONITOR_FLUSH_BACKGROUND_COUNT},
+
+	{"buffer_flush_background_pages", "buffer",
+	 "Pages queued as a background batch",
+	 MONITOR_SET_MEMBER, MONITOR_FLUSH_BACKGROUND_TOTAL_PAGE,
+	 MONITOR_FLUSH_BACKGROUND_PAGES},
+
+	/* Cumulative counter for LRU batch scan */
+	{"buffer_LRU_batch_scanned", "buffer",
+	 "Total pages scanned as part of LRU batch",
+	 MONITOR_SET_OWNER, MONITOR_LRU_BATCH_SCANNED_NUM_CALL,
+	 MONITOR_LRU_BATCH_SCANNED},
+
+	{"buffer_LRU_batch_num_scan", "buffer",
+	 "Number of times LRU batch is called",
+	 MONITOR_SET_MEMBER, MONITOR_LRU_BATCH_SCANNED,
+	 MONITOR_LRU_BATCH_SCANNED_NUM_CALL},
+
+	{"buffer_LRU_batch_scanned_per_call", "buffer",
+	 "Pages scanned per LRU batch call",
+	 MONITOR_SET_MEMBER, MONITOR_LRU_BATCH_SCANNED,
+	 MONITOR_LRU_BATCH_SCANNED_PER_CALL},
+
+	/* Cumulative counter for LRU batch pages flushed */
+	{"buffer_LRU_batch_total_pages", "buffer",
+	 "Total pages flushed as part of LRU batches",
+	 MONITOR_SET_OWNER, MONITOR_LRU_BATCH_COUNT,
+	 MONITOR_LRU_BATCH_TOTAL_PAGE},
+
+	{"buffer_LRU_batches", "buffer",
+	 "Number of LRU batches",
+	 MONITOR_SET_MEMBER, MONITOR_LRU_BATCH_TOTAL_PAGE,
+	 MONITOR_LRU_BATCH_COUNT},
+
+	{"buffer_LRU_batch_pages", "buffer",
+	 "Pages queued as an LRU batch",
+	 MONITOR_SET_MEMBER, MONITOR_LRU_BATCH_TOTAL_PAGE,
+	 MONITOR_LRU_BATCH_PAGES},
+
+	/* Cumulative counter for single page LRU scans */
+	{"buffer_LRU_single_flush_scanned", "buffer",
+	 "Total pages scanned as part of single page LRU flush",
+	 MONITOR_SET_OWNER,
+	 MONITOR_LRU_SINGLE_FLUSH_SCANNED_NUM_CALL,
+	 MONITOR_LRU_SINGLE_FLUSH_SCANNED},
+
+	{"buffer_LRU_single_flush_num_scan", "buffer",
+	 "Number of times single page LRU flush is called",
+	 MONITOR_SET_MEMBER, MONITOR_LRU_SINGLE_FLUSH_SCANNED,
+	 MONITOR_LRU_SINGLE_FLUSH_SCANNED_NUM_CALL},
+
+	{"buffer_LRU_single_flush_scanned_per_call", "buffer",
+	 "Page scanned per single LRU flush",
+	 MONITOR_SET_MEMBER, MONITOR_LRU_SINGLE_FLUSH_SCANNED,
+	 MONITOR_LRU_SINGLE_FLUSH_SCANNED_PER_CALL},
+
+	{"buffer_LRU_single_flush_failure_count", "Buffer",
+	 "Number of times attempt to flush a single page from LRU failed",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_LRU_SINGLE_FLUSH_FAILURE_COUNT},
+
+	{"buffer_LRU_get_free_search", "Buffer",
+	 "Number of searches performed for a clean page",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_LRU_GET_FREE_SEARCH},
+
+	/* Cumulative counter for LRU search scans */
+	{"buffer_LRU_search_scanned", "buffer",
+	 "Total pages scanned as part of LRU search",
+	 MONITOR_SET_OWNER,
+	 MONITOR_LRU_SEARCH_SCANNED_NUM_CALL,
+	 MONITOR_LRU_SEARCH_SCANNED},
+
+	{"buffer_LRU_search_num_scan", "buffer",
+	 "Number of times LRU search is performed",
+	 MONITOR_SET_MEMBER, MONITOR_LRU_SEARCH_SCANNED,
+	 MONITOR_LRU_SEARCH_SCANNED_NUM_CALL},
+
+	{"buffer_LRU_search_scanned_per_call", "buffer",
+	 "Page scanned per single LRU search",
+	 MONITOR_SET_MEMBER, MONITOR_LRU_SEARCH_SCANNED,
+	 MONITOR_LRU_SEARCH_SCANNED_PER_CALL},
+
+	/* Cumulative counter for LRU unzip search scans */
+	{"buffer_LRU_unzip_search_scanned", "buffer",
+	 "Total pages scanned as part of LRU unzip search",
+	 MONITOR_SET_OWNER,
+	 MONITOR_LRU_UNZIP_SEARCH_SCANNED_NUM_CALL,
+	 MONITOR_LRU_UNZIP_SEARCH_SCANNED},
+
+	{"buffer_LRU_unzip_search_num_scan", "buffer",
+	 "Number of times LRU unzip search is performed",
+	 MONITOR_SET_MEMBER, MONITOR_LRU_UNZIP_SEARCH_SCANNED,
+	 MONITOR_LRU_UNZIP_SEARCH_SCANNED_NUM_CALL},
+
+	{"buffer_LRU_unzip_search_scanned_per_call", "buffer",
+	 "Page scanned per single LRU unzip search",
+	 MONITOR_SET_MEMBER, MONITOR_LRU_UNZIP_SEARCH_SCANNED,
+	 MONITOR_LRU_UNZIP_SEARCH_SCANNED_PER_CALL},
+
+	/* ========== Counters for Buffer Page I/O ========== */
+	{"module_buffer_page", "buffer_page_io", "Buffer Page I/O Module",
+	 static_cast<monitor_type_t>(
+	 MONITOR_MODULE | MONITOR_GROUP_MODULE),
+	 MONITOR_DEFAULT_START, MONITOR_MODULE_BUF_PAGE},
+
+	MONITOR_BUF_PAGE_READ("index_leaf","Index Leaf", INDEX_LEAF),
+
+	MONITOR_BUF_PAGE_READ("index_non_leaf","Index Non-leaf",
+			      INDEX_NON_LEAF),
+
+	MONITOR_BUF_PAGE_READ("index_ibuf_leaf", "Insert Buffer Index Leaf",
+			      INDEX_IBUF_LEAF),
+
+	MONITOR_BUF_PAGE_READ("index_ibuf_non_leaf",
+			      "Insert Buffer Index Non-Leaf",
+			       INDEX_IBUF_NON_LEAF),
+
+	MONITOR_BUF_PAGE_READ("undo_log", "Undo Log", UNDO_LOG),
+
+	MONITOR_BUF_PAGE_READ("index_inode", "Index Inode", INODE),
+
+	MONITOR_BUF_PAGE_READ("ibuf_free_list", "Insert Buffer Free List",
+			      IBUF_FREELIST),
+
+	MONITOR_BUF_PAGE_READ("ibuf_bitmap", "Insert Buffer Bitmap",
+			      IBUF_BITMAP),
+
+	MONITOR_BUF_PAGE_READ("system_page", "System", SYSTEM),
+
+	MONITOR_BUF_PAGE_READ("trx_system", "Transaction System", TRX_SYSTEM),
+
+	MONITOR_BUF_PAGE_READ("fsp_hdr", "File Space Header", FSP_HDR),
+
+	MONITOR_BUF_PAGE_READ("xdes", "Extent Descriptor", XDES),
+
+	MONITOR_BUF_PAGE_READ("blob", "Uncompressed BLOB", BLOB),
+
+	MONITOR_BUF_PAGE_READ("zblob", "First Compressed BLOB", ZBLOB),
+
+	MONITOR_BUF_PAGE_READ("zblob2", "Subsequent Compressed BLOB", ZBLOB2),
+
+	MONITOR_BUF_PAGE_READ("other", "other/unknown (old version of InnoDB)",
+			      OTHER),
+
+	MONITOR_BUF_PAGE_WRITTEN("index_leaf","Index Leaf", INDEX_LEAF),
+
+	MONITOR_BUF_PAGE_WRITTEN("index_non_leaf","Index Non-leaf",
+				 INDEX_NON_LEAF),
+
+	MONITOR_BUF_PAGE_WRITTEN("index_ibuf_leaf", "Insert Buffer Index Leaf",
+				 INDEX_IBUF_LEAF),
+
+	MONITOR_BUF_PAGE_WRITTEN("index_ibuf_non_leaf",
+				 "Insert Buffer Index Non-Leaf",
+				 INDEX_IBUF_NON_LEAF),
+
+	MONITOR_BUF_PAGE_WRITTEN("undo_log", "Undo Log", UNDO_LOG),
+
+	MONITOR_BUF_PAGE_WRITTEN("index_inode", "Index Inode", INODE),
+
+	MONITOR_BUF_PAGE_WRITTEN("ibuf_free_list", "Insert Buffer Free List",
+				 IBUF_FREELIST),
+
+	MONITOR_BUF_PAGE_WRITTEN("ibuf_bitmap", "Insert Buffer Bitmap",
+				 IBUF_BITMAP),
+
+	MONITOR_BUF_PAGE_WRITTEN("system_page", "System", SYSTEM),
+
+	MONITOR_BUF_PAGE_WRITTEN("trx_system", "Transaction System",
+				 TRX_SYSTEM),
+
+	MONITOR_BUF_PAGE_WRITTEN("fsp_hdr", "File Space Header", FSP_HDR),
+
+	MONITOR_BUF_PAGE_WRITTEN("xdes", "Extent Descriptor", XDES),
+
+	MONITOR_BUF_PAGE_WRITTEN("blob", "Uncompressed BLOB", BLOB),
+
+	MONITOR_BUF_PAGE_WRITTEN("zblob", "First Compressed BLOB", ZBLOB),
+
+	MONITOR_BUF_PAGE_WRITTEN("zblob2", "Subsequent Compressed BLOB",
+				 ZBLOB2),
+
+	MONITOR_BUF_PAGE_WRITTEN("other", "other/unknown (old version InnoDB)",
+				 OTHER),
+
+	/* ========== Counters for OS level operations ========== */
+	{"module_os", "os", "OS Level Operation",
+	 MONITOR_MODULE,
+	 MONITOR_DEFAULT_START, MONITOR_MODULE_OS},
+
+	{"os_data_reads", "os",
+	 "Number of reads initiated (innodb_data_reads)",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_OS_FILE_READ},
+
+	{"os_data_writes", "os",
+	 "Number of writes initiated (innodb_data_writes)",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_OS_FILE_WRITE},
+
+	{"os_data_fsyncs", "os",
+	 "Number of fsync() calls (innodb_data_fsyncs)",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_OS_FSYNC},
+
+	{"os_pending_reads", "os", "Number of reads pending",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_OS_PENDING_READS},
+
+	{"os_pending_writes", "os", "Number of writes pending",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_OS_PENDING_WRITES},
+
+	{"os_log_bytes_written", "os",
+	 "Bytes of log written (innodb_os_log_written)",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_OS_LOG_WRITTEN},
+
+	{"os_log_fsyncs", "os",
+	 "Number of fsync log writes (innodb_os_log_fsyncs)",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_OS_LOG_FSYNC},
+
+	{"os_log_pending_fsyncs", "os",
+	 "Number of pending fsync write (innodb_os_log_pending_fsyncs)",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_OS_LOG_PENDING_FSYNC},
+
+	{"os_log_pending_writes", "os",
+	 "Number of pending log file writes (innodb_os_log_pending_writes)",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_OS_LOG_PENDING_WRITES},
+
+	/* ========== Counters for Transaction Module ========== */
+	{"module_trx", "transaction", "Transaction Manager",
+	 MONITOR_MODULE,
+	 MONITOR_DEFAULT_START, MONITOR_MODULE_TRX},
+
+	{"trx_rw_commits", "transaction", "Number of read-write transactions "
+	  "committed",
+	 MONITOR_NONE, MONITOR_DEFAULT_START, MONITOR_TRX_RW_COMMIT},
+
+	{"trx_ro_commits", "transaction", "Number of read-only transactions "
+	  "committed",
+	 MONITOR_NONE, MONITOR_DEFAULT_START, MONITOR_TRX_RO_COMMIT},
+
+	{"trx_nl_ro_commits", "transaction", "Number of non-locking "
+	 "auto-commit read-only transactions committed",
+	 MONITOR_NONE, MONITOR_DEFAULT_START, MONITOR_TRX_NL_RO_COMMIT},
+
+	{"trx_commits_insert_update", "transaction",
+	 "Number of transactions committed with inserts and updates",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_TRX_COMMIT_UNDO},
+
+	{"trx_rollbacks", "transaction",
+	 "Number of transactions rolled back",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_TRX_ROLLBACK},
+
+	{"trx_rollbacks_savepoint", "transaction",
+	 "Number of transactions rolled back to savepoint",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_TRX_ROLLBACK_SAVEPOINT},
+
+	{"trx_rollback_active", "transaction",
+	 "Number of resurrected active transactions rolled back",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_TRX_ROLLBACK_ACTIVE},
+
+	{"trx_active_transactions", "transaction",
+	 "Number of active transactions",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_TRX_ACTIVE},
+
+	{"trx_rseg_history_len", "transaction",
+	 "Length of the TRX_RSEG_HISTORY list",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DISPLAY_CURRENT | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_RSEG_HISTORY_LEN},
+
+	{"trx_undo_slots_used", "transaction", "Number of undo slots used",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_NUM_UNDO_SLOT_USED},
+
+	{"trx_undo_slots_cached", "transaction",
+	 "Number of undo slots cached",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_NUM_UNDO_SLOT_CACHED},
+
+	{"trx_rseg_current_size", "transaction",
+	 "Current rollback segment size in pages",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DISPLAY_CURRENT),
+	 MONITOR_DEFAULT_START, MONITOR_RSEG_CUR_SIZE},
+
+	/* ========== Counters for Purge Module ========== */
+	{"module_purge", "purge", "Purge Module",
+	 MONITOR_MODULE,
+	 MONITOR_DEFAULT_START, MONITOR_MODULE_PURGE},
+
+	{"purge_del_mark_records", "purge",
+	 "Number of delete-marked rows purged",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_N_DEL_ROW_PURGE},
+
+	{"purge_upd_exist_or_extern_records", "purge",
+	 "Number of purges on updates of existing records and "
+	 " updates on delete marked record with externally stored field",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_N_UPD_EXIST_EXTERN},
+
+	{"purge_invoked", "purge",
+	 "Number of times purge was invoked",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_PURGE_INVOKED},
+
+	{"purge_undo_log_pages", "purge",
+	 "Number of undo log pages handled by the purge",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_PURGE_N_PAGE_HANDLED},
+
+	{"purge_dml_delay_usec", "purge",
+	 "Microseconds DML to be delayed due to purge lagging",
+	 MONITOR_DISPLAY_CURRENT,
+	 MONITOR_DEFAULT_START, MONITOR_DML_PURGE_DELAY},
+
+	{"purge_stop_count", "purge",
+	 "Number of times purge was stopped",
+	 MONITOR_DISPLAY_CURRENT,
+	 MONITOR_DEFAULT_START, MONITOR_PURGE_STOP_COUNT},
+
+	{"purge_resume_count", "purge",
+	 "Number of times purge was resumed",
+	 MONITOR_DISPLAY_CURRENT,
+	 MONITOR_DEFAULT_START, MONITOR_PURGE_RESUME_COUNT},
+
+	/* ========== Counters for Recovery Module ========== */
+	{"module_log", "recovery", "Recovery Module",
+	 MONITOR_MODULE,
+	 MONITOR_DEFAULT_START, MONITOR_MODULE_RECOVERY},
+
+	{"log_checkpoints", "recovery", "Number of checkpoints",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_NUM_CHECKPOINT},
+
+	{"log_lsn_last_flush", "recovery", "LSN of Last flush",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DISPLAY_CURRENT),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_LSN_FLUSHDISK},
+
+	{"log_lsn_last_checkpoint", "recovery", "LSN at last checkpoint",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DISPLAY_CURRENT),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_LSN_CHECKPOINT},
+
+	{"log_lsn_current", "recovery", "Current LSN value",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DISPLAY_CURRENT),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_LSN_CURRENT},
+
+	{"log_lsn_checkpoint_age", "recovery",
+	 "Current LSN value minus LSN at last checkpoint",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_LSN_CHECKPOINT_AGE},
+
+	{"log_lsn_buf_pool_oldest", "recovery",
+	 "The oldest modified block LSN in the buffer pool",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DISPLAY_CURRENT),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_BUF_OLDEST_LSN},
+
+	{"log_max_modified_age_async", "recovery",
+	 "Maximum LSN difference; when exceeded, start asynchronous preflush",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DISPLAY_CURRENT),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_MAX_AGE_ASYNC},
+
+	{"log_max_modified_age_sync", "recovery",
+	 "Maximum LSN difference; when exceeded, start synchronous preflush",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DISPLAY_CURRENT),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_MAX_AGE_SYNC},
+
+	{"log_pending_log_writes", "recovery", "Pending log writes",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_PENDING_LOG_WRITE},
+
+	{"log_pending_checkpoint_writes", "recovery", "Pending checkpoints",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_PENDING_CHECKPOINT_WRITE},
+
+	{"log_num_log_io", "recovery", "Number of log I/Os",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_LOG_IO},
+
+	{"log_waits", "recovery",
+	 "Number of log waits due to small log buffer (innodb_log_waits)",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_LOG_WAITS},
+
+	{"log_write_requests", "recovery",
+	 "Number of log write requests (innodb_log_write_requests)",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_LOG_WRITE_REQUEST},
+
+	{"log_writes", "recovery",
+	 "Number of log writes (innodb_log_writes)",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_LOG_WRITES},
+
+	/* ========== Counters for Page Compression ========== */
+	{"module_compress", "compression", "Page Compression Info",
+	 MONITOR_MODULE,
+	 MONITOR_DEFAULT_START, MONITOR_MODULE_PAGE},
+
+	{"compress_pages_compressed", "compression",
+	 "Number of pages compressed", MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_PAGE_COMPRESS},
+
+	{"compress_pages_decompressed", "compression",
+	 "Number of pages decompressed",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_PAGE_DECOMPRESS},
+
+	/* ========== Counters for Index ========== */
+	{"module_index", "index", "Index Manager",
+	 MONITOR_MODULE,
+	 MONITOR_DEFAULT_START, MONITOR_MODULE_INDEX},
+
+	{"index_splits", "index", "Number of index splits",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_INDEX_SPLIT},
+
+	{"index_merges", "index", "Number of index merges",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_INDEX_MERGE},
+
+	/* ========== Counters for Adaptive Hash Index ========== */
+	{"module_adaptive_hash", "adaptive_hash_index", "Adpative Hash Index",
+	 MONITOR_MODULE,
+	 MONITOR_DEFAULT_START, MONITOR_MODULE_ADAPTIVE_HASH},
+
+	{"adaptive_hash_searches", "adaptive_hash_index",
+	 "Number of successful searches using Adaptive Hash Index",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_ADAPTIVE_HASH_SEARCH},
+
+	{"adaptive_hash_searches_btree", "adaptive_hash_index",
+	 "Number of searches using B-tree on an index search",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_ADAPTIVE_HASH_SEARCH_BTREE},
+
+	{"adaptive_hash_pages_added", "adaptive_hash_index",
+	 "Number of index pages on which the Adaptive Hash Index is built",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_ADAPTIVE_HASH_PAGE_ADDED},
+
+	{"adaptive_hash_pages_removed", "adaptive_hash_index",
+	 "Number of index pages whose corresponding Adaptive Hash Index"
+	 " entries were removed",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_ADAPTIVE_HASH_PAGE_REMOVED},
+
+	{"adaptive_hash_rows_added", "adaptive_hash_index",
+	 "Number of Adaptive Hash Index rows added",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_ADAPTIVE_HASH_ROW_ADDED},
+
+	{"adaptive_hash_rows_removed", "adaptive_hash_index",
+	 "Number of Adaptive Hash Index rows removed",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_ADAPTIVE_HASH_ROW_REMOVED},
+
+	{"adaptive_hash_rows_deleted_no_hash_entry", "adaptive_hash_index",
+	 "Number of rows deleted that did not have corresponding Adaptive Hash"
+	 " Index entries",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_ADAPTIVE_HASH_ROW_REMOVE_NOT_FOUND},
+
+	{"adaptive_hash_rows_updated", "adaptive_hash_index",
+	 "Number of Adaptive Hash Index rows updated",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_ADAPTIVE_HASH_ROW_UPDATED},
+
+	/* ========== Counters for tablespace ========== */
+	{"module_file", "file_system", "Tablespace and File System Manager",
+	 MONITOR_MODULE,
+	 MONITOR_DEFAULT_START, MONITOR_MODULE_FIL_SYSTEM},
+
+	{"file_num_open_files", "file_system",
+	 "Number of files currently open (innodb_num_open_files)",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DISPLAY_CURRENT | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_N_FILE_OPENED},
+
+	/* ========== Counters for Change Buffer ========== */
+	{"module_ibuf_system", "change_buffer", "InnoDB Change Buffer",
+	 MONITOR_MODULE,
+	 MONITOR_DEFAULT_START, MONITOR_MODULE_IBUF_SYSTEM},
+
+	{"ibuf_merges_insert", "change_buffer",
+	 "Number of inserted records merged by change buffering",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_IBUF_MERGE_INSERT},
+
+	{"ibuf_merges_delete_mark", "change_buffer",
+	 "Number of deleted records merged by change buffering",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_IBUF_MERGE_DELETE},
+
+	{"ibuf_merges_delete", "change_buffer",
+	 "Number of purge records merged by change buffering",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_IBUF_MERGE_PURGE},
+
+	{"ibuf_merges_discard_insert", "change_buffer",
+	 "Number of insert merged operations discarded",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_IBUF_MERGE_DISCARD_INSERT},
+
+	{"ibuf_merges_discard_delete_mark", "change_buffer",
+	 "Number of deleted merged operations discarded",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_IBUF_MERGE_DISCARD_DELETE},
+
+	{"ibuf_merges_discard_delete", "change_buffer",
+	 "Number of purge merged  operations discarded",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_IBUF_MERGE_DISCARD_PURGE},
+
+	{"ibuf_merges", "change_buffer", "Number of change buffer merges",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_IBUF_MERGES},
+
+	{"ibuf_size", "change_buffer", "Change buffer size in pages",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_IBUF_SIZE},
+
+	/* ========== Counters for server operations ========== */
+	{"module_innodb", "innodb",
+	 "Counter for general InnoDB server wide operations and properties",
+	 MONITOR_MODULE,
+	 MONITOR_DEFAULT_START, MONITOR_MODULE_SERVER},
+
+	{"innodb_master_thread_sleeps", "server",
+	 "Number of times (seconds) master thread sleeps",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_MASTER_THREAD_SLEEP},
+
+	{"innodb_activity_count", "server", "Current server activity count",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_SERVER_ACTIVITY},
+
+	{"innodb_master_active_loops", "server",
+	 "Number of times master thread performs its tasks when"
+	 " server is active",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_MASTER_ACTIVE_LOOPS},
+
+	{"innodb_master_idle_loops", "server",
+	 "Number of times master thread performs its tasks when server is idle",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_MASTER_IDLE_LOOPS},
+
+	{"innodb_background_drop_table_usec", "server",
+	 "Time (in microseconds) spent to process drop table list",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_SRV_BACKGROUND_DROP_TABLE_MICROSECOND},
+
+	{"innodb_ibuf_merge_usec", "server",
+	 "Time (in microseconds) spent to process change buffer merge",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_SRV_IBUF_MERGE_MICROSECOND},
+
+	{"innodb_log_flush_usec", "server",
+	 "Time (in microseconds) spent to flush log records",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_SRV_LOG_FLUSH_MICROSECOND},
+
+	{"innodb_mem_validate_usec", "server",
+	 "Time (in microseconds) spent to do memory validation",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_SRV_MEM_VALIDATE_MICROSECOND},
+
+	{"innodb_master_purge_usec", "server",
+	 "Time (in microseconds) spent by master thread to purge records",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_SRV_PURGE_MICROSECOND},
+
+	{"innodb_dict_lru_usec", "server",
+	 "Time (in microseconds) spent to process DICT LRU list",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_SRV_DICT_LRU_MICROSECOND},
+
+	{"innodb_checkpoint_usec", "server",
+	 "Time (in microseconds) spent by master thread to do checkpoint",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_SRV_CHECKPOINT_MICROSECOND},
+
+	{"innodb_dblwr_writes", "server",
+	 "Number of doublewrite operations that have been performed"
+	 " (innodb_dblwr_writes)",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_SRV_DBLWR_WRITES},
+
+	{"innodb_dblwr_pages_written", "server",
+	 "Number of pages that have been written for doublewrite operations"
+	 " (innodb_dblwr_pages_written)",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_SRV_DBLWR_PAGES_WRITTEN},
+
+	{"innodb_page_size", "server",
+	 "InnoDB page size in bytes (innodb_page_size)",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON | MONITOR_DISPLAY_CURRENT),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_SRV_PAGE_SIZE},
+
+	{"innodb_rwlock_s_spin_waits", "server",
+	 "Number of rwlock spin waits due to shared latch request",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_RWLOCK_S_SPIN_WAITS},
+
+	{"innodb_rwlock_x_spin_waits", "server",
+	 "Number of rwlock spin waits due to exclusive latch request",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_RWLOCK_X_SPIN_WAITS},
+
+	{"innodb_rwlock_s_spin_rounds", "server",
+	 "Number of rwlock spin loop rounds due to shared latch request",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_RWLOCK_S_SPIN_ROUNDS},
+
+	{"innodb_rwlock_x_spin_rounds", "server",
+	 "Number of rwlock spin loop rounds due to exclusive latch request",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_RWLOCK_X_SPIN_ROUNDS},
+
+	{"innodb_rwlock_s_os_waits", "server",
+	 "Number of OS waits due to shared latch request",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_RWLOCK_S_OS_WAITS},
+
+	{"innodb_rwlock_x_os_waits", "server",
+	 "Number of OS waits due to exclusive latch request",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_RWLOCK_X_OS_WAITS},
+
+	/* ========== Counters for DML operations ========== */
+	{"module_dml", "dml", "Statistics for DMLs",
+	 MONITOR_MODULE,
+	 MONITOR_DEFAULT_START, MONITOR_MODULE_DML_STATS},
+
+	{"dml_reads", "dml", "Number of rows read",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OLVD_ROW_READ},
+
+	{"dml_inserts", "dml", "Number of rows inserted",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OLVD_ROW_INSERTED},
+
+	{"dml_deletes", "dml", "Number of rows deleted",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OLVD_ROW_DELETED},
+
+	{"dml_updates", "dml", "Number of rows updated",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OLVD_ROW_UPDTATED},
+
+	/* ========== Counters for DDL operations ========== */
+	{"module_ddl", "ddl", "Statistics for DDLs",
+	 MONITOR_MODULE,
+	 MONITOR_DEFAULT_START, MONITOR_MODULE_DDL_STATS},
+
+	{"ddl_background_drop_tables", "ddl",
+	 "Number of tables in background drop table list",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_BACKGROUND_DROP_TABLE},
+
+	/* ===== Counters for ICP (Index Condition Pushdown) Module ===== */
+	{"module_icp", "icp", "Index Condition Pushdown",
+	 MONITOR_MODULE,
+	 MONITOR_DEFAULT_START, MONITOR_MODULE_ICP},
+
+	{"icp_attempts", "icp",
+	 "Number of attempts for index push-down condition checks",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_ICP_ATTEMPTS},
+
+	{"icp_no_match", "icp", "Index push-down condition does not match",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_ICP_NO_MATCH},
+
+	{"icp_out_of_range", "icp", "Index push-down condition out of range",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_ICP_OUT_OF_RANGE},
+
+	{"icp_match", "icp", "Index push-down condition matches",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_ICP_MATCH},
+
+	/* ========== To turn on/off reset all counters ========== */
+	{"all", "All Counters", "Turn on/off and reset all counters",
+	 MONITOR_MODULE,
+	 MONITOR_DEFAULT_START, MONITOR_ALL_COUNTER}
+};
+
+/* The "innodb_counter_value" array stores actual counter values */
+UNIV_INTERN monitor_value_t	innodb_counter_value[NUM_MONITOR];
+
+/* monitor_set_tbl is used to record and determine whether a monitor
+has been turned on/off. */
+UNIV_INTERN ulint		monitor_set_tbl[(NUM_MONITOR + NUM_BITS_ULINT
+						- 1) / NUM_BITS_ULINT];
+
+/****************************************************************//**
+Get a monitor's "monitor_info" by its monitor id (index into the
+innodb_counter_info array.
+@return	Point to corresponding monitor_info_t, or NULL if no such
+monitor */
+UNIV_INTERN
+monitor_info_t*
+srv_mon_get_info(
+/*=============*/
+	monitor_id_t	monitor_id)	/*!< id indexing into the
+					innodb_counter_info array */
+{
+	ut_a(monitor_id < NUM_MONITOR);
+
+	return((monitor_id < NUM_MONITOR)
+			? &innodb_counter_info[monitor_id]
+			: NULL);
+}
+
+/****************************************************************//**
+Get monitor's name by its monitor id (indexing into the
+innodb_counter_info array.
+@return	corresponding monitor name, or NULL if no such
+monitor */
+UNIV_INTERN
+const char*
+srv_mon_get_name(
+/*=============*/
+	monitor_id_t	monitor_id)	/*!< id index into the
+					innodb_counter_info array */
+{
+	ut_a(monitor_id < NUM_MONITOR);
+
+	return((monitor_id < NUM_MONITOR)
+			? innodb_counter_info[monitor_id].monitor_name
+			: NULL);
+}
+
+/****************************************************************//**
+Turn on/off, reset monitor counters in a module. If module_id
+is MONITOR_ALL_COUNTER then turn on all monitor counters.
+turned on because it has already been turned on. */
+UNIV_INTERN
+void
+srv_mon_set_module_control(
+/*=======================*/
+	monitor_id_t	module_id,	/*!< in: Module ID as in
+					monitor_counter_id. If it is
+					set to MONITOR_ALL_COUNTER, this means
+					we shall turn on all the counters */
+	mon_option_t	set_option)	/*!< in: Turn on/off reset the
+					counter */
+{
+	ulint	ix;
+	ulint	start_id;
+	ibool	set_current_module = FALSE;
+
+	ut_a(module_id <= NUM_MONITOR);
+	ut_a(UT_ARR_SIZE(innodb_counter_info) == NUM_MONITOR);
+
+	/* The module_id must be an ID of MONITOR_MODULE type */
+	ut_a(innodb_counter_info[module_id].monitor_type & MONITOR_MODULE);
+
+	/* start with the first monitor in the module. If module_id
+	is MONITOR_ALL_COUNTER, this means we need to turn on all
+	monitor counters. */
+	if (module_id == MONITOR_ALL_COUNTER) {
+		start_id = 1;
+	} else if (innodb_counter_info[module_id].monitor_type
+		   & MONITOR_GROUP_MODULE) {
+		/* Counters in this module are set as a group together
+		and cannot be turned on/off individually. Need to set
+		the on/off bit in the module counter */
+		start_id = module_id;
+		set_current_module = TRUE;
+
+	} else {
+		start_id = module_id + 1;
+	}
+
+	for (ix = start_id; ix < NUM_MONITOR; ix++) {
+		/* if we hit the next module counter, we will
+		continue if we want to turn on all monitor counters,
+		and break if just turn on the counters in the
+		current module. */
+		if (innodb_counter_info[ix].monitor_type & MONITOR_MODULE) {
+
+			if (set_current_module) {
+				/* Continue to set on/off bit on current
+				module */
+				set_current_module = FALSE;
+			} else if (module_id == MONITOR_ALL_COUNTER) {
+				continue;
+			} else {
+				/* Hitting the next module, stop */
+				break;
+			}
+		}
+
+		/* Cannot turn on a monitor already been turned on. User
+		should be aware some counters are already on before
+		turn them on again (which could reset counter value) */
+		if (MONITOR_IS_ON(ix) && (set_option == MONITOR_TURN_ON)) {
+			fprintf(stderr, "Monitor '%s' is already enabled.\n",
+				srv_mon_get_name((monitor_id_t) ix));
+			continue;
+		}
+
+		/* For some existing counters (server status variables),
+		we will get its counter value at the start/stop time
+		to calculate the actual value during the time. */
+		if (innodb_counter_info[ix].monitor_type & MONITOR_EXISTING) {
+			srv_mon_process_existing_counter(
+				static_cast<monitor_id_t>(ix), set_option);
+		}
+
+		/* Currently support 4 operations on the monitor counters:
+		turn on, turn off, reset and reset all operations. */
+		switch (set_option) {
+		case MONITOR_TURN_ON:
+			MONITOR_ON(ix);
+			MONITOR_INIT(ix);
+			MONITOR_SET_START(ix);
+			break;
+
+		case MONITOR_TURN_OFF:
+			MONITOR_OFF(ix);
+			MONITOR_SET_OFF(ix);
+			break;
+
+		case MONITOR_RESET_VALUE:
+			srv_mon_reset(static_cast<monitor_id_t>(ix));
+			break;
+
+		case MONITOR_RESET_ALL_VALUE:
+			srv_mon_reset_all(static_cast<monitor_id_t>(ix));
+			break;
+
+		default:
+			ut_error;
+		}
+	}
+}
+
+/****************************************************************//**
+Get transaction system's rollback segment size in pages
+@return size in pages */
+static
+ulint
+srv_mon_get_rseg_size(void)
+/*=======================*/
+{
+	ulint		i;
+	ulint		value = 0;
+
+	/* rseg_array is a static array, so we can go through it without
+	mutex protection. In addition, we provide an estimate of the
+	total rollback segment size and to avoid mutex contention we
+	don't acquire the rseg->mutex" */
+	for (i = 0; i < TRX_SYS_N_RSEGS; ++i) {
+		const trx_rseg_t*	rseg = trx_sys->rseg_array[i];
+
+		if (rseg != NULL) {
+			value += rseg->curr_size;
+		}
+	}
+
+	return(value);
+}
+
+/****************************************************************//**
+This function consolidates some existing server counters used
+by "system status variables". These existing system variables do not have
+mechanism to start/stop and reset the counters, so we simulate these
+controls by remembering the corresponding counter values when the
+corresponding monitors are turned on/off/reset, and do appropriate
+mathematics to deduct the actual value. Please also refer to
+srv_export_innodb_status() for related global counters used by
+the existing status variables.*/
+UNIV_INTERN
+void
+srv_mon_process_existing_counter(
+/*=============================*/
+	monitor_id_t	monitor_id,	/*!< in: the monitor's ID as in
+					monitor_counter_id */
+	mon_option_t	set_option)	/*!< in: Turn on/off reset the
+					counter */
+{
+	mon_type_t	value;
+	monitor_info_t*	monitor_info;
+	ibool		update_min = FALSE;
+	buf_pool_stat_t	stat;
+	ulint		LRU_len;
+	ulint		free_len;
+	ulint		flush_list_len;
+
+	monitor_info = srv_mon_get_info(monitor_id);
+
+	ut_a(monitor_info->monitor_type & MONITOR_EXISTING);
+	ut_a(monitor_id < NUM_MONITOR);
+
+	/* Get the value from corresponding global variable */
+	switch (monitor_id) {
+	case MONITOR_OVLD_META_MEM_POOL:
+		value = srv_mem_pool_size;
+		break;
+
+	/* export_vars.innodb_buffer_pool_reads. Num Reads from
+	disk (page not in buffer) */
+	case MONITOR_OVLD_BUF_POOL_READS:
+		value = srv_buf_pool_reads;
+		break;
+
+	/* innodb_buffer_pool_read_requests, the number of logical
+	read requests */
+	case MONITOR_OVLD_BUF_POOL_READ_REQUESTS:
+		buf_get_total_stat(&stat);
+		value = stat.n_page_gets;
+		break;
+
+	/* innodb_buffer_pool_write_requests, the number of
+	write request */
+	case MONITOR_OVLD_BUF_POOL_WRITE_REQUEST:
+		value = srv_buf_pool_write_requests;
+		break;
+
+	/* innodb_buffer_pool_wait_free */
+	case MONITOR_OVLD_BUF_POOL_WAIT_FREE:
+		value = srv_buf_pool_wait_free;
+		break;
+
+	/* innodb_buffer_pool_read_ahead */
+	case MONITOR_OVLD_BUF_POOL_READ_AHEAD:
+		buf_get_total_stat(&stat);
+		value = stat.n_ra_pages_read;
+		break;
+
+	/* innodb_buffer_pool_read_ahead_evicted */
+	case MONITOR_OVLD_BUF_POOL_READ_AHEAD_EVICTED:
+		buf_get_total_stat(&stat);
+		value = stat.n_ra_pages_evicted;
+		break;
+
+	/* innodb_buffer_pool_pages_total */
+	case MONITOR_OVLD_BUF_POOL_PAGE_TOTAL:
+		value = buf_pool_get_n_pages();
+		break;
+
+	/* innodb_buffer_pool_pages_misc */
+	case MONITOR_OVLD_BUF_POOL_PAGE_MISC:
+		buf_get_total_list_len(&LRU_len, &free_len, &flush_list_len);
+		value = buf_pool_get_n_pages() - LRU_len - free_len;
+		break;
+
+	/* innodb_buffer_pool_pages_data */
+	case MONITOR_OVLD_BUF_POOL_PAGES_DATA:
+		buf_get_total_list_len(&LRU_len, &free_len, &flush_list_len);
+		value = LRU_len;
+		break;
+
+	/* innodb_buffer_pool_pages_dirty */
+	case MONITOR_OVLD_BUF_POOL_PAGES_DIRTY:
+		buf_get_total_list_len(&LRU_len, &free_len, &flush_list_len);
+		value = flush_list_len;
+		break;
+
+	/* innodb_buffer_pool_pages_free */
+	case MONITOR_OVLD_BUF_POOL_PAGES_FREE:
+		buf_get_total_list_len(&LRU_len, &free_len, &flush_list_len);
+		value = free_len;
+		break;
+
+	/* innodb_pages_created, the number of pages created */
+	case MONITOR_OVLD_PAGE_CREATED:
+		buf_get_total_stat(&stat);
+		value = stat.n_pages_created;
+		break;
+
+	/* innodb_pages_written, the number of page written */
+	case MONITOR_OVLD_PAGES_WRITTEN:
+		buf_get_total_stat(&stat);
+		value = stat.n_pages_written;
+		break;
+
+	/* innodb_pages_read */
+	case MONITOR_OVLD_PAGES_READ:
+		buf_get_total_stat(&stat);
+		value = stat.n_pages_read;
+		break;
+
+	/* innodb_data_reads, the total number of data reads */
+	case MONITOR_OVLD_BYTE_READ:
+		value = srv_data_read;
+		break;
+
+	/* innodb_data_writes, the total number of data writes. */
+	case MONITOR_OVLD_BYTE_WRITTEN:
+		value = srv_data_written;
+		break;
+
+	/* innodb_data_reads, the total number of data reads. */
+	case MONITOR_OVLD_OS_FILE_READ:
+		value = os_n_file_reads;
+		break;
+
+	/* innodb_data_writes, the total number of data writes*/
+	case MONITOR_OVLD_OS_FILE_WRITE:
+		value = os_n_file_writes;
+		break;
+
+	/* innodb_data_fsyncs, number of fsync() operations so far. */
+	case MONITOR_OVLD_OS_FSYNC:
+		value = os_n_fsyncs;
+		break;
+
+	/* innodb_os_log_written */
+	case MONITOR_OVLD_OS_LOG_WRITTEN:
+		value = (mon_type_t) srv_os_log_written;
+		break;
+
+	/* innodb_os_log_fsyncs */
+	case MONITOR_OVLD_OS_LOG_FSYNC:
+		value = fil_n_log_flushes;
+		break;
+
+	/* innodb_os_log_pending_fsyncs */
+	case MONITOR_OVLD_OS_LOG_PENDING_FSYNC:
+		value = fil_n_pending_log_flushes;
+		update_min = TRUE;
+		break;
+
+	/* innodb_os_log_pending_writes */
+	case MONITOR_OVLD_OS_LOG_PENDING_WRITES:
+		value = srv_os_log_pending_writes;
+		update_min = TRUE;
+		break;
+
+	/* innodb_log_waits */
+	case MONITOR_OVLD_LOG_WAITS:
+		value = srv_log_waits;
+		break;
+
+	/* innodb_log_write_requests */
+	case MONITOR_OVLD_LOG_WRITE_REQUEST:
+		value = srv_log_write_requests;
+		break;
+
+	/* innodb_log_writes */
+	case MONITOR_OVLD_LOG_WRITES:
+		value = srv_log_writes;
+		break;
+
+	/* innodb_dblwr_writes */
+	case MONITOR_OVLD_SRV_DBLWR_WRITES:
+		value = srv_dblwr_writes;
+		break;
+
+	/* innodb_dblwr_pages_written */
+	case MONITOR_OVLD_SRV_DBLWR_PAGES_WRITTEN:
+		value = srv_dblwr_pages_written;
+		break;
+
+	/* innodb_page_size */
+	case MONITOR_OVLD_SRV_PAGE_SIZE:
+		value = UNIV_PAGE_SIZE;
+		break;
+
+	case MONITOR_OVLD_RWLOCK_S_SPIN_WAITS:
+		value = rw_s_spin_wait_count;
+		break;
+
+	case MONITOR_OVLD_RWLOCK_X_SPIN_WAITS:
+		value = rw_x_os_wait_count;
+		break;
+
+	case MONITOR_OVLD_RWLOCK_S_SPIN_ROUNDS:
+		value = rw_s_spin_round_count;
+		break;
+
+	case MONITOR_OVLD_RWLOCK_X_SPIN_ROUNDS:
+		value = rw_x_spin_round_count;
+		break;
+
+	case MONITOR_OVLD_RWLOCK_S_OS_WAITS:
+		value = rw_s_os_wait_count;
+		break;
+
+	case MONITOR_OVLD_RWLOCK_X_OS_WAITS:
+		value = rw_x_os_wait_count;
+		break;
+
+	case MONITOR_OVLD_BUFFER_POOL_SIZE:
+		value = srv_buf_pool_size;
+		break;
+
+	/* innodb_rows_read */
+	case MONITOR_OLVD_ROW_READ:
+		value = srv_n_rows_read;
+		break;
+
+	/* innodb_rows_inserted */
+	case MONITOR_OLVD_ROW_INSERTED:
+		value = srv_n_rows_inserted;
+		break;
+
+	/* innodb_rows_deleted */
+	case MONITOR_OLVD_ROW_DELETED:
+		value = srv_n_rows_deleted;
+		break;
+
+	/* innodb_rows_updated */
+	case MONITOR_OLVD_ROW_UPDTATED:
+		value = srv_n_rows_updated;
+		break;
+
+	/* innodb_row_lock_current_waits */
+	case MONITOR_OVLD_ROW_LOCK_CURRENT_WAIT:
+		value = srv_n_lock_wait_current_count;
+		break;
+
+	/* innodb_row_lock_time */
+	case MONITOR_OVLD_LOCK_WAIT_TIME:
+		value = srv_n_lock_wait_time / 1000;
+		break;
+
+	/* innodb_row_lock_time_max */
+	case MONITOR_OVLD_LOCK_MAX_WAIT_TIME:
+		value = srv_n_lock_max_wait_time / 1000;
+		break;
+
+	/* innodb_row_lock_time_avg */
+	case MONITOR_OVLD_LOCK_AVG_WAIT_TIME:
+		if (srv_n_lock_wait_count > 0) {
+			value = srv_n_lock_wait_time / 1000
+				/ srv_n_lock_wait_count;
+		} else {
+			value = 0;
+		}
+		break;
+
+	/* innodb_row_lock_waits */
+	case MONITOR_OVLD_ROW_LOCK_WAIT:
+		value = srv_n_lock_wait_count;
+		break;
+
+	case MONITOR_RSEG_HISTORY_LEN:
+		value = trx_sys->rseg_history_len;
+		break;
+
+	case MONITOR_RSEG_CUR_SIZE:
+		value = srv_mon_get_rseg_size();
+		break;
+
+	case MONITOR_OVLD_N_FILE_OPENED:
+		value = fil_n_file_opened;
+		break;
+
+	case MONITOR_OVLD_IBUF_MERGE_INSERT:
+		value = ibuf->n_merged_ops[IBUF_OP_INSERT];
+		break;
+
+	case MONITOR_OVLD_IBUF_MERGE_DELETE:
+		value = ibuf->n_merged_ops[IBUF_OP_DELETE_MARK];
+		break;
+
+	case MONITOR_OVLD_IBUF_MERGE_PURGE:
+		value = ibuf->n_merged_ops[IBUF_OP_DELETE];
+		break;
+
+	case MONITOR_OVLD_IBUF_MERGE_DISCARD_INSERT:
+		value = ibuf->n_discarded_ops[IBUF_OP_INSERT];
+		break;
+
+	case MONITOR_OVLD_IBUF_MERGE_DISCARD_DELETE:
+		value = ibuf->n_discarded_ops[IBUF_OP_DELETE_MARK];
+		break;
+
+	case MONITOR_OVLD_IBUF_MERGE_DISCARD_PURGE:
+		value = ibuf->n_discarded_ops[IBUF_OP_DELETE];
+		break;
+
+	case MONITOR_OVLD_IBUF_MERGES:
+		value = ibuf->n_merges;
+		break;
+
+	case MONITOR_OVLD_IBUF_SIZE:
+		value = ibuf->size;
+		break;
+
+	case MONITOR_OVLD_SERVER_ACTIVITY:
+		value = srv_get_activity_count();
+		break;
+
+	case MONITOR_OVLD_LSN_FLUSHDISK:
+		value = (mon_type_t) log_sys->flushed_to_disk_lsn;
+		break;
+
+	case MONITOR_OVLD_LSN_CURRENT:
+		value = (mon_type_t) log_sys->lsn;
+		break;
+
+	case MONITOR_OVLD_BUF_OLDEST_LSN:
+		value = (mon_type_t) buf_pool_get_oldest_modification();
+		break;
+
+	case MONITOR_OVLD_LSN_CHECKPOINT:
+		value = (mon_type_t) log_sys->last_checkpoint_lsn;
+		break;
+
+	case MONITOR_OVLD_MAX_AGE_ASYNC:
+		value = log_sys->max_modified_age_async;
+		break;
+
+	case MONITOR_OVLD_MAX_AGE_SYNC:
+		value = log_sys->max_modified_age_sync;
+		break;
+
+	case MONITOR_OVLD_ADAPTIVE_HASH_SEARCH:
+		value = btr_cur_n_sea;
+		break;
+
+	case MONITOR_OVLD_ADAPTIVE_HASH_SEARCH_BTREE:
+		value = btr_cur_n_non_sea;
+		break;
+
+	default:
+		ut_error;
+	}
+
+	switch (set_option) {
+	case MONITOR_TURN_ON:
+		/* Save the initial counter value in mon_start_value
+		field */
+		MONITOR_SAVE_START(monitor_id, value);
+		return;
+
+	case MONITOR_TURN_OFF:
+		/* Save the counter value to mon_last_value when we
+		turn off the monitor but not yet reset. Note the
+		counter has not yet been set to off in the bitmap
+		table for normal turn off. We need to check the
+		count status (on/off) to avoid reset the value
+		for an already off conte */
+		if (MONITOR_IS_ON(monitor_id)) {
+			srv_mon_process_existing_counter(monitor_id,
+							 MONITOR_GET_VALUE);
+			MONITOR_SAVE_LAST(monitor_id);
+		}
+		return;
+
+	case MONITOR_GET_VALUE:
+		if (MONITOR_IS_ON(monitor_id)) {
+
+			/* If MONITOR_DISPLAY_CURRENT bit is on, we
+			only record the current value, rather than
+			incremental value over a period. Most of
+`			this type of counters are resource related
+			counters such as number of buffer pages etc. */
+			if (monitor_info->monitor_type
+			    & MONITOR_DISPLAY_CURRENT) {
+				MONITOR_SET(monitor_id, value);
+			} else {
+				/* Most status counters are montonically
+				increasing, no need to update their
+				minimum values. Only do so
+				if "update_min" set to TRUE */
+				MONITOR_SET_DIFF(monitor_id, value);
+
+				if (update_min
+				    && (MONITOR_VALUE(monitor_id)
+					< MONITOR_MIN_VALUE(monitor_id))) {
+					MONITOR_MIN_VALUE(monitor_id) =
+						MONITOR_VALUE(monitor_id);
+				}
+			}
+		}
+		return;
+
+	case MONITOR_RESET_VALUE:
+		if (!MONITOR_IS_ON(monitor_id)) {
+			MONITOR_LAST_VALUE(monitor_id) = 0;
+		}
+		return;
+
+	/* Nothing special for reset all operation for these existing
+	counters */
+	case MONITOR_RESET_ALL_VALUE:
+		return;
+	}
+}
+
+/*************************************************************//**
+Reset a monitor, create a new base line with the current monitor
+value. This baseline is recorded by MONITOR_VALUE_RESET(monitor) */
+UNIV_INTERN
+void
+srv_mon_reset(
+/*==========*/
+	monitor_id_t	monitor)	/*!< in: monitor id */
+{
+	ibool	monitor_was_on;
+
+	monitor_was_on = MONITOR_IS_ON(monitor);
+
+	if (monitor_was_on) {
+		/* Temporarily turn off the counter for the resetting
+		operation */
+		MONITOR_OFF(monitor);
+	}
+
+	/* Before resetting the current monitor value, first
+	calculate and set the max/min value since monitor
+	start */
+	srv_mon_calc_max_since_start(monitor);
+	srv_mon_calc_min_since_start(monitor);
+
+	/* Monitors with MONITOR_DISPLAY_CURRENT bit
+	are not incremental, no need to remember
+	the reset value. */
+	if (innodb_counter_info[monitor].monitor_type
+	    & MONITOR_DISPLAY_CURRENT) {
+		MONITOR_VALUE_RESET(monitor) = 0;
+	} else {
+		/* Remember the new baseline */
+		MONITOR_VALUE_RESET(monitor) = MONITOR_VALUE_RESET(monitor)
+					       + MONITOR_VALUE(monitor);
+	}
+
+	/* Reset the counter value */
+	MONITOR_VALUE(monitor) = 0;
+	MONITOR_MAX_VALUE(monitor) = MAX_RESERVED;
+	MONITOR_MIN_VALUE(monitor) = MIN_RESERVED;
+
+	MONITOR_FIELD((monitor), mon_reset_time) = time(NULL);
+
+	if (monitor_was_on) {
+		MONITOR_ON(monitor);
+	}
+}
+
+/*************************************************************//**
+Turn on monitor counters that are marked as default ON. */
+UNIV_INTERN
+void
+srv_mon_default_on(void)
+/*====================*/
+{
+	ulint   ix;
+
+	for (ix = 0; ix < NUM_MONITOR; ix++) {
+		if (innodb_counter_info[ix].monitor_type
+		    & MONITOR_DEFAULT_ON) {
+			/* Turn on monitor counters that are default on */
+			MONITOR_ON(ix);
+			MONITOR_INIT(ix);
+			MONITOR_SET_START(ix);
+		}
+	}
+}
+#endif /* !UNIV_HOTBACKUP */
diff --git a/storage/innobase/srv/srv0srv.c b/storage/innobase/srv/srv0srv.c
deleted file mode 100644
index df89156baae..00000000000
--- a/storage/innobase/srv/srv0srv.c
+++ /dev/null
@@ -1,3280 +0,0 @@
-/*****************************************************************************
-
-Copyright (c) 1995, 2012, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2008, 2009 Google Inc.
-Copyright (c) 2009, Percona Inc.
-
-Portions of this file contain modifications contributed and copyrighted by
-Google, Inc. Those modifications are gratefully acknowledged and are described
-briefly in the InnoDB documentation. The contributions by Google are
-incorporated with their permission, and subject to the conditions contained in
-the file COPYING.Google.
-
-Portions of this file contain modifications contributed and copyrighted
-by Percona Inc.. Those modifications are
-gratefully acknowledged and are described briefly in the InnoDB
-documentation. The contributions by Percona Inc. are incorporated with
-their permission, and subject to the conditions contained in the file
-COPYING.Percona.
-
-This program is free software; you can redistribute it and/or modify it under
-the terms of the GNU General Public License as published by the Free Software
-Foundation; version 2 of the License.
-
-This program is distributed in the hope that it will be useful, but WITHOUT
-ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
-FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
-
-*****************************************************************************/
-
-/**************************************************//**
-@file srv/srv0srv.c
-The database server main program
-
-NOTE: SQL Server 7 uses something which the documentation
-calls user mode scheduled threads (UMS threads). One such
-thread is usually allocated per processor. Win32
-documentation does not know any UMS threads, which suggests
-that the concept is internal to SQL Server 7. It may mean that
-SQL Server 7 does all the scheduling of threads itself, even
-in i/o waits. We should maybe modify InnoDB to use the same
-technique, because thread switches within NT may be too slow.
-
-SQL Server 7 also mentions fibers, which are cooperatively
-scheduled threads. They can boost performance by 5 %,
-according to the Delaney and Soukup's book.
-
-Windows 2000 will have something called thread pooling
-(see msdn website), which we could possibly use.
-
-Another possibility could be to use some very fast user space
-thread library. This might confuse NT though.
-
-Created 10/8/1995 Heikki Tuuri
-*******************************************************/
-
-/* Dummy comment */
-#include "srv0srv.h"
-
-#include "ut0mem.h"
-#include "ut0ut.h"
-#include "os0proc.h"
-#include "mem0mem.h"
-#include "mem0pool.h"
-#include "sync0sync.h"
-#include "que0que.h"
-#include "log0recv.h"
-#include "pars0pars.h"
-#include "usr0sess.h"
-#include "lock0lock.h"
-#include "trx0purge.h"
-#include "ibuf0ibuf.h"
-#include "buf0flu.h"
-#include "buf0lru.h"
-#include "btr0sea.h"
-#include "dict0load.h"
-#include "dict0boot.h"
-#include "srv0start.h"
-#include "row0mysql.h"
-#include "ha_prototypes.h"
-#include "trx0i_s.h"
-#include "os0sync.h" /* for HAVE_ATOMIC_BUILTINS */
-#include "mysql/plugin.h"
-#include "mysql/service_thd_wait.h"
-
-/* The following counter is incremented whenever there is some user activity
-in the server */
-UNIV_INTERN ulint	srv_activity_count	= 0;
-
-/* The following is the maximum allowed duration of a lock wait. */
-UNIV_INTERN ulint	srv_fatal_semaphore_wait_threshold = 600;
-
-/* How much data manipulation language (DML) statements need to be delayed,
-in microseconds, in order to reduce the lagging of the purge thread. */
-UNIV_INTERN ulint	srv_dml_needed_delay = 0;
-
-UNIV_INTERN ibool	srv_lock_timeout_active = FALSE;
-UNIV_INTERN ibool	srv_monitor_active = FALSE;
-UNIV_INTERN ibool	srv_error_monitor_active = FALSE;
-
-UNIV_INTERN const char*	srv_main_thread_op_info = "";
-
-/** Prefix used by MySQL to indicate pre-5.1 table name encoding */
-UNIV_INTERN const char	srv_mysql50_table_name_prefix[9] = "#mysql50#";
-
-/* Server parameters which are read from the initfile */
-
-/* The following three are dir paths which are catenated before file
-names, where the file name itself may also contain a path */
-
-UNIV_INTERN char*	srv_data_home	= NULL;
-#ifdef UNIV_LOG_ARCHIVE
-UNIV_INTERN char*	srv_arch_dir	= NULL;
-#endif /* UNIV_LOG_ARCHIVE */
-
-/** store to its own file each table created by an user; data
-dictionary tables are in the system tablespace 0 */
-UNIV_INTERN my_bool	srv_file_per_table;
-/** The file format to use on new *.ibd files. */
-UNIV_INTERN ulint	srv_file_format = 0;
-/** Whether to check file format during startup.  A value of
-DICT_TF_FORMAT_MAX + 1 means no checking ie. FALSE.  The default is to
-set it to the highest format we support. */
-UNIV_INTERN ulint	srv_max_file_format_at_startup = DICT_TF_FORMAT_MAX;
-
-#if DICT_TF_FORMAT_51
-# error "DICT_TF_FORMAT_51 must be 0!"
-#endif
-/** Place locks to records only i.e. do not use next-key locking except
-on duplicate key checking and foreign key checking */
-UNIV_INTERN ibool	srv_locks_unsafe_for_binlog = FALSE;
-
-/* If this flag is TRUE, then we will use the native aio of the
-OS (provided we compiled Innobase with it in), otherwise we will
-use simulated aio we build below with threads.
-Currently we support native aio on windows and linux */
-UNIV_INTERN my_bool	srv_use_native_aio = TRUE;
-
-#ifdef __WIN__
-/* Windows native condition variables. We use runtime loading / function
-pointers, because they are not available on Windows Server 2003 and
-Windows XP/2000.
-
-We use condition for events on Windows if possible, even if os_event
-resembles Windows kernel event object well API-wise. The reason is
-performance, kernel objects are heavyweights and WaitForSingleObject() is a
-performance killer causing calling thread to context switch. Besides, Innodb
-is preallocating large number (often millions) of os_events. With kernel event
-objects it takes a big chunk out of non-paged pool, which is better suited
-for tasks like IO than for storing idle event objects. */
-UNIV_INTERN ibool	srv_use_native_conditions = FALSE;
-#endif /* __WIN__ */
-
-UNIV_INTERN ulint	srv_n_data_files = 0;
-UNIV_INTERN char**	srv_data_file_names = NULL;
-/* size in database pages */
-UNIV_INTERN ulint*	srv_data_file_sizes = NULL;
-
-/* if TRUE, then we auto-extend the last data file */
-UNIV_INTERN ibool	srv_auto_extend_last_data_file	= FALSE;
-/* if != 0, this tells the max size auto-extending may increase the
-last data file size */
-UNIV_INTERN ulint	srv_last_file_size_max	= 0;
-/* If the last data file is auto-extended, we add this
-many pages to it at a time */
-UNIV_INTERN ulong	srv_auto_extend_increment = 8;
-UNIV_INTERN ulint*	srv_data_file_is_raw_partition = NULL;
-
-/* If the following is TRUE we do not allow inserts etc. This protects
-the user from forgetting the 'newraw' keyword to my.cnf */
-
-UNIV_INTERN ibool	srv_created_new_raw	= FALSE;
-
-UNIV_INTERN char**	srv_log_group_home_dirs = NULL;
-
-UNIV_INTERN ulint	srv_n_log_groups	= ULINT_MAX;
-UNIV_INTERN ulint	srv_n_log_files		= ULINT_MAX;
-/* size in database pages */
-UNIV_INTERN ulint	srv_log_file_size	= ULINT_MAX;
-/* size in database pages */
-UNIV_INTERN ulint	srv_log_buffer_size	= ULINT_MAX;
-UNIV_INTERN ulong	srv_flush_log_at_trx_commit = 1;
-
-/* Try to flush dirty pages so as to avoid IO bursts at
-the checkpoints. */
-UNIV_INTERN char	srv_adaptive_flushing	= TRUE;
-
-/** Maximum number of times allowed to conditionally acquire
-mutex before switching to blocking wait on the mutex */
-#define MAX_MUTEX_NOWAIT	20
-
-/** Check whether the number of failed nonblocking mutex
-acquisition attempts exceeds maximum allowed value. If so,
-srv_printf_innodb_monitor() will request mutex acquisition
-with mutex_enter(), which will wait until it gets the mutex. */
-#define MUTEX_NOWAIT(mutex_skipped)	((mutex_skipped) < MAX_MUTEX_NOWAIT)
-
-/** The sort order table of the MySQL latin1_swedish_ci character set
-collation */
-UNIV_INTERN const byte*	srv_latin1_ordering;
-
-/* use os/external memory allocator */
-UNIV_INTERN my_bool	srv_use_sys_malloc	= TRUE;
-/* requested size in kilobytes */
-UNIV_INTERN ulint	srv_buf_pool_size	= ULINT_MAX;
-/* requested number of buffer pool instances */
-UNIV_INTERN ulint       srv_buf_pool_instances  = 1;
-/* previously requested size */
-UNIV_INTERN ulint	srv_buf_pool_old_size;
-/* current size in kilobytes */
-UNIV_INTERN ulint	srv_buf_pool_curr_size	= 0;
-/* size in bytes */
-UNIV_INTERN ulint	srv_mem_pool_size	= ULINT_MAX;
-UNIV_INTERN ulint	srv_lock_table_size	= ULINT_MAX;
-
-/* This parameter is deprecated. Use srv_n_io_[read|write]_threads
-instead. */
-UNIV_INTERN ulint	srv_n_file_io_threads	= ULINT_MAX;
-UNIV_INTERN ulint	srv_n_read_io_threads	= ULINT_MAX;
-UNIV_INTERN ulint	srv_n_write_io_threads	= ULINT_MAX;
-
-/* Switch to enable random read ahead. */
-UNIV_INTERN my_bool	srv_random_read_ahead	= FALSE;
-/* User settable value of the number of pages that must be present
-in the buffer cache and accessed sequentially for InnoDB to trigger a
-readahead request. */
-UNIV_INTERN ulong	srv_read_ahead_threshold	= 56;
-
-#ifdef UNIV_LOG_ARCHIVE
-UNIV_INTERN ibool		srv_log_archive_on	= FALSE;
-UNIV_INTERN ibool		srv_archive_recovery	= 0;
-UNIV_INTERN ib_uint64_t	srv_archive_recovery_limit_lsn;
-#endif /* UNIV_LOG_ARCHIVE */
-
-/* This parameter is used to throttle the number of insert buffers that are
-merged in a batch. By increasing this parameter on a faster disk you can
-possibly reduce the number of I/O operations performed to complete the
-merge operation. The value of this parameter is used as is by the
-background loop when the system is idle (low load), on a busy system
-the parameter is scaled down by a factor of 4, this is to avoid putting
-a heavier load on the I/O sub system. */
-
-UNIV_INTERN ulong	srv_insert_buffer_batch_size = 20;
-
-UNIV_INTERN char*	srv_file_flush_method_str = NULL;
-UNIV_INTERN ulint	srv_unix_file_flush_method = SRV_UNIX_FSYNC;
-UNIV_INTERN ulint	srv_win_file_flush_method = SRV_WIN_IO_UNBUFFERED;
-
-UNIV_INTERN ulint	srv_max_n_open_files	  = 300;
-
-/* Number of IO operations per second the server can do */
-UNIV_INTERN ulong	srv_io_capacity         = 200;
-
-/* The InnoDB main thread tries to keep the ratio of modified pages
-in the buffer pool to all database pages in the buffer pool smaller than
-the following number. But it is not guaranteed that the value stays below
-that during a time of heavy update/insert activity. */
-
-UNIV_INTERN ulong	srv_max_buf_pool_modified_pct	= 75;
-
-/* the number of purge threads to use from the worker pool (currently 0 or 1).*/
-UNIV_INTERN ulong srv_n_purge_threads = 0;
-
-/* the number of pages to purge in one batch */
-UNIV_INTERN ulong srv_purge_batch_size = 20;
-
-/* the number of rollback segments to use */
-UNIV_INTERN ulong srv_rollback_segments = TRX_SYS_N_RSEGS;
-
-/* variable counts amount of data read in total (in bytes) */
-UNIV_INTERN ulint srv_data_read = 0;
-
-/* Internal setting for "innodb_stats_method". Decides how InnoDB treats
-NULL value when collecting statistics. By default, it is set to
-SRV_STATS_NULLS_EQUAL(0), ie. all NULL value are treated equal */
-ulong srv_innodb_stats_method = SRV_STATS_NULLS_EQUAL;
-
-/* here we count the amount of data written in total (in bytes) */
-UNIV_INTERN ulint srv_data_written = 0;
-
-/* the number of the log write requests done */
-UNIV_INTERN ulint srv_log_write_requests = 0;
-
-/* the number of physical writes to the log performed */
-UNIV_INTERN ulint srv_log_writes = 0;
-
-/* amount of data written to the log files in bytes */
-UNIV_INTERN ulint srv_os_log_written = 0;
-
-/* amount of writes being done to the log files */
-UNIV_INTERN ulint srv_os_log_pending_writes = 0;
-
-/* we increase this counter, when there we don't have enough space in the
-log buffer and have to flush it */
-UNIV_INTERN ulint srv_log_waits = 0;
-
-/* this variable counts the amount of times, when the doublewrite buffer
-was flushed */
-UNIV_INTERN ulint srv_dblwr_writes = 0;
-
-/* here we store the number of pages that have been flushed to the
-doublewrite buffer */
-UNIV_INTERN ulint srv_dblwr_pages_written = 0;
-
-/* in this variable we store the number of write requests issued */
-UNIV_INTERN ulint srv_buf_pool_write_requests = 0;
-
-/* here we store the number of times when we had to wait for a free page
-in the buffer pool. It happens when the buffer pool is full and we need
-to make a flush, in order to be able to read or create a page. */
-UNIV_INTERN ulint srv_buf_pool_wait_free = 0;
-
-/* variable to count the number of pages that were written from buffer
-pool to the disk */
-UNIV_INTERN ulint srv_buf_pool_flushed = 0;
-
-/** Number of buffer pool reads that led to the
-reading of a disk page */
-UNIV_INTERN ulint srv_buf_pool_reads = 0;
-
-/* structure to pass status variables to MySQL */
-UNIV_INTERN export_struc export_vars;
-
-/* If the following is != 0 we do not allow inserts etc. This protects
-the user from forgetting the innodb_force_recovery keyword to my.cnf */
-
-UNIV_INTERN ulint	srv_force_recovery	= 0;
-/*-----------------------*/
-/* We are prepared for a situation that we have this many threads waiting for
-a semaphore inside InnoDB. innobase_start_or_create_for_mysql() sets the
-value. */
-
-UNIV_INTERN ulint	srv_max_n_threads	= 0;
-
-/* The following controls how many threads we let inside InnoDB concurrently:
-threads waiting for locks are not counted into the number because otherwise
-we could get a deadlock. MySQL creates a thread for each user session, and
-semaphore contention and convoy problems can occur withput this restriction.
-Value 10 should be good if there are less than 4 processors + 4 disks in the
-computer. Bigger computers need bigger values. Value 0 will disable the
-concurrency check. */
-
-UNIV_INTERN ulong	srv_thread_concurrency	= 0;
-
-/* this mutex protects srv_conc data structures */
-UNIV_INTERN os_fast_mutex_t	srv_conc_mutex;
-/* number of transactions that have declared_to_be_inside_innodb set.
-It used to be a non-error for this value to drop below zero temporarily.
-This is no longer true. We'll, however, keep the lint datatype to add
-assertions to catch any corner cases that we may have missed. */
-UNIV_INTERN lint	srv_conc_n_threads	= 0;
-/* number of OS threads waiting in the FIFO for a permission to enter
-InnoDB */
-UNIV_INTERN ulint	srv_conc_n_waiting_threads = 0;
-
-typedef struct srv_conc_slot_struct	srv_conc_slot_t;
-struct srv_conc_slot_struct{
-	os_event_t			event;		/*!< event to wait */
-	ibool				reserved;	/*!< TRUE if slot
-							reserved */
-	ibool				wait_ended;	/*!< TRUE when another
-							thread has already set
-							the event and the
-							thread in this slot is
-							free to proceed; but
-							reserved may still be
-							TRUE at that point */
-	UT_LIST_NODE_T(srv_conc_slot_t)	srv_conc_queue;	/*!< queue node */
-};
-
-/* queue of threads waiting to get in */
-UNIV_INTERN UT_LIST_BASE_NODE_T(srv_conc_slot_t)	srv_conc_queue;
-/* array of wait slots */
-UNIV_INTERN srv_conc_slot_t* srv_conc_slots;
-
-/* Number of times a thread is allowed to enter InnoDB within the same
-SQL query after it has once got the ticket at srv_conc_enter_innodb */
-#define SRV_FREE_TICKETS_TO_ENTER srv_n_free_tickets_to_enter
-#define SRV_THREAD_SLEEP_DELAY srv_thread_sleep_delay
-/*-----------------------*/
-/* If the following is set to 1 then we do not run purge and insert buffer
-merge to completion before shutdown. If it is set to 2, do not even flush the
-buffer pool to data files at the shutdown: we effectively 'crash'
-InnoDB (but lose no committed transactions). */
-UNIV_INTERN ulint	srv_fast_shutdown	= 0;
-
-/* Generate a innodb_status.<pid> file */
-UNIV_INTERN ibool	srv_innodb_status	= FALSE;
-
-/* When estimating number of different key values in an index, sample
-this many index pages */
-UNIV_INTERN unsigned long long	srv_stats_sample_pages = 8;
-
-UNIV_INTERN ibool	srv_use_doublewrite_buf	= TRUE;
-UNIV_INTERN ibool	srv_use_checksums = TRUE;
-
-UNIV_INTERN ulong	srv_replication_delay		= 0;
-
-/*-------------------------------------------*/
-UNIV_INTERN ulong	srv_n_spin_wait_rounds	= 30;
-UNIV_INTERN ulong	srv_n_free_tickets_to_enter = 500;
-UNIV_INTERN ulong	srv_thread_sleep_delay = 10000;
-UNIV_INTERN ulong	srv_spin_wait_delay	= 6;
-UNIV_INTERN ibool	srv_priority_boost	= TRUE;
-
-#ifdef UNIV_DEBUG
-UNIV_INTERN ibool	srv_print_thread_releases	= FALSE;
-UNIV_INTERN ibool	srv_print_lock_waits		= FALSE;
-UNIV_INTERN ibool	srv_print_buf_io		= FALSE;
-UNIV_INTERN ibool	srv_print_log_io		= FALSE;
-UNIV_INTERN ibool	srv_print_latch_waits		= FALSE;
-#endif /* UNIV_DEBUG */
-
-UNIV_INTERN ulint		srv_n_rows_inserted		= 0;
-UNIV_INTERN ulint		srv_n_rows_updated		= 0;
-UNIV_INTERN ulint		srv_n_rows_deleted		= 0;
-UNIV_INTERN ulint		srv_n_rows_read			= 0;
-
-static ulint	srv_n_rows_inserted_old		= 0;
-static ulint	srv_n_rows_updated_old		= 0;
-static ulint	srv_n_rows_deleted_old		= 0;
-static ulint	srv_n_rows_read_old		= 0;
-
-UNIV_INTERN ulint		srv_n_lock_wait_count		= 0;
-UNIV_INTERN ulint		srv_n_lock_wait_current_count	= 0;
-UNIV_INTERN ib_int64_t	srv_n_lock_wait_time		= 0;
-UNIV_INTERN ulint		srv_n_lock_max_wait_time	= 0;
-
-UNIV_INTERN ulint		srv_truncated_status_writes	= 0;
-
-/*
-  Set the following to 0 if you want InnoDB to write messages on
-  stderr on startup/shutdown
-*/
-UNIV_INTERN ibool	srv_print_verbose_log		= TRUE;
-UNIV_INTERN ibool	srv_print_innodb_monitor	= FALSE;
-UNIV_INTERN ibool	srv_print_innodb_lock_monitor	= FALSE;
-UNIV_INTERN ibool	srv_print_innodb_tablespace_monitor = FALSE;
-UNIV_INTERN ibool	srv_print_innodb_table_monitor = FALSE;
-
-/* Array of English strings describing the current state of an
-i/o handler thread */
-
-UNIV_INTERN const char* srv_io_thread_op_info[SRV_MAX_N_IO_THREADS];
-UNIV_INTERN const char* srv_io_thread_function[SRV_MAX_N_IO_THREADS];
-
-UNIV_INTERN time_t	srv_last_monitor_time;
-
-UNIV_INTERN mutex_t	srv_innodb_monitor_mutex;
-
-/* Mutex for locking srv_monitor_file */
-UNIV_INTERN mutex_t	srv_monitor_file_mutex;
-
-#ifdef UNIV_PFS_MUTEX
-/* Key to register kernel_mutex with performance schema */
-UNIV_INTERN mysql_pfs_key_t	kernel_mutex_key;
-/* Key to register srv_innodb_monitor_mutex with performance schema */
-UNIV_INTERN mysql_pfs_key_t	srv_innodb_monitor_mutex_key;
-/* Key to register srv_monitor_file_mutex with performance schema */
-UNIV_INTERN mysql_pfs_key_t	srv_monitor_file_mutex_key;
-/* Key to register srv_dict_tmpfile_mutex with performance schema */
-UNIV_INTERN mysql_pfs_key_t	srv_dict_tmpfile_mutex_key;
-/* Key to register the mutex with performance schema */
-UNIV_INTERN mysql_pfs_key_t	srv_misc_tmpfile_mutex_key;
-#endif /* UNIV_PFS_MUTEX */
-
-/* Temporary file for innodb monitor output */
-UNIV_INTERN FILE*	srv_monitor_file;
-/* Mutex for locking srv_dict_tmpfile.
-This mutex has a very high rank; threads reserving it should not
-be holding any InnoDB latches. */
-UNIV_INTERN mutex_t	srv_dict_tmpfile_mutex;
-/* Temporary file for output from the data dictionary */
-UNIV_INTERN FILE*	srv_dict_tmpfile;
-/* Mutex for locking srv_misc_tmpfile.
-This mutex has a very low rank; threads reserving it should not
-acquire any further latches or sleep before releasing this one. */
-UNIV_INTERN mutex_t	srv_misc_tmpfile_mutex;
-/* Temporary file for miscellanous diagnostic output */
-UNIV_INTERN FILE*	srv_misc_tmpfile;
-
-UNIV_INTERN ulint	srv_main_thread_process_no	= 0;
-UNIV_INTERN ulint	srv_main_thread_id		= 0;
-
-/* The following count work done by srv_master_thread. */
-
-/* Iterations by the 'once per second' loop. */
-static ulint   srv_main_1_second_loops		= 0;
-/* Calls to sleep by the 'once per second' loop. */
-static ulint   srv_main_sleeps			= 0;
-/* Iterations by the 'once per 10 seconds' loop. */
-static ulint   srv_main_10_second_loops		= 0;
-/* Iterations of the loop bounded by the 'background_loop' label. */
-static ulint   srv_main_background_loops	= 0;
-/* Iterations of the loop bounded by the 'flush_loop' label. */
-static ulint   srv_main_flush_loops		= 0;
-/* Log writes involving flush. */
-static ulint   srv_log_writes_and_flush		= 0;
-
-/* This is only ever touched by the master thread. It records the
-time when the last flush of log file has happened. The master
-thread ensures that we flush the log files at least once per
-second. */
-static time_t	srv_last_log_flush_time;
-
-/* The master thread performs various tasks based on the current
-state of IO activity and the level of IO utilization is past
-intervals. Following macros define thresholds for these conditions. */
-#define SRV_PEND_IO_THRESHOLD	(PCT_IO(3))
-#define SRV_RECENT_IO_ACTIVITY	(PCT_IO(5))
-#define SRV_PAST_IO_ACTIVITY	(PCT_IO(200))
-
-/*
-	IMPLEMENTATION OF THE SERVER MAIN PROGRAM
-	=========================================
-
-There is the following analogue between this database
-server and an operating system kernel:
-
-DB concept			equivalent OS concept
-----------			---------------------
-transaction		--	process;
-
-query thread		--	thread;
-
-lock			--	semaphore;
-
-transaction set to
-the rollback state	--	kill signal delivered to a process;
-
-kernel			--	kernel;
-
-query thread execution:
-(a) without kernel mutex
-reserved		--	process executing in user mode;
-(b) with kernel mutex reserved
-			--	process executing in kernel mode;
-
-The server is controlled by a master thread which runs at
-a priority higher than normal, that is, higher than user threads.
-It sleeps most of the time, and wakes up, say, every 300 milliseconds,
-to check whether there is anything happening in the server which
-requires intervention of the master thread. Such situations may be,
-for example, when flushing of dirty blocks is needed in the buffer
-pool or old version of database rows have to be cleaned away.
-
-The threads which we call user threads serve the queries of
-the clients and input from the console of the server.
-They run at normal priority. The server may have several
-communications endpoints. A dedicated set of user threads waits
-at each of these endpoints ready to receive a client request.
-Each request is taken by a single user thread, which then starts
-processing and, when the result is ready, sends it to the client
-and returns to wait at the same endpoint the thread started from.
-
-So, we do not have dedicated communication threads listening at
-the endpoints and dealing the jobs to dedicated worker threads.
-Our architecture saves one thread swithch per request, compared
-to the solution with dedicated communication threads
-which amounts to 15 microseconds on 100 MHz Pentium
-running NT. If the client
-is communicating over a network, this saving is negligible, but
-if the client resides in the same machine, maybe in an SMP machine
-on a different processor from the server thread, the saving
-can be important as the threads can communicate over shared
-memory with an overhead of a few microseconds.
-
-We may later implement a dedicated communication thread solution
-for those endpoints which communicate over a network.
-
-Our solution with user threads has two problems: for each endpoint
-there has to be a number of listening threads. If there are many
-communication endpoints, it may be difficult to set the right number
-of concurrent threads in the system, as many of the threads
-may always be waiting at less busy endpoints. Another problem
-is queuing of the messages, as the server internally does not
-offer any queue for jobs.
-
-Another group of user threads is intended for splitting the
-queries and processing them in parallel. Let us call these
-parallel communication threads. These threads are waiting for
-parallelized tasks, suspended on event semaphores.
-
-A single user thread waits for input from the console,
-like a command to shut the database.
-
-Utility threads are a different group of threads which takes
-care of the buffer pool flushing and other, mainly background
-operations, in the server.
-Some of these utility threads always run at a lower than normal
-priority, so that they are always in background. Some of them
-may dynamically boost their priority by the pri_adjust function,
-even to higher than normal priority, if their task becomes urgent.
-The running of utilities is controlled by high- and low-water marks
-of urgency. The urgency may be measured by the number of dirty blocks
-in the buffer pool, in the case of the flush thread, for example.
-When the high-water mark is exceeded, an utility starts running, until
-the urgency drops under the low-water mark. Then the utility thread
-suspend itself to wait for an event. The master thread is
-responsible of signaling this event when the utility thread is
-again needed.
-
-For each individual type of utility, some threads always remain
-at lower than normal priority. This is because pri_adjust is implemented
-so that the threads at normal or higher priority control their
-share of running time by calling sleep. Thus, if the load of the
-system sudenly drops, these threads cannot necessarily utilize
-the system fully. The background priority threads make up for this,
-starting to run when the load drops.
-
-When there is no activity in the system, also the master thread
-suspends itself to wait for an event making
-the server totally silent. The responsibility to signal this
-event is on the user thread which again receives a message
-from a client.
-
-There is still one complication in our server design. If a
-background utility thread obtains a resource (e.g., mutex) needed by a user
-thread, and there is also some other user activity in the system,
-the user thread may have to wait indefinitely long for the
-resource, as the OS does not schedule a background thread if
-there is some other runnable user thread. This problem is called
-priority inversion in real-time programming.
-
-One solution to the priority inversion problem would be to
-keep record of which thread owns which resource and
-in the above case boost the priority of the background thread
-so that it will be scheduled and it can release the resource.
-This solution is called priority inheritance in real-time programming.
-A drawback of this solution is that the overhead of acquiring a mutex
-increases slightly, maybe 0.2 microseconds on a 100 MHz Pentium, because
-the thread has to call os_thread_get_curr_id.
-This may be compared to 0.5 microsecond overhead for a mutex lock-unlock
-pair. Note that the thread
-cannot store the information in the resource, say mutex, itself,
-because competing threads could wipe out the information if it is
-stored before acquiring the mutex, and if it stored afterwards,
-the information is outdated for the time of one machine instruction,
-at least. (To be precise, the information could be stored to
-lock_word in mutex if the machine supports atomic swap.)
-
-The above solution with priority inheritance may become actual in the
-future, but at the moment we plan to implement a more coarse solution,
-which could be called a global priority inheritance. If a thread
-has to wait for a long time, say 300 milliseconds, for a resource,
-we just guess that it may be waiting for a resource owned by a background
-thread, and boost the priority of all runnable background threads
-to the normal level. The background threads then themselves adjust
-their fixed priority back to background after releasing all resources
-they had (or, at some fixed points in their program code).
-
-What is the performance of the global priority inheritance solution?
-We may weigh the length of the wait time 300 milliseconds, during
-which the system processes some other thread
-to the cost of boosting the priority of each runnable background
-thread, rescheduling it, and lowering the priority again.
-On 100 MHz Pentium + NT this overhead may be of the order 100
-microseconds per thread. So, if the number of runnable background
-threads is not very big, say < 100, the cost is tolerable.
-Utility threads probably will access resources used by
-user threads not very often, so collisions of user threads
-to preempted utility threads should not happen very often.
-
-The thread table contains
-information of the current status of each thread existing in the system,
-and also the event semaphores used in suspending the master thread
-and utility and parallel communication threads when they have nothing to do.
-The thread table can be seen as an analogue to the process table
-in a traditional Unix implementation.
-
-The thread table is also used in the global priority inheritance
-scheme. This brings in one additional complication: threads accessing
-the thread table must have at least normal fixed priority,
-because the priority inheritance solution does not work if a background
-thread is preempted while possessing the mutex protecting the thread table.
-So, if a thread accesses the thread table, its priority has to be
-boosted at least to normal. This priority requirement can be seen similar to
-the privileged mode used when processing the kernel calls in traditional
-Unix.*/
-
-/* Thread slot in the thread table */
-struct srv_slot_struct{
-	unsigned	type:1;		/*!< thread type: user, utility etc. */
-	unsigned	in_use:1;	/*!< TRUE if this slot is in use */
-	unsigned	suspended:1;	/*!< TRUE if the thread is waiting
-					for the event of this slot */
-	ib_time_t	suspend_time;	/*!< time when the thread was
-					suspended */
-	os_event_t	event;		/*!< event used in suspending the
-					thread when it has nothing to do */
-	que_thr_t*	thr;		/*!< suspended query thread (only
-					used for MySQL threads) */
-};
-
-/* Table for MySQL threads where they will be suspended to wait for locks */
-UNIV_INTERN srv_slot_t*	srv_mysql_table = NULL;
-
-UNIV_INTERN os_event_t	srv_timeout_event;
-
-UNIV_INTERN os_event_t	srv_monitor_event;
-
-UNIV_INTERN os_event_t	srv_error_event;
-
-UNIV_INTERN os_event_t	srv_lock_timeout_thread_event;
-
-UNIV_INTERN srv_sys_t*	srv_sys	= NULL;
-
-/* padding to prevent other memory update hotspots from residing on
-the same memory cache line */
-UNIV_INTERN byte	srv_pad1[64];
-/* mutex protecting the server, trx structs, query threads, and lock table */
-UNIV_INTERN mutex_t*	kernel_mutex_temp;
-/* padding to prevent other memory update hotspots from residing on
-the same memory cache line */
-UNIV_INTERN byte	srv_pad2[64];
-
-#if 0
-/* The following three values measure the urgency of the jobs of
-buffer, version, and insert threads. They may vary from 0 - 1000.
-The server mutex protects all these variables. The low-water values
-tell that the server can acquiesce the utility when the value
-drops below this low-water mark. */
-
-static ulint	srv_meter[SRV_MASTER + 1];
-static ulint	srv_meter_low_water[SRV_MASTER + 1];
-static ulint	srv_meter_high_water[SRV_MASTER + 1];
-static ulint	srv_meter_high_water2[SRV_MASTER + 1];
-static ulint	srv_meter_foreground[SRV_MASTER + 1];
-#endif
-
-/* The following values give info about the activity going on in
-the database. They are protected by the server mutex. The arrays
-are indexed by the type of the thread. */
-
-UNIV_INTERN ulint	srv_n_threads_active[SRV_MASTER + 1];
-UNIV_INTERN ulint	srv_n_threads[SRV_MASTER + 1];
-
-/*********************************************************************//**
-Asynchronous purge thread.
-@return	a dummy parameter */
-UNIV_INTERN
-os_thread_ret_t
-srv_purge_thread(
-/*=============*/
-	void*	arg __attribute__((unused))); /*!< in: a dummy parameter
-					      required by os_thread_create */
-
-/***********************************************************************
-Prints counters for work done by srv_master_thread. */
-static
-void
-srv_print_master_thread_info(
-/*=========================*/
-	FILE  *file)    /* in: output stream */
-{
-	fprintf(file, "srv_master_thread loops: %lu 1_second, %lu sleeps, "
-		"%lu 10_second, %lu background, %lu flush\n",
-		srv_main_1_second_loops, srv_main_sleeps,
-		srv_main_10_second_loops, srv_main_background_loops,
-		srv_main_flush_loops);
-	fprintf(file, "srv_master_thread log flush and writes: %lu\n",
-		      srv_log_writes_and_flush);
-}
-
-/*********************************************************************//**
-Sets the info describing an i/o thread current state. */
-UNIV_INTERN
-void
-srv_set_io_thread_op_info(
-/*======================*/
-	ulint		i,	/*!< in: the 'segment' of the i/o thread */
-	const char*	str)	/*!< in: constant char string describing the
-				state */
-{
-	ut_a(i < SRV_MAX_N_IO_THREADS);
-
-	srv_io_thread_op_info[i] = str;
-}
-
-/*********************************************************************//**
-Accessor function to get pointer to n'th slot in the server thread
-table.
-@return	pointer to the slot */
-static
-srv_slot_t*
-srv_table_get_nth_slot(
-/*===================*/
-	ulint	index)		/*!< in: index of the slot */
-{
-	ut_ad(mutex_own(&kernel_mutex));
-	ut_a(index < OS_THREAD_MAX_N);
-
-	return(srv_sys->threads + index);
-}
-
-/*********************************************************************//**
-Gets the number of threads in the system.
-@return	sum of srv_n_threads[] */
-UNIV_INTERN
-ulint
-srv_get_n_threads(void)
-/*===================*/
-{
-	ulint	i;
-	ulint	n_threads	= 0;
-
-	mutex_enter(&kernel_mutex);
-
-	for (i = 0; i < SRV_MASTER + 1; i++) {
-
-		n_threads += srv_n_threads[i];
-	}
-
-	mutex_exit(&kernel_mutex);
-
-	return(n_threads);
-}
-
-#ifdef UNIV_DEBUG
-/*********************************************************************//**
-Validates the type of a thread table slot.
-@return TRUE if ok */
-static
-ibool
-srv_thread_type_validate(
-/*=====================*/
-	enum srv_thread_type	type)	/*!< in: thread type */
-{
-	switch (type) {
-	case SRV_WORKER:
-	case SRV_MASTER:
-		return(TRUE);
-	}
-	ut_error;
-	return(FALSE);
-}
-#endif /* UNIV_DEBUG */
-
-/*********************************************************************//**
-Gets the type of a thread table slot.
-@return thread type */
-static
-enum srv_thread_type
-srv_slot_get_type(
-/*==============*/
-	const srv_slot_t*	slot)	/*!< in: thread slot */
-{
-	enum srv_thread_type	type	= (enum srv_thread_type) slot->type;
-	ut_ad(srv_thread_type_validate(type));
-	return(type);
-}
-
-/*********************************************************************//**
-Reserves a slot in the thread table for the current thread.
-NOTE! The server mutex has to be reserved by the caller!
-@return	reserved slot */
-static
-srv_slot_t*
-srv_table_reserve_slot(
-/*===================*/
-	enum srv_thread_type	type)	/*!< in: type of the thread */
-{
-	srv_slot_t*	slot;
-	ulint		i;
-
-	ut_ad(srv_thread_type_validate(type));
-	ut_ad(mutex_own(&kernel_mutex));
-
-	i = 0;
-	slot = srv_table_get_nth_slot(i);
-
-	while (slot->in_use) {
-		i++;
-		slot = srv_table_get_nth_slot(i);
-	}
-
-	slot->in_use = TRUE;
-	slot->suspended = FALSE;
-	slot->type = type;
-	ut_ad(srv_slot_get_type(slot) == type);
-
-	return(slot);
-}
-
-/*********************************************************************//**
-Suspends the calling thread to wait for the event in its thread slot.
-NOTE! The server mutex has to be reserved by the caller! */
-static
-void
-srv_suspend_thread(
-/*===============*/
-	srv_slot_t*	slot)	/*!< in/out: thread slot */
-{
-	enum srv_thread_type	type;
-
-	ut_ad(mutex_own(&kernel_mutex));
-	ut_ad(slot->in_use);
-	ut_ad(!slot->suspended);
-
-	if (srv_print_thread_releases) {
-		fprintf(stderr,
-			"Suspending thread %lu to slot %lu\n",
-			(ulong) os_thread_get_curr_id(),
-			(ulong) (slot - srv_sys->threads));
-	}
-
-	type = srv_slot_get_type(slot);
-
-	slot->suspended = TRUE;
-
-	ut_ad(srv_n_threads_active[type] > 0);
-
-	srv_n_threads_active[type]--;
-
-	os_event_reset(slot->event);
-}
-
-/*********************************************************************//**
-Releases threads of the type given from suspension in the thread table.
-NOTE! The server mutex has to be reserved by the caller!
-@return number of threads released: this may be less than n if not
-enough threads were suspended at the moment */
-UNIV_INTERN
-ulint
-srv_release_threads(
-/*================*/
-	enum srv_thread_type	type,	/*!< in: thread type */
-	ulint			n)	/*!< in: number of threads to release */
-{
-	srv_slot_t*	slot;
-	ulint		i;
-	ulint		count	= 0;
-
-	ut_ad(srv_thread_type_validate(type));
-	ut_ad(n > 0);
-	ut_ad(mutex_own(&kernel_mutex));
-
-	for (i = 0; i < OS_THREAD_MAX_N; i++) {
-
-		slot = srv_table_get_nth_slot(i);
-
-		if (slot->in_use && slot->suspended
-		    && srv_slot_get_type(slot) == type) {
-
-			slot->suspended = FALSE;
-
-			srv_n_threads_active[type]++;
-
-			os_event_set(slot->event);
-
-			if (srv_print_thread_releases) {
-				fprintf(stderr,
-					"Releasing thread type %lu"
-					" from slot %lu\n",
-					(ulong) type, (ulong) i);
-			}
-
-			count++;
-
-			if (count == n) {
-				break;
-			}
-		}
-	}
-
-	return(count);
-}
-
-/*********************************************************************//**
-Check whether thread type has reserved a slot. Return the first slot that
-is found. This works because we currently have only 1 thread of each type.
-@return	slot number or ULINT_UNDEFINED if not found*/
-UNIV_INTERN
-ulint
-srv_thread_has_reserved_slot(
-/*=========================*/
-	enum srv_thread_type	type)	/*!< in: thread type to check */
-{
-	ulint			i;
-	ulint			slot_no = ULINT_UNDEFINED;
-
-	ut_ad(srv_thread_type_validate(type));
-	mutex_enter(&kernel_mutex);
-
-	for (i = 0; i < OS_THREAD_MAX_N; i++) {
-		srv_slot_t*	slot;
-
-		slot = srv_table_get_nth_slot(i);
-
-		if (slot->in_use && slot->type == type) {
-			slot_no = i;
-			break;
-		}
-	}
-
-	mutex_exit(&kernel_mutex);
-
-	return(slot_no);
-}
-
-/*********************************************************************//**
-Initializes the server. */
-UNIV_INTERN
-void
-srv_init(void)
-/*==========*/
-{
-	srv_conc_slot_t*	conc_slot;
-	srv_slot_t*		slot;
-	ulint			i;
-
-	srv_sys = mem_alloc(sizeof(srv_sys_t));
-
-	kernel_mutex_temp = mem_alloc(sizeof(mutex_t));
-	mutex_create(kernel_mutex_key, &kernel_mutex, SYNC_KERNEL);
-
-	mutex_create(srv_innodb_monitor_mutex_key,
-		     &srv_innodb_monitor_mutex, SYNC_NO_ORDER_CHECK);
-
-	srv_sys->threads = mem_zalloc(OS_THREAD_MAX_N * sizeof(srv_slot_t));
-
-	for (i = 0; i < OS_THREAD_MAX_N; i++) {
-		slot = srv_sys->threads + i;
-		slot->event = os_event_create(NULL);
-		ut_a(slot->event);
-	}
-
-	srv_mysql_table = mem_zalloc(OS_THREAD_MAX_N * sizeof(srv_slot_t));
-
-	for (i = 0; i < OS_THREAD_MAX_N; i++) {
-		slot = srv_mysql_table + i;
-		slot->event = os_event_create(NULL);
-		ut_a(slot->event);
-	}
-
-	srv_error_event = os_event_create(NULL);
-
-	srv_timeout_event = os_event_create(NULL);
-
-	srv_monitor_event = os_event_create(NULL);
-
-	srv_lock_timeout_thread_event = os_event_create(NULL);
-
-	for (i = 0; i < SRV_MASTER + 1; i++) {
-		srv_n_threads_active[i] = 0;
-		srv_n_threads[i] = 0;
-#if 0
-		srv_meter[i] = 30;
-		srv_meter_low_water[i] = 50;
-		srv_meter_high_water[i] = 100;
-		srv_meter_high_water2[i] = 200;
-		srv_meter_foreground[i] = 250;
-#endif
-	}
-
-	UT_LIST_INIT(srv_sys->tasks);
-
-	/* Create dummy indexes for infimum and supremum records */
-
-	dict_ind_init();
-
-	/* Init the server concurrency restriction data structures */
-
-	os_fast_mutex_init(&srv_conc_mutex);
-
-	UT_LIST_INIT(srv_conc_queue);
-
-	srv_conc_slots = mem_alloc(OS_THREAD_MAX_N * sizeof(srv_conc_slot_t));
-
-	for (i = 0; i < OS_THREAD_MAX_N; i++) {
-		conc_slot = srv_conc_slots + i;
-		conc_slot->reserved = FALSE;
-		conc_slot->event = os_event_create(NULL);
-		ut_a(conc_slot->event);
-	}
-
-	/* Initialize some INFORMATION SCHEMA internal structures */
-	trx_i_s_cache_init(trx_i_s_cache);
-}
-
-/*********************************************************************//**
-Frees the data structures created in srv_init(). */
-UNIV_INTERN
-void
-srv_free(void)
-/*==========*/
-{
-	os_fast_mutex_free(&srv_conc_mutex);
-	mem_free(srv_conc_slots);
-	srv_conc_slots = NULL;
-
-	mem_free(srv_sys->threads);
-	mem_free(srv_sys);
-	srv_sys = NULL;
-
-	mem_free(kernel_mutex_temp);
-	kernel_mutex_temp = NULL;
-	mem_free(srv_mysql_table);
-	srv_mysql_table = NULL;
-
-	trx_i_s_cache_free(trx_i_s_cache);
-}
-
-/*********************************************************************//**
-Initializes the synchronization primitives, memory system, and the thread
-local storage. */
-UNIV_INTERN
-void
-srv_general_init(void)
-/*==================*/
-{
-	ut_mem_init();
-	/* Reset the system variables in the recovery module. */
-	recv_sys_var_init();
-	os_sync_init();
-	sync_init();
-	mem_init(srv_mem_pool_size);
-}
-
-/*======================= InnoDB Server FIFO queue =======================*/
-
-/* Maximum allowable purge history length.  <=0 means 'infinite'. */
-UNIV_INTERN ulong	srv_max_purge_lag		= 0;
-
-/*********************************************************************//**
-Puts an OS thread to wait if there are too many concurrent threads
-(>= srv_thread_concurrency) inside InnoDB. The threads wait in a FIFO queue. */
-UNIV_INTERN
-void
-srv_conc_enter_innodb(
-/*==================*/
-	trx_t*	trx)	/*!< in: transaction object associated with the
-			thread */
-{
-	ibool			has_slept = FALSE;
-	srv_conc_slot_t*	slot	  = NULL;
-	ulint			i;
-
-#ifdef UNIV_SYNC_DEBUG
-	ut_ad(!sync_thread_levels_nonempty_trx(trx->has_search_latch));
-#endif /* UNIV_SYNC_DEBUG */
-
-	if (trx->mysql_thd != NULL
-	    && thd_is_replication_slave_thread(trx->mysql_thd)) {
-
-		UT_WAIT_FOR(srv_conc_n_threads
-			    < (lint)srv_thread_concurrency,
-			    srv_replication_delay * 1000);
-
-		return;
-	}
-
-	/* If trx has 'free tickets' to enter the engine left, then use one
-	such ticket */
-
-	if (trx->n_tickets_to_enter_innodb > 0) {
-		trx->n_tickets_to_enter_innodb--;
-
-		return;
-	}
-
-	os_fast_mutex_lock(&srv_conc_mutex);
-retry:
-	if (trx->declared_to_be_inside_innodb) {
-		ut_print_timestamp(stderr);
-		fputs("  InnoDB: Error: trying to declare trx"
-		      " to enter InnoDB, but\n"
-		      "InnoDB: it already is declared.\n", stderr);
-		trx_print(stderr, trx, 0);
-		putc('\n', stderr);
-		os_fast_mutex_unlock(&srv_conc_mutex);
-
-		return;
-	}
-
-	ut_ad(srv_conc_n_threads >= 0);
-
-	if (srv_conc_n_threads < (lint)srv_thread_concurrency) {
-
-		srv_conc_n_threads++;
-		trx->declared_to_be_inside_innodb = TRUE;
-		trx->n_tickets_to_enter_innodb = SRV_FREE_TICKETS_TO_ENTER;
-
-		os_fast_mutex_unlock(&srv_conc_mutex);
-
-		return;
-	}
-
-	/* If the transaction is not holding resources, let it sleep
-	for SRV_THREAD_SLEEP_DELAY microseconds, and try again then */
-
-	if (!has_slept && !trx->has_search_latch
-	    && NULL == UT_LIST_GET_FIRST(trx->trx_locks)) {
-
-		has_slept = TRUE; /* We let it sleep only once to avoid
-				  starvation */
-
-		srv_conc_n_waiting_threads++;
-
-		os_fast_mutex_unlock(&srv_conc_mutex);
-
-		trx->op_info = "sleeping before joining InnoDB queue";
-
-		/* Peter Zaitsev suggested that we take the sleep away
-		altogether. But the sleep may be good in pathological
-		situations of lots of thread switches. Simply put some
-		threads aside for a while to reduce the number of thread
-		switches. */
-		if (SRV_THREAD_SLEEP_DELAY > 0) {
-			os_thread_sleep(SRV_THREAD_SLEEP_DELAY);
-		}
-
-		trx->op_info = "";
-
-		os_fast_mutex_lock(&srv_conc_mutex);
-
-		srv_conc_n_waiting_threads--;
-
-		goto retry;
-	}
-
-	/* Too many threads inside: put the current thread to a queue */
-
-	for (i = 0; i < OS_THREAD_MAX_N; i++) {
-		slot = srv_conc_slots + i;
-
-		if (!slot->reserved) {
-
-			break;
-		}
-	}
-
-	if (i == OS_THREAD_MAX_N) {
-		/* Could not find a free wait slot, we must let the
-		thread enter */
-
-		srv_conc_n_threads++;
-		trx->declared_to_be_inside_innodb = TRUE;
-		trx->n_tickets_to_enter_innodb = 0;
-
-		os_fast_mutex_unlock(&srv_conc_mutex);
-
-		return;
-	}
-
-	/* Release possible search system latch this thread has */
-	if (trx->has_search_latch) {
-		trx_search_latch_release_if_reserved(trx);
-	}
-
-	/* Add to the queue */
-	slot->reserved = TRUE;
-	slot->wait_ended = FALSE;
-
-	UT_LIST_ADD_LAST(srv_conc_queue, srv_conc_queue, slot);
-
-	os_event_reset(slot->event);
-
-	srv_conc_n_waiting_threads++;
-
-	os_fast_mutex_unlock(&srv_conc_mutex);
-
-	/* Go to wait for the event; when a thread leaves InnoDB it will
-	release this thread */
-
-	ut_ad(!trx->has_search_latch);
-#ifdef UNIV_SYNC_DEBUG
-	ut_ad(!sync_thread_levels_nonempty_trx(trx->has_search_latch));
-#endif /* UNIV_SYNC_DEBUG */
-	trx->op_info = "waiting in InnoDB queue";
-
-	thd_wait_begin(trx->mysql_thd, THD_WAIT_USER_LOCK);
-	os_event_wait(slot->event);
-	thd_wait_end(trx->mysql_thd);
-
-	trx->op_info = "";
-
-	os_fast_mutex_lock(&srv_conc_mutex);
-
-	srv_conc_n_waiting_threads--;
-
-	/* NOTE that the thread which released this thread already
-	incremented the thread counter on behalf of this thread */
-
-	slot->reserved = FALSE;
-
-	UT_LIST_REMOVE(srv_conc_queue, srv_conc_queue, slot);
-
-	trx->declared_to_be_inside_innodb = TRUE;
-	trx->n_tickets_to_enter_innodb = SRV_FREE_TICKETS_TO_ENTER;
-
-	os_fast_mutex_unlock(&srv_conc_mutex);
-}
-
-/*********************************************************************//**
-This lets a thread enter InnoDB regardless of the number of threads inside
-InnoDB. This must be called when a thread ends a lock wait. */
-UNIV_INTERN
-void
-srv_conc_force_enter_innodb(
-/*========================*/
-	trx_t*	trx)	/*!< in: transaction object associated with the
-			thread */
-{
-#ifdef UNIV_SYNC_DEBUG
-	ut_ad(!sync_thread_levels_nonempty_trx(trx->has_search_latch));
-#endif /* UNIV_SYNC_DEBUG */
-
-	if (UNIV_LIKELY(!srv_thread_concurrency)) {
-
-		return;
-	}
-
-	ut_ad(srv_conc_n_threads >= 0);
-
-	os_fast_mutex_lock(&srv_conc_mutex);
-
-	srv_conc_n_threads++;
-	trx->declared_to_be_inside_innodb = TRUE;
-	trx->n_tickets_to_enter_innodb = 1;
-
-	os_fast_mutex_unlock(&srv_conc_mutex);
-}
-
-/*********************************************************************//**
-This must be called when a thread exits InnoDB in a lock wait or at the
-end of an SQL statement. */
-UNIV_INTERN
-void
-srv_conc_force_exit_innodb(
-/*=======================*/
-	trx_t*	trx)	/*!< in: transaction object associated with the
-			thread */
-{
-	srv_conc_slot_t*	slot	= NULL;
-
-	if (trx->mysql_thd != NULL
-	    && thd_is_replication_slave_thread(trx->mysql_thd)) {
-
-		return;
-	}
-
-	if (trx->declared_to_be_inside_innodb == FALSE) {
-
-		return;
-	}
-
-	os_fast_mutex_lock(&srv_conc_mutex);
-
-	ut_ad(srv_conc_n_threads > 0);
-	srv_conc_n_threads--;
-	trx->declared_to_be_inside_innodb = FALSE;
-	trx->n_tickets_to_enter_innodb = 0;
-
-	if (srv_conc_n_threads < (lint)srv_thread_concurrency) {
-		/* Look for a slot where a thread is waiting and no other
-		thread has yet released the thread */
-
-		slot = UT_LIST_GET_FIRST(srv_conc_queue);
-
-		while (slot && slot->wait_ended == TRUE) {
-			slot = UT_LIST_GET_NEXT(srv_conc_queue, slot);
-		}
-
-		if (slot != NULL) {
-			slot->wait_ended = TRUE;
-
-			/* We increment the count on behalf of the released
-			thread */
-
-			srv_conc_n_threads++;
-		}
-	}
-
-	os_fast_mutex_unlock(&srv_conc_mutex);
-
-	if (slot != NULL) {
-		os_event_set(slot->event);
-	}
-
-#ifdef UNIV_SYNC_DEBUG
-	ut_ad(!sync_thread_levels_nonempty_trx(trx->has_search_latch));
-#endif /* UNIV_SYNC_DEBUG */
-}
-
-/*********************************************************************//**
-This must be called when a thread exits InnoDB. */
-UNIV_INTERN
-void
-srv_conc_exit_innodb(
-/*=================*/
-	trx_t*	trx)	/*!< in: transaction object associated with the
-			thread */
-{
-#ifdef UNIV_SYNC_DEBUG
-	ut_ad(!sync_thread_levels_nonempty_trx(trx->has_search_latch));
-#endif /* UNIV_SYNC_DEBUG */
-
-	if (trx->n_tickets_to_enter_innodb > 0) {
-		/* We will pretend the thread is still inside InnoDB though it
-		now leaves the InnoDB engine. In this way we save
-		a lot of semaphore operations. srv_conc_force_exit_innodb is
-		used to declare the thread definitely outside InnoDB. It
-		should be called when there is a lock wait or an SQL statement
-		ends. */
-
-		return;
-	}
-
-	srv_conc_force_exit_innodb(trx);
-}
-
-/*========================================================================*/
-
-/*********************************************************************//**
-Normalizes init parameter values to use units we use inside InnoDB.
-@return	DB_SUCCESS or error code */
-static
-ulint
-srv_normalize_init_values(void)
-/*===========================*/
-{
-	ulint	n;
-	ulint	i;
-
-	n = srv_n_data_files;
-
-	for (i = 0; i < n; i++) {
-		srv_data_file_sizes[i] = srv_data_file_sizes[i]
-			* ((1024 * 1024) / UNIV_PAGE_SIZE);
-	}
-
-	srv_last_file_size_max = srv_last_file_size_max
-		* ((1024 * 1024) / UNIV_PAGE_SIZE);
-
-	srv_log_file_size = srv_log_file_size / UNIV_PAGE_SIZE;
-
-	srv_log_buffer_size = srv_log_buffer_size / UNIV_PAGE_SIZE;
-
-	srv_lock_table_size = 5 * (srv_buf_pool_size / UNIV_PAGE_SIZE);
-
-	return(DB_SUCCESS);
-}
-
-/*********************************************************************//**
-Boots the InnoDB server.
-@return	DB_SUCCESS or error code */
-UNIV_INTERN
-ulint
-srv_boot(void)
-/*==========*/
-{
-	ulint	err;
-
-	/* Transform the init parameter values given by MySQL to
-	use units we use inside InnoDB: */
-
-	err = srv_normalize_init_values();
-
-	if (err != DB_SUCCESS) {
-		return(err);
-	}
-
-	/* Initialize synchronization primitives, memory management, and thread
-	local storage */
-
-	srv_general_init();
-
-	/* Initialize this module */
-
-	srv_init();
-
-	return(DB_SUCCESS);
-}
-
-/*********************************************************************//**
-Reserves a slot in the thread table for the current MySQL OS thread.
-NOTE! The kernel mutex has to be reserved by the caller!
-@return	reserved slot */
-static
-srv_slot_t*
-srv_table_reserve_slot_for_mysql(void)
-/*==================================*/
-{
-	srv_slot_t*	slot;
-	ulint		i;
-
-	ut_ad(mutex_own(&kernel_mutex));
-
-	i = 0;
-	slot = srv_mysql_table + i;
-
-	while (slot->in_use) {
-		i++;
-
-		if (UNIV_UNLIKELY(i >= OS_THREAD_MAX_N)) {
-
-			ut_print_timestamp(stderr);
-
-			fprintf(stderr,
-				"  InnoDB: There appear to be %lu MySQL"
-				" threads currently waiting\n"
-				"InnoDB: inside InnoDB, which is the"
-				" upper limit. Cannot continue operation.\n"
-				"InnoDB: We intentionally generate"
-				" a seg fault to print a stack trace\n"
-				"InnoDB: on Linux. But first we print"
-				" a list of waiting threads.\n", (ulong) i);
-
-			for (i = 0; i < OS_THREAD_MAX_N; i++) {
-
-				slot = srv_mysql_table + i;
-
-				fprintf(stderr,
-					"Slot %lu: thread type %lu,"
-					" in use %lu, susp %lu, time %lu\n",
-					(ulong) i,
-					(ulong) slot->type,
-					(ulong) slot->in_use,
-					(ulong) slot->suspended,
-					(ulong) difftime(ut_time(),
-							 slot->suspend_time));
-			}
-
-			ut_error;
-		}
-
-		slot = srv_mysql_table + i;
-	}
-
-	ut_a(slot->in_use == FALSE);
-
-	slot->in_use = TRUE;
-
-	return(slot);
-}
-
-/***************************************************************//**
-Puts a MySQL OS thread to wait for a lock to be released. If an error
-occurs during the wait trx->error_state associated with thr is
-!= DB_SUCCESS when we return. DB_LOCK_WAIT_TIMEOUT and DB_DEADLOCK
-are possible errors. DB_DEADLOCK is returned if selective deadlock
-resolution chose this transaction as a victim. */
-UNIV_INTERN
-void
-srv_suspend_mysql_thread(
-/*=====================*/
-	que_thr_t*	thr)	/*!< in: query thread associated with the MySQL
-				OS thread */
-{
-	srv_slot_t*	slot;
-	os_event_t	event;
-	double		wait_time;
-	trx_t*		trx;
-	ulint		had_dict_lock;
-	ibool		was_declared_inside_innodb	= FALSE;
-	ib_int64_t	start_time			= 0;
-	ib_int64_t	finish_time;
-	ulint		diff_time;
-	ulint		sec;
-	ulint		ms;
-	ulong		lock_wait_timeout;
-
-	ut_ad(!mutex_own(&kernel_mutex));
-
-	trx = thr_get_trx(thr);
-
-	os_event_set(srv_lock_timeout_thread_event);
-
-	mutex_enter(&kernel_mutex);
-
-	trx->error_state = DB_SUCCESS;
-
-	if (thr->state == QUE_THR_RUNNING) {
-
-		ut_ad(thr->is_active == TRUE);
-
-		/* The lock has already been released or this transaction
-		was chosen as a deadlock victim: no need to suspend */
-
-		if (trx->was_chosen_as_deadlock_victim) {
-
-			trx->error_state = DB_DEADLOCK;
-			trx->was_chosen_as_deadlock_victim = FALSE;
-		}
-
-		mutex_exit(&kernel_mutex);
-
-		return;
-	}
-
-	ut_ad(thr->is_active == FALSE);
-
-	slot = srv_table_reserve_slot_for_mysql();
-
-	event = slot->event;
-
-	slot->thr = thr;
-
-	os_event_reset(event);
-
-	slot->suspend_time = ut_time();
-
-	if (thr->lock_state == QUE_THR_LOCK_ROW) {
-		srv_n_lock_wait_count++;
-		srv_n_lock_wait_current_count++;
-
-		if (ut_usectime(&sec, &ms) == -1) {
-			start_time = -1;
-		} else {
-			start_time = (ib_int64_t) sec * 1000000 + ms;
-		}
-	}
-	/* Wake the lock timeout monitor thread, if it is suspended */
-
-	os_event_set(srv_lock_timeout_thread_event);
-
-	mutex_exit(&kernel_mutex);
-
-	had_dict_lock = trx->dict_operation_lock_mode;
-
-	switch (had_dict_lock) {
-	case RW_S_LATCH:
-		/* Release foreign key check latch */
-		row_mysql_unfreeze_data_dictionary(trx);
-		break;
-	case RW_X_LATCH:
-		/* There should never be a lock wait when the
-		dictionary latch is reserved in X mode.  Dictionary
-		transactions should only acquire locks on dictionary
-		tables, not other tables. All access to dictionary
-		tables should be covered by dictionary
-		transactions. */
-		ut_print_timestamp(stderr);
-		fputs("  InnoDB: Error: dict X latch held in "
-		      "srv_suspend_mysql_thread\n", stderr);
-		/* This should never occur. This incorrect handling
-		was added in the early development of
-		ha_innobase::add_index() in InnoDB Plugin 1.0. */
-		/* Release fast index creation latch */
-		row_mysql_unlock_data_dictionary(trx);
-		break;
-	}
-
-	ut_a(trx->dict_operation_lock_mode == 0);
-
-	if (trx->declared_to_be_inside_innodb) {
-
-		was_declared_inside_innodb = TRUE;
-
-		/* We must declare this OS thread to exit InnoDB, since a
-		possible other thread holding a lock which this thread waits
-		for must be allowed to enter, sooner or later */
-
-		srv_conc_force_exit_innodb(trx);
-	}
-
-	/* Suspend this thread and wait for the event. */
-
-	thd_wait_begin(trx->mysql_thd, THD_WAIT_ROW_LOCK);
-	os_event_wait(event);
-	thd_wait_end(trx->mysql_thd);
-
-#ifdef UNIV_SYNC_DEBUG
-	ut_ad(!sync_thread_levels_nonempty_trx(trx->has_search_latch));
-#endif /* UNIV_SYNC_DEBUG */
-
-	if (was_declared_inside_innodb) {
-
-		/* Return back inside InnoDB */
-
-		srv_conc_force_enter_innodb(trx);
-	}
-
-	/* After resuming, reacquire the data dictionary latch if
-	necessary. */
-
-	switch (had_dict_lock) {
-	case RW_S_LATCH:
-		row_mysql_freeze_data_dictionary(trx);
-		break;
-	case RW_X_LATCH:
-		/* This should never occur. This incorrect handling
-		was added in the early development of
-		ha_innobase::add_index() in InnoDB Plugin 1.0. */
-		row_mysql_lock_data_dictionary(trx);
-		break;
-	}
-
-	mutex_enter(&kernel_mutex);
-
-	/* Release the slot for others to use */
-
-	slot->in_use = FALSE;
-
-	wait_time = ut_difftime(ut_time(), slot->suspend_time);
-
-	if (thr->lock_state == QUE_THR_LOCK_ROW) {
-		if (ut_usectime(&sec, &ms) == -1) {
-			finish_time = -1;
-		} else {
-			finish_time = (ib_int64_t) sec * 1000000 + ms;
-		}
-
-		diff_time = (ulint) (finish_time - start_time);
-
-		srv_n_lock_wait_current_count--;
-		srv_n_lock_wait_time = srv_n_lock_wait_time + diff_time;
-		if (diff_time > srv_n_lock_max_wait_time &&
-		    /* only update the variable if we successfully
-		    retrieved the start and finish times. See Bug#36819. */
-		    start_time != -1 && finish_time != -1) {
-			srv_n_lock_max_wait_time = diff_time;
-		}
-
-		/* Record the lock wait time for this thread */
-		thd_set_lock_wait_time(trx->mysql_thd, diff_time);
-	}
-
-	if (trx->was_chosen_as_deadlock_victim) {
-
-		trx->error_state = DB_DEADLOCK;
-		trx->was_chosen_as_deadlock_victim = FALSE;
-	}
-
-	mutex_exit(&kernel_mutex);
-
-	/* InnoDB system transactions (such as the purge, and
-	incomplete transactions that are being rolled back after crash
-	recovery) will use the global value of
-	innodb_lock_wait_timeout, because trx->mysql_thd == NULL. */
-	lock_wait_timeout = thd_lock_wait_timeout(trx->mysql_thd);
-
-	if (lock_wait_timeout < 100000000
-	    && wait_time > (double) lock_wait_timeout) {
-
-		trx->error_state = DB_LOCK_WAIT_TIMEOUT;
-	}
-
-	if (trx_is_interrupted(trx)) {
-
-		trx->error_state = DB_INTERRUPTED;
-	}
-}
-
-/********************************************************************//**
-Releases a MySQL OS thread waiting for a lock to be released, if the
-thread is already suspended. */
-UNIV_INTERN
-void
-srv_release_mysql_thread_if_suspended(
-/*==================================*/
-	que_thr_t*	thr)	/*!< in: query thread associated with the
-				MySQL OS thread	 */
-{
-	srv_slot_t*	slot;
-	ulint		i;
-
-	ut_ad(mutex_own(&kernel_mutex));
-
-	for (i = 0; i < OS_THREAD_MAX_N; i++) {
-
-		slot = srv_mysql_table + i;
-
-		if (slot->in_use && slot->thr == thr) {
-			/* Found */
-
-			os_event_set(slot->event);
-
-			return;
-		}
-	}
-
-	/* not found */
-}
-
-/******************************************************************//**
-Refreshes the values used to calculate per-second averages. */
-static
-void
-srv_refresh_innodb_monitor_stats(void)
-/*==================================*/
-{
-	mutex_enter(&srv_innodb_monitor_mutex);
-
-	srv_last_monitor_time = time(NULL);
-
-	os_aio_refresh_stats();
-
-	btr_cur_n_sea_old = btr_cur_n_sea;
-	btr_cur_n_non_sea_old = btr_cur_n_non_sea;
-
-	log_refresh_stats();
-
-	buf_refresh_io_stats_all();
-
-	srv_n_rows_inserted_old = srv_n_rows_inserted;
-	srv_n_rows_updated_old = srv_n_rows_updated;
-	srv_n_rows_deleted_old = srv_n_rows_deleted;
-	srv_n_rows_read_old = srv_n_rows_read;
-
-	mutex_exit(&srv_innodb_monitor_mutex);
-}
-
-/******************************************************************//**
-Outputs to a file the output of the InnoDB Monitor.
-@return FALSE if not all information printed
-due to failure to obtain necessary mutex */
-UNIV_INTERN
-ibool
-srv_printf_innodb_monitor(
-/*======================*/
-	FILE*	file,		/*!< in: output stream */
-	ibool	nowait,		/*!< in: whether to wait for kernel mutex */
-	ulint*	trx_start,	/*!< out: file position of the start of
-				the list of active transactions */
-	ulint*	trx_end)	/*!< out: file position of the end of
-				the list of active transactions */
-{
-	double	time_elapsed;
-	time_t	current_time;
-	ulint	n_reserved;
-	ibool	ret;
-
-	mutex_enter(&srv_innodb_monitor_mutex);
-
-	current_time = time(NULL);
-
-	/* We add 0.001 seconds to time_elapsed to prevent division
-	by zero if two users happen to call SHOW INNODB STATUS at the same
-	time */
-
-	time_elapsed = difftime(current_time, srv_last_monitor_time)
-		+ 0.001;
-
-	srv_last_monitor_time = time(NULL);
-
-	fputs("\n=====================================\n", file);
-
-	ut_print_timestamp(file);
-	fprintf(file,
-		" INNODB MONITOR OUTPUT\n"
-		"=====================================\n"
-		"Per second averages calculated from the last %lu seconds\n",
-		(ulong)time_elapsed);
-
-	fputs("-----------------\n"
-	      "BACKGROUND THREAD\n"
-	      "-----------------\n", file);
-	srv_print_master_thread_info(file);
-
-	fputs("----------\n"
-	      "SEMAPHORES\n"
-	      "----------\n", file);
-	sync_print(file);
-
-	/* Conceptually, srv_innodb_monitor_mutex has a very high latching
-	order level in sync0sync.h, while dict_foreign_err_mutex has a very
-	low level 135. Therefore we can reserve the latter mutex here without
-	a danger of a deadlock of threads. */
-
-	mutex_enter(&dict_foreign_err_mutex);
-
-	if (ftell(dict_foreign_err_file) != 0L) {
-		fputs("------------------------\n"
-		      "LATEST FOREIGN KEY ERROR\n"
-		      "------------------------\n", file);
-		ut_copy_file(file, dict_foreign_err_file);
-	}
-
-	mutex_exit(&dict_foreign_err_mutex);
-
-	/* Only if lock_print_info_summary proceeds correctly,
-	before we call the lock_print_info_all_transactions
-	to print all the lock information. */
-	ret = lock_print_info_summary(file, nowait);
-
-	if (ret) {
-		if (trx_start) {
-			long	t = ftell(file);
-			if (t < 0) {
-				*trx_start = ULINT_UNDEFINED;
-			} else {
-				*trx_start = (ulint) t;
-			}
-		}
-		lock_print_info_all_transactions(file);
-		if (trx_end) {
-			long	t = ftell(file);
-			if (t < 0) {
-				*trx_end = ULINT_UNDEFINED;
-			} else {
-				*trx_end = (ulint) t;
-			}
-		}
-	}
-
-	fputs("--------\n"
-	      "FILE I/O\n"
-	      "--------\n", file);
-	os_aio_print(file);
-
-	fputs("-------------------------------------\n"
-	      "INSERT BUFFER AND ADAPTIVE HASH INDEX\n"
-	      "-------------------------------------\n", file);
-	ibuf_print(file);
-
-	ha_print_info(file, btr_search_sys->hash_index);
-
-	fprintf(file,
-		"%.2f hash searches/s, %.2f non-hash searches/s\n",
-		(btr_cur_n_sea - btr_cur_n_sea_old)
-		/ time_elapsed,
-		(btr_cur_n_non_sea - btr_cur_n_non_sea_old)
-		/ time_elapsed);
-	btr_cur_n_sea_old = btr_cur_n_sea;
-	btr_cur_n_non_sea_old = btr_cur_n_non_sea;
-
-	fputs("---\n"
-	      "LOG\n"
-	      "---\n", file);
-	log_print(file);
-
-	fputs("----------------------\n"
-	      "BUFFER POOL AND MEMORY\n"
-	      "----------------------\n", file);
-	fprintf(file,
-		"Total memory allocated " ULINTPF
-		"; in additional pool allocated " ULINTPF "\n",
-		ut_total_allocated_memory,
-		mem_pool_get_reserved(mem_comm_pool));
-	fprintf(file, "Dictionary memory allocated " ULINTPF "\n",
-		dict_sys->size);
-
-	buf_print_io(file);
-
-	fputs("--------------\n"
-	      "ROW OPERATIONS\n"
-	      "--------------\n", file);
-	fprintf(file, "%ld queries inside InnoDB, %lu queries in queue\n",
-		(long) srv_conc_n_threads,
-		(ulong) srv_conc_n_waiting_threads);
-
-	fprintf(file, "%lu read views open inside InnoDB\n",
-		UT_LIST_GET_LEN(trx_sys->view_list));
-
-	n_reserved = fil_space_get_n_reserved_extents(0);
-	if (n_reserved > 0) {
-		fprintf(file,
-			"%lu tablespace extents now reserved for"
-			" B-tree split operations\n",
-			(ulong) n_reserved);
-	}
-
-#ifdef UNIV_LINUX
-	fprintf(file, "Main thread process no. %lu, id %lu, state: %s\n",
-		(ulong) srv_main_thread_process_no,
-		(ulong) srv_main_thread_id,
-		srv_main_thread_op_info);
-#else
-	fprintf(file, "Main thread id %lu, state: %s\n",
-		(ulong) srv_main_thread_id,
-		srv_main_thread_op_info);
-#endif
-	fprintf(file,
-		"Number of rows inserted " ULINTPF
-		", updated " ULINTPF ", deleted " ULINTPF
-		", read " ULINTPF "\n",
-		srv_n_rows_inserted,
-		srv_n_rows_updated,
-		srv_n_rows_deleted,
-		srv_n_rows_read);
-	fprintf(file,
-		"%.2f inserts/s, %.2f updates/s,"
-		" %.2f deletes/s, %.2f reads/s\n",
-		(srv_n_rows_inserted - srv_n_rows_inserted_old)
-		/ time_elapsed,
-		(srv_n_rows_updated - srv_n_rows_updated_old)
-		/ time_elapsed,
-		(srv_n_rows_deleted - srv_n_rows_deleted_old)
-		/ time_elapsed,
-		(srv_n_rows_read - srv_n_rows_read_old)
-		/ time_elapsed);
-
-	srv_n_rows_inserted_old = srv_n_rows_inserted;
-	srv_n_rows_updated_old = srv_n_rows_updated;
-	srv_n_rows_deleted_old = srv_n_rows_deleted;
-	srv_n_rows_read_old = srv_n_rows_read;
-
-	fputs("----------------------------\n"
-	      "END OF INNODB MONITOR OUTPUT\n"
-	      "============================\n", file);
-	mutex_exit(&srv_innodb_monitor_mutex);
-	fflush(file);
-
-	return(ret);
-}
-
-/******************************************************************//**
-Function to pass InnoDB status variables to MySQL */
-UNIV_INTERN
-void
-srv_export_innodb_status(void)
-/*==========================*/
-{
-	buf_pool_stat_t	stat;
-	ulint		LRU_len;
-	ulint		free_len;
-	ulint		flush_list_len;
-
-	buf_get_total_stat(&stat);
-	buf_get_total_list_len(&LRU_len, &free_len, &flush_list_len);
-
-	mutex_enter(&srv_innodb_monitor_mutex);
-
-	export_vars.innodb_data_pending_reads
-		= os_n_pending_reads;
-	export_vars.innodb_data_pending_writes
-		= os_n_pending_writes;
-	export_vars.innodb_data_pending_fsyncs
-		= fil_n_pending_log_flushes
-		+ fil_n_pending_tablespace_flushes;
-	export_vars.innodb_data_fsyncs = os_n_fsyncs;
-	export_vars.innodb_data_read = srv_data_read;
-	export_vars.innodb_data_reads = os_n_file_reads;
-	export_vars.innodb_data_writes = os_n_file_writes;
-	export_vars.innodb_data_written = srv_data_written;
-	export_vars.innodb_buffer_pool_read_requests = stat.n_page_gets;
-	export_vars.innodb_buffer_pool_write_requests
-		= srv_buf_pool_write_requests;
-	export_vars.innodb_buffer_pool_wait_free = srv_buf_pool_wait_free;
-	export_vars.innodb_buffer_pool_pages_flushed = srv_buf_pool_flushed;
-	export_vars.innodb_buffer_pool_reads = srv_buf_pool_reads;
-	export_vars.innodb_buffer_pool_read_ahead_rnd
-		= stat.n_ra_pages_read_rnd;
-	export_vars.innodb_buffer_pool_read_ahead
-		= stat.n_ra_pages_read;
-	export_vars.innodb_buffer_pool_read_ahead_evicted
-		= stat.n_ra_pages_evicted;
-	export_vars.innodb_buffer_pool_pages_data = LRU_len;
-	export_vars.innodb_buffer_pool_pages_dirty = flush_list_len;
-	export_vars.innodb_buffer_pool_pages_free = free_len;
-#ifdef UNIV_DEBUG
-	export_vars.innodb_buffer_pool_pages_latched
-		= buf_get_latched_pages_number();
-#endif /* UNIV_DEBUG */
-	export_vars.innodb_buffer_pool_pages_total = buf_pool_get_n_pages();
-
-	export_vars.innodb_buffer_pool_pages_misc
-	       	= buf_pool_get_n_pages() - LRU_len - free_len;
-#ifdef HAVE_ATOMIC_BUILTINS
-	export_vars.innodb_have_atomic_builtins = 1;
-#else
-	export_vars.innodb_have_atomic_builtins = 0;
-#endif
-	export_vars.innodb_page_size = UNIV_PAGE_SIZE;
-	export_vars.innodb_log_waits = srv_log_waits;
-	export_vars.innodb_os_log_written = srv_os_log_written;
-	export_vars.innodb_os_log_fsyncs = fil_n_log_flushes;
-	export_vars.innodb_os_log_pending_fsyncs = fil_n_pending_log_flushes;
-	export_vars.innodb_os_log_pending_writes = srv_os_log_pending_writes;
-	export_vars.innodb_log_write_requests = srv_log_write_requests;
-	export_vars.innodb_log_writes = srv_log_writes;
-	export_vars.innodb_dblwr_pages_written = srv_dblwr_pages_written;
-	export_vars.innodb_dblwr_writes = srv_dblwr_writes;
-	export_vars.innodb_pages_created = stat.n_pages_created;
-	export_vars.innodb_pages_read = stat.n_pages_read;
-	export_vars.innodb_pages_written = stat.n_pages_written;
-	export_vars.innodb_row_lock_waits = srv_n_lock_wait_count;
-	export_vars.innodb_row_lock_current_waits
-		= srv_n_lock_wait_current_count;
-	export_vars.innodb_row_lock_time = srv_n_lock_wait_time / 1000;
-	if (srv_n_lock_wait_count > 0) {
-		export_vars.innodb_row_lock_time_avg = (ulint)
-			(srv_n_lock_wait_time / 1000 / srv_n_lock_wait_count);
-	} else {
-		export_vars.innodb_row_lock_time_avg = 0;
-	}
-	export_vars.innodb_row_lock_time_max
-		= srv_n_lock_max_wait_time / 1000;
-	export_vars.innodb_rows_read = srv_n_rows_read;
-	export_vars.innodb_rows_inserted = srv_n_rows_inserted;
-	export_vars.innodb_rows_updated = srv_n_rows_updated;
-	export_vars.innodb_rows_deleted = srv_n_rows_deleted;
-	export_vars.innodb_truncated_status_writes = srv_truncated_status_writes;
-
-	mutex_exit(&srv_innodb_monitor_mutex);
-}
-
-/*********************************************************************//**
-A thread which prints the info output by various InnoDB monitors.
-@return	a dummy parameter */
-UNIV_INTERN
-os_thread_ret_t
-srv_monitor_thread(
-/*===============*/
-	void*	arg __attribute__((unused)))
-			/*!< in: a dummy parameter required by
-			os_thread_create */
-{
-	ib_int64_t	sig_count;
-	double		time_elapsed;
-	time_t		current_time;
-	time_t		last_table_monitor_time;
-	time_t		last_tablespace_monitor_time;
-	time_t		last_monitor_time;
-	ulint		mutex_skipped;
-	ibool		last_srv_print_monitor;
-
-#ifdef UNIV_DEBUG_THREAD_CREATION
-	fprintf(stderr, "Lock timeout thread starts, id %lu\n",
-		os_thread_pf(os_thread_get_curr_id()));
-#endif
-
-#ifdef UNIV_PFS_THREAD
-	pfs_register_thread(srv_monitor_thread_key);
-#endif
-
-	UT_NOT_USED(arg);
-	srv_last_monitor_time = ut_time();
-	last_table_monitor_time = ut_time();
-	last_tablespace_monitor_time = ut_time();
-	last_monitor_time = ut_time();
-	mutex_skipped = 0;
-	last_srv_print_monitor = srv_print_innodb_monitor;
-loop:
-	srv_monitor_active = TRUE;
-
-	/* Wake up every 5 seconds to see if we need to print
-	monitor information or if signalled at shutdown. */
-
-	sig_count = os_event_reset(srv_monitor_event);
-
-	os_event_wait_time_low(srv_monitor_event, 5000000, sig_count);
-
-	current_time = ut_time();
-
-	time_elapsed = difftime(current_time, last_monitor_time);
-
-	if (time_elapsed > 15) {
-		last_monitor_time = ut_time();
-
-		if (srv_print_innodb_monitor) {
-			/* Reset mutex_skipped counter everytime
-			srv_print_innodb_monitor changes. This is to
-			ensure we will not be blocked by kernel_mutex
-			for short duration information printing,
-			such as requested by sync_array_print_long_waits() */
-			if (!last_srv_print_monitor) {
-				mutex_skipped = 0;
-				last_srv_print_monitor = TRUE;
-			}
-
-			if (!srv_printf_innodb_monitor(stderr,
-						MUTEX_NOWAIT(mutex_skipped),
-						NULL, NULL)) {
-				mutex_skipped++;
-			} else {
-				/* Reset the counter */
-				mutex_skipped = 0;
-			}
-		} else {
-			last_srv_print_monitor = FALSE;
-		}
-
-
-		if (srv_innodb_status) {
-			mutex_enter(&srv_monitor_file_mutex);
-			rewind(srv_monitor_file);
-			if (!srv_printf_innodb_monitor(srv_monitor_file,
-						MUTEX_NOWAIT(mutex_skipped),
-						NULL, NULL)) {
-				mutex_skipped++;
-			} else {
-				mutex_skipped = 0;
-			}
-
-			os_file_set_eof(srv_monitor_file);
-			mutex_exit(&srv_monitor_file_mutex);
-		}
-
-		if (srv_print_innodb_tablespace_monitor
-		    && difftime(current_time,
-				last_tablespace_monitor_time) > 60) {
-			last_tablespace_monitor_time = ut_time();
-
-			fputs("========================"
-			      "========================\n",
-			      stderr);
-
-			ut_print_timestamp(stderr);
-
-			fputs(" INNODB TABLESPACE MONITOR OUTPUT\n"
-			      "========================"
-			      "========================\n",
-			      stderr);
-
-			fsp_print(0);
-			fputs("Validating tablespace\n", stderr);
-			fsp_validate(0);
-			fputs("Validation ok\n"
-			      "---------------------------------------\n"
-			      "END OF INNODB TABLESPACE MONITOR OUTPUT\n"
-			      "=======================================\n",
-			      stderr);
-		}
-
-		if (srv_print_innodb_table_monitor
-		    && difftime(current_time, last_table_monitor_time) > 60) {
-
-			last_table_monitor_time = ut_time();
-
-			fputs("===========================================\n",
-			      stderr);
-
-			ut_print_timestamp(stderr);
-
-			fputs(" INNODB TABLE MONITOR OUTPUT\n"
-			      "===========================================\n",
-			      stderr);
-			dict_print();
-
-			fputs("-----------------------------------\n"
-			      "END OF INNODB TABLE MONITOR OUTPUT\n"
-			      "==================================\n",
-			      stderr);
-		}
-	}
-
-	if (srv_shutdown_state >= SRV_SHUTDOWN_CLEANUP) {
-		goto exit_func;
-	}
-
-	if (srv_print_innodb_monitor
-	    || srv_print_innodb_lock_monitor
-	    || srv_print_innodb_tablespace_monitor
-	    || srv_print_innodb_table_monitor) {
-		goto loop;
-	}
-
-	srv_monitor_active = FALSE;
-
-	goto loop;
-
-exit_func:
-	srv_monitor_active = FALSE;
-
-	/* We count the number of threads in os_thread_exit(). A created
-	thread should always use that to exit and not use return() to exit. */
-
-	os_thread_exit(NULL);
-
-	OS_THREAD_DUMMY_RETURN;
-}
-
-/*********************************************************************//**
-A thread which wakes up threads whose lock wait may have lasted too long.
-@return	a dummy parameter */
-UNIV_INTERN
-os_thread_ret_t
-srv_lock_timeout_thread(
-/*====================*/
-	void*	arg __attribute__((unused)))
-			/* in: a dummy parameter required by
-			os_thread_create */
-{
-	srv_slot_t*	slot;
-	ibool		some_waits;
-	double		wait_time;
-	ulint		i;
-	ib_int64_t	sig_count;
-
-#ifdef UNIV_PFS_THREAD
-	pfs_register_thread(srv_lock_timeout_thread_key);
-#endif
-
-loop:
-
-	/* When someone is waiting for a lock, we wake up every second
-	and check if a timeout has passed for a lock wait */
-
-	sig_count = os_event_reset(srv_timeout_event);
-
-	os_event_wait_time_low(srv_timeout_event, 1000000, sig_count);
-
-	srv_lock_timeout_active = TRUE;
-
-	mutex_enter(&kernel_mutex);
-
-	some_waits = FALSE;
-
-	/* Check of all slots if a thread is waiting there, and if it
-	has exceeded the time limit */
-
-	for (i = 0; i < OS_THREAD_MAX_N; i++) {
-
-		slot = srv_mysql_table + i;
-
-		if (slot->in_use) {
-			trx_t*	trx;
-			ulong	lock_wait_timeout;
-
-			some_waits = TRUE;
-
-			wait_time = ut_difftime(ut_time(), slot->suspend_time);
-
-			trx = thr_get_trx(slot->thr);
-			lock_wait_timeout = thd_lock_wait_timeout(
-				trx->mysql_thd);
-
-			if (trx_is_interrupted(trx)
-			    || (lock_wait_timeout < 100000000
-				&& (wait_time > (double) lock_wait_timeout
-				    || wait_time < 0))) {
-
-				/* Timeout exceeded or a wrap-around in system
-				time counter: cancel the lock request queued
-				by the transaction and release possible
-				other transactions waiting behind; it is
-				possible that the lock has already been
-				granted: in that case do nothing */
-
-				if (trx->wait_lock) {
-					lock_cancel_waiting_and_release(
-						trx->wait_lock);
-				}
-			}
-		}
-	}
-
-	os_event_reset(srv_lock_timeout_thread_event);
-
-	mutex_exit(&kernel_mutex);
-
-	if (srv_shutdown_state >= SRV_SHUTDOWN_CLEANUP) {
-		goto exit_func;
-	}
-
-	if (some_waits) {
-		goto loop;
-	}
-
-	srv_lock_timeout_active = FALSE;
-
-#if 0
-	/* The following synchronisation is disabled, since
-	the InnoDB monitor output is to be updated every 15 seconds. */
-	os_event_wait(srv_lock_timeout_thread_event);
-#endif
-	goto loop;
-
-exit_func:
-	srv_lock_timeout_active = FALSE;
-
-	/* We count the number of threads in os_thread_exit(). A created
-	thread should always use that to exit and not use return() to exit. */
-
-	os_thread_exit(NULL);
-
-	OS_THREAD_DUMMY_RETURN;
-}
-
-/*********************************************************************//**
-A thread which prints warnings about semaphore waits which have lasted
-too long. These can be used to track bugs which cause hangs.
-@return	a dummy parameter */
-UNIV_INTERN
-os_thread_ret_t
-srv_error_monitor_thread(
-/*=====================*/
-	void*	arg __attribute__((unused)))
-			/*!< in: a dummy parameter required by
-			os_thread_create */
-{
-	/* number of successive fatal timeouts observed */
-	ulint		fatal_cnt	= 0;
-	ib_uint64_t	old_lsn;
-	ib_uint64_t	new_lsn;
-	ib_int64_t	sig_count;
-	/* longest waiting thread for a semaphore */
-	os_thread_id_t	waiter		= os_thread_get_curr_id();
-	os_thread_id_t	old_waiter	= waiter;
-	/* the semaphore that is being waited for */
-	const void*	sema		= NULL;
-	const void*	old_sema	= NULL;
-
-	old_lsn = srv_start_lsn;
-
-#ifdef UNIV_DEBUG_THREAD_CREATION
-	fprintf(stderr, "Error monitor thread starts, id %lu\n",
-		os_thread_pf(os_thread_get_curr_id()));
-#endif
-
-#ifdef UNIV_PFS_THREAD
-	pfs_register_thread(srv_error_monitor_thread_key);
-#endif
-
-loop:
-	srv_error_monitor_active = TRUE;
-
-	/* Try to track a strange bug reported by Harald Fuchs and others,
-	where the lsn seems to decrease at times */
-
-	new_lsn = log_get_lsn();
-
-	if (new_lsn < old_lsn) {
-		ut_print_timestamp(stderr);
-		fprintf(stderr,
-			"  InnoDB: Error: old log sequence number %llu"
-			" was greater\n"
-			"InnoDB: than the new log sequence number %llu!\n"
-			"InnoDB: Please submit a bug report"
-			" to http://bugs.mysql.com\n",
-			old_lsn, new_lsn);
-		ut_ad(0);
-	}
-
-	old_lsn = new_lsn;
-
-	if (difftime(time(NULL), srv_last_monitor_time) > 60) {
-		/* We referesh InnoDB Monitor values so that averages are
-		printed from at most 60 last seconds */
-
-		srv_refresh_innodb_monitor_stats();
-	}
-
-	/* Update the statistics collected for deciding LRU
-	eviction policy. */
-	buf_LRU_stat_update();
-
-	/* Update the statistics collected for flush rate policy. */
-	buf_flush_stat_update();
-
-	/* In case mutex_exit is not a memory barrier, it is
-	theoretically possible some threads are left waiting though
-	the semaphore is already released. Wake up those threads: */
-
-	sync_arr_wake_threads_if_sema_free();
-
-	if (sync_array_print_long_waits(&waiter, &sema)
-	    && sema == old_sema && os_thread_eq(waiter, old_waiter)) {
-		fatal_cnt++;
-		if (fatal_cnt > 10) {
-
-			fprintf(stderr,
-				"InnoDB: Error: semaphore wait has lasted"
-				" > %lu seconds\n"
-				"InnoDB: We intentionally crash the server,"
-				" because it appears to be hung.\n",
-				(ulong) srv_fatal_semaphore_wait_threshold);
-
-			ut_error;
-		}
-	} else {
-		fatal_cnt = 0;
-		old_waiter = waiter;
-		old_sema = sema;
-	}
-
-	/* Flush stderr so that a database user gets the output
-	to possible MySQL error file */
-
-	fflush(stderr);
-
-	sig_count = os_event_reset(srv_error_event);
-
-	os_event_wait_time_low(srv_error_event, 1000000, sig_count);
-
-	if (srv_shutdown_state < SRV_SHUTDOWN_CLEANUP) {
-
-		goto loop;
-	}
-
-	srv_error_monitor_active = FALSE;
-
-	/* We count the number of threads in os_thread_exit(). A created
-	thread should always use that to exit and not use return() to exit. */
-
-	os_thread_exit(NULL);
-
-	OS_THREAD_DUMMY_RETURN;
-}
-
-/**********************************************************************//**
-Check whether any background thread is active. If so return the thread
-type
-@return ULINT_UNDEFINED if all are suspended or have exited, thread
-type if any are still active. */
-UNIV_INTERN
-ulint
-srv_get_active_thread_type(void)
-/*============================*/
-{
-	ulint	i;
-	ibool	ret = ULINT_UNDEFINED;
-
-	mutex_enter(&kernel_mutex);
-
-	for (i = 0; i <= SRV_MASTER; ++i) {
-		if (srv_n_threads_active[i] != 0) {
-			ret = i;
-			break;
-		}
-	}
-
-	mutex_exit(&kernel_mutex);
-
-	return(ret);
-}
-
-/*********************************************************************//**
-This function prints progress message every 60 seconds during server
-shutdown, for any activities that master thread is pending on. */
-static
-void
-srv_shutdown_print_master_pending(
-/*==============================*/
-	ib_time_t*	last_print_time,	/*!< last time the function
-						print the message */
-	ulint		n_tables_to_drop,	/*!< number of tables to
-						be dropped */
-	ulint		n_bytes_merged,		/*!< number of change buffer
-						just merged */
-	ulint		n_pages_flushed)	/*!< number of pages flushed */
-{
-	ib_time_t	current_time;
-	double		time_elapsed;
-
-	current_time = ut_time();
-	time_elapsed = ut_difftime(current_time, *last_print_time);
-
-	if (time_elapsed > 60) {
-		*last_print_time = ut_time();
-
-		if (n_tables_to_drop) {
-			ut_print_timestamp(stderr);
-			fprintf(stderr, "  InnoDB: Waiting for "
-				"%lu table(s) to be dropped\n",
-				(ulong) n_tables_to_drop);
-		}
-
-		/* Check change buffer merge, we only wait for change buffer
-		merge if it is a slow shutdown */
-		if (!srv_fast_shutdown && n_bytes_merged) {
-			ut_print_timestamp(stderr);
-			fprintf(stderr, "  InnoDB: Waiting for change "
-				"buffer merge to complete\n"
-				"  InnoDB: number of bytes of change buffer "
-				"just merged:  %lu\n",
-				n_bytes_merged);
-		}
-
-		if (n_pages_flushed) {
-			ut_print_timestamp(stderr);
-			fprintf(stderr, "  InnoDB: Waiting for "
-				"%lu pages to be flushed\n",
-				(ulong) n_pages_flushed);
-		}
-        }
-}
-
-/*******************************************************************//**
-Tells the InnoDB server that there has been activity in the database
-and wakes up the master thread if it is suspended (not sleeping). Used
-in the MySQL interface. Note that there is a small chance that the master
-thread stays suspended (we do not protect our operation with the
-srv_sys_t->mutex, for performance reasons). */
-UNIV_INTERN
-void
-srv_active_wake_master_thread(void)
-/*===============================*/
-{
-	srv_activity_count++;
-
-	if (srv_n_threads_active[SRV_MASTER] == 0) {
-
-		mutex_enter(&kernel_mutex);
-
-		srv_release_threads(SRV_MASTER, 1);
-
-		mutex_exit(&kernel_mutex);
-	}
-}
-
-/*******************************************************************//**
-Tells the purge thread that there has been activity in the database
-and wakes up the purge thread if it is suspended (not sleeping).  Note
-that there is a small chance that the purge thread stays suspended
-(we do not protect our operation with the kernel mutex, for
-performace reasons). */
-UNIV_INTERN
-void
-srv_wake_purge_thread_if_not_active(void)
-/*=====================================*/
-{
-	ut_ad(!mutex_own(&kernel_mutex));
-
-	if (srv_n_purge_threads > 0
-	    && srv_n_threads_active[SRV_WORKER] == 0) {
-
-		mutex_enter(&kernel_mutex);
-
-		srv_release_threads(SRV_WORKER, 1);
-
-		mutex_exit(&kernel_mutex);
-	}
-}
-
-/*******************************************************************//**
-Wakes up the master thread if it is suspended or being suspended. */
-UNIV_INTERN
-void
-srv_wake_master_thread(void)
-/*========================*/
-{
-	srv_activity_count++;
-
-	mutex_enter(&kernel_mutex);
-
-	srv_release_threads(SRV_MASTER, 1);
-
-	mutex_exit(&kernel_mutex);
-}
-
-/*******************************************************************//**
-Wakes up the purge thread if it's not already awake. */
-UNIV_INTERN
-void
-srv_wake_purge_thread(void)
-/*=======================*/
-{
-	ut_ad(!mutex_own(&kernel_mutex));
-
-	if (srv_n_purge_threads > 0) {
-
-		mutex_enter(&kernel_mutex);
-
-		srv_release_threads(SRV_WORKER, 1);
-
-		mutex_exit(&kernel_mutex);
-	}
-}
-
-/**********************************************************************
-The master thread is tasked to ensure that flush of log file happens
-once every second in the background. This is to ensure that not more
-than one second of trxs are lost in case of crash when
-innodb_flush_logs_at_trx_commit != 1 */
-static
-void
-srv_sync_log_buffer_in_background(void)
-/*===================================*/
-{
-	time_t	current_time = time(NULL);
-
-	srv_main_thread_op_info = "flushing log";
-	if (difftime(current_time, srv_last_log_flush_time) >= 1) {
-		log_buffer_sync_in_background(TRUE);
-		srv_last_log_flush_time = current_time;
-		srv_log_writes_and_flush++;
-	}
-}
-
-/********************************************************************//**
-Do a full purge, reconfigure the purge sub-system if a dynamic
-change is detected. */
-static
-void
-srv_master_do_purge(void)
-/*=====================*/
-{
-	ulint	n_pages_purged;
-
-	ut_ad(!mutex_own(&kernel_mutex));
-
-	ut_a(srv_n_purge_threads == 0);
-
-	do {
-		/* Check for shutdown and change in purge config. */
-		if (srv_fast_shutdown && srv_shutdown_state > 0) {
-			/* Nothing to purge. */
-			n_pages_purged = 0;
-		} else {
-			n_pages_purged = trx_purge(srv_purge_batch_size);
-		}
-
-		srv_sync_log_buffer_in_background();
-
-	} while (n_pages_purged > 0);
-}
-
-/*********************************************************************//**
-The master thread controlling the server.
-@return	a dummy parameter */
-UNIV_INTERN
-os_thread_ret_t
-srv_master_thread(
-/*==============*/
-	void*	arg __attribute__((unused)))
-			/*!< in: a dummy parameter required by
-			os_thread_create */
-{
-	buf_pool_stat_t buf_stat;
-	srv_slot_t*	slot;
-	ulint		old_activity_count;
-	ulint		n_pages_purged	= 0;
-	ulint		n_bytes_merged;
-	ulint		n_pages_flushed;
-	ulint		n_bytes_archived;
-	ulint		n_tables_to_drop;
-	ulint		n_ios;
-	ulint		n_ios_old;
-	ulint		n_ios_very_old;
-	ulint		n_pend_ios;
-	ulint		next_itr_time;
-	ulint		i;
-	ib_time_t	last_print_time;
-
-#ifdef UNIV_DEBUG_THREAD_CREATION
-	fprintf(stderr, "Master thread starts, id %lu\n",
-		os_thread_pf(os_thread_get_curr_id()));
-#endif
-
-#ifdef UNIV_PFS_THREAD
-	pfs_register_thread(srv_master_thread_key);
-#endif
-
-	srv_main_thread_process_no = os_proc_get_number();
-	srv_main_thread_id = os_thread_pf(os_thread_get_curr_id());
-
-	mutex_enter(&kernel_mutex);
-
-	slot = srv_table_reserve_slot(SRV_MASTER);
-
-	srv_n_threads_active[SRV_MASTER]++;
-
-	mutex_exit(&kernel_mutex);
-
-	last_print_time = ut_time();
-loop:
-	/*****************************************************************/
-	/* ---- When there is database activity by users, we cycle in this
-	loop */
-
-	srv_main_thread_op_info = "reserving kernel mutex";
-
-	buf_get_total_stat(&buf_stat);
-	n_ios_very_old = log_sys->n_log_ios + buf_stat.n_pages_read
-		+ buf_stat.n_pages_written;
-	mutex_enter(&kernel_mutex);
-
-	/* Store the user activity counter at the start of this loop */
-	old_activity_count = srv_activity_count;
-
-	mutex_exit(&kernel_mutex);
-
-	if (srv_force_recovery >= SRV_FORCE_NO_BACKGROUND) {
-
-		goto suspend_thread;
-	}
-
-	/* ---- We run the following loop approximately once per second
-	when there is database activity */
-
-	srv_last_log_flush_time = time(NULL);
-
-	/* Sleep for 1 second on entrying the for loop below the first time. */
-	next_itr_time = ut_time_ms() + 1000;
-
-	for (i = 0; i < 10; i++) {
-		ulint	cur_time = ut_time_ms();
-
-		/* ALTER TABLE in MySQL requires on Unix that the table handler
-		can drop tables lazily after there no longer are SELECT
-		queries to them. */
-
-		srv_main_thread_op_info = "doing background drop tables";
-
-		row_drop_tables_for_mysql_in_background();
-
-		srv_main_thread_op_info = "";
-
-		if (srv_fast_shutdown && srv_shutdown_state > 0) {
-
-			goto background_loop;
-		}
-
-		buf_get_total_stat(&buf_stat);
-
-		n_ios_old = log_sys->n_log_ios + buf_stat.n_pages_read
-			+ buf_stat.n_pages_written;
-
-		srv_main_thread_op_info = "sleeping";
-		srv_main_1_second_loops++;
-
-		if (next_itr_time > cur_time
-		    && srv_shutdown_state == SRV_SHUTDOWN_NONE) {
-
-			/* Get sleep interval in micro seconds. We use
-			ut_min() to avoid long sleep in case of
-			wrap around. */
-			os_thread_sleep(ut_min(1000000,
-					(next_itr_time - cur_time)
-					 * 1000));
-			srv_main_sleeps++;
-		}
-
-		/* Each iteration should happen at 1 second interval. */
-		next_itr_time = ut_time_ms() + 1000;
-
-		/* Flush logs if needed */
-		srv_sync_log_buffer_in_background();
-
-		srv_main_thread_op_info = "making checkpoint";
-		log_free_check();
-
-		/* If i/os during one second sleep were less than 5% of
-		capacity, we assume that there is free disk i/o capacity
-		available, and it makes sense to do an insert buffer merge. */
-
-		buf_get_total_stat(&buf_stat);
-		n_pend_ios = buf_get_n_pending_ios()
-			+ log_sys->n_pending_writes;
-		n_ios = log_sys->n_log_ios + buf_stat.n_pages_read
-			+ buf_stat.n_pages_written;
-		if (n_pend_ios < SRV_PEND_IO_THRESHOLD
-		    && (n_ios - n_ios_old < SRV_RECENT_IO_ACTIVITY)) {
-			srv_main_thread_op_info = "doing insert buffer merge";
-			ibuf_contract_for_n_pages(FALSE, PCT_IO(5));
-
-			/* Flush logs if needed */
-			srv_sync_log_buffer_in_background();
-		}
-
-		if (UNIV_UNLIKELY(buf_get_modified_ratio_pct()
-				  > srv_max_buf_pool_modified_pct)) {
-
-			/* Try to keep the number of modified pages in the
-			buffer pool under the limit wished by the user */
-
-			srv_main_thread_op_info =
-				"flushing buffer pool pages";
-			n_pages_flushed = buf_flush_list(
-				PCT_IO(100), IB_ULONGLONG_MAX);
-
-		} else if (srv_adaptive_flushing) {
-
-			/* Try to keep the rate of flushing of dirty
-			pages such that redo log generation does not
-			produce bursts of IO at checkpoint time. */
-			ulint n_flush = buf_flush_get_desired_flush_rate();
-
-			if (n_flush) {
-				srv_main_thread_op_info =
-					"flushing buffer pool pages";
-				n_flush = ut_min(PCT_IO(100), n_flush);
-				n_pages_flushed =
-					buf_flush_list(
-						n_flush,
-						IB_ULONGLONG_MAX);
-			}
-		}
-
-		if (srv_activity_count == old_activity_count) {
-
-			/* There is no user activity at the moment, go to
-			the background loop */
-
-			goto background_loop;
-		}
-	}
-
-	/* ---- We perform the following code approximately once per
-	10 seconds when there is database activity */
-
-#ifdef MEM_PERIODIC_CHECK
-	/* Check magic numbers of every allocated mem block once in 10
-	seconds */
-	mem_validate_all_blocks();
-#endif
-	/* If i/os during the 10 second period were less than 200% of
-	capacity, we assume that there is free disk i/o capacity
-	available, and it makes sense to flush srv_io_capacity pages.
-
-	Note that this is done regardless of the fraction of dirty
-	pages relative to the max requested by the user. The one second
-	loop above requests writes for that case. The writes done here
-	are not required, and may be disabled. */
-
-	buf_get_total_stat(&buf_stat);
-	n_pend_ios = buf_get_n_pending_ios() + log_sys->n_pending_writes;
-	n_ios = log_sys->n_log_ios + buf_stat.n_pages_read
-		+ buf_stat.n_pages_written;
-
-	srv_main_10_second_loops++;
-	if (n_pend_ios < SRV_PEND_IO_THRESHOLD
-	    && (n_ios - n_ios_very_old < SRV_PAST_IO_ACTIVITY)) {
-
-		srv_main_thread_op_info = "flushing buffer pool pages";
-		buf_flush_list(PCT_IO(100), IB_ULONGLONG_MAX);
-
-		/* Flush logs if needed */
-		srv_sync_log_buffer_in_background();
-	}
-
-	/* We run a batch of insert buffer merge every 10 seconds,
-	even if the server were active */
-
-	srv_main_thread_op_info = "doing insert buffer merge";
-	ibuf_contract_for_n_pages(FALSE, PCT_IO(5));
-
-	/* Flush logs if needed */
-	srv_sync_log_buffer_in_background();
-
-	if (srv_n_purge_threads == 0) {
-		srv_main_thread_op_info = "master purging";
-
-		srv_master_do_purge();
-
-		if (srv_fast_shutdown && srv_shutdown_state > 0) {
-
-			goto background_loop;
-		}
-	}
-
-	srv_main_thread_op_info = "flushing buffer pool pages";
-
-	/* Flush a few oldest pages to make a new checkpoint younger */
-
-	if (buf_get_modified_ratio_pct() > 70) {
-
-		/* If there are lots of modified pages in the buffer pool
-		(> 70 %), we assume we can afford reserving the disk(s) for
-		the time it requires to flush 100 pages */
-
-		n_pages_flushed = buf_flush_list(
-			PCT_IO(100), IB_ULONGLONG_MAX);
-	} else {
-		/* Otherwise, we only flush a small number of pages so that
-		we do not unnecessarily use much disk i/o capacity from
-		other work */
-
-		n_pages_flushed = buf_flush_list(
-			  PCT_IO(10), IB_ULONGLONG_MAX);
-	}
-
-	srv_main_thread_op_info = "making checkpoint";
-
-	/* Make a new checkpoint about once in 10 seconds */
-
-	log_checkpoint(TRUE, FALSE);
-
-	srv_main_thread_op_info = "reserving kernel mutex";
-
-	mutex_enter(&kernel_mutex);
-
-	/* ---- When there is database activity, we jump from here back to
-	the start of loop */
-
-	if (srv_activity_count != old_activity_count) {
-		mutex_exit(&kernel_mutex);
-		goto loop;
-	}
-
-	mutex_exit(&kernel_mutex);
-
-	/* If the database is quiet, we enter the background loop */
-
-	/*****************************************************************/
-background_loop:
-	/* ---- In this loop we run background operations when the server
-	is quiet from user activity. Also in the case of a shutdown, we
-	loop here, flushing the buffer pool to the data files. */
-
-	/* The server has been quiet for a while: start running background
-	operations */
-	srv_main_background_loops++;
-	srv_main_thread_op_info = "doing background drop tables";
-
-	n_tables_to_drop = row_drop_tables_for_mysql_in_background();
-
-	if (n_tables_to_drop > 0) {
-		/* Do not monopolize the CPU even if there are tables waiting
-		in the background drop queue. (It is essentially a bug if
-		MySQL tries to drop a table while there are still open handles
-		to it and we had to put it to the background drop queue.) */
-
-		if (srv_shutdown_state == SRV_SHUTDOWN_NONE) {
-			os_thread_sleep(100000);
-		}
-	}
-
-	if (srv_n_purge_threads == 0) {
-		srv_main_thread_op_info = "master purging";
-
-		srv_master_do_purge();
-	}
-
-	srv_main_thread_op_info = "reserving kernel mutex";
-
-	mutex_enter(&kernel_mutex);
-	if (srv_activity_count != old_activity_count) {
-		mutex_exit(&kernel_mutex);
-		goto loop;
-	}
-	mutex_exit(&kernel_mutex);
-
-	srv_main_thread_op_info = "doing insert buffer merge";
-
-	if (srv_fast_shutdown && srv_shutdown_state > 0) {
-		n_bytes_merged = 0;
-	} else {
-		/* This should do an amount of IO similar to the number of
-		dirty pages that will be flushed in the call to
-		buf_flush_list below. Otherwise, the system favors
-		clean pages over cleanup throughput. */
-		n_bytes_merged = ibuf_contract_for_n_pages(FALSE,
-							   PCT_IO(100));
-	}
-
-	srv_main_thread_op_info = "reserving kernel mutex";
-
-	mutex_enter(&kernel_mutex);
-	if (srv_activity_count != old_activity_count) {
-		mutex_exit(&kernel_mutex);
-		goto loop;
-	}
-	mutex_exit(&kernel_mutex);
-
-flush_loop:
-	srv_main_thread_op_info = "flushing buffer pool pages";
-	srv_main_flush_loops++;
-	if (srv_fast_shutdown < 2) {
-		n_pages_flushed = buf_flush_list(
-			  PCT_IO(100), IB_ULONGLONG_MAX);
-	} else {
-		/* In the fastest shutdown we do not flush the buffer pool
-		to data files: we set n_pages_flushed to 0 artificially. */
-
-		n_pages_flushed = 0;
-	}
-
-	srv_main_thread_op_info = "reserving kernel mutex";
-
-	mutex_enter(&kernel_mutex);
-	if (srv_activity_count != old_activity_count) {
-		mutex_exit(&kernel_mutex);
-		goto loop;
-	}
-	mutex_exit(&kernel_mutex);
-
-	srv_main_thread_op_info = "waiting for buffer pool flush to end";
-	buf_flush_wait_batch_end(NULL, BUF_FLUSH_LIST);
-
-	/* Flush logs if needed */
-	srv_sync_log_buffer_in_background();
-
-	srv_main_thread_op_info = "making checkpoint";
-
-	log_checkpoint(TRUE, FALSE);
-
-	if (buf_get_modified_ratio_pct() > srv_max_buf_pool_modified_pct) {
-
-		/* Try to keep the number of modified pages in the
-		buffer pool under the limit wished by the user */
-
-		goto flush_loop;
-	}
-
-	srv_main_thread_op_info = "reserving kernel mutex";
-
-	mutex_enter(&kernel_mutex);
-	if (srv_activity_count != old_activity_count) {
-		mutex_exit(&kernel_mutex);
-		goto loop;
-	}
-	mutex_exit(&kernel_mutex);
-	/*
-	srv_main_thread_op_info = "archiving log (if log archive is on)";
-
-	log_archive_do(FALSE, &n_bytes_archived);
-	*/
-	n_bytes_archived = 0;
-
-	/* Print progress message every 60 seconds during shutdown */
-	if (srv_shutdown_state > 0 && srv_print_verbose_log) {
-		srv_shutdown_print_master_pending(&last_print_time,
-						  n_tables_to_drop,
-						  n_bytes_merged,
-						  n_pages_flushed);
-	}
-
-	/* Keep looping in the background loop if still work to do */
-
-	if (srv_fast_shutdown && srv_shutdown_state > 0) {
-		if (n_tables_to_drop + n_pages_flushed
-		    + n_bytes_archived != 0) {
-
-			/* If we are doing a fast shutdown (= the default)
-			we do not do purge or insert buffer merge. But we
-			flush the buffer pool completely to disk.
-			In a 'very fast' shutdown we do not flush the buffer
-			pool to data files: we have set n_pages_flushed to
-			0 artificially. */
-
-			goto background_loop;
-		}
-	} else if (n_tables_to_drop
-		   + n_pages_purged + n_bytes_merged + n_pages_flushed
-		   + n_bytes_archived != 0) {
-
-		/* In a 'slow' shutdown we run purge and the insert buffer
-		merge to completion */
-
-		goto background_loop;
-	}
-
-	/* There is no work for background operations either: suspend
-	master thread to wait for more server activity */
-
-suspend_thread:
-	srv_main_thread_op_info = "suspending";
-
-	mutex_enter(&kernel_mutex);
-
-	if (row_get_background_drop_list_len_low() > 0) {
-		mutex_exit(&kernel_mutex);
-
-		goto loop;
-	}
-
-	srv_suspend_thread(slot);
-
-	mutex_exit(&kernel_mutex);
-
-	/* DO NOT CHANGE THIS STRING. innobase_start_or_create_for_mysql()
-	waits for database activity to die down when converting < 4.1.x
-	databases, and relies on this string being exactly as it is. InnoDB
-	manual also mentions this string in several places. */
-	srv_main_thread_op_info = "waiting for server activity";
-
-	os_event_wait(slot->event);
-
-	if (srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS) {
-		os_thread_exit(NULL);
-	}
-
-	/* When there is user activity, InnoDB will set the event and the
-	main thread goes back to loop. */
-
-	goto loop;
-}
-
-/*********************************************************************//**
-Asynchronous purge thread.
-@return	a dummy parameter */
-UNIV_INTERN
-os_thread_ret_t
-srv_purge_thread(
-/*=============*/
-	void*	arg __attribute__((unused)))	/*!< in: a dummy parameter
-						required by os_thread_create */
-{
-	srv_slot_t*	slot;
-	ulint		retries = 0;
-	ulint		n_total_purged = ULINT_UNDEFINED;
-
-	ut_a(srv_n_purge_threads == 1);
-
-#ifdef UNIV_PFS_THREAD
-	pfs_register_thread(srv_purge_thread_key);
-#endif /* UNIV_PFS_THREAD */
-
-#ifdef UNIV_DEBUG_THREAD_CREATION
-	fprintf(stderr, "InnoDB: Purge thread running, id %lu\n",
-		os_thread_pf(os_thread_get_curr_id()));
-#endif /* UNIV_DEBUG_THREAD_CREATION */
-
-	mutex_enter(&kernel_mutex);
-
-	slot = srv_table_reserve_slot(SRV_WORKER);
-
-	++srv_n_threads_active[SRV_WORKER];
-
-	mutex_exit(&kernel_mutex);
-
-	while (srv_shutdown_state != SRV_SHUTDOWN_EXIT_THREADS) {
-
-		ulint	n_pages_purged = 0;
-
-		/* If there are very few records to purge or the last
-		purge didn't purge any records then wait for activity.
-	        We peek at the history len without holding any mutex
-		because in the worst case we will end up waiting for
-		the next purge event. */
-		if (trx_sys->rseg_history_len < srv_purge_batch_size
-		    || (n_total_purged == 0
-			&& retries >= TRX_SYS_N_RSEGS)) {
-
-			mutex_enter(&kernel_mutex);
-
-			srv_suspend_thread(slot);
-
-			mutex_exit(&kernel_mutex);
-
-			os_event_wait(slot->event);
-
-			retries = 0;
-		}
-
-		/* Check for shutdown and whether we should do purge at all. */
-		if (srv_force_recovery >= SRV_FORCE_NO_BACKGROUND
-		    || srv_shutdown_state != 0
-		    || srv_fast_shutdown) {
-
-			break;
-		}
-
-		if (n_total_purged == 0 && retries <= TRX_SYS_N_RSEGS) {
-			++retries;
-		} else if (n_total_purged > 0) {
-			retries = 0;
-			n_total_purged = 0;
-		}
-
-		/* Purge until there are no more records to purge and there is
-		no change in configuration or server state. */
-		do {
-			n_pages_purged = trx_purge(srv_purge_batch_size);
-
-			n_total_purged += n_pages_purged;
-
-		} while (n_pages_purged > 0 && !srv_fast_shutdown);
-
-		srv_sync_log_buffer_in_background();
-	}
-
-	mutex_enter(&kernel_mutex);
-
-	/* Decrement the active count. */
-	srv_suspend_thread(slot);
-
-	slot->in_use = FALSE;
-
-	mutex_exit(&kernel_mutex);
-
-#ifdef UNIV_DEBUG_THREAD_CREATION
-	fprintf(stderr, "InnoDB: Purge thread exiting, id %lu\n",
-		os_thread_pf(os_thread_get_curr_id()));
-#endif /* UNIV_DEBUG_THREAD_CREATION */
-
-	/* We count the number of threads in os_thread_exit(). A created
-	thread should always use that to exit and not use return() to exit. */
-	os_thread_exit(NULL);
-
-	OS_THREAD_DUMMY_RETURN;	/* Not reached, avoid compiler warning */
-}
-
-/**********************************************************************//**
-Enqueues a task to server task queue and releases a worker thread, if there
-is a suspended one. */
-UNIV_INTERN
-void
-srv_que_task_enqueue_low(
-/*=====================*/
-	que_thr_t*	thr)	/*!< in: query thread */
-{
-	ut_ad(thr);
-
-	mutex_enter(&kernel_mutex);
-
-	UT_LIST_ADD_LAST(queue, srv_sys->tasks, thr);
-
-	srv_release_threads(SRV_WORKER, 1);
-
-	mutex_exit(&kernel_mutex);
-}
diff --git a/storage/innobase/srv/srv0srv.cc b/storage/innobase/srv/srv0srv.cc
new file mode 100644
index 00000000000..e64cc006f02
--- /dev/null
+++ b/storage/innobase/srv/srv0srv.cc
@@ -0,0 +1,2738 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2012, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2008, 2009 Google Inc.
+Copyright (c) 2009, Percona Inc.
+
+Portions of this file contain modifications contributed and copyrighted by
+Google, Inc. Those modifications are gratefully acknowledged and are described
+briefly in the InnoDB documentation. The contributions by Google are
+incorporated with their permission, and subject to the conditions contained in
+the file COPYING.Google.
+
+Portions of this file contain modifications contributed and copyrighted
+by Percona Inc.. Those modifications are
+gratefully acknowledged and are described briefly in the InnoDB
+documentation. The contributions by Percona Inc. are incorporated with
+their permission, and subject to the conditions contained in the file
+COPYING.Percona.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file srv/srv0srv.cc
+The database server main program
+
+Created 10/8/1995 Heikki Tuuri
+*******************************************************/
+
+/* Dummy comment */
+#include "srv0srv.h"
+
+#include "ut0mem.h"
+#include "ut0ut.h"
+#include "os0proc.h"
+#include "mem0mem.h"
+#include "mem0pool.h"
+#include "sync0sync.h"
+#include "que0que.h"
+#include "log0recv.h"
+#include "pars0pars.h"
+#include "usr0sess.h"
+#include "lock0lock.h"
+#include "trx0purge.h"
+#include "ibuf0ibuf.h"
+#include "buf0flu.h"
+#include "buf0lru.h"
+#include "btr0sea.h"
+#include "dict0load.h"
+#include "dict0boot.h"
+#include "srv0start.h"
+#include "row0mysql.h"
+#include "ha_prototypes.h"
+#include "trx0i_s.h"
+#include "os0sync.h" /* for HAVE_ATOMIC_BUILTINS */
+#include "srv0mon.h"
+#include "ut0crc32.h"
+
+#include "mysql/plugin.h"
+#include "mysql/service_thd_wait.h"
+
+/* The following counter is incremented whenever there is some user activity
+in the server */
+UNIV_INTERN ulint	srv_activity_count	= 0;
+
+/* The following is the maximum allowed duration of a lock wait. */
+UNIV_INTERN ulint	srv_fatal_semaphore_wait_threshold = 600;
+
+/* How much data manipulation language (DML) statements need to be delayed,
+in microseconds, in order to reduce the lagging of the purge thread. */
+UNIV_INTERN ulint	srv_dml_needed_delay = 0;
+
+UNIV_INTERN ibool	srv_monitor_active = FALSE;
+UNIV_INTERN ibool	srv_error_monitor_active = FALSE;
+
+UNIV_INTERN ibool	srv_buf_dump_thread_active = FALSE;
+
+UNIV_INTERN const char*	srv_main_thread_op_info = "";
+
+/** Prefix used by MySQL to indicate pre-5.1 table name encoding */
+const char		srv_mysql50_table_name_prefix[10] = "#mysql50#";
+
+/* Server parameters which are read from the initfile */
+
+/* The following three are dir paths which are catenated before file
+names, where the file name itself may also contain a path */
+
+UNIV_INTERN char*	srv_data_home	= NULL;
+
+/** Rollback files directory, can be absolute. */
+UNIV_INTERN char*	srv_undo_dir = NULL;
+
+/** The number of tablespaces to use for rollback segments. */
+UNIV_INTERN ulong	srv_undo_tablespaces = 8;
+
+/* The number of rollback segments to use */
+UNIV_INTERN ulong	srv_undo_logs = 1;
+
+#ifdef UNIV_LOG_ARCHIVE
+UNIV_INTERN char*	srv_arch_dir	= NULL;
+#endif /* UNIV_LOG_ARCHIVE */
+
+/** store to its own file each table created by an user; data
+dictionary tables are in the system tablespace 0 */
+UNIV_INTERN my_bool	srv_file_per_table;
+/** The file format to use on new *.ibd files. */
+UNIV_INTERN ulint	srv_file_format = 0;
+/** Whether to check file format during startup.  A value of
+UNIV_FORMAT_MAX + 1 means no checking ie. FALSE.  The default is to
+set it to the highest format we support. */
+UNIV_INTERN ulint	srv_max_file_format_at_startup = UNIV_FORMAT_MAX;
+
+#if UNIV_FORMAT_A
+# error "UNIV_FORMAT_A must be 0!"
+#endif
+
+/** Place locks to records only i.e. do not use next-key locking except
+on duplicate key checking and foreign key checking */
+UNIV_INTERN ibool	srv_locks_unsafe_for_binlog = FALSE;
+
+/* If this flag is TRUE, then we will use the native aio of the
+OS (provided we compiled Innobase with it in), otherwise we will
+use simulated aio we build below with threads.
+Currently we support native aio on windows and linux */
+UNIV_INTERN my_bool	srv_use_native_aio = TRUE;
+
+#ifdef __WIN__
+/* Windows native condition variables. We use runtime loading / function
+pointers, because they are not available on Windows Server 2003 and
+Windows XP/2000.
+
+We use condition for events on Windows if possible, even if os_event
+resembles Windows kernel event object well API-wise. The reason is
+performance, kernel objects are heavyweights and WaitForSingleObject() is a
+performance killer causing calling thread to context switch. Besides, Innodb
+is preallocating large number (often millions) of os_events. With kernel event
+objects it takes a big chunk out of non-paged pool, which is better suited
+for tasks like IO than for storing idle event objects. */
+UNIV_INTERN ibool	srv_use_native_conditions = FALSE;
+#endif /* __WIN__ */
+
+UNIV_INTERN ulint	srv_n_data_files = 0;
+UNIV_INTERN char**	srv_data_file_names = NULL;
+/* size in database pages */
+UNIV_INTERN ulint*	srv_data_file_sizes = NULL;
+
+/* if TRUE, then we auto-extend the last data file */
+UNIV_INTERN ibool	srv_auto_extend_last_data_file	= FALSE;
+/* if != 0, this tells the max size auto-extending may increase the
+last data file size */
+UNIV_INTERN ulint	srv_last_file_size_max	= 0;
+/* If the last data file is auto-extended, we add this
+many pages to it at a time */
+UNIV_INTERN ulong	srv_auto_extend_increment = 8;
+UNIV_INTERN ulint*	srv_data_file_is_raw_partition = NULL;
+
+/* If the following is TRUE we do not allow inserts etc. This protects
+the user from forgetting the 'newraw' keyword to my.cnf */
+
+UNIV_INTERN ibool	srv_created_new_raw	= FALSE;
+
+UNIV_INTERN char**	srv_log_group_home_dirs = NULL;
+
+UNIV_INTERN ulint	srv_n_log_groups	= ULINT_MAX;
+UNIV_INTERN ulint	srv_n_log_files		= ULINT_MAX;
+/* size in database pages */
+UNIV_INTERN ib_uint64_t	srv_log_file_size	= IB_UINT64_MAX;
+/* size in database pages */
+UNIV_INTERN ulint	srv_log_buffer_size	= ULINT_MAX;
+UNIV_INTERN ulong	srv_flush_log_at_trx_commit = 1;
+UNIV_INTERN ulong	srv_page_size		= UNIV_PAGE_SIZE_DEF;
+UNIV_INTERN ulong	srv_page_size_shift	= UNIV_PAGE_SIZE_SHIFT_DEF;
+
+/* Try to flush dirty pages so as to avoid IO bursts at
+the checkpoints. */
+UNIV_INTERN char	srv_adaptive_flushing	= TRUE;
+
+/** Maximum number of times allowed to conditionally acquire
+mutex before switching to blocking wait on the mutex */
+#define MAX_MUTEX_NOWAIT	20
+
+/** Check whether the number of failed nonblocking mutex
+acquisition attempts exceeds maximum allowed value. If so,
+srv_printf_innodb_monitor() will request mutex acquisition
+with mutex_enter(), which will wait until it gets the mutex. */
+#define MUTEX_NOWAIT(mutex_skipped)	((mutex_skipped) < MAX_MUTEX_NOWAIT)
+
+/** The sort order table of the MySQL latin1_swedish_ci character set
+collation */
+UNIV_INTERN const byte*	srv_latin1_ordering;
+
+/* use os/external memory allocator */
+UNIV_INTERN my_bool	srv_use_sys_malloc	= TRUE;
+/* requested size in kilobytes */
+UNIV_INTERN ulint	srv_buf_pool_size	= ULINT_MAX;
+/* requested number of buffer pool instances */
+UNIV_INTERN ulint       srv_buf_pool_instances  = 1;
+/* number of locks to protect buf_pool->page_hash */
+UNIV_INTERN ulong	srv_n_page_hash_locks = 16;
+/** Scan depth for LRU flush batch i.e.: number of blocks scanned*/
+UNIV_INTERN ulong	srv_LRU_scan_depth	= 1024;
+/** whether or not to flush neighbors of a block */
+UNIV_INTERN my_bool	srv_flush_neighbors	= TRUE;
+/* previously requested size */
+UNIV_INTERN ulint	srv_buf_pool_old_size;
+/* current size in kilobytes */
+UNIV_INTERN ulint	srv_buf_pool_curr_size	= 0;
+/* size in bytes */
+UNIV_INTERN ulint	srv_mem_pool_size	= ULINT_MAX;
+UNIV_INTERN ulint	srv_lock_table_size	= ULINT_MAX;
+
+/* This parameter is deprecated. Use srv_n_io_[read|write]_threads
+instead. */
+UNIV_INTERN ulint	srv_n_file_io_threads	= ULINT_MAX;
+UNIV_INTERN ulint	srv_n_read_io_threads	= ULINT_MAX;
+UNIV_INTERN ulint	srv_n_write_io_threads	= ULINT_MAX;
+
+/* Switch to enable random read ahead. */
+UNIV_INTERN my_bool	srv_random_read_ahead	= FALSE;
+/* User settable value of the number of pages that must be present
+in the buffer cache and accessed sequentially for InnoDB to trigger a
+readahead request. */
+UNIV_INTERN ulong	srv_read_ahead_threshold	= 56;
+
+#ifdef UNIV_LOG_ARCHIVE
+UNIV_INTERN ibool		srv_log_archive_on	= FALSE;
+UNIV_INTERN ibool		srv_archive_recovery	= 0;
+UNIV_INTERN ib_uint64_t	srv_archive_recovery_limit_lsn;
+#endif /* UNIV_LOG_ARCHIVE */
+
+/* This parameter is used to throttle the number of insert buffers that are
+merged in a batch. By increasing this parameter on a faster disk you can
+possibly reduce the number of I/O operations performed to complete the
+merge operation. The value of this parameter is used as is by the
+background loop when the system is idle (low load), on a busy system
+the parameter is scaled down by a factor of 4, this is to avoid putting
+a heavier load on the I/O sub system. */
+
+UNIV_INTERN ulong	srv_insert_buffer_batch_size = 20;
+
+UNIV_INTERN char*	srv_file_flush_method_str = NULL;
+UNIV_INTERN ulint	srv_unix_file_flush_method = SRV_UNIX_FSYNC;
+UNIV_INTERN ulint	srv_win_file_flush_method = SRV_WIN_IO_UNBUFFERED;
+
+UNIV_INTERN ulint	srv_max_n_open_files	  = 300;
+
+/* Number of IO operations per second the server can do */
+UNIV_INTERN ulong	srv_io_capacity         = 400;
+
+/* The InnoDB main thread tries to keep the ratio of modified pages
+in the buffer pool to all database pages in the buffer pool smaller than
+the following number. But it is not guaranteed that the value stays below
+that during a time of heavy update/insert activity. */
+
+UNIV_INTERN ulong	srv_max_buf_pool_modified_pct	= 75;
+
+/* The number of purge threads to use.*/
+UNIV_INTERN ulong srv_n_purge_threads = 1;
+
+/* the number of pages to purge in one batch */
+UNIV_INTERN ulong srv_purge_batch_size = 20;
+
+/* variable counts amount of data read in total (in bytes) */
+UNIV_INTERN ulint srv_data_read = 0;
+
+/* Internal setting for "innodb_stats_method". Decides how InnoDB treats
+NULL value when collecting statistics. By default, it is set to
+SRV_STATS_NULLS_EQUAL(0), ie. all NULL value are treated equal */
+ulong srv_innodb_stats_method = SRV_STATS_NULLS_EQUAL;
+
+/* here we count the amount of data written in total (in bytes) */
+UNIV_INTERN ulint srv_data_written = 0;
+
+/* the number of the log write requests done */
+UNIV_INTERN ulint srv_log_write_requests = 0;
+
+/* the number of physical writes to the log performed */
+UNIV_INTERN ulint srv_log_writes = 0;
+
+/* amount of data written to the log files in bytes */
+UNIV_INTERN lsn_t srv_os_log_written = 0;
+
+/* amount of writes being done to the log files */
+UNIV_INTERN ulint srv_os_log_pending_writes = 0;
+
+/* we increase this counter, when there we don't have enough space in the
+log buffer and have to flush it */
+UNIV_INTERN ulint srv_log_waits = 0;
+
+/* this variable counts the amount of times, when the doublewrite buffer
+was flushed */
+UNIV_INTERN ulint srv_dblwr_writes = 0;
+
+/* here we store the number of pages that have been flushed to the
+doublewrite buffer */
+UNIV_INTERN ulint srv_dblwr_pages_written = 0;
+
+/* in this variable we store the number of write requests issued */
+UNIV_INTERN ulint srv_buf_pool_write_requests = 0;
+
+/* here we store the number of times when we had to wait for a free page
+in the buffer pool. It happens when the buffer pool is full and we need
+to make a flush, in order to be able to read or create a page. */
+UNIV_INTERN ulint srv_buf_pool_wait_free = 0;
+
+/* variable to count the number of pages that were written from buffer
+pool to the disk */
+UNIV_INTERN ulint srv_buf_pool_flushed = 0;
+
+/** Number of buffer pool reads that led to the
+reading of a disk page */
+UNIV_INTERN ulint srv_buf_pool_reads = 0;
+
+/* structure to pass status variables to MySQL */
+UNIV_INTERN export_struc export_vars;
+
+/* If the following is != 0 we do not allow inserts etc. This protects
+the user from forgetting the innodb_force_recovery keyword to my.cnf */
+
+UNIV_INTERN ulint	srv_force_recovery	= 0;
+
+/** Print all user-level transactions deadlocks to mysqld stderr */
+
+UNIV_INTERN my_bool	srv_print_all_deadlocks = FALSE;
+
+/* If the following is set to 1 then we do not run purge and insert buffer
+merge to completion before shutdown. If it is set to 2, do not even flush the
+buffer pool to data files at the shutdown: we effectively 'crash'
+InnoDB (but lose no committed transactions). */
+UNIV_INTERN ulint	srv_fast_shutdown	= 0;
+
+/* Generate a innodb_status.<pid> file */
+UNIV_INTERN ibool	srv_innodb_status	= FALSE;
+
+/* When estimating number of different key values in an index, sample
+this many index pages, there are 2 ways to calculate statistics:
+* persistent stats that are calculated by ANALYZE TABLE and saved
+  in the innodb database.
+* quick transient stats, that are used if persistent stats for the given
+  table/index are not found in the innodb database */
+UNIV_INTERN unsigned long long	srv_stats_transient_sample_pages = 8;
+UNIV_INTERN unsigned long long	srv_stats_persistent_sample_pages = 20;
+
+UNIV_INTERN ibool	srv_use_doublewrite_buf	= TRUE;
+
+/** doublewrite buffer is 1MB is size i.e.: it can hold 128 16K pages.
+The following parameter is the size of the buffer that is used for
+batch flushing i.e.: LRU flushing and flush_list flushing. The rest
+of the pages are used for single page flushing. */
+UNIV_INTERN ulong	srv_doublewrite_batch_size	= 120;
+
+UNIV_INTERN ulong	srv_replication_delay		= 0;
+
+/*-------------------------------------------*/
+UNIV_INTERN ulong	srv_n_spin_wait_rounds	= 30;
+UNIV_INTERN ulong	srv_spin_wait_delay	= 6;
+UNIV_INTERN ibool	srv_priority_boost	= TRUE;
+
+#ifdef UNIV_DEBUG
+UNIV_INTERN ibool	srv_print_thread_releases	= FALSE;
+UNIV_INTERN ibool	srv_print_lock_waits		= FALSE;
+UNIV_INTERN ibool	srv_print_buf_io		= FALSE;
+UNIV_INTERN ibool	srv_print_log_io		= FALSE;
+UNIV_INTERN ibool	srv_print_latch_waits		= FALSE;
+#endif /* UNIV_DEBUG */
+
+UNIV_INTERN ulint	srv_n_rows_inserted		= 0;
+UNIV_INTERN ulint	srv_n_rows_updated		= 0;
+UNIV_INTERN ulint	srv_n_rows_deleted		= 0;
+UNIV_INTERN ulint	srv_n_rows_read			= 0;
+
+static ulint		srv_n_rows_inserted_old		= 0;
+static ulint		srv_n_rows_updated_old		= 0;
+static ulint		srv_n_rows_deleted_old		= 0;
+static ulint		srv_n_rows_read_old		= 0;
+
+UNIV_INTERN ulint	srv_truncated_status_writes	= 0;
+UNIV_INTERN ulint	srv_available_undo_logs         = 0;
+
+/* Set the following to 0 if you want InnoDB to write messages on
+stderr on startup/shutdown. */
+UNIV_INTERN ibool	srv_print_verbose_log		= TRUE;
+UNIV_INTERN ibool	srv_print_innodb_monitor	= FALSE;
+UNIV_INTERN ibool	srv_print_innodb_lock_monitor	= FALSE;
+UNIV_INTERN ibool	srv_print_innodb_tablespace_monitor = FALSE;
+UNIV_INTERN ibool	srv_print_innodb_table_monitor = FALSE;
+
+/* Array of English strings describing the current state of an
+i/o handler thread */
+
+UNIV_INTERN const char* srv_io_thread_op_info[SRV_MAX_N_IO_THREADS];
+UNIV_INTERN const char* srv_io_thread_function[SRV_MAX_N_IO_THREADS];
+
+UNIV_INTERN time_t	srv_last_monitor_time;
+
+UNIV_INTERN mutex_t	srv_innodb_monitor_mutex;
+
+/* Mutex for locking srv_monitor_file */
+UNIV_INTERN mutex_t	srv_monitor_file_mutex;
+
+#ifdef UNIV_PFS_MUTEX
+# ifndef HAVE_ATOMIC_BUILTINS
+/* Key to register server_mutex with performance schema */
+UNIV_INTERN mysql_pfs_key_t	server_mutex_key;
+# endif /* !HAVE_ATOMIC_BUILTINS */
+/* Key to register srv_innodb_monitor_mutex with performance schema */
+UNIV_INTERN mysql_pfs_key_t	srv_innodb_monitor_mutex_key;
+/* Key to register srv_monitor_file_mutex with performance schema */
+UNIV_INTERN mysql_pfs_key_t	srv_monitor_file_mutex_key;
+/* Key to register srv_dict_tmpfile_mutex with performance schema */
+UNIV_INTERN mysql_pfs_key_t	srv_dict_tmpfile_mutex_key;
+/* Key to register the mutex with performance schema */
+UNIV_INTERN mysql_pfs_key_t	srv_misc_tmpfile_mutex_key;
+/* Key to register srv_sys_t::mutex with performance schema */
+UNIV_INTERN mysql_pfs_key_t	srv_sys_mutex_key;
+/* Key to register srv_sys_t::tasks_mutex with performance schema */
+UNIV_INTERN mysql_pfs_key_t	srv_sys_tasks_mutex_key;
+#endif /* UNIV_PFS_MUTEX */
+
+/* Temporary file for innodb monitor output */
+UNIV_INTERN FILE*	srv_monitor_file;
+/* Mutex for locking srv_dict_tmpfile.
+This mutex has a very high rank; threads reserving it should not
+be holding any InnoDB latches. */
+UNIV_INTERN mutex_t	srv_dict_tmpfile_mutex;
+/* Temporary file for output from the data dictionary */
+UNIV_INTERN FILE*	srv_dict_tmpfile;
+/* Mutex for locking srv_misc_tmpfile.
+This mutex has a very low rank; threads reserving it should not
+acquire any further latches or sleep before releasing this one. */
+UNIV_INTERN mutex_t	srv_misc_tmpfile_mutex;
+/* Temporary file for miscellanous diagnostic output */
+UNIV_INTERN FILE*	srv_misc_tmpfile;
+
+UNIV_INTERN ulint	srv_main_thread_process_no	= 0;
+UNIV_INTERN ulint	srv_main_thread_id		= 0;
+
+/* The following count work done by srv_master_thread. */
+
+/* Iterations of the loop bounded by 'srv_active' label. */
+static ulint   srv_main_active_loops		= 0;
+/* Iterations of the loop bounded by the 'srv_idle' label. */
+static ulint   srv_main_idle_loops		= 0;
+/* Iterations of the loop bounded by the 'srv_shutdown' label. */
+static ulint   srv_main_shutdown_loops		= 0;
+/* Log writes involving flush. */
+static ulint   srv_log_writes_and_flush		= 0;
+
+/* This is only ever touched by the master thread. It records the
+time when the last flush of log file has happened. The master
+thread ensures that we flush the log files at least once per
+second. */
+static time_t	srv_last_log_flush_time;
+
+/* Interval in seconds at which various tasks are performed by the
+master thread when server is active. In order to balance the workload,
+we should try to keep intervals such that they are not multiple of
+each other. For example, if we have intervals for various tasks
+defined as 5, 10, 15, 60 then all tasks will be performed when
+current_time % 60 == 0 and no tasks will be performed when
+current_time % 5 != 0. */
+
+# define	SRV_MASTER_CHECKPOINT_INTERVAL		(7)
+# define	SRV_MASTER_PURGE_INTERVAL		(10)
+#ifdef MEM_PERIODIC_CHECK
+# define	SRV_MASTER_MEM_VALIDATE_INTERVAL	(13)
+#endif /* MEM_PERIODIC_CHECK */
+# define	SRV_MASTER_DICT_LRU_INTERVAL		(47)
+
+/** Acquire the system_mutex. */
+#define srv_sys_mutex_enter() do {			\
+	mutex_enter(&srv_sys->mutex);			\
+} while (0)
+
+/** Test if the system mutex is owned. */
+#define srv_sys_mutex_own() mutex_own(&srv_sys->mutex)
+
+/** Release the system mutex. */
+#define srv_sys_mutex_exit() do {			\
+	mutex_exit(&srv_sys->mutex);			\
+} while (0)
+
+#define fetch_lock_wait_timeout(trx)			\
+	((trx)->lock.allowed_to_wait				\
+	 ? thd_lock_wait_timeout((trx)->mysql_thd)	\
+	 : 0)
+
+/*
+	IMPLEMENTATION OF THE SERVER MAIN PROGRAM
+	=========================================
+
+There is the following analogue between this database
+server and an operating system kernel:
+
+DB concept			equivalent OS concept
+----------			---------------------
+transaction		--	process;
+
+query thread		--	thread;
+
+lock			--	semaphore;
+
+kernel			--	kernel;
+
+query thread execution:
+(a) without lock mutex
+reserved		--	process executing in user mode;
+(b) with lock mutex reserved
+			--	process executing in kernel mode;
+
+The server has several backgroind threads all running at the same
+priority as user threads. It periodically checks if here is anything
+happening in the server which requires intervention of the master
+thread. Such situations may be, for example, when flushing of dirty
+blocks is needed in the buffer pool or old version of database rows
+have to be cleaned away (purged). The user can configure a separate
+dedicated purge thread(s) too, in which case the master thread does not
+do any purging.
+
+The threads which we call user threads serve the queries of the MySQL
+server. They run at normal priority.
+
+When there is no activity in the system, also the master thread
+suspends itself to wait for an event making the server totally silent.
+
+There is still one complication in our server design. If a
+background utility thread obtains a resource (e.g., mutex) needed by a user
+thread, and there is also some other user activity in the system,
+the user thread may have to wait indefinitely long for the
+resource, as the OS does not schedule a background thread if
+there is some other runnable user thread. This problem is called
+priority inversion in real-time programming.
+
+One solution to the priority inversion problem would be to keep record
+of which thread owns which resource and in the above case boost the
+priority of the background thread so that it will be scheduled and it
+can release the resource.  This solution is called priority inheritance
+in real-time programming.  A drawback of this solution is that the overhead
+of acquiring a mutex increases slightly, maybe 0.2 microseconds on a 100
+MHz Pentium, because the thread has to call os_thread_get_curr_id.  This may
+be compared to 0.5 microsecond overhead for a mutex lock-unlock pair. Note
+that the thread cannot store the information in the resource , say mutex,
+itself, because competing threads could wipe out the information if it is
+stored before acquiring the mutex, and if it stored afterwards, the
+information is outdated for the time of one machine instruction, at least.
+(To be precise, the information could be stored to lock_word in mutex if
+the machine supports atomic swap.)
+
+The above solution with priority inheritance may become actual in the
+future, currently we do not implement any priority twiddling solution.
+Our general aim is to reduce the contention of all mutexes by making
+them more fine grained.
+
+The thread table contains information of the current status of each
+thread existing in the system, and also the event semaphores used in
+suspending the master thread and utility threads when they have nothing
+to do.  The thread table can be seen as an analogue to the process table
+in a traditional Unix implementation. */
+
+/** The server system */
+typedef struct srv_sys_struct	srv_sys_t;
+
+/** The server system struct */
+struct srv_sys_struct{
+	mutex_t		tasks_mutex;		/*!< variable protecting the
+						tasks queue */
+	UT_LIST_BASE_NODE_T(que_thr_t)
+			tasks;			/*!< task queue */
+
+	mutex_t		mutex;			/*!< variable protecting the
+
+						fields below. */
+	ulint		n_sys_threads;		/*!< size of the sys_threads
+						array */
+
+	srv_table_t*	sys_threads;		/*!< server thread table */
+
+	ulint		n_threads_active[SRV_MASTER + 1];
+						/*!< number of threads active
+						in a thread class */
+
+	ulint		activity_count;		/*!< For tracking server
+						activity */
+};
+
+#ifndef HAVE_ATOMIC_BUILTINS
+/** Mutex protecting some server global variables. */
+UNIV_INTERN mutex_t	server_mutex;
+#endif /* !HAVE_ATOMIC_BUILTINS */
+
+static srv_sys_t*	srv_sys	= NULL;
+
+/** Event to signal the monitor thread. */
+UNIV_INTERN os_event_t	srv_monitor_event;
+
+/** Event to signal the error thread */
+UNIV_INTERN os_event_t	srv_error_event;
+
+/** Event to signal the buffer pool dump/load thread */
+UNIV_INTERN os_event_t	srv_buf_dump_event;
+
+/** The buffer pool dump/load file name */
+UNIV_INTERN char*	srv_buf_dump_filename;
+
+/** Boolean config knobs that tell InnoDB to dump the buffer pool at shutdown
+and/or load it during startup. */
+UNIV_INTERN char	srv_buffer_pool_dump_at_shutdown = FALSE;
+UNIV_INTERN char	srv_buffer_pool_load_at_startup = FALSE;
+
+/** Slot index in the srv_sys->sys_threads array for the purge thread. */
+static const ulint	SRV_PURGE_SLOT	= 1;
+
+/** Slot index in the srv_sys->sys_threads array for the master thread. */
+static const ulint	SRV_MASTER_SLOT = 0;
+
+/*********************************************************************//**
+Prints counters for work done by srv_master_thread. */
+static
+void
+srv_print_master_thread_info(
+/*=========================*/
+	FILE  *file)    /* in: output stream */
+{
+	fprintf(file, "srv_master_thread loops: %lu srv_active, "
+		"%lu srv_shutdown, %lu srv_idle\n",
+		srv_main_active_loops,
+		srv_main_shutdown_loops,
+		srv_main_idle_loops);
+	fprintf(file, "srv_master_thread log flush and writes: %lu\n",
+		srv_log_writes_and_flush);
+}
+
+/*********************************************************************//**
+Sets the info describing an i/o thread current state. */
+UNIV_INTERN
+void
+srv_set_io_thread_op_info(
+/*======================*/
+	ulint		i,	/*!< in: the 'segment' of the i/o thread */
+	const char*	str)	/*!< in: constant char string describing the
+				state */
+{
+	ut_a(i < SRV_MAX_N_IO_THREADS);
+
+	srv_io_thread_op_info[i] = str;
+}
+
+#ifdef UNIV_DEBUG
+/*********************************************************************//**
+Validates the type of a thread table slot.
+@return TRUE if ok */
+static
+ibool
+srv_thread_type_validate(
+/*=====================*/
+	srv_thread_type	type)	/*!< in: thread type */
+{
+	switch (type) {
+	case SRV_NONE:
+		break;
+	case SRV_WORKER:
+	case SRV_PURGE:
+	case SRV_MASTER:
+		return(TRUE);
+	}
+	ut_error;
+	return(FALSE);
+}
+#endif /* UNIV_DEBUG */
+
+/*********************************************************************//**
+Gets the type of a thread table slot.
+@return thread type */
+static
+srv_thread_type
+srv_slot_get_type(
+/*==============*/
+	const srv_slot_t*	slot)	/*!< in: thread slot */
+{
+	srv_thread_type	type = slot->type;
+	ut_ad(srv_thread_type_validate(type));
+	return(type);
+}
+
+/*********************************************************************//**
+Reserves a slot in the thread table for the current thread.
+@return	reserved slot */
+static
+srv_slot_t*
+srv_reserve_slot(
+/*=============*/
+	srv_thread_type	type)	/*!< in: type of the thread */
+{
+	srv_slot_t*	slot = 0;
+
+	srv_sys_mutex_enter();
+
+	ut_ad(srv_thread_type_validate(type));
+
+	switch (type) {
+	case SRV_MASTER:
+		slot = &srv_sys->sys_threads[SRV_MASTER_SLOT];
+		break;
+
+	case SRV_PURGE:
+		slot = &srv_sys->sys_threads[SRV_PURGE_SLOT];
+		break;
+
+	case SRV_WORKER:
+		/* Find an empty slot, skip the master and purge slots. */
+		for (slot = &srv_sys->sys_threads[2];
+		     slot->in_use;
+		     ++slot) {
+
+			ut_a(slot < &srv_sys->sys_threads[
+			     srv_sys->n_sys_threads]);
+		}
+		break;
+
+	case SRV_NONE:
+		ut_error;
+	}
+
+	ut_a(!slot->in_use);
+
+	slot->in_use = TRUE;
+	slot->suspended = FALSE;
+	slot->type = type;
+
+	ut_ad(srv_slot_get_type(slot) == type);
+
+	++srv_sys->n_threads_active[type];
+
+	srv_sys_mutex_exit();
+
+	return(slot);
+}
+
+/*********************************************************************//**
+Suspends the calling thread to wait for the event in its thread slot.
+@return the current signal count of the event. */
+static
+ib_int64_t
+srv_suspend_thread_low(
+/*===================*/
+	srv_slot_t*	slot)	/*!< in/out: thread slot */
+{
+	ut_ad(srv_sys_mutex_own());
+
+	ut_ad(slot->in_use);
+
+	srv_thread_type	type = srv_slot_get_type(slot);
+
+	switch (type) {
+	case SRV_NONE:
+		ut_error;
+
+	case SRV_MASTER:
+		/* We have only one master thread and it
+		should be the first entry always. */
+		ut_a(srv_sys->n_threads_active[type] == 1);
+		break;
+
+	case SRV_PURGE:
+		/* We have only one purge coordinator thread
+		and it should be the second entry always. */
+		ut_a(srv_sys->n_threads_active[type] == 1);
+		break;
+
+	case SRV_WORKER:
+		ut_a(srv_n_purge_threads > 1);
+		ut_a(srv_sys->n_threads_active[type] > 0);
+		break;
+	}
+
+	ut_a(!slot->suspended);
+	slot->suspended = TRUE;
+
+	ut_a(srv_sys->n_threads_active[type] > 0);
+
+	srv_sys->n_threads_active[type]--;
+
+	return(os_event_reset(slot->event));
+}
+
+/*********************************************************************//**
+Suspends the calling thread to wait for the event in its thread slot.
+@return the current signal count of the event. */
+static
+ib_int64_t
+srv_suspend_thread(
+/*===============*/
+	srv_slot_t*	slot)	/*!< in/out: thread slot */
+{
+	srv_sys_mutex_enter();
+
+	ib_int64_t	sig_count = srv_suspend_thread_low(slot);
+
+	srv_sys_mutex_exit();
+
+	return(sig_count);
+}
+
+/*********************************************************************//**
+Releases threads of the type given from suspension in the thread table.
+NOTE! The server mutex has to be reserved by the caller!
+@return number of threads released: this may be less than n if not
+        enough threads were suspended at the moment. */
+UNIV_INTERN
+ulint
+srv_release_threads(
+/*================*/
+	srv_thread_type	type,	/*!< in: thread type */
+	ulint		n)	/*!< in: number of threads to release */
+{
+	ulint		i;
+	ulint		count	= 0;
+
+	ut_ad(srv_thread_type_validate(type));
+	ut_ad(n > 0);
+
+	srv_sys_mutex_enter();
+
+	for (i = 0; i < srv_sys->n_sys_threads; i++) {
+		srv_slot_t*	slot;
+
+		slot = &srv_sys->sys_threads[i];
+
+		if (slot->in_use
+		    && srv_slot_get_type(slot) == type
+		    && slot->suspended) {
+
+			switch (type) {
+			case SRV_NONE:
+				ut_error;
+
+			case SRV_MASTER:
+				/* We have only one master thread and it
+				should be the first entry always. */
+				ut_a(n == 1);
+				ut_a(i == SRV_MASTER_SLOT);
+				ut_a(srv_sys->n_threads_active[type] == 0);
+				break;
+
+			case SRV_PURGE:
+				/* We have only one purge coordinator thread
+				and it should be the second entry always. */
+				ut_a(n == 1);
+				ut_a(i == SRV_PURGE_SLOT);
+				ut_a(srv_n_purge_threads > 0);
+				ut_a(srv_sys->n_threads_active[type] == 0);
+				break;
+
+			case SRV_WORKER:
+				ut_a(srv_n_purge_threads > 1);
+				ut_a(srv_sys->n_threads_active[type]
+				     < srv_n_purge_threads - 1);
+				break;
+			}
+
+			slot->suspended = FALSE;
+
+			++srv_sys->n_threads_active[type];
+
+			os_event_set(slot->event);
+
+			if (++count == n) {
+				break;
+			}
+		}
+	}
+
+	srv_sys_mutex_exit();
+
+	return(count);
+}
+
+/*********************************************************************//**
+Release a thread's slot. */
+static
+void
+srv_free_slot(
+/*==========*/
+	srv_slot_t*	slot)	/*!< in/out: thread slot */
+{
+	srv_sys_mutex_enter();
+
+	if (!slot->suspended) {
+		/* Mark the thread as inactive. */
+		srv_suspend_thread_low(slot);
+	}
+
+	/* Free the slot for reuse. */
+	ut_ad(slot->in_use);
+	slot->in_use = FALSE;
+
+	srv_sys_mutex_exit();
+}
+
+/*********************************************************************//**
+Initializes the server. */
+UNIV_INTERN
+void
+srv_init(void)
+/*==========*/
+{
+	ulint			i;
+	ulint			srv_sys_sz;
+	ulint			n_sys_threads;
+
+#ifndef HAVE_ATOMIC_BUILTINS
+	mutex_create(server_mutex_key, &server_mutex, SYNC_ANY_LATCH);
+#endif /* !HAVE_ATOMIC_BUILTINS */
+
+	mutex_create(srv_innodb_monitor_mutex_key,
+		     &srv_innodb_monitor_mutex, SYNC_NO_ORDER_CHECK);
+
+	/* Number of purge threads + master thread */
+	n_sys_threads = srv_n_purge_threads + 1;
+
+	srv_sys_sz = sizeof(*srv_sys) + (n_sys_threads * sizeof(srv_slot_t));
+
+	srv_sys = static_cast<srv_sys_t*>(mem_zalloc(srv_sys_sz));
+
+	mutex_create(srv_sys_mutex_key, &srv_sys->mutex, SYNC_THREADS);
+
+	mutex_create(srv_sys_tasks_mutex_key,
+		     &srv_sys->tasks_mutex, SYNC_ANY_LATCH);
+
+	srv_sys->n_sys_threads = n_sys_threads;
+	srv_sys->sys_threads = (srv_slot_t*) &srv_sys[1];
+
+	for (i = 0; i < srv_sys->n_sys_threads; i++) {
+		srv_slot_t*	slot;
+
+		slot = srv_sys->sys_threads + i;
+
+		slot->event = os_event_create(NULL);
+
+		ut_a(slot->event);
+	}
+
+	srv_error_event = os_event_create(NULL);
+
+	srv_monitor_event = os_event_create(NULL);
+
+	srv_buf_dump_event = os_event_create("buf_dump_event");
+
+	UT_LIST_INIT(srv_sys->tasks);
+
+	/* Create dummy indexes for infimum and supremum records */
+
+	dict_ind_init();
+
+	srv_conc_init();
+
+	/* Initialize some INFORMATION SCHEMA internal structures */
+	trx_i_s_cache_init(trx_i_s_cache);
+
+	ut_crc32_init();
+}
+
+/*********************************************************************//**
+Frees the data structures created in srv_init(). */
+UNIV_INTERN
+void
+srv_free(void)
+/*==========*/
+{
+	srv_conc_free();
+
+	/* The mutexes srv_sys->mutex and srv_sys->tasks_mutex should have
+	been freed by sync_close() already. */
+	mem_free(srv_sys);
+	srv_sys = NULL;
+
+	trx_i_s_cache_free(trx_i_s_cache);
+
+	os_event_free(srv_buf_dump_event);
+	srv_buf_dump_event = NULL;
+}
+
+/*********************************************************************//**
+Initializes the synchronization primitives, memory system, and the thread
+local storage. */
+UNIV_INTERN
+void
+srv_general_init(void)
+/*==================*/
+{
+	ut_mem_init();
+	/* Reset the system variables in the recovery module. */
+	recv_sys_var_init();
+	os_sync_init();
+	sync_init();
+	mem_init(srv_mem_pool_size);
+	que_init();
+	row_mysql_init();
+}
+
+/*********************************************************************//**
+Normalizes init parameter values to use units we use inside InnoDB.
+@return	DB_SUCCESS or error code */
+static
+ulint
+srv_normalize_init_values(void)
+/*===========================*/
+{
+	ulint	n;
+	ulint	i;
+
+	n = srv_n_data_files;
+
+	for (i = 0; i < n; i++) {
+		srv_data_file_sizes[i] = srv_data_file_sizes[i]
+			* ((1024 * 1024) / UNIV_PAGE_SIZE);
+	}
+
+	srv_last_file_size_max = srv_last_file_size_max
+		* ((1024 * 1024) / UNIV_PAGE_SIZE);
+
+	srv_log_file_size = srv_log_file_size / UNIV_PAGE_SIZE;
+
+	srv_log_buffer_size = srv_log_buffer_size / UNIV_PAGE_SIZE;
+
+	srv_lock_table_size = 5 * (srv_buf_pool_size / UNIV_PAGE_SIZE);
+
+	return(DB_SUCCESS);
+}
+
+/*********************************************************************//**
+Boots the InnoDB server.
+@return	DB_SUCCESS or error code */
+UNIV_INTERN
+ulint
+srv_boot(void)
+/*==========*/
+{
+	ulint	err;
+
+	/* Transform the init parameter values given by MySQL to
+	use units we use inside InnoDB: */
+
+	err = srv_normalize_init_values();
+
+	if (err != DB_SUCCESS) {
+		return(err);
+	}
+
+	/* Initialize synchronization primitives, memory management, and thread
+	local storage */
+
+	srv_general_init();
+
+	/* Initialize this module */
+
+	srv_init();
+
+	return(DB_SUCCESS);
+}
+
+/******************************************************************//**
+Refreshes the values used to calculate per-second averages. */
+static
+void
+srv_refresh_innodb_monitor_stats(void)
+/*==================================*/
+{
+	mutex_enter(&srv_innodb_monitor_mutex);
+
+	srv_last_monitor_time = time(NULL);
+
+	os_aio_refresh_stats();
+
+	btr_cur_n_sea_old = btr_cur_n_sea;
+	btr_cur_n_non_sea_old = btr_cur_n_non_sea;
+
+	log_refresh_stats();
+
+	buf_refresh_io_stats_all();
+
+	srv_n_rows_inserted_old = srv_n_rows_inserted;
+	srv_n_rows_updated_old = srv_n_rows_updated;
+	srv_n_rows_deleted_old = srv_n_rows_deleted;
+	srv_n_rows_read_old = srv_n_rows_read;
+
+	mutex_exit(&srv_innodb_monitor_mutex);
+}
+
+/******************************************************************//**
+Outputs to a file the output of the InnoDB Monitor.
+@return FALSE if not all information printed
+due to failure to obtain necessary mutex */
+UNIV_INTERN
+ibool
+srv_printf_innodb_monitor(
+/*======================*/
+	FILE*	file,		/*!< in: output stream */
+	ibool	nowait,		/*!< in: whether to wait for the
+				lock_sys_t:: mutex */
+	ulint*	trx_start_pos,	/*!< out: file position of the start of
+				the list of active transactions */
+	ulint*	trx_end)	/*!< out: file position of the end of
+				the list of active transactions */
+{
+	double	time_elapsed;
+	time_t	current_time;
+	ulint	n_reserved;
+	ibool	ret;
+
+	mutex_enter(&srv_innodb_monitor_mutex);
+
+	current_time = time(NULL);
+
+	/* We add 0.001 seconds to time_elapsed to prevent division
+	by zero if two users happen to call SHOW ENGINE INNODB STATUS at the
+	same time */
+
+	time_elapsed = difftime(current_time, srv_last_monitor_time)
+		+ 0.001;
+
+	srv_last_monitor_time = time(NULL);
+
+	fputs("\n=====================================\n", file);
+
+	ut_print_timestamp(file);
+	fprintf(file,
+		" INNODB MONITOR OUTPUT\n"
+		"=====================================\n"
+		"Per second averages calculated from the last %lu seconds\n",
+		(ulong) time_elapsed);
+
+	fputs("-----------------\n"
+	      "BACKGROUND THREAD\n"
+	      "-----------------\n", file);
+	srv_print_master_thread_info(file);
+
+	fputs("----------\n"
+	      "SEMAPHORES\n"
+	      "----------\n", file);
+	sync_print(file);
+
+	/* Conceptually, srv_innodb_monitor_mutex has a very high latching
+	order level in sync0sync.h, while dict_foreign_err_mutex has a very
+	low level 135. Therefore we can reserve the latter mutex here without
+	a danger of a deadlock of threads. */
+
+	mutex_enter(&dict_foreign_err_mutex);
+
+	if (ftell(dict_foreign_err_file) != 0L) {
+		fputs("------------------------\n"
+		      "LATEST FOREIGN KEY ERROR\n"
+		      "------------------------\n", file);
+		ut_copy_file(file, dict_foreign_err_file);
+	}
+
+	mutex_exit(&dict_foreign_err_mutex);
+
+	/* Only if lock_print_info_summary proceeds correctly,
+	before we call the lock_print_info_all_transactions
+	to print all the lock information. IMPORTANT NOTE: This
+	function acquires the lock mutex on success. */
+	ret = lock_print_info_summary(file, nowait);
+
+	if (ret) {
+		if (trx_start_pos) {
+			long	t = ftell(file);
+			if (t < 0) {
+				*trx_start_pos = ULINT_UNDEFINED;
+			} else {
+				*trx_start_pos = (ulint) t;
+			}
+		}
+
+		/* NOTE: If we get here then we have the lock mutex. This
+		function will release the lock mutex that we acquired when
+		we called the lock_print_info_summary() function earlier. */
+
+		lock_print_info_all_transactions(file);
+
+		if (trx_end) {
+			long	t = ftell(file);
+			if (t < 0) {
+				*trx_end = ULINT_UNDEFINED;
+			} else {
+				*trx_end = (ulint) t;
+			}
+		}
+	}
+
+	fputs("--------\n"
+	      "FILE I/O\n"
+	      "--------\n", file);
+	os_aio_print(file);
+
+	fputs("-------------------------------------\n"
+	      "INSERT BUFFER AND ADAPTIVE HASH INDEX\n"
+	      "-------------------------------------\n", file);
+	ibuf_print(file);
+
+	ha_print_info(file, btr_search_sys->hash_index);
+
+	fprintf(file,
+		"%.2f hash searches/s, %.2f non-hash searches/s\n",
+		(btr_cur_n_sea - btr_cur_n_sea_old)
+		/ time_elapsed,
+		(btr_cur_n_non_sea - btr_cur_n_non_sea_old)
+		/ time_elapsed);
+	btr_cur_n_sea_old = btr_cur_n_sea;
+	btr_cur_n_non_sea_old = btr_cur_n_non_sea;
+
+	fputs("---\n"
+	      "LOG\n"
+	      "---\n", file);
+	log_print(file);
+
+	fputs("----------------------\n"
+	      "BUFFER POOL AND MEMORY\n"
+	      "----------------------\n", file);
+	fprintf(file,
+		"Total memory allocated " ULINTPF
+		"; in additional pool allocated " ULINTPF "\n",
+		ut_total_allocated_memory,
+		mem_pool_get_reserved(mem_comm_pool));
+	fprintf(file, "Dictionary memory allocated " ULINTPF "\n",
+		dict_sys->size);
+
+	buf_print_io(file);
+
+	fputs("--------------\n"
+	      "ROW OPERATIONS\n"
+	      "--------------\n", file);
+	fprintf(file, "%ld queries inside InnoDB, %lu queries in queue\n",
+		(long) srv_conc_get_active_threads(),
+		srv_conc_get_waiting_threads());
+
+	/* This is a dirty read, without holding trx_sys->mutex. */
+	fprintf(file, "%lu read views open inside InnoDB\n",
+		UT_LIST_GET_LEN(trx_sys->view_list));
+
+	n_reserved = fil_space_get_n_reserved_extents(0);
+	if (n_reserved > 0) {
+		fprintf(file,
+			"%lu tablespace extents now reserved for"
+			" B-tree split operations\n",
+			(ulong) n_reserved);
+	}
+
+#ifdef UNIV_LINUX
+	fprintf(file, "Main thread process no. %lu, id %lu, state: %s\n",
+		(ulong) srv_main_thread_process_no,
+		(ulong) srv_main_thread_id,
+		srv_main_thread_op_info);
+#else
+	fprintf(file, "Main thread id %lu, state: %s\n",
+		(ulong) srv_main_thread_id,
+		srv_main_thread_op_info);
+#endif
+	fprintf(file,
+		"Number of rows inserted " ULINTPF
+		", updated " ULINTPF ", deleted " ULINTPF
+		", read " ULINTPF "\n",
+		srv_n_rows_inserted,
+		srv_n_rows_updated,
+		srv_n_rows_deleted,
+		srv_n_rows_read);
+	fprintf(file,
+		"%.2f inserts/s, %.2f updates/s,"
+		" %.2f deletes/s, %.2f reads/s\n",
+		(srv_n_rows_inserted - srv_n_rows_inserted_old)
+		/ time_elapsed,
+		(srv_n_rows_updated - srv_n_rows_updated_old)
+		/ time_elapsed,
+		(srv_n_rows_deleted - srv_n_rows_deleted_old)
+		/ time_elapsed,
+		(srv_n_rows_read - srv_n_rows_read_old)
+		/ time_elapsed);
+
+	srv_n_rows_inserted_old = srv_n_rows_inserted;
+	srv_n_rows_updated_old = srv_n_rows_updated;
+	srv_n_rows_deleted_old = srv_n_rows_deleted;
+	srv_n_rows_read_old = srv_n_rows_read;
+
+	fputs("----------------------------\n"
+	      "END OF INNODB MONITOR OUTPUT\n"
+	      "============================\n", file);
+	mutex_exit(&srv_innodb_monitor_mutex);
+	fflush(file);
+
+	return(ret);
+}
+
+/******************************************************************//**
+Function to pass InnoDB status variables to MySQL */
+UNIV_INTERN
+void
+srv_export_innodb_status(void)
+/*==========================*/
+{
+	buf_pool_stat_t	stat;
+	ulint		LRU_len;
+	ulint		free_len;
+	ulint		flush_list_len;
+
+	buf_get_total_stat(&stat);
+	buf_get_total_list_len(&LRU_len, &free_len, &flush_list_len);
+
+	mutex_enter(&srv_innodb_monitor_mutex);
+
+	export_vars.innodb_data_pending_reads
+		= os_n_pending_reads;
+	export_vars.innodb_data_pending_writes
+		= os_n_pending_writes;
+	export_vars.innodb_data_pending_fsyncs
+		= fil_n_pending_log_flushes
+		+ fil_n_pending_tablespace_flushes;
+	export_vars.innodb_data_fsyncs = os_n_fsyncs;
+	export_vars.innodb_data_read = srv_data_read;
+	export_vars.innodb_data_reads = os_n_file_reads;
+	export_vars.innodb_data_writes = os_n_file_writes;
+	export_vars.innodb_data_written = srv_data_written;
+	export_vars.innodb_buffer_pool_read_requests = stat.n_page_gets;
+	export_vars.innodb_buffer_pool_write_requests
+		= srv_buf_pool_write_requests;
+	export_vars.innodb_buffer_pool_wait_free = srv_buf_pool_wait_free;
+	export_vars.innodb_buffer_pool_pages_flushed = srv_buf_pool_flushed;
+	export_vars.innodb_buffer_pool_reads = srv_buf_pool_reads;
+	export_vars.innodb_buffer_pool_read_ahead_rnd
+		= stat.n_ra_pages_read_rnd;
+	export_vars.innodb_buffer_pool_read_ahead
+		= stat.n_ra_pages_read;
+	export_vars.innodb_buffer_pool_read_ahead_evicted
+		= stat.n_ra_pages_evicted;
+	export_vars.innodb_buffer_pool_pages_data = LRU_len;
+	export_vars.innodb_buffer_pool_pages_dirty = flush_list_len;
+	export_vars.innodb_buffer_pool_pages_free = free_len;
+#ifdef UNIV_DEBUG
+	export_vars.innodb_buffer_pool_pages_latched
+		= buf_get_latched_pages_number();
+#endif /* UNIV_DEBUG */
+	export_vars.innodb_buffer_pool_pages_total = buf_pool_get_n_pages();
+
+	export_vars.innodb_buffer_pool_pages_misc
+		= buf_pool_get_n_pages() - LRU_len - free_len;
+#ifdef HAVE_ATOMIC_BUILTINS
+	export_vars.innodb_have_atomic_builtins = 1;
+#else
+	export_vars.innodb_have_atomic_builtins = 0;
+#endif
+	export_vars.innodb_page_size = UNIV_PAGE_SIZE;
+	export_vars.innodb_log_waits = srv_log_waits;
+	export_vars.innodb_os_log_written = srv_os_log_written;
+	export_vars.innodb_os_log_fsyncs = fil_n_log_flushes;
+	export_vars.innodb_os_log_pending_fsyncs = fil_n_pending_log_flushes;
+	export_vars.innodb_os_log_pending_writes = srv_os_log_pending_writes;
+	export_vars.innodb_log_write_requests = srv_log_write_requests;
+	export_vars.innodb_log_writes = srv_log_writes;
+	export_vars.innodb_dblwr_pages_written = srv_dblwr_pages_written;
+	export_vars.innodb_dblwr_writes = srv_dblwr_writes;
+	export_vars.innodb_pages_created = stat.n_pages_created;
+	export_vars.innodb_pages_read = stat.n_pages_read;
+	export_vars.innodb_pages_written = stat.n_pages_written;
+	export_vars.innodb_row_lock_waits = srv_n_lock_wait_count;
+	export_vars.innodb_row_lock_current_waits
+		= srv_n_lock_wait_current_count;
+	export_vars.innodb_row_lock_time = srv_n_lock_wait_time / 1000;
+	if (srv_n_lock_wait_count > 0) {
+		export_vars.innodb_row_lock_time_avg = (ulint)
+			(srv_n_lock_wait_time / 1000 / srv_n_lock_wait_count);
+	} else {
+		export_vars.innodb_row_lock_time_avg = 0;
+	}
+	export_vars.innodb_row_lock_time_max
+		= srv_n_lock_max_wait_time / 1000;
+	export_vars.innodb_rows_read = srv_n_rows_read;
+	export_vars.innodb_rows_inserted = srv_n_rows_inserted;
+	export_vars.innodb_rows_updated = srv_n_rows_updated;
+	export_vars.innodb_rows_deleted = srv_n_rows_deleted;
+	export_vars.innodb_num_open_files = fil_n_file_opened;
+	export_vars.innodb_truncated_status_writes = srv_truncated_status_writes;
+	export_vars.innodb_available_undo_logs = srv_available_undo_logs;
+
+	mutex_exit(&srv_innodb_monitor_mutex);
+}
+
+/*********************************************************************//**
+A thread which prints the info output by various InnoDB monitors.
+@return	a dummy parameter */
+extern "C" UNIV_INTERN
+os_thread_ret_t
+DECLARE_THREAD(srv_monitor_thread)(
+/*===============================*/
+	void*	arg __attribute__((unused)))
+			/*!< in: a dummy parameter required by
+			os_thread_create */
+{
+	ib_int64_t	sig_count;
+	double		time_elapsed;
+	time_t		current_time;
+	time_t		last_table_monitor_time;
+	time_t		last_tablespace_monitor_time;
+	time_t		last_monitor_time;
+	ulint		mutex_skipped;
+	ibool		last_srv_print_monitor;
+
+#ifdef UNIV_DEBUG_THREAD_CREATION
+	fprintf(stderr, "Lock timeout thread starts, id %lu\n",
+		os_thread_pf(os_thread_get_curr_id()));
+#endif
+
+#ifdef UNIV_PFS_THREAD
+	pfs_register_thread(srv_monitor_thread_key);
+#endif
+	srv_monitor_active = TRUE;
+
+	UT_NOT_USED(arg);
+	srv_last_monitor_time = ut_time();
+	last_table_monitor_time = ut_time();
+	last_tablespace_monitor_time = ut_time();
+	last_monitor_time = ut_time();
+	mutex_skipped = 0;
+	last_srv_print_monitor = srv_print_innodb_monitor;
+loop:
+	/* Wake up every 5 seconds to see if we need to print
+	monitor information or if signalled at shutdown. */
+
+	sig_count = os_event_reset(srv_monitor_event);
+
+	os_event_wait_time_low(srv_monitor_event, 5000000, sig_count);
+
+	current_time = ut_time();
+
+	time_elapsed = difftime(current_time, last_monitor_time);
+
+	if (time_elapsed > 15) {
+		last_monitor_time = ut_time();
+
+		if (srv_print_innodb_monitor) {
+			/* Reset mutex_skipped counter everytime
+			srv_print_innodb_monitor changes. This is to
+			ensure we will not be blocked by lock_sys->mutex
+			for short duration information printing,
+			such as requested by sync_array_print_long_waits() */
+			if (!last_srv_print_monitor) {
+				mutex_skipped = 0;
+				last_srv_print_monitor = TRUE;
+			}
+
+			if (!srv_printf_innodb_monitor(stderr,
+						MUTEX_NOWAIT(mutex_skipped),
+						NULL, NULL)) {
+				mutex_skipped++;
+			} else {
+				/* Reset the counter */
+				mutex_skipped = 0;
+			}
+		} else {
+			last_srv_print_monitor = FALSE;
+		}
+
+
+		if (srv_innodb_status) {
+			mutex_enter(&srv_monitor_file_mutex);
+			rewind(srv_monitor_file);
+			if (!srv_printf_innodb_monitor(srv_monitor_file,
+						MUTEX_NOWAIT(mutex_skipped),
+						NULL, NULL)) {
+				mutex_skipped++;
+			} else {
+				mutex_skipped = 0;
+			}
+
+			os_file_set_eof(srv_monitor_file);
+			mutex_exit(&srv_monitor_file_mutex);
+		}
+
+		if (srv_print_innodb_tablespace_monitor
+		    && difftime(current_time,
+				last_tablespace_monitor_time) > 60) {
+			last_tablespace_monitor_time = ut_time();
+
+			fputs("========================"
+			      "========================\n",
+			      stderr);
+
+			ut_print_timestamp(stderr);
+
+			fputs(" INNODB TABLESPACE MONITOR OUTPUT\n"
+			      "========================"
+			      "========================\n",
+			      stderr);
+
+			fsp_print(0);
+			fputs("Validating tablespace\n", stderr);
+			fsp_validate(0);
+			fputs("Validation ok\n"
+			      "---------------------------------------\n"
+			      "END OF INNODB TABLESPACE MONITOR OUTPUT\n"
+			      "=======================================\n",
+			      stderr);
+		}
+
+		if (srv_print_innodb_table_monitor
+		    && difftime(current_time, last_table_monitor_time) > 60) {
+
+			last_table_monitor_time = ut_time();
+
+			fprintf(stderr, "Warning: %s\n",
+				DEPRECATED_MSG_INNODB_TABLE_MONITOR);
+
+			fputs("===========================================\n",
+			      stderr);
+
+			ut_print_timestamp(stderr);
+
+			fputs(" INNODB TABLE MONITOR OUTPUT\n"
+			      "===========================================\n",
+			      stderr);
+			dict_print();
+
+			fputs("-----------------------------------\n"
+			      "END OF INNODB TABLE MONITOR OUTPUT\n"
+			      "==================================\n",
+			      stderr);
+
+			fprintf(stderr, "Warning: %s\n",
+				DEPRECATED_MSG_INNODB_TABLE_MONITOR);
+		}
+	}
+
+	if (srv_shutdown_state >= SRV_SHUTDOWN_CLEANUP) {
+		goto exit_func;
+	}
+
+	if (srv_print_innodb_monitor
+	    || srv_print_innodb_lock_monitor
+	    || srv_print_innodb_tablespace_monitor
+	    || srv_print_innodb_table_monitor) {
+		goto loop;
+	}
+
+	goto loop;
+
+exit_func:
+	srv_monitor_active = FALSE;
+
+	/* We count the number of threads in os_thread_exit(). A created
+	thread should always use that to exit and not use return() to exit. */
+
+	os_thread_exit(NULL);
+
+	OS_THREAD_DUMMY_RETURN;
+}
+
+/*********************************************************************//**
+A thread which prints warnings about semaphore waits which have lasted
+too long. These can be used to track bugs which cause hangs.
+@return	a dummy parameter */
+extern "C" UNIV_INTERN
+os_thread_ret_t
+DECLARE_THREAD(srv_error_monitor_thread)(
+/*=====================================*/
+	void*	arg __attribute__((unused)))
+			/*!< in: a dummy parameter required by
+			os_thread_create */
+{
+	/* number of successive fatal timeouts observed */
+	ulint		fatal_cnt	= 0;
+	lsn_t		old_lsn;
+	lsn_t		new_lsn;
+	ib_int64_t	sig_count;
+	/* longest waiting thread for a semaphore */
+	os_thread_id_t	waiter		= os_thread_get_curr_id();
+	os_thread_id_t	old_waiter	= waiter;
+	/* the semaphore that is being waited for */
+	const void*	sema		= NULL;
+	const void*	old_sema	= NULL;
+
+	old_lsn = srv_start_lsn;
+
+#ifdef UNIV_DEBUG_THREAD_CREATION
+	fprintf(stderr, "Error monitor thread starts, id %lu\n",
+		os_thread_pf(os_thread_get_curr_id()));
+#endif
+
+#ifdef UNIV_PFS_THREAD
+	pfs_register_thread(srv_error_monitor_thread_key);
+#endif
+	srv_error_monitor_active = TRUE;
+
+loop:
+	/* Try to track a strange bug reported by Harald Fuchs and others,
+	where the lsn seems to decrease at times */
+
+	new_lsn = log_get_lsn();
+
+	if (new_lsn < old_lsn) {
+		ut_print_timestamp(stderr);
+		fprintf(stderr,
+			"  InnoDB: Error: old log sequence number " LSN_PF
+			" was greater\n"
+			"InnoDB: than the new log sequence number " LSN_PF "!\n"
+			"InnoDB: Please submit a bug report"
+			" to http://bugs.mysql.com\n",
+			old_lsn, new_lsn);
+		ut_ad(0);
+	}
+
+	old_lsn = new_lsn;
+
+	if (difftime(time(NULL), srv_last_monitor_time) > 60) {
+		/* We referesh InnoDB Monitor values so that averages are
+		printed from at most 60 last seconds */
+
+		srv_refresh_innodb_monitor_stats();
+	}
+
+	/* Update the statistics collected for deciding LRU
+	eviction policy. */
+	buf_LRU_stat_update();
+
+	/* Update the statistics collected for flush rate policy. */
+	buf_flush_stat_update();
+
+	/* In case mutex_exit is not a memory barrier, it is
+	theoretically possible some threads are left waiting though
+	the semaphore is already released. Wake up those threads: */
+
+	sync_arr_wake_threads_if_sema_free();
+
+	if (sync_array_print_long_waits(&waiter, &sema)
+	    && sema == old_sema && os_thread_eq(waiter, old_waiter)) {
+		fatal_cnt++;
+		if (fatal_cnt > 10) {
+
+			fprintf(stderr,
+				"InnoDB: Error: semaphore wait has lasted"
+				" > %lu seconds\n"
+				"InnoDB: We intentionally crash the server,"
+				" because it appears to be hung.\n",
+				(ulong) srv_fatal_semaphore_wait_threshold);
+
+			ut_error;
+		}
+	} else {
+		fatal_cnt = 0;
+		old_waiter = waiter;
+		old_sema = sema;
+	}
+
+	/* Flush stderr so that a database user gets the output
+	to possible MySQL error file */
+
+	fflush(stderr);
+
+	sig_count = os_event_reset(srv_error_event);
+
+	os_event_wait_time_low(srv_error_event, 1000000, sig_count);
+
+	if (srv_shutdown_state < SRV_SHUTDOWN_CLEANUP) {
+
+		goto loop;
+	}
+
+	srv_error_monitor_active = FALSE;
+
+	/* We count the number of threads in os_thread_exit(). A created
+	thread should always use that to exit and not use return() to exit. */
+
+	os_thread_exit(NULL);
+
+	OS_THREAD_DUMMY_RETURN;
+}
+
+/******************************************************************//**
+Increment the server activity count. */
+UNIV_INTERN
+void
+srv_inc_activity_count(void)
+/*========================*/
+{
+	++srv_sys->activity_count;
+}
+
+/**********************************************************************//**
+Check whether any background thread is active. If so return the thread
+type.
+@return SRV_NONE if all are suspended or have exited, thread
+type if any are still active. */
+UNIV_INTERN
+srv_thread_type
+srv_get_active_thread_type(void)
+/*============================*/
+{
+	ulint	i;
+	srv_thread_type ret = SRV_NONE;
+
+	srv_sys_mutex_enter();
+
+	for (i = SRV_WORKER; i <= SRV_MASTER; ++i) {
+		if (srv_sys->n_threads_active[i] != 0) {
+			ret = static_cast<srv_thread_type>(i);
+			break;
+		}
+	}
+
+	srv_sys_mutex_exit();
+
+	/* Check only on shutdown. */
+	if (ret == SRV_NONE
+	    && srv_shutdown_state != SRV_SHUTDOWN_NONE
+	    && trx_purge_state() != PURGE_STATE_EXIT) {
+
+		ret = SRV_PURGE;
+	}
+
+	return(ret);
+}
+
+/**********************************************************************//**
+Check whether any background thread are active. If so print which thread
+is active. Send the threads wakeup signal.
+@return name of thread that is active or NULL */
+UNIV_INTERN
+const char*
+srv_any_background_threads_are_active(void)
+/*=======================================*/
+{
+	const char*	thread_active = NULL;
+
+	if (srv_error_monitor_active) {
+		thread_active = "srv_error_monitor_thread";
+	} else if (srv_lock_timeout_active) {
+		thread_active = "srv_lock_timeout thread";
+	} else if (srv_monitor_active) {
+		thread_active = "srv_monitor_thread";
+	} else if (srv_buf_dump_thread_active) {
+		thread_active = "buf_dump_thread";
+	}
+
+	os_event_set(srv_error_event);
+	os_event_set(srv_monitor_event);
+	os_event_set(srv_timeout_event);
+	os_event_set(srv_buf_dump_event);
+
+	return(thread_active);
+}
+
+/*******************************************************************//**
+Tells the InnoDB server that there has been activity in the database
+and wakes up the master thread if it is suspended (not sleeping). Used
+in the MySQL interface. Note that there is a small chance that the master
+thread stays suspended (we do not protect our operation with the
+srv_sys_t->mutex, for performance reasons). */
+UNIV_INTERN
+void
+srv_active_wake_master_thread(void)
+/*===============================*/
+{
+	ut_ad(!srv_sys_mutex_own());
+
+	srv_inc_activity_count();
+
+	if (srv_sys->n_threads_active[SRV_MASTER] == 0) {
+		srv_slot_t*	slot;
+
+		srv_sys_mutex_enter();
+
+		slot = &srv_sys->sys_threads[SRV_MASTER_SLOT];
+
+		/* Only if the master thread has been started. */
+
+		if (slot->in_use) {
+			ut_a(srv_slot_get_type(slot) == SRV_MASTER);
+
+			if (slot->suspended) {
+
+				slot->suspended = FALSE;
+
+				++srv_sys->n_threads_active[SRV_MASTER];
+
+				os_event_set(slot->event);
+			}
+		}
+
+		srv_sys_mutex_exit();
+	}
+}
+
+/*******************************************************************//**
+Tells the purge thread that there has been activity in the database
+and wakes up the purge thread if it is suspended (not sleeping).  Note
+that there is a small chance that the purge thread stays suspended
+(we do not protect our check with the srv_sys_t:mutex and the
+purge_sys->latch, for performance reasons). */
+UNIV_INTERN
+void
+srv_wake_purge_thread_if_not_active(void)
+/*=====================================*/
+{
+	ut_ad(!srv_sys_mutex_own());
+
+	if (purge_sys->state == PURGE_STATE_RUN
+	    && srv_sys->n_threads_active[SRV_PURGE] == 0) {
+
+		srv_release_threads(SRV_PURGE, 1);
+	}
+}
+
+/*******************************************************************//**
+Wakes up the master thread if it is suspended or being suspended. */
+UNIV_INTERN
+void
+srv_wake_master_thread(void)
+/*========================*/
+{
+	ut_ad(!srv_sys_mutex_own());
+
+	srv_inc_activity_count();
+
+	srv_release_threads(SRV_MASTER, 1);
+}
+
+/*******************************************************************//**
+Get current server activity count. We don't hold srv_sys::mutex while
+reading this value as it is only used in heuristics.
+@return activity count. */
+UNIV_INTERN
+ulint
+srv_get_activity_count(void)
+/*========================*/
+{
+	return(srv_sys->activity_count);
+}
+
+/*******************************************************************//**
+Check if there has been any activity.
+@return FALSE if no change in activity counter. */
+UNIV_INTERN
+ibool
+srv_check_activity(
+/*===============*/
+	ulint		old_activity_count)	/*!< in: old activity count */
+{
+	return(srv_sys->activity_count != old_activity_count);
+}
+
+/********************************************************************//**
+The master thread is tasked to ensure that flush of log file happens
+once every second in the background. This is to ensure that not more
+than one second of trxs are lost in case of crash when
+innodb_flush_logs_at_trx_commit != 1 */
+static
+void
+srv_sync_log_buffer_in_background(void)
+/*===================================*/
+{
+	time_t	current_time = time(NULL);
+
+	srv_main_thread_op_info = "flushing log";
+	if (difftime(current_time, srv_last_log_flush_time) >= 1) {
+		log_buffer_sync_in_background(TRUE);
+		srv_last_log_flush_time = current_time;
+		srv_log_writes_and_flush++;
+	}
+}
+
+/********************************************************************//**
+Make room in the table cache by evicting an unused table.
+@return number of tables evicted. */
+static
+ulint
+srv_master_evict_from_table_cache(
+/*==============================*/
+	ulint	pct_check)	/*!< in: max percent to check */
+{
+	ulint	n_tables_evicted = 0;
+
+	rw_lock_x_lock(&dict_operation_lock);
+
+	dict_mutex_enter_for_mysql();
+
+	n_tables_evicted = dict_make_room_in_cache(
+		innobase_get_table_cache_size(), pct_check);
+
+	dict_mutex_exit_for_mysql();
+
+	rw_lock_x_unlock(&dict_operation_lock);
+
+	return(n_tables_evicted);
+}
+
+/*********************************************************************//**
+This function prints progress message every 60 seconds during server
+shutdown, for any activities that master thread is pending on. */
+static
+void
+srv_shutdown_print_master_pending(
+/*==============================*/
+	ib_time_t*	last_print_time,	/*!< last time the function
+						print the message */
+	ulint		n_tables_to_drop,	/*!< number of tables to
+						be dropped */
+	ulint		n_bytes_merged)		/*!< number of change buffer
+						just merged */
+{
+	ib_time_t	current_time;
+	double		time_elapsed;
+
+	current_time = ut_time();
+	time_elapsed = ut_difftime(current_time, *last_print_time);
+
+	if (time_elapsed > 60) {
+		*last_print_time = ut_time();
+
+		if (n_tables_to_drop) {
+			ut_print_timestamp(stderr);
+			fprintf(stderr, "  InnoDB: Waiting for "
+				"%lu table(s) to be dropped\n",
+				(ulong) n_tables_to_drop);
+		}
+
+		/* Check change buffer merge, we only wait for change buffer
+		merge if it is a slow shutdown */
+		if (!srv_fast_shutdown && n_bytes_merged) {
+			ut_print_timestamp(stderr);
+			fprintf(stderr, "  InnoDB: Waiting for change "
+				"buffer merge to complete\n"
+				"  InnoDB: number of bytes of change buffer "
+				"just merged:  %lu\n",
+				n_bytes_merged);
+		}
+	}
+}
+
+/*********************************************************************//**
+Perform the tasks that the master thread is supposed to do when the
+server is active. There are two types of tasks. The first category is
+of such tasks which are performed at each inovcation of this function.
+We assume that this function is called roughly every second when the
+server is active. The second category is of such tasks which are
+performed at some interval e.g.: purge, dict_LRU cleanup etc. */
+static
+void
+srv_master_do_active_tasks(void)
+/*============================*/
+{
+	ib_time_t	cur_time = ut_time();
+	ullint		counter_time = ut_time_us(NULL);
+
+	/* First do the tasks that we are suppose to do at each
+	invocation of this function. */
+
+	++srv_main_active_loops;
+
+	MONITOR_INC(MONITOR_MASTER_ACTIVE_LOOPS);
+
+	/* ALTER TABLE in MySQL requires on Unix that the table handler
+	can drop tables lazily after there no longer are SELECT
+	queries to them. */
+	srv_main_thread_op_info = "doing background drop tables";
+	row_drop_tables_for_mysql_in_background();
+	MONITOR_INC_TIME_IN_MICRO_SECS(
+		MONITOR_SRV_BACKGROUND_DROP_TABLE_MICROSECOND, counter_time);
+
+	if (srv_shutdown_state > 0) {
+		return;
+	}
+
+	/* make sure that there is enough reusable space in the redo
+	log files */
+	srv_main_thread_op_info = "checking free log space";
+	log_free_check();
+
+	/* Do an ibuf merge */
+	srv_main_thread_op_info = "doing insert buffer merge";
+	counter_time = ut_time_us(NULL);
+	ibuf_contract_in_background(FALSE);
+	MONITOR_INC_TIME_IN_MICRO_SECS(
+		MONITOR_SRV_IBUF_MERGE_MICROSECOND, counter_time);
+
+	/* Flush logs if needed */
+	srv_main_thread_op_info = "flushing log";
+	srv_sync_log_buffer_in_background();
+	MONITOR_INC_TIME_IN_MICRO_SECS(
+		MONITOR_SRV_LOG_FLUSH_MICROSECOND, counter_time);
+
+	/* Now see if various tasks that are performed at defined
+	intervals need to be performed. */
+
+#ifdef MEM_PERIODIC_CHECK
+	/* Check magic numbers of every allocated mem block once in
+	SRV_MASTER_MEM_VALIDATE_INTERVAL seconds */
+	if (cur_time % SRV_MASTER_MEM_VALIDATE_INTERVAL == 0) {
+		mem_validate_all_blocks();
+		MONITOR_INC_TIME_IN_MICRO_SECS(
+			MONITOR_SRV_MEM_VALIDATE_MICROSECOND, counter_time);
+	}
+#endif
+	if (srv_shutdown_state > 0) {
+		return;
+	}
+
+	if (srv_shutdown_state > 0) {
+		return;
+	}
+
+	if (cur_time % SRV_MASTER_DICT_LRU_INTERVAL == 0) {
+		srv_main_thread_op_info = "enforcing dict cache limit";
+		srv_master_evict_from_table_cache(50);
+		MONITOR_INC_TIME_IN_MICRO_SECS(
+			MONITOR_SRV_DICT_LRU_MICROSECOND, counter_time);
+	}
+
+	if (srv_shutdown_state > 0) {
+		return;
+	}
+
+	/* Make a new checkpoint */
+	if (cur_time % SRV_MASTER_CHECKPOINT_INTERVAL == 0) {
+		srv_main_thread_op_info = "making checkpoint";
+		log_checkpoint(TRUE, FALSE);
+		MONITOR_INC_TIME_IN_MICRO_SECS(
+			MONITOR_SRV_CHECKPOINT_MICROSECOND, counter_time);
+	}
+}
+
+/*********************************************************************//**
+Perform the tasks that the master thread is supposed to do whenever the
+server is idle. We do check for the server state during this function
+and if the server has entered the shutdown phase we may return from
+the function without completing the required tasks.
+Note that the server can move to active state when we are executing this
+function but we don't check for that as we are suppose to perform more
+or less same tasks when server is active. */
+static
+void
+srv_master_do_idle_tasks(void)
+/*==========================*/
+{
+	ullint	counter_time;
+
+	++srv_main_idle_loops;
+
+	MONITOR_INC(MONITOR_MASTER_IDLE_LOOPS);
+
+
+	/* ALTER TABLE in MySQL requires on Unix that the table handler
+	can drop tables lazily after there no longer are SELECT
+	queries to them. */
+	counter_time = ut_time_us(NULL);
+	srv_main_thread_op_info = "doing background drop tables";
+	row_drop_tables_for_mysql_in_background();
+	MONITOR_INC_TIME_IN_MICRO_SECS(
+		MONITOR_SRV_BACKGROUND_DROP_TABLE_MICROSECOND,
+			 counter_time);
+
+	if (srv_shutdown_state > 0) {
+		return;
+	}
+
+	/* make sure that there is enough reusable space in the redo
+	log files */
+	srv_main_thread_op_info = "checking free log space";
+	log_free_check();
+
+	/* Do an ibuf merge */
+	counter_time = ut_time_us(NULL);
+	srv_main_thread_op_info = "doing insert buffer merge";
+	ibuf_contract_in_background(TRUE);
+	MONITOR_INC_TIME_IN_MICRO_SECS(
+		MONITOR_SRV_IBUF_MERGE_MICROSECOND, counter_time);
+
+	if (srv_shutdown_state > 0) {
+		return;
+	}
+
+	srv_main_thread_op_info = "enforcing dict cache limit";
+	srv_master_evict_from_table_cache(100);
+	MONITOR_INC_TIME_IN_MICRO_SECS(
+		MONITOR_SRV_DICT_LRU_MICROSECOND, counter_time);
+
+	/* Flush logs if needed */
+	srv_sync_log_buffer_in_background();
+	MONITOR_INC_TIME_IN_MICRO_SECS(
+		MONITOR_SRV_LOG_FLUSH_MICROSECOND, counter_time);
+
+	if (srv_shutdown_state > 0) {
+		return;
+	}
+
+	/* Make a new checkpoint */
+	srv_main_thread_op_info = "making checkpoint";
+	log_checkpoint(TRUE, FALSE);
+	MONITOR_INC_TIME_IN_MICRO_SECS(MONITOR_SRV_CHECKPOINT_MICROSECOND,
+				       counter_time);
+}
+
+/*********************************************************************//**
+Perform the tasks during shutdown. The tasks that we do at shutdown
+depend on srv_fast_shutdown:
+2 => very fast shutdown => do no book keeping
+1 => normal shutdown => clear drop table queue and make checkpoint
+0 => slow shutdown => in addition to above do complete purge and ibuf
+merge
+@return TRUE if some work was done. FALSE otherwise */
+static
+ibool
+srv_master_do_shutdown_tasks(
+/*=========================*/
+	ib_time_t*	last_print_time)/*!< last time the function
+					print the message */
+{
+	ulint		n_bytes_merged = 0;
+	ulint		n_tables_to_drop = 0;
+
+	++srv_main_shutdown_loops;
+
+	ut_a(srv_shutdown_state > 0);
+
+	/* In very fast shutdown none of the following is necessary */
+	if (srv_fast_shutdown == 2) {
+		return(FALSE);
+	}
+
+	/* ALTER TABLE in MySQL requires on Unix that the table handler
+	can drop tables lazily after there no longer are SELECT
+	queries to them. */
+	srv_main_thread_op_info = "doing background drop tables";
+	n_tables_to_drop = row_drop_tables_for_mysql_in_background();
+
+	/* make sure that there is enough reusable space in the redo
+	log files */
+	srv_main_thread_op_info = "checking free log space";
+	log_free_check();
+
+	/* In case of normal shutdown we don't do ibuf merge or purge */
+	if (srv_fast_shutdown == 1) {
+		goto func_exit;
+	}
+
+	/* Do an ibuf merge */
+	srv_main_thread_op_info = "doing insert buffer merge";
+	n_bytes_merged = ibuf_contract_in_background(TRUE);
+
+	/* Flush logs if needed */
+	srv_sync_log_buffer_in_background();
+
+func_exit:
+	/* Make a new checkpoint about once in 10 seconds */
+	srv_main_thread_op_info = "making checkpoint";
+	log_checkpoint(TRUE, FALSE);
+
+	/* Print progress message every 60 seconds during shutdown */
+	if (srv_shutdown_state > 0 && srv_print_verbose_log) {
+		srv_shutdown_print_master_pending(
+			last_print_time, n_tables_to_drop, n_bytes_merged);
+	}
+
+	return(n_bytes_merged || n_tables_to_drop);
+}
+
+/*********************************************************************//**
+Puts master thread to sleep. At this point we are using polling to
+service various activities. Master thread sleeps for one second before
+checking the state of the server again */
+static
+void
+srv_master_sleep(void)
+/*==================*/
+{
+	srv_main_thread_op_info = "sleeping";
+	os_thread_sleep(1000000);
+	srv_main_thread_op_info = "";
+}
+
+/*********************************************************************//**
+The master thread controlling the server.
+@return	a dummy parameter */
+extern "C" UNIV_INTERN
+os_thread_ret_t
+DECLARE_THREAD(srv_master_thread)(
+/*==============================*/
+	void*	arg __attribute__((unused)))
+			/*!< in: a dummy parameter required by
+			os_thread_create */
+{
+	srv_slot_t*	slot;
+	ulint		old_activity_count = srv_get_activity_count();
+	ib_time_t	last_print_time;
+
+#ifdef UNIV_DEBUG_THREAD_CREATION
+	fprintf(stderr, "Master thread starts, id %lu\n",
+		os_thread_pf(os_thread_get_curr_id()));
+#endif
+
+#ifdef UNIV_PFS_THREAD
+	pfs_register_thread(srv_master_thread_key);
+#endif
+
+	srv_main_thread_process_no = os_proc_get_number();
+	srv_main_thread_id = os_thread_pf(os_thread_get_curr_id());
+
+	slot = srv_reserve_slot(SRV_MASTER);
+	ut_a(slot == srv_sys->sys_threads);
+
+	last_print_time = ut_time();
+loop:
+	if (srv_force_recovery >= SRV_FORCE_NO_BACKGROUND) {
+		goto suspend_thread;
+	}
+
+	while (srv_shutdown_state == SRV_SHUTDOWN_NONE) {
+
+		srv_master_sleep();
+
+		MONITOR_INC(MONITOR_MASTER_THREAD_SLEEP);
+
+		if (srv_check_activity(old_activity_count)) {
+			old_activity_count = srv_get_activity_count();
+			srv_master_do_active_tasks();
+		} else {
+			srv_master_do_idle_tasks();
+		}
+	}
+
+	while (srv_master_do_shutdown_tasks(&last_print_time)) {
+
+		/* Shouldn't loop here in case of very fast shutdown */
+		ut_ad(srv_fast_shutdown < 2);
+	}
+
+suspend_thread:
+	srv_main_thread_op_info = "suspending";
+
+	srv_suspend_thread(slot);
+
+	/* DO NOT CHANGE THIS STRING. innobase_start_or_create_for_mysql()
+	waits for database activity to die down when converting < 4.1.x
+	databases, and relies on this string being exactly as it is. InnoDB
+	manual also mentions this string in several places. */
+	srv_main_thread_op_info = "waiting for server activity";
+
+	os_event_wait(slot->event);
+
+	if (srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS) {
+		os_thread_exit(NULL);
+	}
+
+	goto loop;
+
+	OS_THREAD_DUMMY_RETURN;	/* Not reached, avoid compiler warning */
+}
+
+/*********************************************************************//**
+Check if purge should stop.
+@return true if it should shutdown. */
+static
+bool
+srv_purge_should_exit(
+/*==============*/
+	ulint		n_purged)	/*!< in: pages purged in last batch */
+{
+	switch (srv_shutdown_state) {
+	case SRV_SHUTDOWN_NONE:
+		/* Normal operation. */
+		break;
+
+	case SRV_SHUTDOWN_CLEANUP:
+	case SRV_SHUTDOWN_EXIT_THREADS:
+		/* Exit unless slow shutdown requested or all done. */
+		return(srv_fast_shutdown != 0 || n_purged == 0);
+
+	case SRV_SHUTDOWN_LAST_PHASE:
+	case SRV_SHUTDOWN_FLUSH_PHASE:
+		ut_error;
+	}
+
+	return(false);
+}
+
+/*********************************************************************//**
+Fetch and execute a task from the work queue.
+@return	true if a task was executed */
+static
+bool
+srv_task_execute(void)
+/*==================*/
+{
+	que_thr_t*	thr = NULL;
+
+	ut_a(srv_force_recovery < SRV_FORCE_NO_BACKGROUND);
+
+	mutex_enter(&srv_sys->tasks_mutex);
+
+	if (UT_LIST_GET_LEN(srv_sys->tasks) > 0) {
+
+		thr = UT_LIST_GET_FIRST(srv_sys->tasks);
+
+		ut_a(que_node_get_type(thr->child) == QUE_NODE_PURGE);
+
+		UT_LIST_REMOVE(queue, srv_sys->tasks, thr);
+	}
+
+	mutex_exit(&srv_sys->tasks_mutex);
+
+	if (thr != NULL) {
+
+		que_run_threads(thr);
+
+		os_atomic_inc_ulint(
+			&purge_sys->bh_mutex, &purge_sys->n_completed, 1);
+	}
+
+	return(thr != NULL);
+}
+
+/*********************************************************************//**
+Worker thread that reads tasks from the work queue and executes them.
+@return	a dummy parameter */
+extern "C" UNIV_INTERN
+os_thread_ret_t
+DECLARE_THREAD(srv_worker_thread)(
+/*==============================*/
+	void*	arg __attribute__((unused)))	/*!< in: a dummy parameter
+						required by os_thread_create */
+{
+	srv_slot_t*	slot;
+
+	ut_a(srv_force_recovery < SRV_FORCE_NO_BACKGROUND);
+
+#ifdef UNIV_DEBUG_THREAD_CREATION
+	ut_print_timestamp(stderr);
+	fprintf(stderr, " InnoDB: worker thread starting, id %lu\n",
+		os_thread_pf(os_thread_get_curr_id()));
+#endif /* UNIV_DEBUG_THREAD_CREATION */
+
+	slot = srv_reserve_slot(SRV_WORKER);
+
+	ut_a(srv_n_purge_threads > 1);
+
+	srv_sys_mutex_enter();
+
+	ut_a(srv_sys->n_threads_active[SRV_WORKER] < srv_n_purge_threads);
+
+	srv_sys_mutex_exit();
+
+	/* We need to ensure that the worker threads exit after the
+	purge coordinator thread. Otherwise the purge coordinaor can
+	end up waiting forever in trx_purge_wait_for_workers_to_complete() */
+
+	do {
+		srv_suspend_thread(slot);
+
+		os_event_wait(slot->event);
+
+		if (srv_task_execute()) {
+
+			/* If there are tasks in the queue, wakeup
+			the purge coordinator thread. */
+
+			srv_wake_purge_thread_if_not_active();
+		}
+
+		/* Note: we are checking the state without holding the
+		purge_sys->latch here. */
+	} while (purge_sys->state != PURGE_STATE_EXIT);
+
+	srv_free_slot(slot);
+
+	rw_lock_x_lock(&purge_sys->latch);
+
+	ut_a(!purge_sys->running);
+	ut_a(purge_sys->state == PURGE_STATE_EXIT);
+	ut_a(srv_shutdown_state > SRV_SHUTDOWN_NONE);
+
+	rw_lock_x_unlock(&purge_sys->latch);
+
+#ifdef UNIV_DEBUG_THREAD_CREATION
+	ut_print_timestamp(stderr);
+	fprintf(stderr, " InnoDB: Purge worker thread exiting, id %lu\n",
+		os_thread_pf(os_thread_get_curr_id()));
+#endif /* UNIV_DEBUG_THREAD_CREATION */
+
+	/* We count the number of threads in os_thread_exit(). A created
+	thread should always use that to exit and not use return() to exit. */
+	os_thread_exit(NULL);
+
+	OS_THREAD_DUMMY_RETURN;	/* Not reached, avoid compiler warning */
+}
+
+/*********************************************************************//**
+Do the actual purge operation.
+@return length of history list before the last purge batch. */
+static
+ulint
+srv_do_purge(
+/*=========*/
+	ulint		n_threads,	/*!< in: number of threads to use */
+	ulint*		n_total_purged)	/*!< in/out: total pages purged */
+{
+	ulint		n_pages_purged;
+
+	static ulint	count = 0;
+	static ulint	n_use_threads = 0;
+	static ulint	rseg_history_len = 0;
+	ulint		old_activity_count = srv_get_activity_count();
+
+	ut_a(n_threads > 0);
+
+	/* Purge until there are no more records to purge and there is
+	no change in configuration or server state. If the user has
+	configured more than one purge thread then we treat that as a
+	pool of threads and only use the extra threads if purge can't
+	keep up with updates. */
+
+	if (n_use_threads == 0) {
+		n_use_threads = n_threads;
+	}
+
+	do {
+		if (trx_sys->rseg_history_len > rseg_history_len) {
+
+			/* History length is now longer than what it was
+			when we took the last snapshot. Use more threads. */
+
+			if (n_use_threads < n_threads) {
+				++n_use_threads;
+			}
+
+		} else if (srv_check_activity(old_activity_count)
+			   && n_use_threads > 1) {
+
+			/* History length same or smaller since last snapshot,
+			use fewer threads. */
+
+			--n_use_threads;
+
+			old_activity_count = srv_get_activity_count();
+		}
+
+		/* Ensure that the purge threads are less than what
+		was configured. */
+
+		ut_a(n_use_threads > 0);
+		ut_a(n_use_threads <= n_threads);
+
+		/* Take a snapshot of the history list before purge. */
+		if ((rseg_history_len = trx_sys->rseg_history_len) == 0) {
+			break;
+		}
+
+		n_pages_purged = trx_purge(
+			n_use_threads, srv_purge_batch_size, false);
+
+		if (!(count++ % TRX_SYS_N_RSEGS) || n_pages_purged == 0) {
+			/* Force a truncate of the history list. */
+			trx_purge(1, srv_purge_batch_size, true);
+		}
+
+		*n_total_purged += n_pages_purged;
+
+	} while (!srv_purge_should_exit(n_pages_purged) && n_pages_purged > 0);
+
+	return(rseg_history_len);
+}
+
+/*********************************************************************//**
+Suspend the purge coordinator thread. */
+static
+void
+srv_purge_coordinator_suspend(
+/*==========================*/
+	srv_slot_t*	slot,			/*!< in/out: Purge coordinator
+						thread slot */
+	ulint		rseg_history_len)	/*!< in: history list length
+						before last purge */
+{
+	ut_a(slot->type == SRV_PURGE);
+
+	rw_lock_x_lock(&purge_sys->latch);
+
+	purge_sys->running = false;
+
+	rw_lock_x_unlock(&purge_sys->latch);
+
+	bool		stop = false;
+
+	/** Maximum wait time on the purge event, in micro-seconds. */
+	static const ulint SRV_PURGE_MAX_TIMEOUT = 10000;
+
+	do {
+		ulint		ret;
+		ib_int64_t	sig_count = srv_suspend_thread(slot);
+
+		/* We don't wait right away on the the non-timed wait because
+		we want to signal the thread that wants to suspend purge. */
+
+		if (stop) {
+			os_event_wait_low(slot->event, sig_count);
+			ret = 0;
+		} else if (rseg_history_len <= trx_sys->rseg_history_len) {
+			ret = os_event_wait_time_low(
+				slot->event, SRV_PURGE_MAX_TIMEOUT, sig_count);
+		} else {
+			/* We don't want to waste time waiting if the
+			history list has increased by the time we get here
+			unless purge has been stopped. */
+			ret = 0;
+		}
+
+		srv_sys_mutex_enter();
+
+		/* The thread can be in state !suspended after the timeout
+		but before this check if another thread sent a wakeup signal. */
+
+		if (slot->suspended) {
+			slot->suspended = FALSE;
+			++srv_sys->n_threads_active[slot->type];
+			ut_a(srv_sys->n_threads_active[slot->type] == 1);
+		}
+
+		srv_sys_mutex_exit();
+
+		rw_lock_x_lock(&purge_sys->latch);
+
+		stop = (purge_sys->state == PURGE_STATE_STOP);
+
+		if (!stop) {
+			ut_a(purge_sys->n_stop == 0);
+			purge_sys->running = true;
+		} else {
+			ut_a(purge_sys->n_stop > 0);
+
+			/* Signal that we are suspended. */
+			os_event_set(purge_sys->event);
+		}
+
+		rw_lock_x_unlock(&purge_sys->latch);
+
+		if (ret == OS_SYNC_TIME_EXCEEDED) {
+
+			/* No new records added since wait started then simply
+			wait for new records. The magic number 5000 is an
+			approximation for the case where we have cached UNDO
+			log records which prevent truncate of the UNDO
+			segments. */
+
+			if (rseg_history_len == trx_sys->rseg_history_len
+			    && trx_sys->rseg_history_len < 5000) {
+
+				stop = true;
+			}
+		}
+
+	} while (stop);
+
+	ut_a(!slot->suspended);
+}
+
+/*********************************************************************//**
+Purge coordinator thread that schedules the purge tasks.
+@return	a dummy parameter */
+extern "C" UNIV_INTERN
+os_thread_ret_t
+DECLARE_THREAD(srv_purge_coordinator_thread)(
+/*=========================================*/
+	void*	arg __attribute__((unused)))	/*!< in: a dummy parameter
+						required by os_thread_create */
+{
+	srv_slot_t*	slot;
+	ulint           n_total_purged = ULINT_UNDEFINED;
+
+	ut_a(srv_n_purge_threads >= 1);
+	ut_a(trx_purge_state() == PURGE_STATE_INIT);
+	ut_a(srv_force_recovery < SRV_FORCE_NO_BACKGROUND);
+
+	rw_lock_x_lock(&purge_sys->latch);
+
+	purge_sys->running = true;
+	purge_sys->state = PURGE_STATE_RUN;
+
+	rw_lock_x_unlock(&purge_sys->latch);
+
+#ifdef UNIV_PFS_THREAD
+	pfs_register_thread(srv_purge_thread_key);
+#endif /* UNIV_PFS_THREAD */
+
+#ifdef UNIV_DEBUG_THREAD_CREATION
+	ut_print_timestamp(stderr);
+	fprintf(stderr, " InnoDB: Purge coordinator thread created, id %lu\n",
+		os_thread_pf(os_thread_get_curr_id()));
+#endif /* UNIV_DEBUG_THREAD_CREATION */
+
+	slot = srv_reserve_slot(SRV_PURGE);
+
+	ulint	rseg_history_len = trx_sys->rseg_history_len;
+
+	do {
+		/* If there are no records to purge or the last
+		purge didn't purge any records then wait for activity. */
+
+		if (purge_sys->state == PURGE_STATE_STOP
+		    || n_total_purged == 0) {
+
+			srv_purge_coordinator_suspend(slot, rseg_history_len);
+		}
+
+		if (srv_purge_should_exit(n_total_purged)) {
+			ut_a(!slot->suspended);
+			break;
+		}
+
+		n_total_purged = 0;
+
+		rseg_history_len = srv_do_purge(
+			srv_n_purge_threads, &n_total_purged);
+
+	} while (!srv_purge_should_exit(n_total_purged));
+
+	/* Ensure that we don't jump out of the loop unless the
+	exit condition is satisfied. */
+
+	ut_a(srv_purge_should_exit(n_total_purged));
+
+	ulint	n_pages_purged = ULINT_MAX;
+
+	/* Ensure that all records are purged if it is not a fast shutdown.
+	This covers the case where a record can be added after we exit the
+	loop above. */
+	while (srv_fast_shutdown == 0 && n_pages_purged > 0) {
+		n_pages_purged = trx_purge(1, srv_purge_batch_size, false);
+	}
+
+	/* Force a truncate of the history list. */
+	n_pages_purged = trx_purge(1, srv_purge_batch_size, true);
+	ut_a(n_pages_purged == 0 || srv_fast_shutdown != 0);
+
+	/* The task queue should always be empty, independent of fast
+	shutdown state. */
+	ut_a(srv_get_task_queue_length() == 0);
+
+	srv_free_slot(slot);
+
+	/* Note that we are shutting down. */
+	rw_lock_x_lock(&purge_sys->latch);
+
+	purge_sys->state = PURGE_STATE_EXIT;
+
+	purge_sys->running = false;
+
+	rw_lock_x_unlock(&purge_sys->latch);
+
+#ifdef UNIV_DEBUG_THREAD_CREATION
+	ut_print_timestamp(stderr);
+	fprintf(stderr, " InnoDB: Purge coordinator exiting, id %lu\n",
+		os_thread_pf(os_thread_get_curr_id()));
+#endif /* UNIV_DEBUG_THREAD_CREATION */
+
+	/* Ensure that all the worker threads quit. */
+	if (srv_n_purge_threads > 1) {
+		srv_release_threads(SRV_WORKER, srv_n_purge_threads - 1);
+	}
+
+	/* We count the number of threads in os_thread_exit(). A created
+	thread should always use that to exit and not use return() to exit. */
+	os_thread_exit(NULL);
+
+	OS_THREAD_DUMMY_RETURN;	/* Not reached, avoid compiler warning */
+}
+
+/**********************************************************************//**
+Enqueues a task to server task queue and releases a worker thread, if there
+is a suspended one. */
+UNIV_INTERN
+void
+srv_que_task_enqueue_low(
+/*=====================*/
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	mutex_enter(&srv_sys->tasks_mutex);
+
+	UT_LIST_ADD_LAST(queue, srv_sys->tasks, thr);
+
+	mutex_exit(&srv_sys->tasks_mutex);
+
+	srv_release_threads(SRV_WORKER, 1);
+}
+
+/**********************************************************************//**
+Get count of tasks in the queue.
+@return number of tasks in queue  */
+UNIV_INTERN
+ulint
+srv_get_task_queue_length(void)
+/*===========================*/
+{
+	ulint	n_tasks;
+
+	mutex_enter(&srv_sys->tasks_mutex);
+
+	n_tasks = UT_LIST_GET_LEN(srv_sys->tasks);
+
+	mutex_exit(&srv_sys->tasks_mutex);
+
+	return(n_tasks);
+}
+
+/**********************************************************************//**
+Wakeup the purge threads. */
+UNIV_INTERN
+void
+srv_purge_wakeup(void)
+/*==================*/
+{
+	if (srv_force_recovery < SRV_FORCE_NO_BACKGROUND) {
+
+		srv_release_threads(SRV_PURGE, 1);
+
+		if (srv_n_purge_threads > 1) {
+			ulint	n_workers = srv_n_purge_threads - 1;
+
+			srv_release_threads(SRV_WORKER, n_workers);
+		}
+	}
+}
+
diff --git a/storage/innobase/srv/srv0start.c b/storage/innobase/srv/srv0start.cc
index 86669a50895..9d1600cff23 100644
--- a/storage/innobase/srv/srv0start.c
+++ b/storage/innobase/srv/srv0start.cc
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1996, 2010, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2012, Oracle and/or its affiliates. All rights reserved.
 Copyright (c) 2008, Google Inc.
 Copyright (c) 2009, Percona Inc.
 
@@ -26,13 +26,13 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
 /********************************************************************//**
-@file srv/srv0start.c
+@file srv/srv0start.cc
 Starts the InnoDB database server
 
 Created 2/16/1996 Heikki Tuuri
@@ -44,6 +44,7 @@ Created 2/16/1996 Heikki Tuuri
 #include "data0type.h"
 #include "dict0dict.h"
 #include "buf0buf.h"
+#include "buf0dump.h"
 #include "os0file.h"
 #include "os0thread.h"
 #include "fil0fil.h"
@@ -63,6 +64,7 @@ Created 2/16/1996 Heikki Tuuri
 #include "srv0start.h"
 #include "srv0srv.h"
 #ifndef UNIV_HOTBACKUP
+# include "trx0rseg.h"
 # include "os0proc.h"
 # include "sync0sync.h"
 # include "buf0flu.h"
@@ -87,11 +89,12 @@ Created 2/16/1996 Heikki Tuuri
 # include "btr0pcur.h"
 # include "os0sync.h" /* for INNODB_RW_LOCKS_USE_ATOMICS */
 # include "zlib.h" /* for ZLIB_VERSION */
+# include "buf0dblwr.h"
 
 /** Log sequence number immediately after startup */
-UNIV_INTERN ib_uint64_t	srv_start_lsn;
+UNIV_INTERN lsn_t	srv_start_lsn;
 /** Log sequence number at shutdown */
-UNIV_INTERN ib_uint64_t	srv_shutdown_lsn;
+UNIV_INTERN lsn_t	srv_shutdown_lsn;
 
 #ifdef HAVE_DARWIN_THREADS
 # include <sys/utsname.h>
@@ -110,7 +113,7 @@ UNIV_INTERN ibool	srv_is_being_started = FALSE;
 /** TRUE if the server was successfully started */
 UNIV_INTERN ibool	srv_was_started = FALSE;
 /** TRUE if innobase_start_or_create_for_mysql() has been called */
-static ibool	srv_start_has_been_called = FALSE;
+static ibool		srv_start_has_been_called = FALSE;
 
 /** At a shutdown this value climbs from SRV_SHUTDOWN_NONE to
 SRV_SHUTDOWN_CLEANUP and then to SRV_SHUTDOWN_LAST_PHASE, and so on */
@@ -121,8 +124,8 @@ static os_file_t	files[1000];
 
 /** io_handler_thread parameters for thread identification */
 static ulint		n[SRV_MAX_N_IO_THREADS + 6];
-/** io_handler_thread identifiers */
-static os_thread_id_t	thread_ids[SRV_MAX_N_IO_THREADS + 6];
+/** io_handler_thread identifiers, 32 is the maximum number of purge threads  */
+static os_thread_id_t	thread_ids[SRV_MAX_N_IO_THREADS + 6 + 32];
 
 /** We use this mutex to test the return value of pthread_mutex_trylock
    on successful locking. HP-UX does NOT return 0, though Linux et al do. */
@@ -132,6 +135,10 @@ static os_fast_mutex_t	srv_os_test_mutex;
 static char*	srv_monitor_file_name;
 #endif /* !UNIV_HOTBACKUP */
 
+/** Default undo tablespace size in UNIV_PAGEs count (10MB). */
+static const ulint SRV_UNDO_TABLESPACE_SIZE_IN_PAGES =
+	((1024 * 1024) * 10) / UNIV_PAGE_SIZE_DEF;
+
 /** */
 #define SRV_N_PENDING_IOS_PER_THREAD	OS_AIO_N_PENDING_IOS_PER_THREAD
 #define SRV_MAX_N_PENDING_SYNC_IOS	100
@@ -276,10 +283,14 @@ srv_parse_data_file_paths_and_sizes(
 		return(FALSE);
 	}
 
-	srv_data_file_names = malloc(i * sizeof *srv_data_file_names);
-	srv_data_file_sizes = malloc(i * sizeof *srv_data_file_sizes);
-	srv_data_file_is_raw_partition = malloc(
-		i * sizeof *srv_data_file_is_raw_partition);
+	srv_data_file_names = static_cast<char**>(
+		malloc(i * sizeof *srv_data_file_names));
+
+	srv_data_file_sizes = static_cast<ulint*>(
+		malloc(i * sizeof *srv_data_file_sizes));
+
+	srv_data_file_is_raw_partition = static_cast<ulint*>(
+		malloc(i * sizeof *srv_data_file_is_raw_partition));
 
 	srv_n_data_files = i;
 
@@ -409,7 +420,8 @@ srv_parse_log_group_home_dirs(
 		return(FALSE);
 	}
 
-	srv_log_group_home_dirs = malloc(i * sizeof *srv_log_group_home_dirs);
+	srv_log_group_home_dirs = static_cast<char**>(
+		malloc(i * sizeof *srv_log_group_home_dirs));
 
 	/* Then store the actual values to our array */
 
@@ -458,16 +470,16 @@ srv_free_paths_and_sizes(void)
 /********************************************************************//**
 I/o-handler thread function.
 @return	OS_THREAD_DUMMY_RETURN */
-static
+extern "C" UNIV_INTERN
 os_thread_ret_t
-io_handler_thread(
-/*==============*/
+DECLARE_THREAD(io_handler_thread)(
+/*==============================*/
 	void*	arg)	/*!< in: pointer to the number of the segment in
 			the aio array */
 {
 	ulint	segment;
 
-	segment = *((ulint*)arg);
+	segment = *((ulint*) arg);
 
 #ifdef UNIV_DEBUG_THREAD_CREATION
 	fprintf(stderr, "Io handler thread %lu starts, id %lu\n", segment,
@@ -493,12 +505,6 @@ io_handler_thread(
 }
 #endif /* !UNIV_HOTBACKUP */
 
-#ifdef __WIN__
-#define SRV_PATH_SEPARATOR	'\\'
-#else
-#define SRV_PATH_SEPARATOR	'/'
-#endif
-
 /*********************************************************************//**
 Normalizes a directory path for Windows: converts slashes to backslashes. */
 UNIV_INTERN
@@ -520,32 +526,6 @@ srv_normalize_path_for_win(
 
 #ifndef UNIV_HOTBACKUP
 /*********************************************************************//**
-Calculates the low 32 bits when a file size which is given as a number
-database pages is converted to the number of bytes.
-@return	low 32 bytes of file size when expressed in bytes */
-static
-ulint
-srv_calc_low32(
-/*===========*/
-	ulint	file_size)	/*!< in: file size in database pages */
-{
-	return(0xFFFFFFFFUL & (file_size << UNIV_PAGE_SIZE_SHIFT));
-}
-
-/*********************************************************************//**
-Calculates the high 32 bits when a file size which is given as a number
-database pages is converted to the number of bytes.
-@return	high 32 bytes of file size when expressed in bytes */
-static
-ulint
-srv_calc_high32(
-/*============*/
-	ulint	file_size)	/*!< in: file size in database pages */
-{
-	return(file_size >> (32 - UNIV_PAGE_SIZE_SHIFT));
-}
-
-/*********************************************************************//**
 Creates or opens the log files and closes them.
 @return	DB_SUCCESS or error code */
 static
@@ -562,11 +542,10 @@ open_or_create_log_file(
 	ulint	k,			/*!< in: log group number */
 	ulint	i)			/*!< in: log file number in group */
 {
-	ibool	ret;
-	ulint	size;
-	ulint	size_high;
-	char	name[10000];
-	ulint	dirnamelen;
+	ibool		ret;
+	os_offset_t	size;
+	char		name[10000];
+	ulint		dirnamelen;
 
 	UT_NOT_USED(create_new_db);
 
@@ -614,20 +593,20 @@ open_or_create_log_file(
 			return(DB_ERROR);
 		}
 
-		ret = os_file_get_size(files[i], &size, &size_high);
-		ut_a(ret);
+		size = os_file_get_size(files[i]);
+		ut_a(size != (os_offset_t) -1);
 
-		if (size != srv_calc_low32(srv_log_file_size)
-		    || size_high != srv_calc_high32(srv_log_file_size)) {
+		if (UNIV_UNLIKELY(size != (os_offset_t) srv_log_file_size
+				  << UNIV_PAGE_SIZE_SHIFT)) {
 
 			fprintf(stderr,
 				"InnoDB: Error: log file %s is"
-				" of different size %lu %lu bytes\n"
+				" of different size "UINT64PF" bytes\n"
 				"InnoDB: than specified in the .cnf"
-				" file %lu %lu bytes!\n",
-				name, (ulong) size_high, (ulong) size,
-				(ulong) srv_calc_high32(srv_log_file_size),
-				(ulong) srv_calc_low32(srv_log_file_size));
+				" file "UINT64PF" bytes!\n",
+				name, size,
+				(os_offset_t) srv_log_file_size
+				<< UNIV_PAGE_SIZE_SHIFT);
 
 			return(DB_ERROR);
 		}
@@ -637,7 +616,7 @@ open_or_create_log_file(
 		ut_print_timestamp(stderr);
 
 		fprintf(stderr,
-			"  InnoDB: Log file %s did not exist:"
+			" InnoDB: Log file %s did not exist:"
 			" new to be created\n",
 			name);
 		if (log_file_has_been_opened) {
@@ -654,8 +633,8 @@ open_or_create_log_file(
 			" full: wait...\n");
 
 		ret = os_file_set_size(name, files[i],
-				       srv_calc_low32(srv_log_file_size),
-				       srv_calc_high32(srv_log_file_size));
+				       (os_offset_t) srv_log_file_size
+				       << UNIV_PAGE_SIZE_SHIFT);
 		if (!ret) {
 			fprintf(stderr,
 				"InnoDB: Error in creating %s:"
@@ -674,12 +653,18 @@ open_or_create_log_file(
 		which is for this log group */
 
 		fil_space_create(name,
-				 2 * k + SRV_LOG_SPACE_FIRST_ID, 0, FIL_LOG);
+				 2 * k + SRV_LOG_SPACE_FIRST_ID,
+				 fsp_flags_set_page_size(0, UNIV_PAGE_SIZE),
+				 FIL_LOG);
 	}
 
 	ut_a(fil_validate());
 
-	fil_node_create(name, srv_log_file_size,
+	/* srv_log_file_size is measured in pages; if page size is 16KB,
+	then we have a limit of 64TB on 32 bit systems */
+	ut_a(srv_log_file_size <= ULINT_MAX);
+
+	fil_node_create(name, (ulint) srv_log_file_size,
 			2 * k + SRV_LOG_SPACE_FIRST_ID, FALSE);
 #ifdef UNIV_LOG_ARCHIVE
 	/* If this is the first log group, create the file space object
@@ -720,22 +705,21 @@ open_or_create_data_files(
 	ulint*		max_arch_log_no,/*!< out: max of archived log
 					numbers in data files */
 #endif /* UNIV_LOG_ARCHIVE */
-	ib_uint64_t*	min_flushed_lsn,/*!< out: min of flushed lsn
+	lsn_t*		min_flushed_lsn,/*!< out: min of flushed lsn
 					values in data files */
-	ib_uint64_t*	max_flushed_lsn,/*!< out: max of flushed lsn
+	lsn_t*		max_flushed_lsn,/*!< out: max of flushed lsn
 					values in data files */
 	ulint*		sum_of_new_sizes)/*!< out: sum of sizes of the
 					new files added */
 {
-	ibool	ret;
-	ulint	i;
-	ibool	one_opened	= FALSE;
-	ibool	one_created	= FALSE;
-	ulint	size;
-	ulint	size_high;
-	ulint	flags;
-	ulint	rounded_size_pages;
-	char	name[10000];
+	ibool		ret;
+	ulint		i;
+	ibool		one_opened	= FALSE;
+	ibool		one_created	= FALSE;
+	os_offset_t	size;
+	ulint		flags;
+	ulint		rounded_size_pages;
+	char		name[10000];
 
 	if (srv_n_data_files >= 1000) {
 		fprintf(stderr, "InnoDB: can only have < 1000 data files\n"
@@ -861,13 +845,12 @@ open_or_create_data_files(
 				goto skip_size_check;
 			}
 
-			ret = os_file_get_size(files[i], &size, &size_high);
-			ut_a(ret);
+			size = os_file_get_size(files[i]);
+			ut_a(size != (os_offset_t) -1);
 			/* Round size downward to megabytes */
 
-			rounded_size_pages
-				= (size / (1024 * 1024) + 4096 * size_high)
-					<< (20 - UNIV_PAGE_SIZE_SHIFT);
+			rounded_size_pages = (ulint)
+				(size >> UNIV_PAGE_SIZE_SHIFT);
 
 			if (i == srv_n_data_files - 1
 			    && srv_auto_extend_last_data_file) {
@@ -934,9 +917,9 @@ skip_size_check:
 					fsp_flags_get_page_size(flags));
 				ut_print_timestamp(stderr);
 				fprintf(stderr,
-					" InnoDB: but the only supported"
-					" page size in this release is=%lu\n",
-					(ulong) UNIV_PAGE_SIZE);
+					" InnoDB: but the start-up parameter"
+					" is innodb-page-size=%lu\n",
+					UNIV_PAGE_SIZE);
 
 				return(DB_ERROR);
 			}
@@ -951,7 +934,7 @@ skip_size_check:
 			if (i > 0) {
 				ut_print_timestamp(stderr);
 				fprintf(stderr,
-					"  InnoDB: Data file %s did not"
+					" InnoDB: Data file %s did not"
 					" exist: new to be created\n",
 					name);
 			} else {
@@ -965,7 +948,7 @@ skip_size_check:
 
 			ut_print_timestamp(stderr);
 			fprintf(stderr,
-				"  InnoDB: Setting file %s size to %lu MB\n",
+				" InnoDB: Setting file %s size to %lu MB\n",
 				name,
 				(ulong) (srv_data_file_sizes[i]
 					 >> (20 - UNIV_PAGE_SIZE_SHIFT)));
@@ -976,8 +959,8 @@ skip_size_check:
 
 			ret = os_file_set_size(
 				name, files[i],
-				srv_calc_low32(srv_data_file_sizes[i]),
-				srv_calc_high32(srv_data_file_sizes[i]));
+				(os_offset_t) srv_data_file_sizes[i]
+				<< UNIV_PAGE_SIZE_SHIFT);
 
 			if (!ret) {
 				fprintf(stderr,
@@ -987,15 +970,15 @@ skip_size_check:
 				return(DB_ERROR);
 			}
 
-			*sum_of_new_sizes = *sum_of_new_sizes
-				+ srv_data_file_sizes[i];
+			*sum_of_new_sizes += srv_data_file_sizes[i];
 		}
 
 		ret = os_file_close(files[i]);
 		ut_a(ret);
 
 		if (i == 0) {
-			fil_space_create(name, 0, 0, FIL_TABLESPACE);
+			flags = fsp_flags_set_page_size(0, UNIV_PAGE_SIZE);
+			fil_space_create(name, 0, flags, FIL_TABLESPACE);
 		}
 
 		ut_a(fil_validate());
@@ -1007,6 +990,318 @@ skip_size_check:
 	return(DB_SUCCESS);
 }
 
+/*********************************************************************//**
+Create undo tablespace.
+@return	DB_SUCCESS or error code */
+static
+enum db_err
+srv_undo_tablespace_create(
+/*=======================*/
+	const char*	name,		/*!< in: tablespace name */
+	ulint		size)		/*!< in: tablespace size in pages */
+{
+	os_file_t	fh;
+	ibool		ret;
+	enum db_err	err = DB_SUCCESS;
+
+	os_file_create_subdirs_if_needed(name);
+
+	fh = os_file_create(
+		innodb_file_data_key, name, OS_FILE_CREATE,
+		OS_FILE_NORMAL, OS_DATA_FILE, &ret);
+
+	if (ret == FALSE
+	    && os_file_get_last_error(FALSE) != OS_FILE_ALREADY_EXISTS
+#ifdef UNIV_AIX
+	    /* AIX 5.1 after security patch ML7 may have
+	    errno set to 0 here, which causes our function
+	    to return 100; work around that AIX problem */
+	    && os_file_get_last_error(FALSE) != 100
+#endif
+		) {
+
+		fprintf(stderr, "InnoDB: Error in creating %s\n", name);
+
+		err = DB_ERROR;
+	} else {
+		/* We created the data file and now write it full of zeros */
+
+		ut_print_timestamp(stderr);
+		fprintf(stderr, " InnoDB: Data file %s did not"
+				" exist: new to be created\n", name);
+
+		ut_print_timestamp(stderr);
+		fprintf(stderr, " InnoDB: Setting file %s size to %lu MB\n",
+				name, size >> (20 - UNIV_PAGE_SIZE_SHIFT));
+
+		ut_print_timestamp(stderr);
+		fprintf(stderr, " InnoDB: Database physically writes the"
+				" file full: wait...\n");
+
+		ret = os_file_set_size(name, fh, size << UNIV_PAGE_SIZE_SHIFT);
+
+		if (!ret) {
+			ut_print_timestamp(stderr);
+			fprintf(stderr, " InnoDB: Error in creating %s:"
+					" probably out of disk space\n", name);
+
+			err = DB_ERROR;
+		}
+
+		os_file_close(fh);
+	}
+
+	return(err);
+}
+
+/*********************************************************************//**
+Open an undo tablespace.
+@return	DB_SUCCESS or error code */
+static
+enum db_err
+srv_undo_tablespace_open(
+/*=====================*/
+	const char*	name,		/*!< in: tablespace name */
+	ulint		space)		/*!< in: tablespace id */
+{
+	os_file_t	fh;
+	enum db_err	err;
+	ibool		ret;
+	ulint		flags;
+
+	fh = os_file_create(
+		innodb_file_data_key, name,
+		OS_FILE_OPEN_RETRY
+		| OS_FILE_ON_ERROR_NO_EXIT
+		| OS_FILE_ON_ERROR_SILENT,
+		OS_FILE_NORMAL,
+		OS_DATA_FILE,
+		&ret);
+
+	/* If the file open was successful then load the tablespace. */
+
+	if (ret) {
+		os_offset_t	size;
+		os_offset_t	n_pages;
+
+		size = os_file_get_size(fh);
+		ut_a(size != (os_offset_t) -1);
+
+		ret = os_file_close(fh);
+		ut_a(ret);
+
+		/* Load the tablespace into InnoDB's internal
+		data structures. */
+
+		/* We set the biggest space id to the undo tablespace
+		because InnoDB hasn't opened any other tablespace apart
+		from the system tablespace. */
+
+		fil_set_max_space_id_if_bigger(space);
+
+		/* Set the compressed page size to 0 (non-compressed) */
+		flags = fsp_flags_set_page_size(0, UNIV_PAGE_SIZE);
+		fil_space_create(name, space, flags, FIL_TABLESPACE);
+
+		ut_a(fil_validate());
+
+		n_pages = size / UNIV_PAGE_SIZE;
+
+		/* On 64 bit Windows ulint can be 32 bit and os_offset_t
+		is 64 bit. It is OK to cast the n_pages to ulint because
+		the unit has been scaled to pages and they are always
+		32 bit. */
+		fil_node_create(name, (ulint) n_pages, space, FALSE);
+
+		err = DB_SUCCESS;
+	} else {
+		err = DB_ERROR;
+	}
+
+	return(err);
+}
+
+/********************************************************************
+Opens the configured number of undo tablespaces.
+@return	DB_SUCCESS or error code */
+static
+enum db_err
+srv_undo_tablespaces_init(
+/*======================*/
+	ibool		create_new_db,		/*!< in: TRUE if new db being
+						created */
+	const ulint	n_conf_tablespaces)	/*!< in: configured undo
+						tablespaces */
+{
+	ulint		i;
+	enum db_err	err = DB_SUCCESS;
+	ulint		prev_space_id = 0;
+	ulint		n_undo_tablespaces;
+	ulint		undo_tablespace_ids[TRX_SYS_N_RSEGS + 1];
+
+	ut_a(n_conf_tablespaces <= TRX_SYS_N_RSEGS);
+
+	memset(undo_tablespace_ids, 0x0, sizeof(undo_tablespace_ids));
+
+	/* Create the undo spaces only if we are creating a new
+	instance. We don't allow creating of new undo tablespaces
+	in an existing instance (yet).  This restriction exists because
+	we check in several places for SYSTEM tablespaces to be less than
+	the min of user defined tablespace ids. Once we implement saving
+	the location of the undo tablespaces and their space ids this
+	restriction will/should be lifted. */
+
+	for (i = 0; create_new_db && i < n_conf_tablespaces; ++i) {
+		char	name[OS_FILE_MAX_PATH];
+
+		ut_snprintf(
+			name, sizeof(name),
+			"%s%cundo%03lu",
+			srv_undo_dir, SRV_PATH_SEPARATOR, i + 1);
+
+		/* Undo space ids start from 1. */
+		err = srv_undo_tablespace_create(
+			name, SRV_UNDO_TABLESPACE_SIZE_IN_PAGES);
+
+		if (err != DB_SUCCESS) {
+			ut_print_timestamp(stderr);
+			fprintf(stderr,
+				" InnoDB: Could not create "
+				"undo tablespace '%s'.\n", name);
+
+			return(err);
+		}
+	}
+
+	/* Get the tablespace ids of all the undo segments excluding
+	the system tablespace (0). If we are creating a new instance then
+	we build the undo_tablespace_ids ourselves since they don't
+	already exist. */
+
+	if (!create_new_db) {
+		n_undo_tablespaces = trx_rseg_get_n_undo_tablespaces(
+			undo_tablespace_ids);
+	} else {
+		n_undo_tablespaces = n_conf_tablespaces;
+
+		for (i = 1; i <= n_undo_tablespaces; ++i) {
+			undo_tablespace_ids[i - 1] = i;
+		}
+
+		undo_tablespace_ids[i] = ULINT_UNDEFINED;
+	}
+
+	/* Open all the undo tablespaces that are currently in use. If we
+	fail to open any of these it is a fatal error. The tablespace ids
+	should be contiguous. It is a fatal error because they are required
+	for recovery and are referenced by the UNDO logs (a.k.a RBS). */
+
+	for (i = 0; i < n_undo_tablespaces; ++i) {
+		char	name[OS_FILE_MAX_PATH];
+
+		ut_snprintf(
+			name, sizeof(name),
+			"%s%cundo%03lu",
+			srv_undo_dir, SRV_PATH_SEPARATOR,
+			undo_tablespace_ids[i]);
+
+		/* Should be no gaps in undo tablespace ids. */
+		ut_a(prev_space_id + 1 == undo_tablespace_ids[i]);
+
+		/* The system space id should not be in this array. */
+		ut_a(undo_tablespace_ids[i] != 0);
+		ut_a(undo_tablespace_ids[i] != ULINT_UNDEFINED);
+
+		/* Undo space ids start from 1. */
+
+		err = srv_undo_tablespace_open(name, undo_tablespace_ids[i]);
+
+		if (err != DB_SUCCESS) {
+			ut_print_timestamp(stderr);
+			fprintf(stderr,
+				" InnoDB: Error opening undo "
+				"tablespace %s.\n", name);
+
+			return(err);
+		}
+
+		prev_space_id = undo_tablespace_ids[i];
+	}
+
+	/* Open any extra unused undo tablespaces. These must be contiguous.
+	We stop at the first failure. These are undo tablespaces that are
+	not in use and therefore not required by recovery. We only check
+	that there are no gaps. */
+
+	for (i = prev_space_id + 1; i < TRX_SYS_N_RSEGS; ++i) {
+		char	name[OS_FILE_MAX_PATH];
+
+		ut_snprintf(
+			name, sizeof(name),
+			"%s%cundo%03lu", srv_undo_dir, SRV_PATH_SEPARATOR, i);
+
+		/* Undo space ids start from 1. */
+		err = srv_undo_tablespace_open(name, i);
+
+		if (err != DB_SUCCESS) {
+			break;
+		}
+
+		++n_undo_tablespaces;
+	}
+
+	/* If the user says that there are fewer than what we find we
+	tolerate that discrepancy but not the inverse. Because there could
+	be unused undo tablespaces for future use. */
+
+	if (n_conf_tablespaces > n_undo_tablespaces) {
+		ut_print_timestamp(stderr);
+		fprintf(stderr,
+			" InnoDB: Expected to open %lu undo "
+			"tablespaces but was able\n",
+			n_conf_tablespaces);
+		ut_print_timestamp(stderr);
+		fprintf(stderr,
+			" InnoDB: to find only %lu undo "
+			"tablespaces.\n", n_undo_tablespaces);
+		ut_print_timestamp(stderr);
+		fprintf(stderr,
+			" InnoDB: Set the "
+			"innodb_undo_tablespaces parameter to "
+			"the\n");
+		ut_print_timestamp(stderr);
+		fprintf(stderr,
+			" InnoDB: correct value and retry. Suggested "
+			"value is %lu\n", n_undo_tablespaces);
+
+		return(err != DB_SUCCESS ? err : DB_ERROR);
+	}
+
+	if (n_undo_tablespaces > 0) {
+		ut_print_timestamp(stderr);
+		fprintf(stderr,
+			" InnoDB: Opened %lu undo tablespaces\n",
+			n_conf_tablespaces);
+	}
+
+	if (create_new_db) {
+		mtr_t	mtr;
+
+		mtr_start(&mtr);
+
+		/* The undo log tablespace */
+		for (i = 1; i <= n_undo_tablespaces; ++i) {
+
+			fsp_header_init(
+				i, SRV_UNDO_TABLESPACE_SIZE_IN_PAGES, &mtr);
+		}
+
+		mtr_commit(&mtr);
+	}
+
+	return(DB_SUCCESS);
+}
+
 /********************************************************************
 Starts InnoDB and creates a new database if database files
 are not found and the user wants.
@@ -1020,8 +1315,8 @@ innobase_start_or_create_for_mysql(void)
 	ibool		log_file_created;
 	ibool		log_created	= FALSE;
 	ibool		log_opened	= FALSE;
-	ib_uint64_t	min_flushed_lsn;
-	ib_uint64_t	max_flushed_lsn;
+	lsn_t		min_flushed_lsn;
+	lsn_t		max_flushed_lsn;
 #ifdef UNIV_LOG_ARCHIVE
 	ulint		min_arch_log_no;
 	ulint		max_arch_log_no;
@@ -1032,9 +1327,9 @@ innobase_start_or_create_for_mysql(void)
 	ulint		err;
 	ulint		i;
 	ulint		io_limit;
-	my_bool		srv_file_per_table_original_value
-		= srv_file_per_table;
 	mtr_t		mtr;
+	ib_bh_t*	ib_bh;
+
 #ifdef HAVE_DARWIN_THREADS
 # ifdef F_FULLFSYNC
 	/* This executable has been compiled on Mac OS X 10.3 or later.
@@ -1077,11 +1372,6 @@ innobase_start_or_create_for_mysql(void)
 			"of memory.\n");
 	}
 
-	/* System tables are created in tablespace 0.  Thus, we must
-	temporarily clear srv_file_per_table.  This is ok, because the
-	server will not accept connections (which could modify
-	innodb_file_per_table) until this function has returned. */
-	srv_file_per_table = FALSE;
 #ifdef UNIV_DEBUG
 	ut_print_timestamp(stderr);
 	fprintf(stderr,
@@ -1138,6 +1428,11 @@ innobase_start_or_create_for_mysql(void)
 			" InnoDB: The InnoDB memory heap is disabled\n");
 	}
 
+#if defined(COMPILER_HINTS_ENABLED)
+	ut_print_timestamp(stderr);
+	fprintf(stderr, " InnoDB: Compiler hints enabled.\n");
+#endif /* defined(COMPILER_HINTS_ENABLED) */
+
 	ut_print_timestamp(stderr);
 	fputs(" InnoDB: " IB_ATOMICS_STARTUP_MSG "\n", stderr);
 
@@ -1301,9 +1596,12 @@ innobase_start_or_create_for_mysql(void)
 		     &srv_monitor_file_mutex, SYNC_NO_ORDER_CHECK);
 
 	if (srv_innodb_status) {
-		srv_monitor_file_name = mem_alloc(
-			strlen(fil_path_to_mysql_datadir)
-			+ 20 + sizeof "/innodb_status.");
+
+		srv_monitor_file_name = static_cast<char*>(
+			mem_alloc(
+				strlen(fil_path_to_mysql_datadir)
+				+ 20 + sizeof "/innodb_status."));
+
 		sprintf(srv_monitor_file_name, "%s/innodb_status.%lu",
 			fil_path_to_mysql_datadir, os_proc_get_number());
 		srv_monitor_file = fopen(srv_monitor_file_name, "w+");
@@ -1364,21 +1662,12 @@ innobase_start_or_create_for_mysql(void)
 	}
 # endif /* __WIN__ */
 
-	if (!os_aio_init(io_limit,
-			 srv_n_read_io_threads,
-			 srv_n_write_io_threads,
-			 SRV_MAX_N_PENDING_SYNC_IOS)) {
-
-		ut_print_timestamp(stderr);
-		fprintf(stderr,
-			" InnoDB: Fatal error: cannot initialize AIO"
-			" sub-system\n");
-
-		return(DB_ERROR);
-	}
+	os_aio_init(io_limit,
+		    srv_n_read_io_threads,
+		    srv_n_write_io_threads,
+		    SRV_MAX_N_PENDING_SYNC_IOS);
 
-	fil_init(srv_file_per_table ? 50000 : 5000,
-		 srv_max_n_open_files);
+	fil_init(srv_file_per_table ? 50000 : 5000, srv_max_n_open_files);
 
 	/* Print time to initialize the buffer pool */
 	ut_print_timestamp(stderr);
@@ -1448,11 +1737,33 @@ innobase_start_or_create_for_mysql(void)
 	}
 #endif /* UNIV_LOG_ARCHIVE */
 
-	if (srv_n_log_files * srv_log_file_size >= 262144) {
+	if (srv_n_log_files * srv_log_file_size * UNIV_PAGE_SIZE
+	    >= 549755813888ULL /* 512G */) {
+		/* log_block_convert_lsn_to_no() limits the returned block
+		number to 1G and given that OS_FILE_LOG_BLOCK_SIZE is 512
+		bytes, then we have a limit of 512 GB. If that limit is to
+		be raised, then log_block_convert_lsn_to_no() must be
+		modified. */
 		ut_print_timestamp(stderr);
 		fprintf(stderr,
 			" InnoDB: Error: combined size of log files"
-			" must be < 4 GB\n");
+			" must be < 512 GB\n");
+
+		return(DB_ERROR);
+	}
+
+	if (srv_n_log_files * srv_log_file_size >= ULINT_MAX) {
+		/* fil_io() takes ulint as an argument and we are passing
+		(next_offset / UNIV_PAGE_SIZE) to it in log_group_write_buf().
+		So (next_offset / UNIV_PAGE_SIZE) must be less than ULINT_MAX.
+		So next_offset must be < ULINT_MAX * UNIV_PAGE_SIZE. This
+		means that we are limited to ULINT_MAX * UNIV_PAGE_SIZE which
+		is 64 TB on 32 bit systems. */
+		ut_print_timestamp(stderr);
+		fprintf(stderr,
+			" InnoDB: Error: combined size of log files"
+			" must be < %lu GB\n",
+			ULINT_MAX / 1073741824 * UNIV_PAGE_SIZE);
 
 		return(DB_ERROR);
 	}
@@ -1461,7 +1772,9 @@ innobase_start_or_create_for_mysql(void)
 
 	for (i = 0; i < srv_n_data_files; i++) {
 #ifndef __WIN__
-		if (sizeof(off_t) < 5 && srv_data_file_sizes[i] >= 262144) {
+		if (sizeof(off_t) < 5
+		    && srv_data_file_sizes[i]
+		    >= (ulint) (1 << (32 - UNIV_PAGE_SIZE_SHIFT))) {
 			ut_print_timestamp(stderr);
 			fprintf(stderr,
 				" InnoDB: Error: file size must be < 4 GB"
@@ -1577,6 +1890,17 @@ innobase_start_or_create_for_mysql(void)
 
 	fil_open_log_and_system_tablespace_files();
 
+	err = srv_undo_tablespaces_init(create_new_db, srv_undo_tablespaces);
+
+	/* If the force recovery is set very high then we carry on regardless
+	of all errors. Basically this is fingers crossed mode. */
+
+	if (err != DB_SUCCESS
+	    && srv_force_recovery < SRV_FORCE_NO_UNDO_LOG_SCAN) {
+
+		return((int) err);
+	}
+
 	if (log_created && !create_new_db
 #ifdef UNIV_LOG_ARCHIVE
 	    && !srv_archive_recovery
@@ -1602,7 +1926,7 @@ innobase_start_or_create_for_mysql(void)
 			return(DB_ERROR);
 		}
 
-		if (max_flushed_lsn < (ib_uint64_t) 1000) {
+		if (max_flushed_lsn < (lsn_t) 1000) {
 			ut_print_timestamp(stderr);
 			fprintf(stderr,
 				" InnoDB: Cannot initialize created"
@@ -1641,6 +1965,8 @@ innobase_start_or_create_for_mysql(void)
 
 	trx_sys_file_format_init();
 
+	trx_sys_create();
+
 	if (create_new_db) {
 		mtr_start(&mtr);
 
@@ -1652,7 +1978,14 @@ innobase_start_or_create_for_mysql(void)
 		the first rollback segment before the double write buffer.
 		All the remaining rollback segments will be created later,
 		after the double write buffer has been created. */
-		trx_sys_create();
+		trx_sys_create_sys_pages();
+
+		ib_bh = trx_sys_init_at_db_start();
+
+		/* The purge system needs to create the purge view and
+		therefore requires that the trx_sys is inited. */
+
+		trx_purge_sys_create(srv_n_purge_threads, ib_bh);
 
 		dict_create();
 
@@ -1676,20 +2009,21 @@ innobase_start_or_create_for_mysql(void)
 
 		dict_boot();
 
-		trx_sys_init_at_db_start();
+		ib_bh = trx_sys_init_at_db_start();
 
-		srv_startup_is_before_trx_rollback_phase = FALSE;
+		/* The purge system needs to create the purge view and
+		therefore requires that the trx_sys is inited. */
 
-		/* Initialize the fsp free limit global variable in the log
-		system */
-		fsp_header_get_free_limit();
+		trx_purge_sys_create(srv_n_purge_threads, ib_bh);
+
+		srv_startup_is_before_trx_rollback_phase = FALSE;
 
 		recv_recovery_from_archive_finish();
 #endif /* UNIV_LOG_ARCHIVE */
 	} else {
 
 		/* Check if we support the max format that is stamped
-		on the system tablespace. 
+		on the system tablespace.
 		Note:  We are NOT allowed to make any modifications to
 		the TRX_SYS_PAGE_NO page before recovery  because this
 		page also contains the max_trx_id etc. important system
@@ -1733,11 +2067,13 @@ innobase_start_or_create_for_mysql(void)
 		works for space 0. */
 
 		dict_boot();
-		trx_sys_init_at_db_start();
 
-		/* Initialize the fsp free limit global variable in the log
-		system */
-		fsp_header_get_free_limit();
+		ib_bh = trx_sys_init_at_db_start();
+
+		/* The purge system needs to create the purge view and
+		therefore requires that the trx_sys is inited. */
+
+		trx_purge_sys_create(srv_n_purge_threads, ib_bh);
 
 		/* recv_recovery_from_checkpoint_finish needs trx lists which
 		are initialized in trx_sys_init_at_db_start(). */
@@ -1814,10 +2150,10 @@ innobase_start_or_create_for_mysql(void)
 	/* fprintf(stderr, "Max allowed record size %lu\n",
 	page_get_free_space_of_empty() / 2); */
 
-	if (trx_doublewrite == NULL) {
+	if (buf_dblwr == NULL) {
 		/* Create the doublewrite buffer to a new tablespace */
 
-		trx_sys_create_doublewrite_buf();
+		buf_dblwr_create();
 	}
 
 	/* Here the double write buffer has already been created and so
@@ -1828,63 +2164,104 @@ innobase_start_or_create_for_mysql(void)
 
 	/* Note: When creating the extra rollback segments during an upgrade
 	we violate the latching order, even if the change buffer is empty.
-	We make an exception in sync0sync.c and check srv_is_being_started
+	We make an exception in sync0sync.cc and check srv_is_being_started
 	for that violation. It cannot create a deadlock because we are still
 	running in single threaded mode essentially. Only the IO threads
 	should be running at this stage. */
 
-	trx_sys_create_rsegs(TRX_SYS_N_RSEGS - 1);
+	ut_a(srv_undo_logs > 0);
+	ut_a(srv_undo_logs <= TRX_SYS_N_RSEGS);
+
+	/* The number of rsegs that exist in InnoDB is given by status
+	variable srv_available_undo_logs. The number of rsegs to use can
+	be set using the dynamic global variable srv_undo_logs. */
+
+	srv_available_undo_logs = trx_sys_create_rsegs(
+		srv_undo_tablespaces, srv_undo_logs);
+
+	if (srv_available_undo_logs == ULINT_UNDEFINED) {
+		/* Can only happen if force recovery is set. */
+		ut_a(srv_force_recovery >= SRV_FORCE_NO_TRX_UNDO);
+		srv_undo_logs = ULONG_UNDEFINED;
+	}
 
 	/* Create the thread which watches the timeouts for lock waits */
-	os_thread_create(&srv_lock_timeout_thread, NULL,
-			 thread_ids + 2 + SRV_MAX_N_IO_THREADS);
+	os_thread_create(
+		lock_wait_timeout_thread,
+		NULL, thread_ids + 2 + SRV_MAX_N_IO_THREADS);
 
 	/* Create the thread which warns of long semaphore waits */
-	os_thread_create(&srv_error_monitor_thread, NULL,
-			 thread_ids + 3 + SRV_MAX_N_IO_THREADS);
+	os_thread_create(
+		srv_error_monitor_thread,
+		NULL, thread_ids + 3 + SRV_MAX_N_IO_THREADS);
 
 	/* Create the thread which prints InnoDB monitor info */
-	os_thread_create(&srv_monitor_thread, NULL,
-			 thread_ids + 4 + SRV_MAX_N_IO_THREADS);
+	os_thread_create(
+		srv_monitor_thread,
+		NULL, thread_ids + 4 + SRV_MAX_N_IO_THREADS);
 
 	srv_is_being_started = FALSE;
 
+	/* Create the SYS_FOREIGN and SYS_FOREIGN_COLS system tables */
 	err = dict_create_or_check_foreign_constraint_tables();
-
 	if (err != DB_SUCCESS) {
 		return((int)DB_ERROR);
 	}
 
+	srv_is_being_started = FALSE;
+
+	ut_a(trx_purge_state() == PURGE_STATE_INIT);
+
 	/* Create the master thread which does purge and other utility
 	operations */
 
-	os_thread_create(&srv_master_thread, NULL, thread_ids
-			 + (1 + SRV_MAX_N_IO_THREADS));
+	os_thread_create(
+		srv_master_thread,
+		NULL, thread_ids + (1 + SRV_MAX_N_IO_THREADS));
+
+	if (srv_force_recovery < SRV_FORCE_NO_BACKGROUND) {
+
+		os_thread_create(
+			srv_purge_coordinator_thread,
+			NULL, thread_ids + 5 + SRV_MAX_N_IO_THREADS);
 
-	/* Currently we allow only a single purge thread. */
-	ut_a(srv_n_purge_threads == 0 || srv_n_purge_threads == 1);
+		ut_a(UT_ARR_SIZE(thread_ids)
+		     > 5 + srv_n_purge_threads + SRV_MAX_N_IO_THREADS);
 
-	/* If the user has requested a separate purge thread then
-	start the purge thread. */
-	if (srv_n_purge_threads == 1) {
-		os_thread_create(&srv_purge_thread, NULL, NULL);
+		/* We've already created the purge coordinator thread above. */
+		for (i = 1; i < srv_n_purge_threads; ++i) {
+			os_thread_create(
+				srv_worker_thread, NULL,
+				thread_ids + 5 + i + SRV_MAX_N_IO_THREADS);
+		}
 	}
 
-	/* Wait for the purge and master thread to startup. */
+	os_thread_create(buf_flush_page_cleaner_thread, NULL, NULL);
+
+	/* Wait for the purge coordinator and master thread to startup. */
+
+	purge_state_t	state = trx_purge_state();
+
+	while (srv_shutdown_state == SRV_SHUTDOWN_NONE
+	       && srv_force_recovery < SRV_FORCE_NO_BACKGROUND
+	       && state == PURGE_STATE_INIT) {
 
-	while (srv_shutdown_state == SRV_SHUTDOWN_NONE) {
-		if (srv_thread_has_reserved_slot(SRV_MASTER) == ULINT_UNDEFINED
-		    || (srv_n_purge_threads == 1
-			&& srv_thread_has_reserved_slot(SRV_WORKER)
-			== ULINT_UNDEFINED)) {
+		switch (state = trx_purge_state()) {
+		case PURGE_STATE_RUN:
+		case PURGE_STATE_STOP:
+			break;
 
+		case PURGE_STATE_INIT:
 			ut_print_timestamp(stderr);
-			fprintf(stderr, "  InnoDB: "
+			fprintf(stderr, " InnoDB: "
 				"Waiting for the background threads to "
 				"start\n");
-			os_thread_sleep(1000000);
-		} else {
+
+			os_thread_sleep(50000);
 			break;
+
+		case PURGE_STATE_EXIT:
+			ut_error;
 		}
 	}
 
@@ -1984,7 +2361,7 @@ innobase_start_or_create_for_mysql(void)
 	}
 
 	/* Check that os_fast_mutexes work as expected */
-	os_fast_mutex_init(&srv_os_test_mutex);
+	os_fast_mutex_init(PFS_NOT_INSTRUMENTED, &srv_os_test_mutex);
 
 	if (0 != os_fast_mutex_trylock(&srv_os_test_mutex)) {
 		ut_print_timestamp(stderr);
@@ -2009,7 +2386,7 @@ innobase_start_or_create_for_mysql(void)
 		ut_print_timestamp(stderr);
 		fprintf(stderr,
 			" InnoDB: %s started; "
-			"log sequence number %llu\n",
+			"log sequence number " LSN_PF "\n",
 			INNODB_VERSION_STR, srv_start_lsn);
 	}
 
@@ -2023,67 +2400,6 @@ innobase_start_or_create_for_mysql(void)
 
 	fflush(stderr);
 
-	if (trx_doublewrite_must_reset_space_ids) {
-		/* Actually, we did not change the undo log format between
-		4.0 and 4.1.1, and we would not need to run purge to
-		completion. Note also that the purge algorithm in 4.1.1
-		can process the history list again even after a full
-		purge, because our algorithm does not cut the end of the
-		history list in all cases so that it would become empty
-		after a full purge. That mean that we may purge 4.0 type
-		undo log even after this phase.
-
-		The insert buffer record format changed between 4.0 and
-		4.1.1. It is essential that the insert buffer is emptied
-		here! */
-
-		ut_print_timestamp(stderr);
-		fprintf(stderr,
-			" InnoDB: You are upgrading to an"
-			" InnoDB version which allows multiple\n");
-		ut_print_timestamp(stderr);
-		fprintf(stderr,
-			" InnoDB: tablespaces. Wait that purge"
-			" and insert buffer merge run to\n");
-		ut_print_timestamp(stderr);
-		fprintf(stderr,
-			" InnoDB: completion...\n");
-		for (;;) {
-			os_thread_sleep(1000000);
-
-			if (0 == strcmp(srv_main_thread_op_info,
-					"waiting for server activity")) {
-
-				ut_a(ibuf_is_empty());
-
-				break;
-			}
-		}
-		ut_print_timestamp(stderr);
-		fprintf(stderr,
-			" InnoDB: Full purge and insert buffer merge"
-			" completed.\n");
-
-		trx_sys_mark_upgraded_to_multiple_tablespaces();
-
-		ut_print_timestamp(stderr);
-		fprintf(stderr,
-			" InnoDB: You have now successfully upgraded"
-			" to the multiple tablespaces\n");
-		ut_print_timestamp(stderr);
-		fprintf(stderr,
-			" InnoDB: format. You should NOT DOWNGRADE"
-			" to an earlier version of\n");
-		ut_print_timestamp(stderr);
-		fprintf(stderr,
-			" InnoDB: InnoDB! But if you absolutely need to"
-			" downgrade, see\n");
-		ut_print_timestamp(stderr);
-		fprintf(stderr,
-			" InnoDB: " REFMAN "multiple-tablespaces.html\n"
-			" InnoDB: for instructions.\n");
-	}
-
 	if (srv_force_recovery == 0) {
 		/* In the insert buffer we may have even bigger tablespace
 		id's, because we may have dropped those tablespaces, but
@@ -2093,13 +2409,48 @@ innobase_start_or_create_for_mysql(void)
 		ibuf_update_max_tablespace_id();
 	}
 
-	srv_file_per_table = srv_file_per_table_original_value;
+	/* Create the buffer pool dump/load thread */
+	os_thread_create(buf_dump_thread, NULL, NULL);
 
 	srv_was_started = TRUE;
 
+	/* Create the thread that will optimize the FTS sub-system
+	in a separate background thread. */
+	fts_optimize_init();
+
 	return((int) DB_SUCCESS);
 }
 
+#if 0
+/********************************************************************
+Sync all FTS cache before shutdown */
+static
+void
+srv_fts_close(void)
+/*===============*/
+{
+	dict_table_t*	table;
+
+	for (table = UT_LIST_GET_FIRST(dict_sys->table_LRU);
+	     table; table = UT_LIST_GET_NEXT(table_LRU, table)) {
+		fts_t*          fts = table->fts;
+
+		if (fts != NULL) {
+			fts_sync_table(table);
+		}
+	}
+
+	for (table = UT_LIST_GET_FIRST(dict_sys->table_non_LRU);
+	     table; table = UT_LIST_GET_NEXT(table_LRU, table)) {
+		fts_t*          fts = table->fts;
+
+		if (fts != NULL) {
+			fts_sync_table(table);
+		}
+	}
+}
+#endif
+
 /****************************************************************//**
 Shuts down the InnoDB database.
 @return	DB_SUCCESS or error code */
@@ -2113,7 +2464,7 @@ innobase_shutdown_for_mysql(void)
 		if (srv_is_being_started) {
 			ut_print_timestamp(stderr);
 			fprintf(stderr,
-				"  InnoDB: Warning: shutting down"
+				" InnoDB: Warning: shutting down"
 				" a not properly started\n"
 				"InnoDB: or created database!\n");
 		}
@@ -2121,6 +2472,11 @@ innobase_shutdown_for_mysql(void)
 		return(DB_SUCCESS);
 	}
 
+	/* Shutdown the FTS optimize sub system. */
+	fts_optimize_start_shutdown();
+
+	fts_optimize_end();
+
 	/* 1. Flush the buffer pool to disk, write the current lsn to
 	the tablespace header(s), and copy all log data to archive.
 	The step 1 is the real InnoDB shutdown. The remaining steps 2 - ...
@@ -2128,14 +2484,19 @@ innobase_shutdown_for_mysql(void)
 
 	logs_empty_and_mark_files_at_shutdown();
 
-	if (srv_conc_n_threads != 0) {
+	if (srv_conc_get_active_threads() != 0) {
 		fprintf(stderr,
 			"InnoDB: Warning: query counter shows %ld queries"
 			" still\n"
 			"InnoDB: inside InnoDB at shutdown\n",
-			srv_conc_n_threads);
+			srv_conc_get_active_threads());
 	}
 
+	/* This functionality will be used by WL#5522. */
+	ut_a(trx_purge_state() == PURGE_STATE_RUN
+	     || trx_purge_state() == PURGE_STATE_EXIT
+	     || srv_force_recovery >= SRV_FORCE_NO_BACKGROUND);
+
 	/* 2. Make all threads created by InnoDB to exit */
 
 	srv_shutdown_state = SRV_SHUTDOWN_EXIT_THREADS;
@@ -2149,7 +2510,7 @@ innobase_shutdown_for_mysql(void)
 		HERE OR EARLIER */
 
 		/* a. Let the lock timeout thread exit */
-		os_event_set(srv_lock_timeout_thread_event);
+		os_event_set(srv_timeout_event);
 
 		/* b. srv error monitor thread exits automatically, no need
 		to do anything here */
@@ -2157,8 +2518,8 @@ innobase_shutdown_for_mysql(void)
 		/* c. We wake the master thread so that it exits */
 		srv_wake_master_thread();
 
-		/* d. We wake the purge thread so that it exits */
-		srv_wake_purge_thread();
+		/* d. Wakeup purge threads. */
+		srv_purge_wakeup();
 
 		/* e. Exit the i/o threads */
 
@@ -2231,6 +2592,8 @@ innobase_shutdown_for_mysql(void)
 	/* 3. Free all InnoDB's own mutexes and the os_fast_mutexes inside
 	them */
 	os_aio_free();
+	que_close();
+	row_mysql_close();
 	sync_close();
 	srv_free();
 	fil_close();
@@ -2267,15 +2630,12 @@ innobase_shutdown_for_mysql(void)
 	if (dict_foreign_err_file) {
 		fclose(dict_foreign_err_file);
 	}
-	if (lock_latest_err_file) {
-		fclose(lock_latest_err_file);
-	}
 
 	if (srv_print_verbose_log) {
 		ut_print_timestamp(stderr);
 		fprintf(stderr,
-			"  InnoDB: Shutdown completed;"
-			" log sequence number %llu\n",
+			" InnoDB: Shutdown completed;"
+			" log sequence number " LSN_PF "\n",
 			srv_shutdown_lsn);
 	}
 
@@ -2285,3 +2645,81 @@ innobase_shutdown_for_mysql(void)
 	return((int) DB_SUCCESS);
 }
 #endif /* !UNIV_HOTBACKUP */
+
+
+/********************************************************************
+Signal all per-table background threads to shutdown, and wait for them to do
+so. */
+
+void
+srv_shutdown_table_bg_threads(void)
+/*===============================*/
+{
+	dict_table_t*	table;
+	dict_table_t*	first;
+	dict_table_t*	last = NULL;
+
+	mutex_enter(&dict_sys->mutex);
+
+	/* Signal all threads that they should stop. */
+	table = UT_LIST_GET_FIRST(dict_sys->table_LRU);
+	first = table;
+	while (table) {
+		dict_table_t*	next;
+		fts_t*		fts = table->fts;
+
+		if (fts != NULL) {
+			fts_start_shutdown(table, fts);
+		}
+
+		next = UT_LIST_GET_NEXT(table_LRU, table);
+
+		if (!next) {
+			last = table;
+		}
+
+		table = next;
+	}
+
+	/* We must release dict_sys->mutex here; if we hold on to it in the
+	loop below, we will deadlock if any of the background threads try to
+	acquire it (for example, the FTS thread by calling que_eval_sql).
+
+	Releasing it here and going through dict_sys->table_LRU without
+	holding it is safe because:
+
+	 a) MySQL only starts the shutdown procedure after all client
+	 threads have been disconnected and no new ones are accepted, so no
+	 new tables are added or old ones dropped.
+
+	 b) Despite its name, the list is not LRU, and the order stays
+	 fixed.
+
+	To safeguard against the above assumptions ever changing, we store
+	the first and last items in the list above, and then check that
+	they've stayed the same below. */
+
+	mutex_exit(&dict_sys->mutex);
+
+	/* Wait for the threads of each table to stop. This is not inside
+	the above loop, because by signaling all the threads first we can
+	overlap their shutting down delays. */
+	table = UT_LIST_GET_FIRST(dict_sys->table_LRU);
+	ut_a(first == table);
+	while (table) {
+		dict_table_t*	next;
+		fts_t*		fts = table->fts;
+
+		if (fts != NULL) {
+			fts_shutdown(table, fts);
+		}
+
+		next = UT_LIST_GET_NEXT(table_LRU, table);
+
+		if (table == last) {
+			ut_a(!next);
+		}
+
+		table = next;
+	}
+}
diff --git a/storage/innobase/sync/sync0arr.c b/storage/innobase/sync/sync0arr.cc
index c6cbfc94dca..b90a5f29589 100644
--- a/storage/innobase/sync/sync0arr.c
+++ b/storage/innobase/sync/sync0arr.cc
@@ -18,13 +18,13 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
 /**************************************************//**
-@file sync/sync0arr.c
+@file sync/sync0arr.cc
 The wait array used in synchronization primitives
 
 Created 9/5/1995 Heikki Tuuri
@@ -122,8 +122,6 @@ struct sync_array_struct {
 	ulint		n_cells;	/*!< number of cells in the
 					wait array */
 	sync_cell_t*	array;		/*!< pointer to wait array */
-	ulint		protection;	/*!< this flag tells which
-					mutex protects the data */
 	mutex_t		mutex;		/*!< possible database mutex
 					protecting this data structure */
 	os_mutex_t	os_mutex;	/*!< Possible operating system mutex
@@ -133,16 +131,22 @@ struct sync_array_struct {
 					to prevent infinite recursion
 					in implementation, we fall back to
 					an OS mutex. */
-	ulint		sg_count;	/*!< count of how many times an
-					object has been signalled */
 	ulint		res_count;	/*!< count of cell reservations
 					since creation of the array */
 };
 
-#ifdef UNIV_PFS_MUTEX
-/* Key to register the mutex with performance schema */
-UNIV_INTERN mysql_pfs_key_t	syn_arr_mutex_key;
-#endif
+/** User configured sync array size */
+UNIV_INTERN ulong	srv_sync_array_size = 32;
+
+/** Locally stored copy of srv_sync_array_size */
+static	ulint		sync_array_size;
+
+/** The global array of wait cells for implementation of the database's own
+mutexes and read-write locks */
+static	sync_array_t**	sync_wait_array;
+
+/** count of how many times an object has been signalled */
+static ulint		sg_count;
 
 #ifdef UNIV_SYNC_DEBUG
 /******************************************************************//**
@@ -184,17 +188,7 @@ sync_array_enter(
 /*=============*/
 	sync_array_t*	arr)	/*!< in: sync wait array */
 {
-	ulint	protection;
-
-	protection = arr->protection;
-
-	if (protection == SYNC_ARRAY_OS_MUTEX) {
-		os_mutex_enter(arr->os_mutex);
-	} else if (protection == SYNC_ARRAY_MUTEX) {
-		mutex_enter(&(arr->mutex));
-	} else {
-		ut_error;
-	}
+	os_mutex_enter(arr->os_mutex);
 }
 
 /******************************************************************//**
@@ -205,17 +199,7 @@ sync_array_exit(
 /*============*/
 	sync_array_t*	arr)	/*!< in: sync wait array */
 {
-	ulint	protection;
-
-	protection = arr->protection;
-
-	if (protection == SYNC_ARRAY_OS_MUTEX) {
-		os_mutex_exit(arr->os_mutex);
-	} else if (protection == SYNC_ARRAY_MUTEX) {
-		mutex_exit(&(arr->mutex));
-	} else {
-		ut_error;
-	}
+	os_mutex_exit(arr->os_mutex);
 }
 
 /*******************************************************************//**
@@ -223,15 +207,12 @@ Creates a synchronization wait array. It is protected by a mutex
 which is automatically reserved when the functions operating on it
 are called.
 @return	own: created wait array */
-UNIV_INTERN
+static
 sync_array_t*
 sync_array_create(
 /*==============*/
-	ulint	n_cells,	/*!< in: number of cells in the array
+	ulint	n_cells)	/*!< in: number of cells in the array
 				to create */
-	ulint	protection)	/*!< in: either SYNC_ARRAY_OS_MUTEX or
-				SYNC_ARRAY_MUTEX: determines the type
-				of mutex protecting the data structure */
 {
 	ulint		sz;
 	sync_array_t*	arr;
@@ -239,54 +220,36 @@ sync_array_create(
 	ut_a(n_cells > 0);
 
 	/* Allocate memory for the data structures */
-	arr = ut_malloc(sizeof(sync_array_t));
+	arr = static_cast<sync_array_t*>(ut_malloc(sizeof(*arr)));
 	memset(arr, 0x0, sizeof(*arr));
 
 	sz = sizeof(sync_cell_t) * n_cells;
-	arr->array = ut_malloc(sz);
+	arr->array = static_cast<sync_cell_t*>(ut_malloc(sz));
 	memset(arr->array, 0x0, sz);
 
 	arr->n_cells = n_cells;
-	arr->protection = protection;
 
 	/* Then create the mutex to protect the wait array complex */
-	if (protection == SYNC_ARRAY_OS_MUTEX) {
-		arr->os_mutex = os_mutex_create();
-	} else if (protection == SYNC_ARRAY_MUTEX) {
-		mutex_create(syn_arr_mutex_key,
-			     &arr->mutex, SYNC_NO_ORDER_CHECK);
-	} else {
-		ut_error;
-	}
+	arr->os_mutex = os_mutex_create();
 
 	return(arr);
 }
 
 /******************************************************************//**
 Frees the resources in a wait array. */
-UNIV_INTERN
+static
 void
 sync_array_free(
 /*============*/
 	sync_array_t*	arr)	/*!< in, own: sync wait array */
 {
-	ulint		protection;
-
 	ut_a(arr->n_reserved == 0);
 
 	sync_array_validate(arr);
 
-	protection = arr->protection;
-
 	/* Release the mutex protecting the wait array complex */
 
-	if (protection == SYNC_ARRAY_OS_MUTEX) {
-		os_mutex_free(arr->os_mutex);
-	} else if (protection == SYNC_ARRAY_MUTEX) {
-		mutex_free(&(arr->mutex));
-	} else {
-		ut_error;
-	}
+	os_mutex_free(arr->os_mutex);
 
 	ut_free(arr->array);
 	ut_free(arr);
@@ -330,11 +293,11 @@ sync_cell_get_event(
 	ulint type = cell->request_type;
 
 	if (type == SYNC_MUTEX) {
-		return(((mutex_t *) cell->wait_object)->event);
+		return(((mutex_t*) cell->wait_object)->event);
 	} else if (type == RW_LOCK_WAIT_EX) {
-		return(((rw_lock_t *) cell->wait_object)->wait_ex_event);
+		return(((rw_lock_t*) cell->wait_object)->wait_ex_event);
 	} else { /* RW_LOCK_SHARED and RW_LOCK_EX wait on the same event */
-		return(((rw_lock_t *) cell->wait_object)->event);
+		return(((rw_lock_t*) cell->wait_object)->event);
 	}
 }
 
@@ -373,9 +336,11 @@ sync_array_reserve_cell(
 			cell->wait_object = object;
 
 			if (type == SYNC_MUTEX) {
-				cell->old_wait_mutex = object;
+				cell->old_wait_mutex =
+					static_cast<ib_mutex_t*>(object);
 			} else {
-				cell->old_wait_rw_lock = object;
+				cell->old_wait_rw_lock =
+					static_cast<rw_lock_t*>(object);
 			}
 
 			cell->request_type = type;
@@ -395,7 +360,7 @@ sync_array_reserve_cell(
                         event = sync_cell_get_event(cell);
 			cell->signal_count = os_event_reset(event);
 
-			cell->reservation_time = time(NULL);
+			cell->reservation_time = ut_time();
 
 			cell->thread = os_thread_get_curr_id();
 
@@ -434,7 +399,7 @@ sync_array_wait_event(
 	ut_ad(os_thread_get_curr_id() == cell->thread);
 
 	event = sync_cell_get_event(cell);
-		cell->waiting = TRUE;
+	cell->waiting = TRUE;
 
 #ifdef UNIV_SYNC_DEBUG
 
@@ -591,7 +556,7 @@ sync_array_deadlock_step(
 	ulint		pass,	/*!< in: pass value */
 	ulint		depth)	/*!< in: recursion depth */
 {
-	sync_cell_t*	new;
+	sync_cell_t*	new_cell;
 
 	if (pass != 0) {
 		/* If pass != 0, then we do not know which threads are
@@ -601,9 +566,9 @@ sync_array_deadlock_step(
 		return(FALSE);
 	}
 
-	new = sync_array_find_thread(arr, thread);
+	new_cell = sync_array_find_thread(arr, thread);
 
-	if (UNIV_UNLIKELY(new == start)) {
+	if (new_cell == start) {
 		/* Stop running of other threads */
 
 		ut_dbg_stop_threads = TRUE;
@@ -614,8 +579,9 @@ sync_array_deadlock_step(
 
 		return(TRUE);
 
-	} else if (new) {
-		return(sync_array_detect_deadlock(arr, start, new, depth + 1));
+	} else if (new_cell) {
+		return(sync_array_detect_deadlock(
+			arr, start, new_cell, depth + 1));
 	}
 	return(FALSE);
 }
@@ -656,7 +622,7 @@ sync_array_detect_deadlock(
 
 	if (cell->request_type == SYNC_MUTEX) {
 
-		mutex = cell->wait_object;
+		mutex = static_cast<mutex_t*>(cell->wait_object);
 
 		if (mutex_get_lock_word(mutex) != 0) {
 
@@ -688,11 +654,11 @@ sync_array_detect_deadlock(
 	} else if (cell->request_type == RW_LOCK_EX
 		   || cell->request_type == RW_LOCK_WAIT_EX) {
 
-		lock = cell->wait_object;
-
-		debug = UT_LIST_GET_FIRST(lock->debug_list);
+		lock = static_cast<rw_lock_t*>(cell->wait_object);
 
-		while (debug != NULL) {
+		for (debug = UT_LIST_GET_FIRST(lock->debug_list);
+		     debug != 0;
+		     debug = UT_LIST_GET_NEXT(list, debug)) {
 
 			thread = debug->thread_id;
 
@@ -720,18 +686,17 @@ print:
 					return(TRUE);
 				}
 			}
-
-			debug = UT_LIST_GET_NEXT(list, debug);
 		}
 
 		return(FALSE);
 
 	} else if (cell->request_type == RW_LOCK_SHARED) {
 
-		lock = cell->wait_object;
-		debug = UT_LIST_GET_FIRST(lock->debug_list);
+		lock = static_cast<rw_lock_t*>(cell->wait_object);
 
-		while (debug != NULL) {
+		for (debug = UT_LIST_GET_FIRST(lock->debug_list);
+		     debug != 0;
+		     debug = UT_LIST_GET_NEXT(list, debug)) {
 
 			thread = debug->thread_id;
 
@@ -750,8 +715,6 @@ print:
 					goto print;
 				}
 			}
-
-			debug = UT_LIST_GET_NEXT(list, debug);
 		}
 
 		return(FALSE);
@@ -778,7 +741,7 @@ sync_arr_cell_can_wake_up(
 
 	if (cell->request_type == SYNC_MUTEX) {
 
-		mutex = cell->wait_object;
+		mutex = static_cast<ib_mutex_t*>(cell->wait_object);
 
 		if (mutex_get_lock_word(mutex) == 0) {
 
@@ -787,7 +750,7 @@ sync_arr_cell_can_wake_up(
 
 	} else if (cell->request_type == RW_LOCK_EX) {
 
-		lock = cell->wait_object;
+		lock = static_cast<rw_lock_t*>(cell->wait_object);
 
 		if (lock->lock_word > 0) {
 		/* Either unlocked or only read locked. */
@@ -797,7 +760,7 @@ sync_arr_cell_can_wake_up(
 
         } else if (cell->request_type == RW_LOCK_WAIT_EX) {
 
-		lock = cell->wait_object;
+		lock = static_cast<rw_lock_t*>(cell->wait_object);
 
                 /* lock_word == 0 means all readers have left */
 		if (lock->lock_word == 0) {
@@ -805,7 +768,7 @@ sync_arr_cell_can_wake_up(
 			return(TRUE);
 		}
 	} else if (cell->request_type == RW_LOCK_SHARED) {
-		lock = cell->wait_object;
+		lock = static_cast<rw_lock_t*>(cell->wait_object);
 
                 /* lock_word > 0 means no writer or reserved writer */
 		if (lock->lock_word > 0) {
@@ -849,19 +812,14 @@ sync_array_free_cell(
 Increments the signalled count. */
 UNIV_INTERN
 void
-sync_array_object_signalled(
-/*========================*/
-	sync_array_t*	arr)	/*!< in: wait array */
+sync_array_object_signalled(void)
+/*=============================*/
 {
 #ifdef HAVE_ATOMIC_BUILTINS
-	(void) os_atomic_increment_ulint(&arr->sg_count, 1);
+	(void) os_atomic_increment_ulint(&sg_count, 1);
 #else
-	sync_array_enter(arr);
-
-	arr->sg_count++;
-
-	sync_array_exit(arr);
-#endif
+	++sg_count;
+#endif /* HAVE_ATOMIC_BUILTINS */
 }
 
 /**********************************************************************//**
@@ -872,67 +830,78 @@ function should be called about every 1 second in the server.
 Note that there's a race condition between this thread and mutex_exit
 changing the lock_word and calling signal_object, so sometimes this finds
 threads to wake up even when nothing has gone wrong. */
-UNIV_INTERN
+static
 void
-sync_arr_wake_threads_if_sema_free(void)
-/*====================================*/
+sync_array_wake_threads_if_sema_free_low(
+/*=====================================*/
+	sync_array_t*	arr)		/* in/out: wait array */
 {
-	sync_array_t*	arr	= sync_primary_wait_array;
-	sync_cell_t*	cell;
+	ulint		i = 0;
 	ulint		count;
-	ulint		i;
-	os_event_t      event;
 
 	sync_array_enter(arr);
 
-	i = 0;
-	count = 0;
-
-	while (count < arr->n_reserved) {
+	for (count = 0;  count < arr->n_reserved; ++i) {
+		sync_cell_t*	cell;
 
 		cell = sync_array_get_nth_cell(arr, i);
-		i++;
 
-		if (cell->wait_object == NULL) {
-			continue;
-		}
+		if (cell->wait_object != NULL) {
+
 			count++;
 
 			if (sync_arr_cell_can_wake_up(cell)) {
+				os_event_t      event;
 
-			event = sync_cell_get_event(cell);
+				event = sync_cell_get_event(cell);
 
-			os_event_set(event);
+				os_event_set(event);
+			}
 		}
-
 	}
 
 	sync_array_exit(arr);
 }
 
 /**********************************************************************//**
+If the wakeup algorithm does not work perfectly at semaphore relases,
+this function will do the waking (see the comment in mutex_exit). This
+function should be called about every 1 second in the server.
+
+Note that there's a race condition between this thread and mutex_exit
+changing the lock_word and calling signal_object, so sometimes this finds
+threads to wake up even when nothing has gone wrong. */
+UNIV_INTERN
+void
+sync_arr_wake_threads_if_sema_free(void)
+/*====================================*/
+{
+	ulint		i;
+
+	for (i = 0; i < sync_array_size; ++i) {
+
+		sync_array_wake_threads_if_sema_free_low(
+			sync_wait_array[i]);
+	}
+}
+
+/**********************************************************************//**
 Prints warnings of long semaphore waits to stderr.
 @return	TRUE if fatal semaphore wait threshold was exceeded */
-UNIV_INTERN
+static
 ibool
-sync_array_print_long_waits(
-/*========================*/
+sync_array_print_long_waits_low(
+/*============================*/
+	sync_array_t*	arr,	/*!< in: sync array instance */
 	os_thread_id_t*	waiter,	/*!< out: longest waiting thread */
-	const void**	sema)	/*!< out: longest-waited-for semaphore */
+	const void**	sema,	/*!< out: longest-waited-for semaphore */
+	ibool*		noticed)/*!< out: TRUE if long wait noticed */
 {
-	sync_cell_t*	cell;
-	ibool		old_val;
-	ibool		noticed = FALSE;
 	ulint		i;
 	ulint		fatal_timeout = srv_fatal_semaphore_wait_threshold;
 	ibool		fatal = FALSE;
 	double		longest_diff = 0;
 
-	/* For huge tables, skip the check during CHECK TABLE etc... */
-	if (fatal_timeout > SRV_SEMAPHORE_WAIT_EXTENSION) {
-		return(FALSE);
-	}
-
 #ifdef UNIV_DEBUG_VALGRIND
 	/* Increase the timeouts if running under valgrind because it executes
 	extremely slowly. UNIV_DEBUG_VALGRIND does not necessary mean that
@@ -945,12 +914,13 @@ sync_array_print_long_waits(
 # define SYNC_ARRAY_TIMEOUT	240
 #endif
 
-	for (i = 0; i < sync_primary_wait_array->n_cells; i++) {
+	for (i = 0; i < arr->n_cells; i++) {
 
-		double	diff;
-		void*	wait_object;
+		double		diff;
+		sync_cell_t*	cell;
+		void*		wait_object;
 
-		cell = sync_array_get_nth_cell(sync_primary_wait_array, i);
+		cell = sync_array_get_nth_cell(arr, i);
 
 		wait_object = cell->wait_object;
 
@@ -965,7 +935,7 @@ sync_array_print_long_waits(
 			fputs("InnoDB: Warning: a long semaphore wait:\n",
 			      stderr);
 			sync_array_cell_print(stderr, cell);
-			noticed = TRUE;
+			*noticed = TRUE;
 		}
 
 		if (diff > fatal_timeout) {
@@ -979,10 +949,43 @@ sync_array_print_long_waits(
 		}
 	}
 
+#undef SYNC_ARRAY_TIMEOUT
+
+	return(fatal);
+}
+
+/**********************************************************************//**
+Prints warnings of long semaphore waits to stderr.
+@return	TRUE if fatal semaphore wait threshold was exceeded */
+UNIV_INTERN
+ibool
+sync_array_print_long_waits(
+/*========================*/
+	os_thread_id_t*	waiter,	/*!< out: longest waiting thread */
+	const void**	sema)	/*!< out: longest-waited-for semaphore */
+{
+	ulint		i;
+	ibool		fatal = FALSE;
+	ibool		noticed = FALSE;
+
+	for (i = 0; i < sync_array_size; ++i) {
+
+		sync_array_t*	arr = sync_wait_array[i];
+
+		if (sync_array_print_long_waits_low(
+				arr, waiter, sema, &noticed)) {
+
+			fatal = TRUE;
+		}
+	}
+
 	if (noticed) {
+		ibool	old_val;
+
 		fprintf(stderr,
 			"InnoDB: ###### Starts InnoDB Monitor"
 			" for 30 secs to print diagnostic info:\n");
+
 		old_val = srv_print_innodb_monitor;
 
 		/* If some crucial semaphore is reserved, then also the InnoDB
@@ -993,11 +996,11 @@ sync_array_print_long_waits(
 
 		fprintf(stderr,
 			"InnoDB: Pending preads %lu, pwrites %lu\n",
-			(ulong)os_file_n_pending_preads,
-			(ulong)os_file_n_pending_pwrites);
+			(ulong) os_file_n_pending_preads,
+			(ulong) os_file_n_pending_pwrites);
 
 		srv_print_innodb_monitor = TRUE;
-		os_event_set(srv_lock_timeout_thread_event);
+		os_event_set(srv_timeout_event);
 
 		os_thread_sleep(30000000);
 
@@ -1007,8 +1010,6 @@ sync_array_print_long_waits(
 			" to the standard error stream\n");
 	}
 
-#undef SYNC_ARRAY_TIMEOUT
-
 	return(fatal);
 }
 
@@ -1016,38 +1017,33 @@ sync_array_print_long_waits(
 Prints info of the wait array. */
 static
 void
-sync_array_output_info(
-/*===================*/
+sync_array_print_info_low(
+/*======================*/
 	FILE*		file,	/*!< in: file where to print */
-	sync_array_t*	arr)	/*!< in: wait array; NOTE! caller must own the
-				mutex */
+	sync_array_t*	arr)	/*!< in: wait array */
 {
-	sync_cell_t*	cell;
-	ulint		count;
 	ulint		i;
+	ulint		count = 0;
 
 	fprintf(file,
-		"OS WAIT ARRAY INFO: reservation count %ld, signal count %ld\n",
-						(long) arr->res_count, (long) arr->sg_count);
-	i = 0;
-	count = 0;
+		"OS WAIT ARRAY INFO: reservation count %ld\n",
+		(long) arr->res_count);
 
-	while (count < arr->n_reserved) {
+	for (i = 0; count < arr->n_reserved; ++i) {
+		sync_cell_t*	cell;
 
 		cell = sync_array_get_nth_cell(arr, i);
 
-	if (cell->wait_object != NULL) {
-		count++;
+		if (cell->wait_object != NULL) {
+			count++;
 			sync_array_cell_print(file, cell);
 		}
-
-		i++;
 	}
 }
 
 /**********************************************************************//**
 Prints info of the wait array. */
-UNIV_INTERN
+static
 void
 sync_array_print_info(
 /*==================*/
@@ -1056,7 +1052,95 @@ sync_array_print_info(
 {
 	sync_array_enter(arr);
 
-	sync_array_output_info(file, arr);
+	sync_array_print_info_low(file, arr);
 
 	sync_array_exit(arr);
 }
+
+/**********************************************************************//**
+Create the primary system wait array(s), they are protected by an OS mutex */
+UNIV_INTERN
+void
+sync_array_init(
+/*============*/
+	ulint		n_threads)		/*!< in: Number of slots to
+						create in all arrays */
+{
+	ulint		i;
+	ulint		n_slots;
+
+	ut_a(sync_wait_array == NULL);
+	ut_a(srv_sync_array_size > 0);
+	ut_a(n_threads > srv_sync_array_size);
+
+	sync_array_size = srv_sync_array_size;
+
+	/* We have to use ut_malloc() because the mutex infrastructure
+	hasn't been initialised yet. It is required by mem_alloc() and
+	the heap functions. */
+
+	sync_wait_array = static_cast<sync_array_t**>(
+		ut_malloc(sizeof(*sync_wait_array) * sync_array_size));
+
+	n_slots = 1 + (n_threads - 1) / sync_array_size;
+
+	for (i = 0; i < sync_array_size; ++i) {
+
+		sync_wait_array[i] = sync_array_create(n_slots);
+	}
+}
+
+/**********************************************************************//**
+Close sync array wait sub-system. */
+UNIV_INTERN
+void
+sync_array_close(void)
+/*==================*/
+{
+	ulint		i;
+
+	for (i = 0; i < sync_array_size; ++i) {
+		sync_array_free(sync_wait_array[i]);
+	}
+
+	ut_free(sync_wait_array);
+	sync_wait_array = NULL;
+}
+
+/**********************************************************************//**
+Print info about the sync array(s). */
+UNIV_INTERN
+void
+sync_array_print(
+/*=============*/
+	FILE*		file)		/*!< in/out: Print to this stream */
+{
+	ulint		i;
+
+	for (i = 0; i < sync_array_size; ++i) {
+		sync_array_print_info(file, sync_wait_array[i]);
+	}
+
+	fprintf(file,
+		"OS WAIT ARRAY INFO: signal count %ld\n", (long) sg_count);
+
+}
+
+/**********************************************************************//**
+Get an instance of the sync wait array. */
+UNIV_INTERN
+sync_array_t*
+sync_array_get(void)
+/*================*/
+{
+	ulint		i;
+	static ulint	count;
+
+#ifdef HAVE_ATOMIC_BUILTINS
+	i = os_atomic_increment_ulint(&count, 1);
+#else
+	i = count++;
+#endif /* HAVE_ATOMIC_BUILTINS */
+
+	return(sync_wait_array[i % sync_array_size]);
+}
diff --git a/storage/innobase/sync/sync0rw.c b/storage/innobase/sync/sync0rw.cc
index 8de9b40ef67..dc6c510a3ed 100644
--- a/storage/innobase/sync/sync0rw.c
+++ b/storage/innobase/sync/sync0rw.cc
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1995, 2012, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 1995, 2011, Oracle and/or its affiliates. All Rights Reserved.
 Copyright (c) 2008, Google Inc.
 
 Portions of this file contain modifications contributed and copyrighted by
@@ -24,7 +24,7 @@ this program; if not, write to the Free Software Foundation, Inc.,
 *****************************************************************************/
 
 /**************************************************//**
-@file sync/sync0rw.c
+@file sync/sync0rw.cc
 The read-write lock (for thread synchronization)
 
 Created 9/11/1995 Heikki Tuuri
@@ -315,14 +315,19 @@ rw_lock_free_func(
 /*==============*/
 	rw_lock_t*	lock)	/*!< in: rw-lock */
 {
+#ifndef INNODB_RW_LOCKS_USE_ATOMICS
+	mutex_t*	mutex;
+#endif /* !INNODB_RW_LOCKS_USE_ATOMICS */
+
 	ut_ad(rw_lock_validate(lock));
 	ut_a(lock->lock_word == X_LOCK_DECR);
 
+	mutex_enter(&rw_lock_list_mutex);
+
 #ifndef INNODB_RW_LOCKS_USE_ATOMICS
-	mutex_free(rw_lock_get_mutex(lock));
-#endif /* INNODB_RW_LOCKS_USE_ATOMICS */
+	mutex = rw_lock_get_mutex(lock);
+#endif /* !INNODB_RW_LOCKS_USE_ATOMICS */
 
-	mutex_enter(&rw_lock_list_mutex);
 	os_event_free(lock->event);
 
 	os_event_free(lock->wait_ex_event);
@@ -337,6 +342,12 @@ rw_lock_free_func(
 	mutex_exit(&rw_lock_list_mutex);
 
 	ut_d(lock->magic_n = 0);
+
+#ifndef INNODB_RW_LOCKS_USE_ATOMICS
+	/* We have merely removed the rw_lock from the list, the memory
+	has not been freed. Therefore the pointer to mutex is valid. */
+	mutex_free(mutex);
+#endif /* !INNODB_RW_LOCKS_USE_ATOMICS */
 }
 
 #ifdef UNIV_DEBUG
@@ -381,8 +392,9 @@ rw_lock_s_lock_spin(
 	const char*	file_name, /*!< in: file name where lock requested */
 	ulint		line)	/*!< in: line where requested */
 {
-	ulint	 index;	/* index of the reserved wait cell */
-	ulint	 i = 0;	/* spin round count */
+	ulint		index;	/* index of the reserved wait cell */
+	ulint		i = 0;	/* spin round count */
+	sync_array_t*	sync_arr;
 
 	ut_ad(rw_lock_validate(lock));
 
@@ -425,17 +437,18 @@ lock_loop:
 
 		rw_s_spin_round_count += i;
 
-		sync_array_reserve_cell(sync_primary_wait_array,
-					lock, RW_LOCK_SHARED,
-					file_name, line,
-					&index);
+		sync_arr = sync_array_get();
+
+		sync_array_reserve_cell(
+			sync_arr, lock, RW_LOCK_SHARED,
+			file_name, line, &index);
 
 		/* Set waiters before checking lock_word to ensure wake-up
                 signal is sent. This may lead to some unnecessary signals. */
 		rw_lock_set_waiter_flag(lock);
 
 		if (TRUE == rw_lock_s_lock_low(lock, pass, file_name, line)) {
-			sync_array_free_cell(sync_primary_wait_array, index);
+			sync_array_free_cell(sync_arr, index);
 			return; /* Success */
 		}
 
@@ -453,7 +466,7 @@ lock_loop:
 		lock->count_os_wait++;
 		rw_s_os_wait_count++;
 
-		sync_array_wait_event(sync_primary_wait_array, index);
+		sync_array_wait_event(sync_arr, index);
 
 		i = 0;
 		goto lock_loop;
@@ -495,8 +508,9 @@ rw_lock_x_lock_wait(
 	const char*	file_name,/*!< in: file name where lock requested */
 	ulint		line)	/*!< in: line where requested */
 {
-	ulint index;
-	ulint i = 0;
+	ulint		index;
+	ulint		i = 0;
+	sync_array_t*	sync_arr;
 
 	ut_ad(lock->lock_word <= 0);
 
@@ -511,14 +525,17 @@ rw_lock_x_lock_wait(
 
 		/* If there is still a reader, then go to sleep.*/
 		rw_x_spin_round_count += i;
+
+		sync_arr = sync_array_get();
+
+		sync_array_reserve_cell(
+			sync_arr, lock, RW_LOCK_WAIT_EX,
+			file_name, line, &index);
+
 		i = 0;
-		sync_array_reserve_cell(sync_primary_wait_array,
-					lock,
-					RW_LOCK_WAIT_EX,
-					file_name, line,
-					&index);
+
 		/* Check lock_word to ensure wake-up isn't missed.*/
-		if(lock->lock_word < 0) {
+		if (lock->lock_word < 0) {
 
 			/* these stats may not be accurate */
 			lock->count_os_wait++;
@@ -532,8 +549,7 @@ rw_lock_x_lock_wait(
 					       file_name, line);
 #endif
 
-			sync_array_wait_event(sync_primary_wait_array,
-					      index);
+			sync_array_wait_event(sync_arr, index);
 #ifdef UNIV_SYNC_DEBUG
 			rw_lock_remove_debug_info(lock, pass,
 					       RW_LOCK_WAIT_EX);
@@ -541,8 +557,7 @@ rw_lock_x_lock_wait(
                         /* It is possible to wake when lock_word < 0.
                         We must pass the while-loop check to proceed.*/
 		} else {
-			sync_array_free_cell(sync_primary_wait_array,
-					     index);
+			sync_array_free_cell(sync_arr, index);
 		}
 	}
 	rw_x_spin_round_count += i;
@@ -550,7 +565,7 @@ rw_lock_x_lock_wait(
 
 /******************************************************************//**
 Low-level function for acquiring an exclusive lock.
-@return	RW_LOCK_NOT_LOCKED if did not succeed, RW_LOCK_EX if success. */
+@return	FALSE if did not succeed, TRUE if success. */
 UNIV_INLINE
 ibool
 rw_lock_x_lock_low(
@@ -579,14 +594,14 @@ rw_lock_x_lock_low(
 #ifdef UNIV_SYNC_DEBUG
 				    pass,
 #endif
-                                    file_name, line);
+				    file_name, line);
 
 	} else {
 		/* Decrement failed: relock or failed lock */
 		if (!pass && lock->recursive
 		    && os_thread_eq(lock->writer_thread, curr_thread)) {
 			/* Relock */
-                        lock->lock_word -= X_LOCK_DECR;
+			lock->lock_word -= X_LOCK_DECR;
 		} else {
 			/* Another thread locked before us */
 			return(FALSE);
@@ -621,9 +636,10 @@ rw_lock_x_lock_func(
 	const char*	file_name,/*!< in: file name where lock requested */
 	ulint		line)	/*!< in: line where requested */
 {
-	ulint	index;	/*!< index of the reserved wait cell */
-	ulint	i;	/*!< spin round count */
-	ibool	spinning = FALSE;
+	ulint		i;	/*!< spin round count */
+	ulint		index;	/*!< index of the reserved wait cell */
+	sync_array_t*	sync_arr;
+	ibool		spinning = FALSE;
 
 	ut_ad(rw_lock_validate(lock));
 #ifdef UNIV_SYNC_DEBUG
@@ -674,18 +690,17 @@ lock_loop:
 			(ulong) lock->cline, (ulong) i);
 	}
 
-	sync_array_reserve_cell(sync_primary_wait_array,
-				lock,
-				RW_LOCK_EX,
-				file_name, line,
-				&index);
+	sync_arr = sync_array_get();
+
+	sync_array_reserve_cell(
+		sync_arr, lock, RW_LOCK_EX, file_name, line, &index);
 
 	/* Waiters must be set before checking lock_word, to ensure signal
 	is sent. This could lead to a few unnecessary wake-up signals. */
 	rw_lock_set_waiter_flag(lock);
 
 	if (rw_lock_x_lock_low(lock, pass, file_name, line)) {
-		sync_array_free_cell(sync_primary_wait_array, index);
+		sync_array_free_cell(sync_arr, index);
 		return; /* Locking succeeded */
 	}
 
@@ -702,7 +717,7 @@ lock_loop:
 	lock->count_os_wait++;
 	rw_x_os_wait_count++;
 
-	sync_array_wait_event(sync_primary_wait_array, index);
+	sync_array_wait_event(sync_arr, index);
 
 	i = 0;
 	goto lock_loop;
diff --git a/storage/innobase/sync/sync0sync.c b/storage/innobase/sync/sync0sync.cc
index fb7101fdb8d..1bebd553c19 100644
--- a/storage/innobase/sync/sync0sync.c
+++ b/storage/innobase/sync/sync0sync.cc
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1995, 2012, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 1995, 2011, Oracle and/or its affiliates. All Rights Reserved.
 Copyright (c) 2008, Google Inc.
 
 Portions of this file contain modifications contributed and copyrighted by
@@ -24,7 +24,7 @@ this program; if not, write to the Free Software Foundation, Inc.,
 *****************************************************************************/
 
 /**************************************************//**
-@file sync/sync0sync.c
+@file sync/sync0sync.cc
 Mutex, the basic synchronization primitive
 
 Created 9/5/1995 Heikki Tuuri
@@ -185,10 +185,6 @@ static ib_int64_t	mutex_os_wait_count		= 0;
 monitoring. */
 UNIV_INTERN ib_int64_t	mutex_exit_count		= 0;
 
-/** The global array of wait cells for implementation of the database's own
-mutexes and read-write locks */
-UNIV_INTERN sync_array_t*	sync_primary_wait_array;
-
 /** This variable is set to TRUE when sync_init is called */
 UNIV_INTERN ibool	sync_initialized	= FALSE;
 
@@ -284,7 +280,7 @@ mutex_create_func(
 #if defined(HAVE_ATOMIC_BUILTINS)
 	mutex_reset_lock_word(mutex);
 #else
-	os_fast_mutex_init(&(mutex->os_fast_mutex));
+	os_fast_mutex_init(PFS_NOT_INSTRUMENTED, &mutex->os_fast_mutex);
 	mutex->lock_word = 0;
 #endif
 	mutex->event = os_event_create(NULL);
@@ -392,6 +388,7 @@ func_exit:
 #ifdef UNIV_DEBUG
 	mutex->magic_n = 0;
 #endif /* UNIV_DEBUG */
+	return;
 }
 
 /********************************************************************//**
@@ -490,15 +487,10 @@ mutex_spin_wait(
 					requested */
 	ulint		line)		/*!< in: line where requested */
 {
-	ulint	   index; /* index of the reserved wait cell */
-	ulint	   i;	  /* spin round count */
-#ifdef UNIV_DEBUG
-	ib_int64_t lstart_time = 0, lfinish_time; /* for timing os_wait */
-	ulint ltime_diff;
-	ulint sec;
-	ulint ms;
-	uint timer_started = 0;
-#endif /* UNIV_DEBUG */
+	ulint		i;		/* spin round count */
+	ulint		index;		/* index of the reserved wait cell */
+	sync_array_t*	sync_arr;
+
 	ut_ad(mutex);
 
 	/* This update is not thread safe, but we don't mind if the count
@@ -531,13 +523,6 @@ spin_loop:
 	if (i == SYNC_SPIN_ROUNDS) {
 #ifdef UNIV_DEBUG
 		mutex->count_os_yield++;
-#ifndef UNIV_HOTBACKUP
-		if (timed_mutexes && timer_started == 0) {
-			ut_usectime(&sec, &ms);
-			lstart_time= (ib_int64_t)sec * 1000000 + ms;
-			timer_started = 1;
-		}
-#endif /* UNIV_HOTBACKUP */
 #endif /* UNIV_DEBUG */
 		os_thread_yield();
 	}
@@ -562,8 +547,7 @@ spin_loop:
 #ifdef UNIV_SYNC_DEBUG
 		mutex_set_debug_info(mutex, file_name, line);
 #endif
-
-		goto finish_timing;
+		return;
 	}
 
 	/* We may end up with a situation where lock_word is 0 but the OS
@@ -579,8 +563,10 @@ spin_loop:
 		goto spin_loop;
 	}
 
-	sync_array_reserve_cell(sync_primary_wait_array, mutex,
-				SYNC_MUTEX, file_name, line, &index);
+	sync_arr = sync_array_get();
+
+	sync_array_reserve_cell(
+		sync_arr, mutex, SYNC_MUTEX, file_name, line, &index);
 
 	/* The memory order of the array reservation and the change in the
 	waiters field is important: when we suspend a thread, we first
@@ -595,7 +581,7 @@ spin_loop:
 		if (mutex_test_and_set(mutex) == 0) {
 			/* Succeeded! Free the reserved wait cell */
 
-			sync_array_free_cell(sync_primary_wait_array, index);
+			sync_array_free_cell(sync_arr, index);
 
 			ut_d(mutex->thread_id = os_thread_get_curr_id());
 #ifdef UNIV_SYNC_DEBUG
@@ -609,7 +595,7 @@ spin_loop:
 				(void*) mutex);
 #endif
 
-			goto finish_timing;
+			return;
 
 			/* Note that in this case we leave the waiters field
 			set to 1. We cannot reset it to zero, as we do not
@@ -632,35 +618,9 @@ spin_loop:
 	mutex_os_wait_count++;
 
 	mutex->count_os_wait++;
-#ifdef UNIV_DEBUG
-	/* !!!!! Sometimes os_wait can be called without os_thread_yield */
-#ifndef UNIV_HOTBACKUP
-	if (timed_mutexes == 1 && timer_started == 0) {
-		ut_usectime(&sec, &ms);
-		lstart_time= (ib_int64_t)sec * 1000000 + ms;
-		timer_started = 1;
-	}
-#endif /* UNIV_HOTBACKUP */
-#endif /* UNIV_DEBUG */
 
-	sync_array_wait_event(sync_primary_wait_array, index);
+	sync_array_wait_event(sync_arr, index);
 	goto mutex_loop;
-
-finish_timing:
-#ifdef UNIV_DEBUG
-	if (timed_mutexes == 1 && timer_started==1) {
-		ut_usectime(&sec, &ms);
-		lfinish_time= (ib_int64_t)sec * 1000000 + ms;
-
-		ltime_diff= (ulint) (lfinish_time - lstart_time);
-		mutex->lspent_time += ltime_diff;
-
-		if (mutex->lmax_spent_time < ltime_diff) {
-			mutex->lmax_spent_time= ltime_diff;
-		}
-	}
-#endif /* UNIV_DEBUG */
-	return;
 }
 
 /******************************************************************//**
@@ -676,7 +636,7 @@ mutex_signal_object(
 	/* The memory order of resetting the waiters field and
 	signaling the object is important. See LEMMA 1 above. */
 	os_event_set(mutex->event);
-	sync_array_object_signalled(sync_primary_wait_array);
+	sync_array_object_signalled();
 }
 
 #ifdef UNIV_SYNC_DEBUG
@@ -870,7 +830,7 @@ sync_print_warning(
 {
 	mutex_t*	mutex;
 
-	mutex = slot->latch;
+	mutex = static_cast<mutex_t*>(slot->latch);
 
 	if (mutex->magic_n == MUTEX_MAGIC_N) {
 		fprintf(stderr,
@@ -895,7 +855,9 @@ sync_print_warning(
 			fputs("Not locked\n", stderr);
 		}
 	} else {
-		rw_lock_t*	lock = slot->latch;
+		rw_lock_t*	lock;
+
+		lock = static_cast<rw_lock_t*>(slot->latch);
 
 		rw_lock_print(lock);
 	}
@@ -1055,7 +1017,8 @@ sync_thread_levels_nonempty_gen(
 		if (slot->latch != NULL
 		    && (!dict_mutex_allowed
 			|| (slot->level != SYNC_DICT
-			    && slot->level != SYNC_DICT_OPERATION))) {
+			    && slot->level != SYNC_DICT_OPERATION
+			    && slot->level != SYNC_FTS_CACHE))) {
 
 			mutex_exit(&sync_thread_mutex);
 			ut_error;
@@ -1150,10 +1113,10 @@ sync_thread_add_level(
 		return;
 	}
 
-	if ((latch == (void*)&sync_thread_mutex)
-	    || (latch == (void*)&mutex_list_mutex)
-	    || (latch == (void*)&rw_lock_debug_mutex)
-	    || (latch == (void*)&rw_lock_list_mutex)) {
+	if ((latch == (void*) &sync_thread_mutex)
+	    || (latch == (void*) &mutex_list_mutex)
+	    || (latch == (void*) &rw_lock_debug_mutex)
+	    || (latch == (void*) &rw_lock_list_mutex)) {
 
 		return;
 	}
@@ -1174,7 +1137,7 @@ sync_thread_add_level(
 		   + (sizeof(*array->elems) * SYNC_THREAD_N_LEVELS);
 
 		/* We have to allocate the level array for a new thread */
-		array = calloc(sz, sizeof(char));
+		array = static_cast<sync_arr_t*>(calloc(sz, sizeof(char)));
 		ut_a(array != NULL);
 
 		array->next_free = ULINT_UNDEFINED;
@@ -1214,15 +1177,21 @@ sync_thread_add_level(
 	case SYNC_MEM_POOL:
 	case SYNC_MEM_HASH:
 	case SYNC_RECV:
+	case SYNC_FTS_BG_THREADS:
 	case SYNC_WORK_QUEUE:
+	case SYNC_FTS_OPTIMIZE:
+	case SYNC_FTS_CACHE:
+	case SYNC_FTS_CACHE_INIT:
 	case SYNC_LOG:
 	case SYNC_LOG_FLUSH_ORDER:
 	case SYNC_ANY_LATCH:
 	case SYNC_FILE_FORMAT_TAG:
 	case SYNC_DOUBLEWRITE:
 	case SYNC_SEARCH_SYS:
-	case SYNC_TRX_LOCK_HEAP:
-	case SYNC_KERNEL:
+	case SYNC_THREADS:
+	case SYNC_LOCK_SYS:
+	case SYNC_LOCK_WAIT_SYS:
+	case SYNC_TRX_SYS:
 	case SYNC_IBUF_BITMAP_MUTEX:
 	case SYNC_RSEG:
 	case SYNC_TRX_UNDO:
@@ -1241,6 +1210,14 @@ sync_thread_add_level(
 			ut_error;
 		}
 		break;
+	case SYNC_TRX:
+		/* Either the thread must own the lock_sys->mutex, or
+		it is allowed to own only ONE trx->mutex. */
+		if (!sync_thread_levels_g(array, level, FALSE)) {
+			ut_a(sync_thread_levels_g(array, level - 1, TRUE));
+			ut_a(sync_thread_levels_contain(array, SYNC_LOCK_SYS));
+		}
+		break;
 	case SYNC_BUF_FLUSH_LIST:
 	case SYNC_BUF_POOL:
 		/* We can have multiple mutexes of this type therefore we
@@ -1253,6 +1230,13 @@ sync_thread_add_level(
 		}
 		break;
 
+
+	case SYNC_BUF_PAGE_HASH:
+		/* Multiple page_hash locks are only allowed during
+		buf_validate and that is where buf_pool mutex is already
+		held. */
+		/* Fall through */
+
 	case SYNC_BUF_BLOCK:
 		/* Either the thread must own the buffer pool mutex
 		(buf_pool->mutex), or it is allowed to latch only ONE
@@ -1263,7 +1247,7 @@ sync_thread_add_level(
 		}
 		break;
 	case SYNC_REC_LOCK:
-		if (sync_thread_levels_contain(array, SYNC_KERNEL)) {
+		if (sync_thread_levels_contain(array, SYNC_LOCK_SYS)) {
 			ut_a(sync_thread_levels_g(array, SYNC_REC_LOCK - 1,
 						  TRUE));
 		} else {
@@ -1310,8 +1294,7 @@ sync_thread_add_level(
 		ut_a(sync_thread_levels_contain(array, SYNC_RSEG));
 		break;
 	case SYNC_RSEG_HEADER_NEW:
-		ut_a(sync_thread_levels_contain(array, SYNC_KERNEL)
-		     && sync_thread_levels_contain(array, SYNC_FSP_PAGE));
+		ut_a(sync_thread_levels_contain(array, SYNC_FSP_PAGE));
 		break;
 	case SYNC_TREE_NODE:
 		ut_a(sync_thread_levels_contain(array, SYNC_INDEX_TREE)
@@ -1413,10 +1396,10 @@ sync_thread_reset_level(
 		return(FALSE);
 	}
 
-	if ((latch == (void*)&sync_thread_mutex)
-	    || (latch == (void*)&mutex_list_mutex)
-	    || (latch == (void*)&rw_lock_debug_mutex)
-	    || (latch == (void*)&rw_lock_list_mutex)) {
+	if ((latch == (void*) &sync_thread_mutex)
+	    || (latch == (void*) &mutex_list_mutex)
+	    || (latch == (void*) &rw_lock_debug_mutex)
+	    || (latch == (void*) &rw_lock_list_mutex)) {
 
 		return(FALSE);
 	}
@@ -1499,17 +1482,15 @@ sync_init(void)
 
 	sync_initialized = TRUE;
 
-	/* Create the primary system wait array which is protected by an OS
-	mutex */
+	sync_array_init(OS_THREAD_MAX_N);
 
-	sync_primary_wait_array = sync_array_create(OS_THREAD_MAX_N,
-						    SYNC_ARRAY_OS_MUTEX);
 #ifdef UNIV_SYNC_DEBUG
 	/* Create the thread latch level array where the latch levels
 	are stored for each OS thread */
 
-	sync_thread_level_arrays = calloc(
-		sizeof(sync_thread_t), OS_THREAD_MAX_N);
+	sync_thread_level_arrays = static_cast<sync_thread_t*>(
+		calloc(sizeof(sync_thread_t), OS_THREAD_MAX_N));
+
 	ut_a(sync_thread_level_arrays != NULL);
 
 #endif /* UNIV_SYNC_DEBUG */
@@ -1576,7 +1557,7 @@ sync_close(void)
 {
 	mutex_t*	mutex;
 
-	sync_array_free(sync_primary_wait_array);
+	sync_array_close();
 
 	for (mutex = UT_LIST_GET_FIRST(mutex_list);
 	     mutex != NULL;
@@ -1598,13 +1579,13 @@ sync_close(void)
 #ifdef UNIV_SYNC_DEBUG
 	mutex_free(&sync_thread_mutex);
 
-	/* Switch latching order checks on in sync0sync.c */
+	/* Switch latching order checks on in sync0sync.cc */
 	sync_order_checks_on = FALSE;
 
 	sync_thread_level_arrays_free();
 #endif /* UNIV_SYNC_DEBUG */
 
-	sync_initialized = FALSE;	
+	sync_initialized = FALSE;
 }
 
 /*******************************************************************//**
@@ -1616,14 +1597,19 @@ sync_print_wait_info(
 	FILE*	file)		/*!< in: file where to print */
 {
 #ifdef UNIV_SYNC_DEBUG
-	fprintf(file, "Mutex exits %llu, rws exits %llu, rwx exits %llu\n",
+	fprintf(file,
+		"Mutex exits "UINT64PF", "
+		"rws exits "UINT64PF",  rwx exits "UINT64PF"\n",
 		mutex_exit_count, rw_s_exit_count, rw_x_exit_count);
 #endif
 
 	fprintf(file,
-		"Mutex spin waits %llu, rounds %llu, OS waits %llu\n"
-		"RW-shared spins %llu, rounds %llu, OS waits %llu\n"
-		"RW-excl spins %llu, rounds %llu, OS waits %llu\n",
+		"Mutex spin waits "UINT64PF", rounds "UINT64PF", "
+		"OS waits "UINT64PF"\n"
+		"RW-shared spins "UINT64PF", rounds "UINT64PF", "
+		"OS waits "UINT64PF"\n"
+		"RW-excl spins "UINT64PF", rounds "UINT64PF", "
+		"OS waits "UINT64PF"\n",
 		mutex_spin_wait_count,
 		mutex_spin_round_count,
 		mutex_os_wait_count,
@@ -1659,7 +1645,7 @@ sync_print(
 	rw_lock_list_print_info(file);
 #endif /* UNIV_SYNC_DEBUG */
 
-	sync_array_print_info(file, sync_primary_wait_array);
+	sync_array_print(file);
 
 	sync_print_wait_info(file);
 }
diff --git a/storage/innobase/trx/trx0i_s.c b/storage/innobase/trx/trx0i_s.cc
index aa0a9c797f2..cbf90afae0d 100644
--- a/storage/innobase/trx/trx0i_s.c
+++ b/storage/innobase/trx/trx0i_s.cc
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 2007, 2010, Innobase Oy. All Rights Reserved.
+Copyright (c) 2007, 2011, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,13 +11,13 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
 /**************************************************//**
-@file trx/trx0i_s.c
+@file trx/trx0i_s.cc
 INFORMATION SCHEMA innodb_trx, innodb_locks and
 innodb_lock_waits tables fetch code.
 
@@ -168,9 +168,9 @@ struct trx_i_s_cache_struct {
 /** Number of hash cells in the cache storage */
 #define CACHE_STORAGE_HASH_CELLS	2048
 	ha_storage_t*	storage;	/*!< storage for external volatile
-					data that can possibly not be
-					available later, when we release
-					the kernel mutex */
+					data that may become unavailable
+					when we release
+					lock_sys->mutex or trx_sys->mutex */
 	ulint		mem_allocd;	/*!< the amount of memory
 					allocated with mem_alloc*() */
 	ibool		is_truncated;	/*!< this is TRUE if the memory
@@ -472,7 +472,7 @@ fill_trx_row(
 	size_t		stmt_len;
 	const char*	s;
 
-	ut_ad(mutex_own(&kernel_mutex));
+	ut_ad(lock_mutex_own());
 
 	row->trx_id = trx->id;
 	row->trx_started = (ib_time_t) trx->start_time;
@@ -481,9 +481,10 @@ fill_trx_row(
 	ut_ad(requested_lock_row == NULL
 	      || i_s_locks_row_validate(requested_lock_row));
 
-	if (trx->wait_lock != NULL) {
+	if (trx->lock.wait_lock != NULL) {
+
 		ut_a(requested_lock_row != NULL);
-		row->trx_wait_started = (ib_time_t) trx->wait_started;
+		row->trx_wait_started = (ib_time_t) trx->lock.wait_started;
 	} else {
 		ut_a(requested_lock_row == NULL);
 		row->trx_wait_started = 0;
@@ -500,7 +501,9 @@ fill_trx_row(
 		goto thd_done;
 	}
 
-	row->trx_mysql_thread_id = thd_get_thread_id(trx->mysql_thd);
+	row->trx_mysql_thread_id = thd_get_thread_id(
+		static_cast<const THD*>(trx->mysql_thd));
+
 	stmt = innobase_get_stmt(trx->mysql_thd, &stmt_len);
 
 	if (stmt != NULL) {
@@ -513,9 +516,10 @@ fill_trx_row(
 		memcpy(query, stmt, stmt_len);
 		query[stmt_len] = '\0';
 
-		row->trx_query = ha_storage_put_memlim(
-			cache->storage, query, stmt_len + 1,
-			MAX_ALLOWED_FOR_STORAGE(cache));
+		row->trx_query = static_cast<const char*>(
+			ha_storage_put_memlim(
+				cache->storage, query, stmt_len + 1,
+				MAX_ALLOWED_FOR_STORAGE(cache)));
 
 		row->trx_query_cs = innobase_get_charset(trx->mysql_thd);
 
@@ -549,11 +553,15 @@ thd_done:
 
 	row->trx_tables_locked = trx->mysql_n_tables_locked;
 
-	row->trx_lock_structs = UT_LIST_GET_LEN(trx->trx_locks);
+	/* These are protected by both trx->mutex or lock_sys->mutex,
+	or just lock_sys->mutex. For reading, it suffices to hold
+	lock_sys->mutex. */
 
-	row->trx_lock_memory_bytes = mem_heap_get_size(trx->lock_heap);
+	row->trx_lock_structs = UT_LIST_GET_LEN(trx->lock.trx_locks);
 
-	row->trx_rows_locked = lock_number_of_rows_locked(trx);
+	row->trx_lock_memory_bytes = mem_heap_get_size(trx->lock.lock_heap);
+
+	row->trx_rows_locked = lock_number_of_rows_locked(&trx->lock);
 
 	row->trx_rows_modified = trx->undo_no;
 
@@ -601,6 +609,10 @@ thd_done:
 
 	row->trx_search_latch_timeout = trx->search_latch_timeout;
 
+	row->trx_is_read_only = trx->read_only;
+
+	row->trx_is_autocommit_non_locking = trx_is_autocommit_non_locking(trx);
+
 	return(TRUE);
 }
 
@@ -1128,25 +1140,25 @@ add_trx_relevant_locks_to_cache(
 					requested lock row, or NULL or
 					undefined */
 {
-	ut_ad(mutex_own(&kernel_mutex));
+	ut_ad(lock_mutex_own());
 
 	/* If transaction is waiting we add the wait lock and all locks
 	from another transactions that are blocking the wait lock. */
-	if (trx->que_state == TRX_QUE_LOCK_WAIT) {
+	if (trx->lock.que_state == TRX_QUE_LOCK_WAIT) {
 
 		const lock_t*		curr_lock;
 		ulint			wait_lock_heap_no;
 		i_s_locks_row_t*	blocking_lock_row;
 		lock_queue_iterator_t	iter;
 
-		ut_a(trx->wait_lock != NULL);
+		ut_a(trx->lock.wait_lock != NULL);
 
 		wait_lock_heap_no
-			= wait_lock_get_heap_no(trx->wait_lock);
+			= wait_lock_get_heap_no(trx->lock.wait_lock);
 
 		/* add the requested lock */
 		*requested_lock_row
-			= add_lock_to_cache(cache, trx->wait_lock,
+			= add_lock_to_cache(cache, trx->lock.wait_lock,
 					    wait_lock_heap_no);
 
 		/* memory could not be allocated */
@@ -1158,17 +1170,18 @@ add_trx_relevant_locks_to_cache(
 		/* then iterate over the locks before the wait lock and
 		add the ones that are blocking it */
 
-		lock_queue_iterator_reset(&iter, trx->wait_lock,
+		lock_queue_iterator_reset(&iter, trx->lock.wait_lock,
 					  ULINT_UNDEFINED);
 
-		curr_lock = lock_queue_iterator_get_prev(&iter);
-		while (curr_lock != NULL) {
+		for (curr_lock = lock_queue_iterator_get_prev(&iter);
+		     curr_lock != NULL;
+		     curr_lock = lock_queue_iterator_get_prev(&iter)) {
 
-			if (lock_has_to_wait(trx->wait_lock,
+			if (lock_has_to_wait(trx->lock.wait_lock,
 					     curr_lock)) {
 
 				/* add the lock that is
-				blocking trx->wait_lock */
+				blocking trx->lock.wait_lock */
 				blocking_lock_row
 					= add_lock_to_cache(
 						cache, curr_lock,
@@ -1193,8 +1206,6 @@ add_trx_relevant_locks_to_cache(
 					return(FALSE);
 				}
 			}
-
-			curr_lock = lock_queue_iterator_get_prev(&iter);
 		}
 	} else {
 
@@ -1257,27 +1268,47 @@ Fetches the data needed to fill the 3 INFORMATION SCHEMA tables into the
 table cache buffer. Cache must be locked for write. */
 static
 void
-fetch_data_into_cache(
-/*==================*/
-	trx_i_s_cache_t*	cache)	/*!< in/out: cache */
+fetch_data_into_cache_low(
+/*======================*/
+	trx_i_s_cache_t*	cache,		/*!< in/out: cache */
+	ibool			only_ac_nl,	/*!< in: only select non-locking
+						autocommit transactions */
+	trx_list_t*		trx_list)	/*!< in: trx list */
 {
-	trx_t*			trx;
-	i_s_trx_row_t*		trx_row;
-	i_s_locks_row_t*	requested_lock_row;
+	const trx_t*		trx;
 
-	ut_ad(mutex_own(&kernel_mutex));
+	ut_ad(trx_list == &trx_sys->rw_trx_list
+	      || trx_list == &trx_sys->ro_trx_list
+	      || trx_list == &trx_sys->mysql_trx_list);
 
-	trx_i_s_cache_clear(cache);
+	ut_ad(only_ac_nl == (trx_list == &trx_sys->mysql_trx_list));
 
-	/* We iterate over the list of all transactions and add each one
+	/* Iterate over the transaction list and add each one
 	to innodb_trx's cache. We also add all locks that are relevant
 	to each transaction into innodb_locks' and innodb_lock_waits'
 	caches. */
 
-	for (trx = UT_LIST_GET_FIRST(trx_sys->trx_list);
+	for (trx = UT_LIST_GET_FIRST(*trx_list);
 	     trx != NULL;
 	     trx = UT_LIST_GET_NEXT(trx_list, trx)) {
 
+		i_s_trx_row_t*		trx_row;
+		i_s_locks_row_t*	requested_lock_row;
+
+		if (trx->state == TRX_STATE_NOT_STARTED
+		    || (only_ac_nl && !trx_is_autocommit_non_locking(trx))) {
+
+			continue;
+		}
+
+		assert_trx_nonlocking_or_in_list(trx);
+
+		ut_ad(trx->in_ro_trx_list
+		      == (trx_list == &trx_sys->ro_trx_list));
+
+		ut_ad(trx->in_rw_trx_list
+		      == (trx_list == &trx_sys->rw_trx_list));
+
 		if (!add_trx_relevant_locks_to_cache(cache, trx,
 						     &requested_lock_row)) {
 
@@ -1304,6 +1335,28 @@ fetch_data_into_cache(
 			return;
 		}
 	}
+}
+
+/*******************************************************************//**
+Fetches the data needed to fill the 3 INFORMATION SCHEMA tables into the
+table cache buffer. Cache must be locked for write. */
+static
+void
+fetch_data_into_cache(
+/*==================*/
+	trx_i_s_cache_t*	cache)	/*!< in/out: cache */
+{
+	ut_ad(lock_mutex_own());
+	ut_ad(mutex_own(&trx_sys->mutex));
+
+	trx_i_s_cache_clear(cache);
+
+	fetch_data_into_cache_low(cache, FALSE, &trx_sys->rw_trx_list);
+	fetch_data_into_cache_low(cache, FALSE, &trx_sys->ro_trx_list);
+
+	/* Only select autocommit non-locking selects because they can
+	only be on the MySQL transaction list (TRUE). */
+	fetch_data_into_cache_low(cache, TRUE, &trx_sys->mysql_trx_list);
 
 	cache->is_truncated = FALSE;
 }
@@ -1330,11 +1383,16 @@ trx_i_s_possibly_fetch_data_into_cache(
 	}
 
 	/* We need to read trx_sys and record/table lock queues */
-	mutex_enter(&kernel_mutex);
+
+	lock_mutex_enter();
+
+	mutex_enter(&trx_sys->mutex);
 
 	fetch_data_into_cache(cache);
 
-	mutex_exit(&kernel_mutex);
+	mutex_exit(&trx_sys->mutex);
+
+	lock_mutex_exit();
 
 	/* update cache last read time */
 	now = ut_time_us(NULL);
@@ -1366,8 +1424,8 @@ trx_i_s_cache_init(
 {
 	/* The latching is done in the following order:
 	acquire trx_i_s_cache_t::rw_lock, X
-	acquire kernel_mutex
-	release kernel_mutex
+	acquire lock mutex
+	release lock mutex
 	release trx_i_s_cache_t::rw_lock
 	acquire trx_i_s_cache_t::rw_lock, S
 	release trx_i_s_cache_t::rw_lock */
@@ -1579,7 +1637,7 @@ trx_i_s_create_lock_id(
 	} else {
 		/* table lock */
 		res_len = ut_snprintf(lock_id, lock_id_size,
-				      TRX_ID_FMT ":%llu",
+				      TRX_ID_FMT":"UINT64PF,
 				      row->lock_trx_id,
 				      row->lock_table_id);
 	}
diff --git a/storage/innobase/trx/trx0purge.c b/storage/innobase/trx/trx0purge.cc
index 96f01ea81b5..62c140879aa 100644
--- a/storage/innobase/trx/trx0purge.c
+++ b/storage/innobase/trx/trx0purge.cc
@@ -11,13 +11,13 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
 /**************************************************//**
-@file trx/trx0purge.c
+@file trx/trx0purge.cc
 Purge old versions
 
 Created 3/26/1996 Heikki Tuuri
@@ -31,7 +31,6 @@ Created 3/26/1996 Heikki Tuuri
 
 #include "fsp0fsp.h"
 #include "mach0data.h"
-#include "mtr0log.h"
 #include "trx0rseg.h"
 #include "trx0trx.h"
 #include "trx0roll.h"
@@ -42,7 +41,16 @@ Created 3/26/1996 Heikki Tuuri
 #include "row0upd.h"
 #include "trx0rec.h"
 #include "srv0srv.h"
+#include "srv0start.h"
 #include "os0thread.h"
+#include "srv0mon.h"
+#include "mtr0log.h"
+
+/** Maximum allowable purge history length.  <=0 means 'infinite'. */
+UNIV_INTERN ulong		srv_max_purge_lag = 0;
+
+/** Max DML user threads delay in micro-seconds. */
+UNIV_INTERN ulong		srv_max_purge_lag_delay = 0;
 
 /** The global data structure coordinating a purge */
 UNIV_INTERN trx_purge_t*	purge_sys = NULL;
@@ -61,129 +69,19 @@ UNIV_INTERN mysql_pfs_key_t	trx_purge_latch_key;
 UNIV_INTERN mysql_pfs_key_t	purge_sys_bh_mutex_key;
 #endif /* UNIV_PFS_MUTEX */
 
-/*****************************************************************//**
-Checks if trx_id is >= purge_view: then it is guaranteed that its update
-undo log still exists in the system.
-@return TRUE if is sure that it is preserved, also if the function
-returns FALSE, it is possible that the undo log still exists in the
-system */
-UNIV_INTERN
-ibool
-trx_purge_update_undo_must_exist(
-/*=============================*/
-	trx_id_t	trx_id)	/*!< in: transaction id */
-{
-#ifdef UNIV_SYNC_DEBUG
-	ut_ad(rw_lock_own(&(purge_sys->latch), RW_LOCK_SHARED));
-#endif /* UNIV_SYNC_DEBUG */
-
-	if (!read_view_sees_trx_id(purge_sys->view, trx_id)) {
-
-		return(TRUE);
-	}
-
-	return(FALSE);
-}
-
-/*=================== PURGE RECORD ARRAY =============================*/
-
-/*******************************************************************//**
-Stores info of an undo log record during a purge.
-@return	pointer to the storage cell */
+/********************************************************************//**
+Fetches the next undo log record from the history list to purge. It must be
+released with the corresponding release function.
+@return copy of an undo log record or pointer to trx_purge_dummy_rec,
+if the whole undo log can skipped in purge; NULL if none left */
 static
-trx_undo_inf_t*
-trx_purge_arr_store_info(
+trx_undo_rec_t*
+trx_purge_fetch_next_rec(
 /*=====================*/
-	trx_id_t	trx_no,	/*!< in: transaction number */
-	undo_no_t	undo_no)/*!< in: undo number */
-{
-	trx_undo_inf_t*	cell;
-	trx_undo_arr_t*	arr;
-	ulint		i;
-
-	arr = purge_sys->arr;
-
-	for (i = 0;; i++) {
-		cell = trx_undo_arr_get_nth_info(arr, i);
-
-		if (!(cell->in_use)) {
-			/* Not in use, we may store here */
-			cell->undo_no = undo_no;
-			cell->trx_no = trx_no;
-			cell->in_use = TRUE;
-
-			arr->n_used++;
-
-			return(cell);
-		}
-	}
-}
-
-/*******************************************************************//**
-Removes info of an undo log record during a purge. */
-UNIV_INLINE
-void
-trx_purge_arr_remove_info(
-/*======================*/
-	trx_undo_inf_t*	cell)	/*!< in: pointer to the storage cell */
-{
-	trx_undo_arr_t*	arr;
-
-	arr = purge_sys->arr;
-
-	cell->in_use = FALSE;
-
-	ut_ad(arr->n_used > 0);
-
-	arr->n_used--;
-}
-
-/*******************************************************************//**
-Gets the biggest pair of a trx number and an undo number in a purge array. */
-static
-void
-trx_purge_arr_get_biggest(
-/*======================*/
-	trx_undo_arr_t*	arr,	/*!< in: purge array */
-	trx_id_t*	trx_no,	/*!< out: transaction number: 0
-				if array is empty */
-	undo_no_t*	undo_no)/*!< out: undo number */
-{
-	trx_undo_inf_t*	cell;
-	trx_id_t	pair_trx_no;
-	undo_no_t	pair_undo_no;
-	ulint		i;
-	ulint		n;
-
-	n = arr->n_used;
-	pair_trx_no = 0;
-	pair_undo_no = 0;
-
-	if (n) {
-		for (i = 0;; i++) {
-			cell = trx_undo_arr_get_nth_info(arr, i);
-
-			if (!cell->in_use) {
-				continue;
-			}
-
-			if ((cell->trx_no > pair_trx_no)
-			    || ((cell->trx_no == pair_trx_no)
-				&& cell->undo_no >= pair_undo_no)) {
-
-				pair_trx_no = cell->trx_no;
-				pair_undo_no = cell->undo_no;
-			}
-
-			if (!--n) {
-				break;
-			}
-		}
-	}
-
-	*trx_no = pair_trx_no;
-	*undo_no = pair_undo_no;
-}
+	roll_ptr_t*	roll_ptr,	/*!< out: roll pointer to undo record */
+	ulint*		n_pages_handled,/*!< in/out: number of UNDO log pages
+					handled */
+	mem_heap_t*	heap);		/*!< in: memory heap where copied */
 
 /****************************************************************//**
 Builds a purge 'query' graph. The actual purge is performed by executing
@@ -191,25 +89,27 @@ this query graph.
 @return	own: the query graph */
 static
 que_t*
-trx_purge_graph_build(void)
-/*=======================*/
+trx_purge_graph_build(
+/*==================*/
+	trx_t*		trx,			/*!< in: transaction */
+	ulint		n_purge_threads)	/*!< in: number of purge
+						threads */
 {
+	ulint		i;
 	mem_heap_t*	heap;
 	que_fork_t*	fork;
-	que_thr_t*	thr;
-	/*	que_thr_t*	thr2; */
 
 	heap = mem_heap_create(512);
 	fork = que_fork_create(NULL, NULL, QUE_FORK_PURGE, heap);
-	fork->trx = purge_sys->trx;
+	fork->trx = trx;
 
-	thr = que_thr_create(fork, heap);
+	for (i = 0; i < n_purge_threads; ++i) {
+		que_thr_t*	thr;
 
-	thr->child = row_purge_node_create(thr, heap);
+		thr = que_thr_create(fork, heap);
 
-	/*	thr2 = que_thr_create(fork, fork, heap);
-
-	thr2->child = row_purge_node_create(fork, thr2, heap);	 */
+		thr->child = row_purge_node_create(thr, heap);
+	}
 
 	return(fork);
 }
@@ -221,21 +121,18 @@ UNIV_INTERN
 void
 trx_purge_sys_create(
 /*=================*/
-	ib_bh_t*	ib_bh)	/*!< in, own: UNDO log min binary heap */
+	ulint		n_purge_threads,	/*!< in: number of purge
+						threads */
+	ib_bh_t*	ib_bh)			/*!< in, own: UNDO log min
+						binary heap */
 {
-	ut_ad(mutex_own(&kernel_mutex));
+	purge_sys = static_cast<trx_purge_t*>(mem_zalloc(sizeof(*purge_sys)));
 
-	purge_sys = mem_zalloc(sizeof(trx_purge_t));
+	purge_sys->state = PURGE_STATE_INIT;
+	purge_sys->event = os_event_create("purge");
 
 	/* Take ownership of ib_bh, we are responsible for freeing it. */
 	purge_sys->ib_bh = ib_bh;
-	purge_sys->state = TRX_STOP_PURGE;
-
-	purge_sys->n_pages_handled = 0;
-
-	purge_sys->purge_trx_no = 0;
-	purge_sys->purge_undo_no = 0;
-	purge_sys->next_stored = FALSE;
 
 	rw_lock_create(trx_purge_latch_key,
 		       &purge_sys->latch, SYNC_PURGE_LATCH);
@@ -246,20 +143,26 @@ trx_purge_sys_create(
 
 	purge_sys->heap = mem_heap_create(256);
 
-	purge_sys->arr = trx_undo_arr_create();
+	ut_a(n_purge_threads > 0);
 
 	purge_sys->sess = sess_open();
 
 	purge_sys->trx = purge_sys->sess->trx;
 
-	purge_sys->trx->is_purge = 1;
+	ut_a(purge_sys->trx->sess == purge_sys->sess);
 
-	ut_a(trx_start_low(purge_sys->trx, ULINT_UNDEFINED));
+	/* A purge transaction is not a real transaction, we use a transaction
+	here only because the query threads code requires it. It is otherwise
+	quite unnecessary. We should get rid of it eventually. */
+	purge_sys->trx->id = 0;
+	purge_sys->trx->start_time = ut_time();
+	purge_sys->trx->state = TRX_STATE_ACTIVE;
+	purge_sys->trx->op_info = "purge trx";
 
-	purge_sys->query = trx_purge_graph_build();
+	purge_sys->query = trx_purge_graph_build(
+		purge_sys->trx, n_purge_threads);
 
-	purge_sys->view = read_view_oldest_copy_or_open_new(0,
-							    purge_sys->heap);
+	purge_sys->view = read_view_purge_open(purge_sys->heap);
 }
 
 /************************************************************************
@@ -269,27 +172,18 @@ void
 trx_purge_sys_close(void)
 /*======================*/
 {
-	ut_ad(!mutex_own(&kernel_mutex));
-
 	que_graph_free(purge_sys->query);
 
-	ut_a(purge_sys->sess->trx->is_purge);
-	purge_sys->sess->trx->conc_state = TRX_NOT_STARTED;
-	sess_close(purge_sys->sess);
-	purge_sys->sess = NULL;
+	ut_a(purge_sys->trx->id == 0);
+	ut_a(purge_sys->sess->trx == purge_sys->trx);
 
-	if (purge_sys->view != NULL) {
-		/* Because acquiring the kernel mutex is a pre-condition
-		of read_view_close(). We don't really need it here. */
-		mutex_enter(&kernel_mutex);
+	purge_sys->trx->state = TRX_STATE_NOT_STARTED;
 
-		read_view_close(purge_sys->view);
-		purge_sys->view = NULL;
+	sess_close(purge_sys->sess);
 
-		mutex_exit(&kernel_mutex);
-	}
+	purge_sys->sess = NULL;
 
-	trx_undo_arr_free(purge_sys->arr);
+	purge_sys->view = NULL;
 
 	rw_lock_free(&purge_sys->latch);
 	mutex_free(&purge_sys->bh_mutex);
@@ -298,6 +192,10 @@ trx_purge_sys_close(void)
 
 	ib_bh_free(purge_sys->ib_bh);
 
+	os_event_free(purge_sys->event);
+
+	purge_sys->event = NULL;
+
 	mem_free(purge_sys);
 
 	purge_sys = NULL;
@@ -318,21 +216,18 @@ trx_purge_add_update_undo_to_history(
 	mtr_t*	mtr)		/*!< in: mtr */
 {
 	trx_undo_t*	undo;
+	trx_rseg_t*	rseg;
 	trx_rsegf_t*	rseg_header;
 	trx_ulogf_t*	undo_header;
 
 	undo = trx->update_undo;
-
-	ut_ad(undo);
-
-	ut_ad(mutex_own(&undo->rseg->mutex));
+	rseg = undo->rseg;
 
 	rseg_header = trx_rsegf_get(
 		undo->rseg->space, undo->rseg->zip_size, undo->rseg->page_no,
 		mtr);
 
 	undo_header = undo_page + undo->hdr_offset;
-	/* Add the log as the first in the history list */
 
 	if (undo->state != TRX_UNDO_CACHED) {
 		ulint		hist_size;
@@ -351,6 +246,8 @@ trx_purge_add_update_undo_to_history(
 
 		trx_rsegf_set_nth_undo(rseg_header, undo->id, FIL_NULL, mtr);
 
+		MONITOR_DEC(MONITOR_NUM_UNDO_SLOT_USED);
+
 		hist_size = mtr_read_ulint(
 			rseg_header + TRX_RSEG_HISTORY_SIZE, MLOG_4BYTES, mtr);
 
@@ -362,39 +259,35 @@ trx_purge_add_update_undo_to_history(
 			hist_size + undo->size, MLOG_4BYTES, mtr);
 	}
 
-	flst_add_first(
-		rseg_header + TRX_RSEG_HISTORY,
-		undo_header + TRX_UNDO_HISTORY_NODE, mtr);
+	/* Add the log as the first in the history list */
+	flst_add_first(rseg_header + TRX_RSEG_HISTORY,
+		       undo_header + TRX_UNDO_HISTORY_NODE, mtr);
 
-	/* Write the trx number to the undo log header */
+#ifdef HAVE_ATOMIC_BUILTINS
+	os_atomic_increment_ulint(&trx_sys->rseg_history_len, 1);
+#else
+	mutex_enter(&trx_sys->mutex);
+	++trx_sys->rseg_history_len;
+	mutex_exit(&trx_sys->mutex);
+#endif /* HAVE_ATOMIC_BUILTINS */
 
+	srv_wake_purge_thread_if_not_active();
+
+	/* Write the trx number to the undo log header */
 	mlog_write_ull(undo_header + TRX_UNDO_TRX_NO, trx->no, mtr);
 
 	/* Write information about delete markings to the undo log header */
 
 	if (!undo->del_marks) {
-		mlog_write_ulint(
-			undo_header + TRX_UNDO_DEL_MARKS, FALSE,
-			MLOG_2BYTES, mtr);
+		mlog_write_ulint(undo_header + TRX_UNDO_DEL_MARKS, FALSE,
+				 MLOG_2BYTES, mtr);
 	}
 
-	if (undo->rseg->last_page_no == FIL_NULL) {
-		undo->rseg->last_trx_no = trx->no;
-		undo->rseg->last_offset = undo->hdr_offset;
-		undo->rseg->last_page_no = undo->hdr_page_no;
-		undo->rseg->last_del_marks = undo->del_marks;
-
-		/* FIXME: Add a bin heap validate function to check that
-		the rseg exists. */
-	}
-
-	mutex_enter(&kernel_mutex);
-	trx_sys->rseg_history_len++;
-	mutex_exit(&kernel_mutex);
-
-	if (!(trx_sys->rseg_history_len % srv_purge_batch_size)) {
-		/* Inform the purge thread that there is work to do. */
-		srv_wake_purge_thread_if_not_active();
+	if (rseg->last_page_no == FIL_NULL) {
+		rseg->last_page_no = undo->hdr_page_no;
+		rseg->last_offset = undo->hdr_offset;
+		rseg->last_trx_no = trx->no;
+		rseg->last_del_marks = undo->del_marks;
 	}
 }
 
@@ -411,49 +304,55 @@ trx_purge_free_segment(
 					will cut off from the end of the
 					history list */
 {
-	page_t*		undo_page;
+	mtr_t		mtr;
 	trx_rsegf_t*	rseg_hdr;
 	trx_ulogf_t*	log_hdr;
 	trx_usegf_t*	seg_hdr;
-	ibool		freed;
 	ulint		seg_size;
 	ulint		hist_size;
 	ibool		marked		= FALSE;
-	mtr_t		mtr;
 
 	/*	fputs("Freeing an update undo log segment\n", stderr); */
 
-loop:
-	mtr_start(&mtr);
-	mutex_enter(&(rseg->mutex));
+	for (;;) {
+		page_t*	undo_page;
 
-	rseg_hdr = trx_rsegf_get(rseg->space, rseg->zip_size,
-				 rseg->page_no, &mtr);
+		mtr_start(&mtr);
 
-	undo_page = trx_undo_page_get(rseg->space, rseg->zip_size,
-				      hdr_addr.page, &mtr);
-	seg_hdr = undo_page + TRX_UNDO_SEG_HDR;
-	log_hdr = undo_page + hdr_addr.boffset;
+		mutex_enter(&rseg->mutex);
 
-	/* Mark the last undo log totally purged, so that if the system
-	crashes, the tail of the undo log will not get accessed again. The
-	list of pages in the undo log tail gets inconsistent during the
-	freeing of the segment, and therefore purge should not try to access
-	them again. */
+		rseg_hdr = trx_rsegf_get(
+			rseg->space, rseg->zip_size, rseg->page_no, &mtr);
 
-	if (!marked) {
-		mlog_write_ulint(log_hdr + TRX_UNDO_DEL_MARKS, FALSE,
-				 MLOG_2BYTES, &mtr);
-		marked = TRUE;
-	}
+		undo_page = trx_undo_page_get(
+			rseg->space, rseg->zip_size, hdr_addr.page, &mtr);
 
-	freed = fseg_free_step_not_header(seg_hdr + TRX_UNDO_FSEG_HEADER,
-					  &mtr);
-	if (!freed) {
-		mutex_exit(&(rseg->mutex));
-		mtr_commit(&mtr);
+		seg_hdr = undo_page + TRX_UNDO_SEG_HDR;
+		log_hdr = undo_page + hdr_addr.boffset;
 
-		goto loop;
+		/* Mark the last undo log totally purged, so that if the
+		system crashes, the tail of the undo log will not get accessed
+		again. The list of pages in the undo log tail gets inconsistent
+		during the freeing of the segment, and therefore purge should
+		not try to access them again. */
+
+		if (!marked) {
+			mlog_write_ulint(
+				log_hdr + TRX_UNDO_DEL_MARKS, FALSE,
+				MLOG_2BYTES, &mtr);
+
+			marked = TRUE;
+		}
+
+		if (fseg_free_step_not_header(
+			seg_hdr + TRX_UNDO_FSEG_HEADER, &mtr)) {
+
+			break;
+		}
+
+		mutex_exit(&rseg->mutex);
+
+		mtr_commit(&mtr);
 	}
 
 	/* The page list may now be inconsistent, but the length field
@@ -470,22 +369,22 @@ loop:
 	flst_cut_end(rseg_hdr + TRX_RSEG_HISTORY,
 		     log_hdr + TRX_UNDO_HISTORY_NODE, n_removed_logs, &mtr);
 
-	mutex_enter(&kernel_mutex);
-	ut_ad(trx_sys->rseg_history_len >= n_removed_logs);
+#ifdef HAVE_ATOMIC_BUILTINS
+	os_atomic_decrement_ulint(&trx_sys->rseg_history_len, n_removed_logs);
+#else
+	mutex_enter(&trx_sys->mutex);
 	trx_sys->rseg_history_len -= n_removed_logs;
-	mutex_exit(&kernel_mutex);
+	mutex_exit(&trx_sys->mutex);
+#endif /* HAVE_ATOMIC_BUILTINS */
 
-	freed = FALSE;
+	do {
 
-	while (!freed) {
 		/* Here we assume that a file segment with just the header
 		page can be freed in a few steps, so that the buffer pool
 		is not flooded with bufferfixed pages: see the note in
-		fsp0fsp.c. */
+		fsp0fsp.cc. */
 
-		freed = fseg_free_step(seg_hdr + TRX_UNDO_FSEG_HEADER,
-				       &mtr);
-	}
+	} while(!fseg_free_step(seg_hdr + TRX_UNDO_FSEG_HEADER, &mtr));
 
 	hist_size = mtr_read_ulint(rseg_hdr + TRX_RSEG_HISTORY_SIZE,
 				   MLOG_4BYTES, &mtr);
@@ -509,12 +408,8 @@ static
 void
 trx_purge_truncate_rseg_history(
 /*============================*/
-	trx_rseg_t*	rseg,		/*!< in: rollback segment */
-	trx_id_t	limit_trx_no,	/*!< in: remove update undo logs whose
-					trx number is < limit_trx_no */
-	undo_no_t	limit_undo_no)	/*!< in: if transaction number is equal
-					to limit_trx_no, truncate undo records
-					with undo number < limit_undo_no */
+	trx_rseg_t*		rseg,		/*!< in: rollback segment */
+	const purge_iter_t*	limit)		/*!< in: truncate offset */
 {
 	fil_addr_t	hdr_addr;
 	fil_addr_t	prev_hdr_addr;
@@ -548,20 +443,26 @@ loop:
 				      hdr_addr.page, &mtr);
 
 	log_hdr = undo_page + hdr_addr.boffset;
+
 	undo_trx_no = mach_read_from_8(log_hdr + TRX_UNDO_TRX_NO);
 
-	if (undo_trx_no >= limit_trx_no) {
-		if (undo_trx_no == limit_trx_no) {
-			trx_undo_truncate_start(rseg, rseg->space,
-						hdr_addr.page,
-						hdr_addr.boffset,
-						limit_undo_no);
+	if (undo_trx_no >= limit->trx_no) {
+
+		if (undo_trx_no == limit->trx_no) {
+
+			trx_undo_truncate_start(
+				rseg, rseg->space, hdr_addr.page,
+				hdr_addr.boffset, limit->undo_no);
 		}
 
-		mutex_enter(&kernel_mutex);
-		ut_a(trx_sys->rseg_history_len >= n_removed_logs);
+#ifdef HAVE_ATOMIC_BUILTINS
+		os_atomic_decrement_ulint(
+			&trx_sys->rseg_history_len, n_removed_logs);
+#else
+		mutex_enter(&trx_sys->mutex);
 		trx_sys->rseg_history_len -= n_removed_logs;
-		mutex_exit(&kernel_mutex);
+		mutex_exit(&trx_sys->mutex);
+#endif /* HAVE_ATOMIC_BUILTINS */
 
 		flst_truncate_end(rseg_hdr + TRX_RSEG_HISTORY,
 				  log_hdr + TRX_UNDO_HISTORY_NODE,
@@ -611,56 +512,33 @@ Removes unnecessary history data from rollback segments. NOTE that when this
 function is called, the caller must not have any latches on undo log pages! */
 static
 void
-trx_purge_truncate_history(void)
-/*============================*/
+trx_purge_truncate_history(
+/*========================*/
+	purge_iter_t*		limit,		/*!< in: truncate limit */
+	const read_view_t*	view)		/*!< in: purge view */
 {
-	trx_rseg_t*	rseg;
-	trx_id_t	limit_trx_no;
-	undo_no_t	limit_undo_no;
-
-	trx_purge_arr_get_biggest(
-		purge_sys->arr, &limit_trx_no, &limit_undo_no);
-
-	if (limit_trx_no == 0) {
-
-		limit_trx_no = purge_sys->purge_trx_no;
-		limit_undo_no = purge_sys->purge_undo_no;
-	}
+	ulint		i;
 
 	/* We play safe and set the truncate limit at most to the purge view
 	low_limit number, though this is not necessary */
 
-	if (limit_trx_no >= purge_sys->view->low_limit_no) {
-		limit_trx_no = purge_sys->view->low_limit_no;
-		limit_undo_no = 0;
+	if (limit->trx_no >= view->low_limit_no) {
+		limit->trx_no = view->low_limit_no;
+		limit->undo_no = 0;
 	}
 
-	ut_ad(limit_trx_no <= purge_sys->view->low_limit_no);
+	ut_ad(limit->trx_no <= purge_sys->view->low_limit_no);
 
-	for (rseg = UT_LIST_GET_FIRST(trx_sys->rseg_list);
-	     rseg != NULL;
-	     rseg = UT_LIST_GET_NEXT(rseg_list, rseg)) {
+	for (i = 0; i < TRX_SYS_N_RSEGS; ++i) {
+		trx_rseg_t*	rseg = trx_sys->rseg_array[i];
 
-		trx_purge_truncate_rseg_history(
-			rseg, limit_trx_no, limit_undo_no);
+		if (rseg != NULL) {
+			ut_a(rseg->id == i);
+			trx_purge_truncate_rseg_history(rseg, limit);
+		}
 	}
 }
 
-/********************************************************************//**
-Does a truncate if the purge array is empty. NOTE that when this function is
-called, the caller must not have any latches on undo log pages! */
-UNIV_INLINE
-void
-trx_purge_truncate_if_arr_empty(void)
-/*=================================*/
-{
-	static ulint	count;
-
-	if (!(++count % TRX_SYS_N_RSEGS) && purge_sys->arr->n_used == 0) {
-
-		trx_purge_truncate_history();
-	}
-}
 
 /***********************************************************************//**
 Updates the last not yet purged history log info in rseg when we have purged
@@ -669,8 +547,11 @@ static
 void
 trx_purge_rseg_get_next_history_log(
 /*================================*/
-	trx_rseg_t*	rseg)	/*!< in: rollback segment */
+	trx_rseg_t*	rseg,		/*!< in: rollback segment */
+	ulint*		n_pages_handled)/*!< in/out: number of UNDO pages
+					handled */
 {
+	const void*	ptr;
 	page_t*		undo_page;
 	trx_ulogf_t*	log_hdr;
 	fil_addr_t	prev_log_addr;
@@ -678,14 +559,13 @@ trx_purge_rseg_get_next_history_log(
 	ibool		del_marks;
 	mtr_t		mtr;
 	rseg_queue_t	rseg_queue;
-	const void*	ptr;
 
 	mutex_enter(&(rseg->mutex));
 
 	ut_a(rseg->last_page_no != FIL_NULL);
 
-	purge_sys->purge_trx_no = rseg->last_trx_no + 1;
-	purge_sys->purge_undo_no = 0;
+	purge_sys->iter.trx_no = rseg->last_trx_no + 1;
+	purge_sys->iter.undo_no = 0;
 	purge_sys->next_stored = FALSE;
 
 	mtr_start(&mtr);
@@ -697,7 +577,7 @@ trx_purge_rseg_get_next_history_log(
 
 	/* Increase the purge page count by one for every handled log */
 
-	purge_sys->n_pages_handled++;
+	(*n_pages_handled)++;
 
 	prev_log_addr = trx_purge_get_log_from_hist(
 		flst_get_prev_addr(log_hdr + TRX_UNDO_HISTORY_NODE, &mtr));
@@ -710,10 +590,10 @@ trx_purge_rseg_get_next_history_log(
 		mutex_exit(&(rseg->mutex));
 		mtr_commit(&mtr);
 
-		mutex_enter(&kernel_mutex);
+		mutex_enter(&trx_sys->mutex);
 
 		/* Add debug code to track history list corruption reported
-		on the MySQL mailing list on Nov 9, 2004. The fut0lst.c
+		on the MySQL mailing list on Nov 9, 2004. The fut0lst.cc
 		file-based list was corrupt. The prev node pointer was
 		FIL_NULL, even though the list length was over 8 million nodes!
 		We assume that purge truncates the history list in large
@@ -733,12 +613,13 @@ trx_purge_rseg_get_next_history_log(
 			ut_ad(0);
 		}
 
-		mutex_exit(&kernel_mutex);
+		mutex_exit(&trx_sys->mutex);
 
 		return;
 	}
 
-	mutex_exit(&(rseg->mutex));
+	mutex_exit(&rseg->mutex);
+
 	mtr_commit(&mtr);
 
 	/* Read the trx number and del marks from the previous log header */
@@ -776,7 +657,7 @@ trx_purge_rseg_get_next_history_log(
 
 	mutex_exit(&purge_sys->bh_mutex);
 
-	mutex_exit(&(rseg->mutex));
+	mutex_exit(&rseg->mutex);
 }
 
 /***********************************************************************//**
@@ -820,18 +701,16 @@ trx_purge_get_rseg_with_min_trx_id(
 
 	ut_a(purge_sys->rseg->last_page_no != FIL_NULL);
 
-	/* We assume in purge of externally stored fields
-	that space id == 0 */
-	ut_a(purge_sys->rseg->space == 0);
+	/* We assume in purge of externally stored fields that space id is
+	in the range of UNDO tablespace space ids */
+	ut_a(purge_sys->rseg->space <= srv_undo_tablespaces);
 
 	zip_size = purge_sys->rseg->zip_size;
 
-	ut_a(purge_sys->purge_trx_no <= purge_sys->rseg->last_trx_no);
-
-	purge_sys->purge_trx_no = purge_sys->rseg->last_trx_no;
+	ut_a(purge_sys->iter.trx_no <= purge_sys->rseg->last_trx_no);
 
+	purge_sys->iter.trx_no = purge_sys->rseg->last_trx_no;
 	purge_sys->hdr_offset = purge_sys->rseg->last_offset;
-
 	purge_sys->hdr_page_no = purge_sys->rseg->last_page_no;
 
 	mutex_exit(&purge_sys->rseg->mutex);
@@ -848,21 +727,22 @@ trx_purge_read_undo_rec(
 	trx_purge_t*	purge_sys,		/*!< in/out: purge instance */
 	ulint		zip_size)		/*!< in: block size or 0 */
 {
+	ulint		offset;
 	ulint		page_no;
-	ulint		offset = 0;
-	ib_uint64_t	undo_no = 0;
+	ib_uint64_t	undo_no;
 
 	purge_sys->hdr_offset = purge_sys->rseg->last_offset;
 	page_no = purge_sys->hdr_page_no = purge_sys->rseg->last_page_no;
 
 	if (purge_sys->rseg->last_del_marks) {
 		mtr_t		mtr;
-		trx_undo_rec_t*	undo_rec;
+		trx_undo_rec_t*	undo_rec = NULL;
 
 		mtr_start(&mtr);
 
 		undo_rec = trx_undo_get_first_rec(
-			0 /* System space id */, zip_size,
+			purge_sys->rseg->space,
+			zip_size,
 			purge_sys->hdr_page_no,
 			purge_sys->hdr_offset, RW_S_LATCH, &mtr);
 
@@ -870,14 +750,20 @@ trx_purge_read_undo_rec(
 			offset = page_offset(undo_rec);
 			undo_no = trx_undo_rec_get_undo_no(undo_rec);
 			page_no = page_get_page_no(page_align(undo_rec));
+		} else {
+			offset = 0;
+			undo_no = 0;
 		}
 
 		mtr_commit(&mtr);
+	} else {
+		offset = 0;
+		undo_no = 0;
 	}
 
 	purge_sys->offset = offset;
 	purge_sys->page_no = page_no;
-	purge_sys->purge_undo_no = undo_no;
+	purge_sys->iter.undo_no = undo_no;
 
 	purge_sys->next_stored = TRUE;
 }
@@ -899,7 +785,6 @@ trx_purge_choose_next_log(void)
 	zip_size = trx_purge_get_rseg_with_min_trx_id(purge_sys);
 
 	if (purge_sys->rseg != NULL) {
-
 		trx_purge_read_undo_rec(purge_sys, zip_size);
 	} else {
 		/* There is nothing to do yet. */
@@ -914,23 +799,23 @@ static
 trx_undo_rec_t*
 trx_purge_get_next_rec(
 /*===================*/
-	mem_heap_t*	heap)	/*!< in: memory heap where copied */
+	ulint*		n_pages_handled,/*!< in/out: number of UNDO pages
+					handled */
+	mem_heap_t*	heap)		/*!< in: memory heap where copied */
 {
 	trx_undo_rec_t*	rec;
 	trx_undo_rec_t*	rec_copy;
 	trx_undo_rec_t*	rec2;
-	trx_undo_rec_t*	next_rec;
 	page_t*		undo_page;
 	page_t*		page;
 	ulint		offset;
 	ulint		page_no;
 	ulint		space;
 	ulint		zip_size;
-	ulint		type;
-	ulint		cmpl_info;
 	mtr_t		mtr;
 
 	ut_ad(purge_sys->next_stored);
+	ut_ad(purge_sys->iter.trx_no < purge_sys->view->low_limit_no);
 
 	space = purge_sys->rseg->space;
 	zip_size = purge_sys->rseg->zip_size;
@@ -941,7 +826,8 @@ trx_purge_get_next_rec(
 		/* It is the dummy undo log record, which means that there is
 		no need to purge this undo log */
 
-		trx_purge_rseg_get_next_history_log(purge_sys->rseg);
+		trx_purge_rseg_get_next_history_log(
+			purge_sys->rseg, n_pages_handled);
 
 		/* Look for the next undo log and record to purge */
 
@@ -959,6 +845,10 @@ trx_purge_get_next_rec(
 	rec2 = rec;
 
 	for (;;) {
+		ulint		type;
+		trx_undo_rec_t*	next_rec;
+		ulint		cmpl_info;
+
 		/* Try first to find the next record which requires a purge
 		operation from the same page of the same undo log */
 
@@ -996,7 +886,8 @@ trx_purge_get_next_rec(
 	if (rec2 == NULL) {
 		mtr_commit(&mtr);
 
-		trx_purge_rseg_get_next_history_log(purge_sys->rseg);
+		trx_purge_rseg_get_next_history_log(
+			purge_sys->rseg, n_pages_handled);
 
 		/* Look for the next undo log and record to purge */
 
@@ -1004,20 +895,20 @@ trx_purge_get_next_rec(
 
 		mtr_start(&mtr);
 
-		undo_page = trx_undo_page_get_s_latched(space, zip_size,
-							page_no, &mtr);
+		undo_page = trx_undo_page_get_s_latched(
+			space, zip_size, page_no, &mtr);
 
 		rec = undo_page + offset;
 	} else {
 		page = page_align(rec2);
 
-		purge_sys->purge_undo_no = trx_undo_rec_get_undo_no(rec2);
-		purge_sys->page_no = page_get_page_no(page);
 		purge_sys->offset = rec2 - page;
+		purge_sys->page_no = page_get_page_no(page);
+		purge_sys->iter.undo_no = trx_undo_rec_get_undo_no(rec2);
 
 		if (undo_page != page) {
 			/* We advance to a new page of the undo log: */
-			purge_sys->n_pages_handled++;
+			(*n_pages_handled)++;
 		}
 	}
 
@@ -1033,88 +924,262 @@ Fetches the next undo log record from the history list to purge. It must be
 released with the corresponding release function.
 @return copy of an undo log record or pointer to trx_purge_dummy_rec,
 if the whole undo log can skipped in purge; NULL if none left */
-UNIV_INTERN
+static
 trx_undo_rec_t*
 trx_purge_fetch_next_rec(
 /*=====================*/
-	roll_ptr_t*	roll_ptr,/*!< out: roll pointer to undo record */
-	trx_undo_inf_t** cell,	/*!< out: storage cell for the record in the
-				purge array */
-	mem_heap_t*	heap)	/*!< in: memory heap where copied */
+	roll_ptr_t*	roll_ptr,	/*!< out: roll pointer to undo record */
+	ulint*		n_pages_handled,/*!< in/out: number of UNDO log pages
+					handled */
+	mem_heap_t*	heap)		/*!< in: memory heap where copied */
 {
-	trx_undo_rec_t*	undo_rec;
-
-
-	if (purge_sys->state == TRX_STOP_PURGE) {
-		trx_purge_truncate_if_arr_empty();
-
-		return(NULL);
-	} else if (!purge_sys->next_stored) {
+	if (!purge_sys->next_stored) {
 		trx_purge_choose_next_log();
 
 		if (!purge_sys->next_stored) {
-			purge_sys->state = TRX_STOP_PURGE;
-
-			trx_purge_truncate_if_arr_empty();
 
 			if (srv_print_thread_releases) {
 				fprintf(stderr,
 					"Purge: No logs left in the"
-					" history list; pages handled %lu\n",
-					(ulong) purge_sys->n_pages_handled);
+					" history list\n");
 			}
 
 			return(NULL);
 		}
 	}
 
-	if (purge_sys->n_pages_handled >= purge_sys->handle_limit) {
+	if (purge_sys->iter.trx_no >= purge_sys->view->low_limit_no) {
 
-		purge_sys->state = TRX_STOP_PURGE;
+		return(NULL);
+	}
 
-		trx_purge_truncate_if_arr_empty();
+	/* fprintf(stderr, "Thread %lu purging trx %llu undo record %llu\n",
+	os_thread_get_curr_id(), iter->trx_no, iter->undo_no); */
 
-		return(NULL);
-	} else if (purge_sys->purge_trx_no >= purge_sys->view->low_limit_no) {
-		purge_sys->state = TRX_STOP_PURGE;
+	*roll_ptr = trx_undo_build_roll_ptr(
+		FALSE, purge_sys->rseg->id,
+		purge_sys->page_no, purge_sys->offset);
 
-		trx_purge_truncate_if_arr_empty();
+	/* The following call will advance the stored values of the
+	purge iterator. */
 
-		return(NULL);
+	return(trx_purge_get_next_rec(n_pages_handled, heap));
+}
+
+/*******************************************************************//**
+This function runs a purge batch.
+@return	number of undo log pages handled in the batch */
+static
+ulint
+trx_purge_attach_undo_recs(
+/*=======================*/
+	ulint		n_purge_threads,/*!< in: number of purge threads */
+	trx_purge_t*	purge_sys,	/*!< in/out: purge instance */
+	purge_iter_t*	limit,		/*!< out: records read up to */
+	ulint		batch_size)	/*!< in: no. of pages to purge */
+{
+	que_thr_t*	thr;
+	ulint		i = 0;
+	ulint		n_pages_handled = 0;
+	ulint		n_thrs = UT_LIST_GET_LEN(purge_sys->query->thrs);
+
+	ut_a(n_purge_threads > 0);
+
+	*limit = purge_sys->iter;
+
+	/* Debug code to validate some pre-requisites and reset done flag. */
+	for (thr = UT_LIST_GET_FIRST(purge_sys->query->thrs);
+	     thr != NULL && i < n_purge_threads;
+	     thr = UT_LIST_GET_NEXT(thrs, thr), ++i) {
+
+		purge_node_t*		node;
+
+		/* Get the purge node. */
+		node = (purge_node_t*) thr->child;
+
+		ut_a(que_node_get_type(node) == QUE_NODE_PURGE);
+		ut_a(node->undo_recs == NULL);
+		ut_a(node->done);
+
+		node->done = FALSE;
 	}
 
-	/* fprintf(stderr, "Thread %lu purging trx %llu undo record %llu\n",
-	os_thread_get_curr_id(),
-	(ullint) purge_sys->purge_trx_no,
-	(ullint) purge_sys->purge_undo_no); */
+	/* There should never be fewer nodes than threads, the inverse
+	however is allowed because we only use purge threads as needed. */
+	ut_a(i == n_purge_threads);
 
+	/* Fetch and parse the UNDO records. The UNDO records are added
+	to a per purge node vector. */
+	thr = UT_LIST_GET_FIRST(purge_sys->query->thrs);
+	ut_a(n_thrs > 0 && thr != NULL);
 
-	*roll_ptr = trx_undo_build_roll_ptr(
-		FALSE, (purge_sys->rseg)->id, purge_sys->page_no,
-		purge_sys->offset);
+	ut_ad(trx_purge_check_limit());
+
+	i = 0;
+
+	for (;;) {
+		purge_node_t*		node;
+		trx_purge_rec_t*	purge_rec;
+
+		ut_a(!thr->is_active);
 
-	*cell = trx_purge_arr_store_info(
-		purge_sys->purge_trx_no, purge_sys->purge_undo_no);
+		/* Get the purge node. */
+		node = (purge_node_t*) thr->child;
+		ut_a(que_node_get_type(node) == QUE_NODE_PURGE);
 
-	ut_ad(purge_sys->purge_trx_no < purge_sys->view->low_limit_no);
+		purge_rec = static_cast<trx_purge_rec_t*>(
+			mem_heap_zalloc(node->heap, sizeof(*purge_rec)));
 
-	/* The following call will advance the stored values of purge_trx_no
-	and purge_undo_no, therefore we had to store them first */
+		/* Track the max {trx_id, undo_no} for truncating the
+		UNDO logs once we have purged the records. */
 
-	undo_rec = trx_purge_get_next_rec(heap);
+		if (purge_sys->iter.trx_no > limit->trx_no
+		    || (purge_sys->iter.trx_no == limit->trx_no
+			&& purge_sys->iter.undo_no >= limit->undo_no)) {
 
-	return(undo_rec);
+			*limit = purge_sys->iter;
+		}
+
+		/* Fetch the next record, and advance the purge_sys->iter. */
+		purge_rec->undo_rec = trx_purge_fetch_next_rec(
+			&purge_rec->roll_ptr, &n_pages_handled, node->heap);
+
+		if (purge_rec->undo_rec != NULL) {
+
+			if (node->undo_recs == NULL) {
+				node->undo_recs = ib_vector_create(
+					ib_heap_allocator_create(node->heap),
+					sizeof(trx_purge_rec_t),
+					batch_size);
+			} else {
+				ut_a(!ib_vector_is_empty(node->undo_recs));
+			}
+
+			ib_vector_push(node->undo_recs, purge_rec);
+
+			if (n_pages_handled >= batch_size) {
+
+				break;
+			}
+		} else {
+			break;
+		}
+
+		thr = UT_LIST_GET_NEXT(thrs, thr);
+
+		if (!(++i % n_purge_threads)) {
+			thr = UT_LIST_GET_FIRST(purge_sys->query->thrs);
+		}
+
+		ut_a(thr != NULL);
+	}
+
+	ut_ad(trx_purge_check_limit());
+
+	return(n_pages_handled);
 }
 
 /*******************************************************************//**
-Releases a reserved purge undo record. */
-UNIV_INTERN
+Calculate the DML delay required.
+@return delay in microseconds or ULINT_MAX */
+static
+ulint
+trx_purge_dml_delay(void)
+/*=====================*/
+{
+	/* Determine how much data manipulation language (DML) statements
+	need to be delayed in order to reduce the lagging of the purge
+	thread. */
+	ulint	delay = 0; /* in microseconds; default: no delay */
+
+	/* If purge lag is set (ie. > 0) then calculate the new DML delay.
+	Note: we do a dirty read of the trx_sys_t data structure here,
+	without holding trx_sys->mutex. */
+
+	if (srv_max_purge_lag > 0) {
+		float	ratio;
+
+		ratio = float(trx_sys->rseg_history_len) / srv_max_purge_lag;
+
+		if (ratio > 1.0) {
+			/* If the history list length exceeds the
+			srv_max_purge_lag, the data manipulation
+			statements are delayed by at least 5000
+			microseconds. */
+			delay = (ulint) ((ratio - .5) * 10000);
+		}
+
+		if (delay > srv_max_purge_lag_delay) {
+			delay = srv_max_purge_lag_delay;
+		}
+
+		MONITOR_SET(MONITOR_DML_PURGE_DELAY, delay);
+	}
+
+	return(delay);
+}
+
+/*******************************************************************//**
+Wait for pending purge jobs to complete. */
+static
 void
-trx_purge_rec_release(
-/*==================*/
-	trx_undo_inf_t*	cell)	/*!< in: storage cell */
+trx_purge_wait_for_workers_to_complete(
+/*===================================*/
+	trx_purge_t*	purge_sys)	/*!< in: purge instance */
 {
-	trx_purge_arr_remove_info(cell);
+	ulint		n_submitted = purge_sys->n_submitted;
+
+#ifdef HAVE_ATOMIC_BUILTINS
+	/* Ensure that the work queue empties out. */
+	while (!os_compare_and_swap_ulint(
+			&purge_sys->n_completed, n_submitted, n_submitted)) {
+#else
+	mutex_enter(&purge_sys->bh_mutex);
+
+	while (purge_sys->n_completed < n_submitted) {
+#endif /* HAVE_ATOMIC_BUILTINS */
+
+#ifndef HAVE_ATOMIC_BUILTINS
+		mutex_exit(&purge_sys->bh_mutex);
+#endif /* !HAVE_ATOMIC_BUILTINS */
+
+		if (srv_get_task_queue_length() > 0) {
+			srv_release_threads(SRV_WORKER, 1);
+		}
+
+		os_thread_yield();
+
+#ifndef HAVE_ATOMIC_BUILTINS
+		mutex_enter(&purge_sys->bh_mutex);
+#endif /* !HAVE_ATOMIC_BUILTINS */
+	}
+
+#ifndef HAVE_ATOMIC_BUILTINS
+	mutex_exit(&purge_sys->bh_mutex);
+#endif /* !HAVE_ATOMIC_BUILTINS */
+
+	/* None of the worker threads should be doing any work. */
+	ut_a(purge_sys->n_submitted == purge_sys->n_completed);
+
+	/* There should be no outstanding tasks as long
+	as the worker threads are active. */
+	ut_a(srv_get_task_queue_length() == 0);
+}
+
+/******************************************************************//**
+Remove old historical changes from the rollback segments. */
+static
+void
+trx_purge_truncate(void)
+/*====================*/
+{
+	ut_ad(trx_purge_check_limit());
+
+	if (purge_sys->limit.trx_no == 0) {
+		trx_purge_truncate_history(&purge_sys->iter, purge_sys->view);
+	} else {
+		trx_purge_truncate_history(&purge_sys->limit, purge_sys->view);
+	}
 }
 
 /*******************************************************************//**
@@ -1124,107 +1189,183 @@ UNIV_INTERN
 ulint
 trx_purge(
 /*======*/
-	ulint	limit)		/*!< in: the maximum number of records to
-				purge in one batch */
+	ulint	n_purge_threads,	/*!< in: number of purge tasks
+					to submit to the work queue */
+	ulint	batch_size,		/*!< in: the maximum number of records
+					to purge in one batch */
+	bool	truncate)		/*!< in: truncate history if true */
 {
-	que_thr_t*	thr;
-	ulint		old_pages_handled;
+	que_thr_t*	thr = NULL;
+	ulint		n_pages_handled;
 
-	ut_a(purge_sys->trx->n_active_thrs == 0);
+	ut_a(n_purge_threads > 0);
 
-	rw_lock_x_lock(&purge_sys->latch);
+	srv_dml_needed_delay = trx_purge_dml_delay();
 
-	mutex_enter(&kernel_mutex);
+	/* The number of tasks submitted should be completed. */
+	ut_a(purge_sys->n_submitted == purge_sys->n_completed);
 
-	/* Close and free the old purge view */
+	rw_lock_x_lock(&purge_sys->latch);
 
-	read_view_close(purge_sys->view);
 	purge_sys->view = NULL;
+
 	mem_heap_empty(purge_sys->heap);
 
-	/* Determine how much data manipulation language (DML) statements
-	need to be delayed in order to reduce the lagging of the purge
-	thread. */
-	srv_dml_needed_delay = 0; /* in microseconds; default: no delay */
-
-	/* If we cannot advance the 'purge view' because of an old
-	'consistent read view', then the DML statements cannot be delayed.
-	Also, srv_max_purge_lag <= 0 means 'infinity'. */
-	if (srv_max_purge_lag > 0
-	    && !UT_LIST_GET_LAST(trx_sys->view_list)) {
-		float	ratio = (float) trx_sys->rseg_history_len
-			/ srv_max_purge_lag;
-		if (ratio > ULINT_MAX / 10000) {
-			/* Avoid overflow: maximum delay is 4295 seconds */
-			srv_dml_needed_delay = ULINT_MAX;
-		} else if (ratio > 1) {
-			/* If the history list length exceeds the
-			innodb_max_purge_lag, the
-			data manipulation statements are delayed
-			by at least 5000 microseconds. */
-			srv_dml_needed_delay = (ulint) ((ratio - .5) * 10000);
+	purge_sys->view = read_view_purge_open(purge_sys->heap);
+
+	rw_lock_x_unlock(&purge_sys->latch);
+
+	/* Fetch the UNDO recs that need to be purged. */
+	n_pages_handled = trx_purge_attach_undo_recs(
+		n_purge_threads, purge_sys, &purge_sys->limit, batch_size);
+
+	/* Do we do an asynchronous purge or not ? */
+	if (n_purge_threads > 1) {
+		ulint	i = 0;
+
+		/* Submit the tasks to the work queue. */
+		for (i = 0; i < n_purge_threads - 1; ++i) {
+			thr = que_fork_scheduler_round_robin(
+				purge_sys->query, thr);
+
+			ut_a(thr != NULL);
+
+			srv_que_task_enqueue_low(thr);
+		}
+
+		thr = que_fork_scheduler_round_robin(purge_sys->query, thr);
+		ut_a(thr != NULL);
+
+		purge_sys->n_submitted += n_purge_threads - 1;
+
+		goto run_synchronously;
+
+	/* Do it synchronously. */
+	} else {
+		thr = que_fork_scheduler_round_robin(purge_sys->query, NULL);
+		ut_ad(thr);
+
+run_synchronously:
+		++purge_sys->n_submitted;
+
+		que_run_threads(thr);
+
+		os_atomic_inc_ulint(
+			&purge_sys->bh_mutex, &purge_sys->n_completed, 1);
+
+		if (n_purge_threads > 1) {
+			trx_purge_wait_for_workers_to_complete(purge_sys);
 		}
 	}
 
-	purge_sys->view = read_view_oldest_copy_or_open_new(
-		0, purge_sys->heap);
+	ut_a(purge_sys->n_submitted == purge_sys->n_completed);
+
+	if (truncate) {
+		trx_purge_truncate();
+	}
+
+	MONITOR_INC_VALUE(MONITOR_PURGE_INVOKED, 1);
+	MONITOR_INC_VALUE(MONITOR_PURGE_N_PAGE_HANDLED, n_pages_handled);
+
+	return(n_pages_handled);
+}
+
+/*******************************************************************//**
+Get the purge state.
+@return purge state. */
+UNIV_INTERN
+purge_state_t
+trx_purge_state(void)
+/*=================*/
+{
+	purge_state_t	state;
+
+	rw_lock_x_lock(&purge_sys->latch);
 
-	mutex_exit(&kernel_mutex);
+	state = purge_sys->state;
 
-	rw_lock_x_unlock(&(purge_sys->latch));
+	rw_lock_x_unlock(&purge_sys->latch);
 
-	purge_sys->state = TRX_PURGE_ON;
+	return(state);
+}
 
-	purge_sys->handle_limit = purge_sys->n_pages_handled + limit;
+/*******************************************************************//**
+Stop purge and wait for it to stop, move to PURGE_STATE_STOP. */
+UNIV_INTERN
+void
+trx_purge_stop(void)
+/*================*/
+{
+	purge_state_t	state;
+	ib_int64_t	sig_count = os_event_reset(purge_sys->event);
 
-	old_pages_handled = purge_sys->n_pages_handled;
+	ut_a(srv_n_purge_threads > 0);
 
+	rw_lock_x_lock(&purge_sys->latch);
 
-	mutex_enter(&kernel_mutex);
+	ut_a(purge_sys->state != PURGE_STATE_INIT);
+	ut_a(purge_sys->state != PURGE_STATE_EXIT);
 
-	thr = que_fork_start_command(purge_sys->query);
+	++purge_sys->n_stop;
 
-	ut_ad(thr);
+	state = purge_sys->state;
 
-	mutex_exit(&kernel_mutex);
+	if (state == PURGE_STATE_RUN) {
+		ut_print_timestamp(stderr);
+		fprintf(stderr, " InnoDB: Stopping purge.\n");
 
-	if (srv_print_thread_releases) {
+		/* We need to wakeup the purge thread in case it is suspended,
+		so that it can acknowledge the state change. */
 
-		fputs("Starting purge\n", stderr);
+		srv_wake_purge_thread_if_not_active();
 	}
 
-	que_run_threads(thr);
+	purge_sys->state = PURGE_STATE_STOP;
+
+	rw_lock_x_unlock(&purge_sys->latch);
 
-	if (srv_print_thread_releases) {
+	if (state != PURGE_STATE_STOP) {
 
-		fprintf(stderr,
-			"Purge ends; pages handled %lu\n",
-			(ulong) purge_sys->n_pages_handled);
+		/* Wait for purge coordinator to signal that it
+		is suspended. */
+		os_event_wait_low(purge_sys->event, sig_count);
 	}
 
-	return((ulint) (purge_sys->n_pages_handled - old_pages_handled));
+	MONITOR_INC_VALUE(MONITOR_PURGE_STOP_COUNT, 1);
 }
 
-/******************************************************************//**
-Prints information of the purge system to stderr. */
+/*******************************************************************//**
+Resume purge, move to PURGE_STATE_RUN. */
 UNIV_INTERN
 void
-trx_purge_sys_print(void)
-/*=====================*/
+trx_purge_run(void)
+/*===============*/
 {
-	fprintf(stderr, "InnoDB: Purge system view:\n");
-	read_view_print(purge_sys->view);
-
-	fprintf(stderr, "InnoDB: Purge trx n:o " TRX_ID_FMT
-		", undo n:o " TRX_ID_FMT "\n",
-		(ullint) purge_sys->purge_trx_no,
-		(ullint) purge_sys->purge_undo_no);
-	fprintf(stderr,
-		"InnoDB: Purge next stored %lu, page_no %lu, offset %lu,\n"
-		"InnoDB: Purge hdr_page_no %lu, hdr_offset %lu\n",
-		(ulong) purge_sys->next_stored,
-		(ulong) purge_sys->page_no,
-		(ulong) purge_sys->offset,
-		(ulong) purge_sys->hdr_page_no,
-		(ulong) purge_sys->hdr_offset);
+	rw_lock_x_lock(&purge_sys->latch);
+
+	ut_a(purge_sys->state != PURGE_STATE_INIT);
+	ut_a(purge_sys->state != PURGE_STATE_EXIT);
+
+	if (purge_sys->n_stop > 0) {
+
+		ut_a(purge_sys->state == PURGE_STATE_STOP);
+
+		--purge_sys->n_stop;
+
+		if (purge_sys->n_stop == 0) {
+
+			ut_print_timestamp(stderr);
+			fprintf(stderr, " InnoDB: Resuming purge.\n");
+
+			purge_sys->state = PURGE_STATE_RUN;
+		}
+
+		MONITOR_INC_VALUE(MONITOR_PURGE_RESUME_COUNT, 1);
+	} else {
+		ut_a(purge_sys->state == PURGE_STATE_RUN);
+	}
+
+	rw_lock_x_unlock(&purge_sys->latch);
+
+	srv_wake_purge_thread_if_not_active();
 }
diff --git a/storage/innobase/trx/trx0rec.c b/storage/innobase/trx/trx0rec.cc
index 2f1389ae263..b87eac9362e 100644
--- a/storage/innobase/trx/trx0rec.c
+++ b/storage/innobase/trx/trx0rec.cc
@@ -17,7 +17,7 @@ this program; if not, write to the Free Software Foundation, Inc.,
 *****************************************************************************/
 
 /**************************************************//**
-@file trx/trx0rec.c
+@file trx/trx0rec.cc
 Transaction undo log record
 
 Created 3/26/1996 Heikki Tuuri
@@ -36,6 +36,7 @@ Created 3/26/1996 Heikki Tuuri
 #ifndef UNIV_HOTBACKUP
 #include "dict0dict.h"
 #include "ut0mem.h"
+#include "read0read.h"
 #include "row0ext.h"
 #include "row0upd.h"
 #include "que0que.h"
@@ -352,8 +353,9 @@ trx_undo_rec_get_col_val(
 		ut_ad(*len > *orig_len);
 		/* @see dtuple_convert_big_rec() */
 		ut_ad(*len >= BTR_EXTERN_FIELD_REF_SIZE);
+
 		/* we do not have access to index->table here
-		ut_ad(dict_table_get_format(index->table) >= DICT_TF_FORMAT_ZIP
+		ut_ad(dict_table_get_format(index->table) >= UNIV_FORMAT_B
 		      || *len >= col->max_prefix
 		      + BTR_EXTERN_FIELD_REF_SIZE);
 		*/
@@ -959,7 +961,9 @@ trx_undo_update_rec_get_update(
 	/* Store first trx id and roll ptr to update vector */
 
 	upd_field = upd_get_nth_field(update, n_fields);
-	buf = mem_heap_alloc(heap, DATA_TRX_ID_LEN);
+
+	buf = static_cast<byte*>(mem_heap_alloc(heap, DATA_TRX_ID_LEN));
+
 	trx_write_trx_id(buf, trx_id);
 
 	upd_field_set_field_no(upd_field,
@@ -968,7 +972,9 @@ trx_undo_update_rec_get_update(
 	dfield_set_data(&(upd_field->new_val), buf, DATA_TRX_ID_LEN);
 
 	upd_field = upd_get_nth_field(update, n_fields + 1);
-	buf = mem_heap_alloc(heap, DATA_ROLL_PTR_LEN);
+
+	buf = static_cast<byte*>(mem_heap_alloc(heap, DATA_ROLL_PTR_LEN));
+
 	trx_write_roll_ptr(buf, roll_ptr);
 
 	upd_field_set_field_no(
@@ -1106,9 +1112,9 @@ trx_undo_rec_get_partial_row(
 				ut_a(dfield_get_len(dfield)
 				     >= BTR_EXTERN_FIELD_REF_SIZE);
 				ut_a(dict_table_get_format(index->table)
-				     >= DICT_TF_FORMAT_ZIP
+				     >= UNIV_FORMAT_B
 				     || dfield_get_len(dfield)
-				     >= REC_ANTELOPE_MAX_INDEX_COL_LEN 
+				     >= REC_ANTELOPE_MAX_INDEX_COL_LEN
 				     + BTR_EXTERN_FIELD_REF_SIZE);
 			}
 		}
@@ -1226,44 +1232,43 @@ trx_undo_report_row_operation(
 	trx = thr_get_trx(thr);
 	rseg = trx->rseg;
 
-	mutex_enter(&(trx->undo_mutex));
+	mtr_start(&mtr);
+	mutex_enter(&trx->undo_mutex);
 
 	/* If the undo log is not assigned yet, assign one */
 
-	if (op_type == TRX_UNDO_INSERT_OP) {
+	switch (op_type) {
+	case TRX_UNDO_INSERT_OP:
+		undo = trx->insert_undo;
 
-		if (trx->insert_undo == NULL) {
+		if (undo == NULL) {
 
 			err = trx_undo_assign_undo(trx, TRX_UNDO_INSERT);
-		}
+			undo = trx->insert_undo;
 
-		undo = trx->insert_undo;
-
-		if (UNIV_UNLIKELY(!undo)) {
-			/* Did not succeed */
-			ut_ad(err != DB_SUCCESS);
-			mutex_exit(&(trx->undo_mutex));
+			if (undo == NULL) {
+				/* Did not succeed */
+				ut_ad(err != DB_SUCCESS);
+				goto err_exit;
+			}
 
-			return(err);
+			ut_ad(err == DB_SUCCESS);
 		}
-
-		ut_ad(err == DB_SUCCESS);
-	} else {
+		break;
+	default:
 		ut_ad(op_type == TRX_UNDO_MODIFY_OP);
 
-		if (trx->update_undo == NULL) {
+		undo = trx->update_undo;
 
+		if (undo == NULL) {
 			err = trx_undo_assign_undo(trx, TRX_UNDO_UPDATE);
+			undo = trx->update_undo;
 
-		}
-
-		undo = trx->update_undo;
-
-		if (UNIV_UNLIKELY(!undo)) {
-			/* Did not succeed */
-			ut_ad(err != DB_SUCCESS);
-			mutex_exit(&(trx->undo_mutex));
-			return(err);
+			if (undo == NULL) {
+				/* Did not succeed */
+				ut_ad(err != DB_SUCCESS);
+				goto err_exit;
+			}
 		}
 
 		ut_ad(err == DB_SUCCESS);
@@ -1271,8 +1276,6 @@ trx_undo_report_row_operation(
 					  ULINT_UNDEFINED, &heap);
 	}
 
-	mtr_start(&mtr);
-
 	page_no = undo->last_page_no;
 	undo_block = buf_page_get_gen(
 		undo->space, undo->zip_size, page_no, RW_X_LATCH,
@@ -1286,10 +1289,13 @@ trx_undo_report_row_operation(
 		undo_page = buf_block_get_frame(undo_block);
 		ut_ad(page_no == buf_block_get_page_no(undo_block));
 
-		if (op_type == TRX_UNDO_INSERT_OP) {
+		switch (op_type) {
+		case TRX_UNDO_INSERT_OP:
 			offset = trx_undo_page_report_insert(
 				undo_page, trx, index, clust_entry, &mtr);
-		} else {
+			break;
+		default:
+			ut_ad(op_type == TRX_UNDO_MODIFY_OP);
 			offset = trx_undo_page_report_modify(
 				undo_page, trx, index, rec, offsets, update,
 				cmpl_info, &mtr);
@@ -1364,6 +1370,7 @@ trx_undo_report_row_operation(
 		mutex_enter(&rseg->mutex);
 		undo_block = trx_undo_add_page(trx, undo, &mtr);
 		mutex_exit(&rseg->mutex);
+
 		page_no = undo->last_page_no;
 	} while (undo_block != NULL);
 
@@ -1426,7 +1433,7 @@ purge_view.
 
 @return DB_SUCCESS, or DB_MISSING_HISTORY if the undo log has been
 truncated and we cannot fetch the old version */
-UNIV_INTERN
+static
 ulint
 trx_undo_get_undo_rec(
 /*==================*/
@@ -1437,11 +1444,13 @@ trx_undo_get_undo_rec(
 	trx_undo_rec_t** undo_rec,	/*!< out, own: copy of the record */
 	mem_heap_t*	heap)		/*!< in: memory heap where copied */
 {
-#ifdef UNIV_SYNC_DEBUG
-	ut_ad(rw_lock_own(&(purge_sys->latch), RW_LOCK_SHARED));
-#endif /* UNIV_SYNC_DEBUG */
+	ibool		missing_history;
+
+	rw_lock_s_lock(&purge_sys->latch);
+	missing_history = read_view_sees_trx_id(purge_sys->view, trx_id);
+	rw_lock_s_unlock(&purge_sys->latch);
 
-	if (!trx_purge_update_undo_must_exist(trx_id)) {
+	if (UNIV_UNLIKELY(missing_history)) {
 
 		/* It may be that the necessary undo log has already been
 		deleted */
@@ -1454,21 +1463,27 @@ trx_undo_get_undo_rec(
 	return(DB_SUCCESS);
 }
 
+#ifdef UNIV_DEBUG
+#define ATTRIB_USED_ONLY_IN_DEBUG
+#else /* UNIV_DEBUG */
+#define ATTRIB_USED_ONLY_IN_DEBUG	__attribute__((unused))
+#endif /* UNIV_DEBUG */
+
 /*******************************************************************//**
-Build a previous version of a clustered index record. This function checks
-that the caller has a latch on the index page of the clustered index record
-and an s-latch on the purge_view. This guarantees that the stack of versions
-is locked all the way down to the purge_view.
+Build a previous version of a clustered index record. The caller must
+hold a latch on the index page of the clustered index record, to
+guarantee that the stack of versions is locked all the way down to the
+purge_sys->view.
 @return DB_SUCCESS, or DB_MISSING_HISTORY if the previous version is
-earlier than purge_view, which means that it may have been removed,
-DB_ERROR if corrupted record */
+earlier than purge_view, which means that it may have been removed */
 UNIV_INTERN
 ulint
 trx_undo_prev_version_build(
 /*========================*/
-	const rec_t*	index_rec,/*!< in: clustered index record in the
+	const rec_t*	index_rec ATTRIB_USED_ONLY_IN_DEBUG,
+				/*!< in: clustered index record in the
 				index tree */
-	mtr_t*		index_mtr __attribute__((unused)),
+	mtr_t*		index_mtr ATTRIB_USED_ONLY_IN_DEBUG,
 				/*!< in: mtr which contains the latch to
 				index_rec page and purge_view */
 	const rec_t*	rec,	/*!< in: version of a clustered index record */
@@ -1490,7 +1505,6 @@ trx_undo_prev_version_build(
 	table_id_t	table_id;
 	trx_id_t	trx_id;
 	roll_ptr_t	roll_ptr;
-	roll_ptr_t	old_roll_ptr;
 	upd_t*		update;
 	byte*		ptr;
 	ulint		info_bits;
@@ -1499,30 +1513,15 @@ trx_undo_prev_version_build(
 	byte*		buf;
 	ulint		err;
 #ifdef UNIV_SYNC_DEBUG
-	ut_ad(rw_lock_own(&(purge_sys->latch), RW_LOCK_SHARED));
+	ut_ad(!rw_lock_own(&purge_sys->latch, RW_LOCK_SHARED));
 #endif /* UNIV_SYNC_DEBUG */
 	ut_ad(mtr_memo_contains_page(index_mtr, index_rec, MTR_MEMO_PAGE_S_FIX)
 	      || mtr_memo_contains_page(index_mtr, index_rec,
 					MTR_MEMO_PAGE_X_FIX));
 	ut_ad(rec_offs_validate(rec, index, offsets));
-
-	if (!dict_index_is_clust(index)) {
-		fprintf(stderr, "InnoDB: Error: trying to access"
-			" update undo rec for non-clustered index %s\n"
-			"InnoDB: Submit a detailed bug report to"
-			" http://bugs.mysql.com\n"
-			"InnoDB: index record ", index->name);
-		rec_print(stderr, index_rec, index);
-		fputs("\n"
-		      "InnoDB: record version ", stderr);
-		rec_print_new(stderr, rec, offsets);
-		putc('\n', stderr);
-		ut_ad(0);
-		return(DB_ERROR);
-	}
+	ut_a(dict_index_is_clust(index));
 
 	roll_ptr = row_get_rec_roll_ptr(rec, index, offsets);
-	old_roll_ptr = roll_ptr;
 
 	*old_vers = NULL;
 
@@ -1539,7 +1538,9 @@ trx_undo_prev_version_build(
 
 	if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
 		/* The undo record may already have been purged.
-		This should never happen in InnoDB. */
+		This should never happen for user transactions, but
+		it can happen in purge. */
+		ut_ad(err == DB_MISSING_HISTORY);
 
 		return(err);
 	}
@@ -1577,59 +1578,12 @@ trx_undo_prev_version_build(
 	ptr = trx_undo_update_rec_get_update(ptr, index, type, trx_id,
 					     roll_ptr, info_bits,
 					     NULL, heap, &update);
+	ut_a(table_id == index->table->id);
+	ut_a(ptr);
 
-	if (UNIV_UNLIKELY(table_id != index->table->id)) {
-		ptr = NULL;
-
-		fprintf(stderr,
-			"InnoDB: Error: trying to access update undo rec"
-			" for table %s\n"
-			"InnoDB: but the table id in the"
-			" undo record is wrong\n"
-			"InnoDB: Submit a detailed bug report"
-			" to http://bugs.mysql.com\n"
-			"InnoDB: Run also CHECK TABLE %s\n",
-			index->table_name, index->table_name);
-	}
-
-	if (ptr == NULL) {
-		/* The record was corrupted, return an error; these printfs
-		should catch an elusive bug in row_vers_old_has_index_entry */
-
-		fprintf(stderr,
-			"InnoDB: table %s, index %s, n_uniq %lu\n"
-			"InnoDB: undo rec address %p, type %lu cmpl_info %lu\n"
-			"InnoDB: undo rec table id %llu,"
-			" index table id %llu\n"
-			"InnoDB: dump of 150 bytes in undo rec: ",
-			index->table_name, index->name,
-			(ulong) dict_index_get_n_unique(index),
-			undo_rec, (ulong) type, (ulong) cmpl_info,
-			(ullint) table_id,
-			(ullint) index->table->id);
-		ut_print_buf(stderr, undo_rec, 150);
-		fputs("\n"
-		      "InnoDB: index record ", stderr);
-		rec_print(stderr, index_rec, index);
-		fputs("\n"
-		      "InnoDB: record version ", stderr);
-		rec_print_new(stderr, rec, offsets);
-		fprintf(stderr, "\n"
-			"InnoDB: Record trx id " TRX_ID_FMT
-			", update rec trx id " TRX_ID_FMT "\n"
-			"InnoDB: Roll ptr in rec " TRX_ID_FMT
-			", in update rec" TRX_ID_FMT "\n",
-			(ullint) rec_trx_id, (ullint) trx_id,
-			(ullint) old_roll_ptr, (ullint) roll_ptr);
-
-		trx_purge_sys_print();
-		ut_ad(0);
-		return(DB_ERROR);
-	}
-
-# ifdef UNIV_BLOB_NULL_DEBUG
+# if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
 	ut_a(!rec_offs_any_null_extern(rec, offsets));
-# endif /* UNIV_BLOB_NULL_DEBUG */
+# endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
 
 	if (row_upd_changes_field_size_or_external(index, offsets, update)) {
 		ulint	n_ext;
@@ -1648,13 +1602,17 @@ trx_undo_prev_version_build(
 		following call is safe. */
 		row_upd_index_replace_new_col_vals(entry, index, update, heap);
 
-		buf = mem_heap_alloc(heap, rec_get_converted_size(index, entry,
-								  n_ext));
+		buf = static_cast<byte*>(
+			mem_heap_alloc(
+				heap,
+				rec_get_converted_size(index, entry, n_ext)));
 
 		*old_vers = rec_convert_dtuple_to_rec(buf, index,
 						      entry, n_ext);
 	} else {
-		buf = mem_heap_alloc(heap, rec_offs_size(offsets));
+		buf = static_cast<byte*>(
+			mem_heap_alloc(heap, rec_offs_size(offsets)));
+
 		*old_vers = rec_copy(buf, rec, offsets);
 		rec_offs_make_valid(*old_vers, index, offsets);
 		row_upd_rec_in_place(*old_vers, index, offsets, update, NULL);
diff --git a/storage/innobase/trx/trx0roll.c b/storage/innobase/trx/trx0roll.cc
index b55471959ce..042b5b87da7 100644
--- a/storage/innobase/trx/trx0roll.c
+++ b/storage/innobase/trx/trx0roll.cc
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2011, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,13 +11,13 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
 /**************************************************//**
-@file trx/trx0roll.c
+@file trx/trx0roll.cc
 Transaction rollback
 
 Created 3/26/1996 Heikki Tuuri
@@ -38,10 +38,13 @@ Created 3/26/1996 Heikki Tuuri
 #include "que0que.h"
 #include "usr0sess.h"
 #include "srv0start.h"
+#include "read0read.h"
 #include "row0undo.h"
 #include "row0mysql.h"
 #include "lock0lock.h"
 #include "pars0pars.h"
+#include "srv0mon.h"
+#include "trx0sys.h"
 
 /** This many pages must be undone before a truncate is tried within
 rollback */
@@ -57,61 +60,97 @@ static undo_no_t	trx_roll_max_undo_no;
 /** Auxiliary variable which tells the previous progress % we printed */
 static ulint		trx_roll_progress_printed_pct;
 
+/****************************************************************//**
+Finishes a transaction rollback. */
+static
+void
+trx_rollback_finish(
+/*================*/
+	trx_t*		trx);	/*!< in: transaction */
+
 /*******************************************************************//**
-Rollback a transaction used in MySQL.
-@return	error code or DB_SUCCESS */
-UNIV_INTERN
-int
-trx_general_rollback_for_mysql(
-/*===========================*/
+Rollback a transaction used in MySQL. */
+static
+void
+trx_rollback_to_savepoint_low(
+/*==========================*/
 	trx_t*		trx,	/*!< in: transaction handle */
 	trx_savept_t*	savept)	/*!< in: pointer to savepoint undo number, if
 				partial rollback requested, or NULL for
 				complete rollback */
 {
-	mem_heap_t*	heap;
 	que_thr_t*	thr;
+	mem_heap_t*	heap;
 	roll_node_t*	roll_node;
 
-	/* Tell Innobase server that there might be work for
-	utility threads: */
-
-	srv_active_wake_master_thread();
-
-	trx_start_if_not_started(trx);
-
 	heap = mem_heap_create(512);
 
 	roll_node = roll_node_create(heap);
 
-	if (savept) {
+	if (savept != NULL) {
 		roll_node->partial = TRUE;
 		roll_node->savept = *savept;
+		assert_trx_in_list(trx);
+	}  else {
+		assert_trx_nonlocking_or_in_list(trx);
 	}
 
 	trx->error_state = DB_SUCCESS;
 
-	thr = pars_complete_graph_for_exec(roll_node, trx, heap);
+	if (!trx->read_only) {
+		thr = pars_complete_graph_for_exec(roll_node, trx, heap);
 
-	ut_a(thr == que_fork_start_command(que_node_get_parent(thr)));
-	que_run_threads(thr);
+		ut_a(thr == que_fork_start_command(
+			static_cast<que_fork_t*>(que_node_get_parent(thr))));
 
-	mutex_enter(&kernel_mutex);
+		que_run_threads(thr);
 
-	while (trx->que_state != TRX_QUE_RUNNING) {
+		ut_a(roll_node->undo_thr != NULL);
+		que_run_threads(roll_node->undo_thr);
 
-		mutex_exit(&kernel_mutex);
-
-		os_thread_sleep(100000);
+		/* Free the memory reserved by the undo graph. */
+		que_graph_free(static_cast<que_t*>(
+			       roll_node->undo_thr->common.parent));
+	}
 
-		mutex_enter(&kernel_mutex);
+	if (savept == NULL) {
+		trx_rollback_finish(trx);
+		MONITOR_INC(MONITOR_TRX_ROLLBACK);
+	} else {
+		trx->lock.que_state = TRX_QUE_RUNNING;
+		MONITOR_INC(MONITOR_TRX_ROLLBACK_SAVEPOINT);
 	}
 
-	mutex_exit(&kernel_mutex);
+	ut_a(trx->error_state == DB_SUCCESS);
+	ut_a(trx->lock.que_state == TRX_QUE_RUNNING);
 
 	mem_heap_free(heap);
 
-	ut_a(trx->error_state == DB_SUCCESS);
+	MONITOR_DEC(MONITOR_TRX_ACTIVE);
+}
+
+/*******************************************************************//**
+Rollback a transaction to a given savepoint or do a complete rollback.
+@return	error code or DB_SUCCESS */
+UNIV_INTERN
+int
+trx_rollback_to_savepoint(
+/*======================*/
+	trx_t*		trx,	/*!< in: transaction handle */
+	trx_savept_t*	savept)	/*!< in: pointer to savepoint undo number, if
+				partial rollback requested, or NULL for
+				complete rollback */
+{
+	ut_ad(!trx_mutex_own(trx));
+
+	/* Tell Innobase server that there might be work for
+	utility threads: */
+
+	srv_active_wake_master_thread();
+
+	trx_start_if_not_started_xa(trx);
+
+	trx_rollback_to_savepoint_low(trx, savept);
 
 	/* Tell Innobase server that there might be work for
 	utility threads: */
@@ -124,30 +163,67 @@ trx_general_rollback_for_mysql(
 /*******************************************************************//**
 Rollback a transaction used in MySQL.
 @return	error code or DB_SUCCESS */
+static
+enum db_err
+trx_rollback_for_mysql_low(
+/*=======================*/
+	trx_t*	trx)	/*!< in/out: transaction */
+{
+	srv_active_wake_master_thread();
+
+	trx->op_info = "rollback";
+
+	/* If we are doing the XA recovery of prepared transactions,
+	then the transaction object does not have an InnoDB session
+	object, and we set a dummy session that we use for all MySQL
+	transactions. */
+
+	trx_rollback_to_savepoint_low(trx, NULL);
+
+	trx->op_info = "";
+
+	ut_a(trx->error_state == DB_SUCCESS);
+
+	srv_active_wake_master_thread();
+
+	return(trx->error_state);
+}
+
+/*******************************************************************//**
+Rollback a transaction used in MySQL.
+@return	error code or DB_SUCCESS */
 UNIV_INTERN
 int
 trx_rollback_for_mysql(
 /*===================*/
-	trx_t*	trx)	/*!< in: transaction handle */
+	trx_t*	trx)	/*!< in/out: transaction */
 {
-	int	err;
-
-	if (trx->conc_state == TRX_NOT_STARTED) {
-
+	/* We are reading trx->state without holding trx_sys->mutex
+	here, because the rollback should be invoked for a running
+	active MySQL transaction (or recovered prepared transaction)
+	that is associated with the current thread. */
+
+	switch (trx->state) {
+	case TRX_STATE_NOT_STARTED:
+		ut_ad(trx->in_mysql_trx_list);
 		return(DB_SUCCESS);
-	}
-
-	trx->op_info = "rollback";
 
-	/* If we are doing the XA recovery of prepared transactions, then
-	the transaction object does not have an InnoDB session object, and we
-	set a dummy session that we use for all MySQL transactions. */
+	case TRX_STATE_ACTIVE:
+		ut_ad(trx->in_mysql_trx_list);
+		assert_trx_nonlocking_or_in_list(trx);
+		return(trx_rollback_for_mysql_low(trx));
 
-	err = trx_general_rollback_for_mysql(trx, NULL);
+	case TRX_STATE_PREPARED:
+		assert_trx_in_rw_list(trx);
+		return(trx_rollback_for_mysql_low(trx));
 
-	trx->op_info = "";
+	case TRX_STATE_COMMITTED_IN_MEMORY:
+		assert_trx_in_list(trx);
+		break;
+	}
 
-	return(err);
+	ut_error;
+	return((int) DB_CORRUPTION);
 }
 
 /*******************************************************************//**
@@ -157,64 +233,100 @@ UNIV_INTERN
 int
 trx_rollback_last_sql_stat_for_mysql(
 /*=================================*/
-	trx_t*	trx)	/*!< in: transaction handle */
+	trx_t*	trx)	/*!< in/out: transaction */
 {
 	int	err;
 
-	if (trx->conc_state == TRX_NOT_STARTED) {
+	/* We are reading trx->state without holding trx_sys->mutex
+	here, because the statement rollback should be invoked for a
+	running active MySQL transaction that is associated with the
+	current thread. */
+	ut_ad(trx->in_mysql_trx_list);
 
+	switch (trx->state) {
+	case TRX_STATE_NOT_STARTED:
 		return(DB_SUCCESS);
+	case TRX_STATE_ACTIVE:
+		assert_trx_nonlocking_or_in_list(trx);
+
+		trx->op_info = "rollback of SQL statement";
+
+		err = trx_rollback_to_savepoint(
+			trx, &trx->last_sql_stat_start);
+
+		if (trx->fts_trx) {
+			fts_savepoint_rollback_last_stmt(trx);
+		}
+
+		/* The following call should not be needed,
+		but we play it safe: */
+		trx_mark_sql_stat_end(trx);
+
+		trx->op_info = "";
+
+		return(err);
+	case TRX_STATE_PREPARED:
+	case TRX_STATE_COMMITTED_IN_MEMORY:
+		/* The statement rollback is only allowed on an ACTIVE
+		transaction, not a PREPARED or COMMITTED one. */
+		break;
 	}
 
-	trx->op_info = "rollback of SQL statement";
+	ut_error;
+	return(DB_CORRUPTION);
+}
 
-	err = trx_general_rollback_for_mysql(trx, &trx->last_sql_stat_start);
-	/* The following call should not be needed, but we play safe: */
-	trx_mark_sql_stat_end(trx);
+/*******************************************************************//**
+Search for a savepoint using name.
+@return savepoint if found else NULL */
+static
+trx_named_savept_t*
+trx_savepoint_find(
+/*===============*/
+	trx_t*		trx,			/*!< in: transaction */
+	const char*	name)			/*!< in: savepoint name */
+{
+	trx_named_savept_t*	savep;
 
-	trx->op_info = "";
+	for (savep = UT_LIST_GET_FIRST(trx->trx_savepoints);
+	     savep != NULL;
+	     savep = UT_LIST_GET_NEXT(trx_savepoints, savep)) {
 
-	return(err);
+		if (0 == ut_strcmp(savep->name, name)) {
+			return(savep);
+		}
+	}
+
+	return(NULL);
 }
 
 /*******************************************************************//**
 Frees a single savepoint struct. */
-UNIV_INTERN
+static
 void
 trx_roll_savepoint_free(
 /*=====================*/
 	trx_t*			trx,	/*!< in: transaction handle */
 	trx_named_savept_t*	savep)	/*!< in: savepoint to free */
 {
-	ut_a(savep != NULL);
-	ut_a(UT_LIST_GET_LEN(trx->trx_savepoints) > 0);
-
 	UT_LIST_REMOVE(trx_savepoints, trx->trx_savepoints, savep);
 	mem_free(savep->name);
 	mem_free(savep);
 }
 
 /*******************************************************************//**
-Frees savepoint structs starting from savep, if savep == NULL then
-free all savepoints. */
+Frees savepoint structs starting from savep. */
 UNIV_INTERN
 void
 trx_roll_savepoints_free(
 /*=====================*/
 	trx_t*			trx,	/*!< in: transaction handle */
-	trx_named_savept_t*	savep)	/*!< in: free all savepoints > this one;
-					if this is NULL, free all savepoints
-					of trx */
+	trx_named_savept_t*	savep)	/*!< in: free all savepoints starting
+					with this savepoint i*/
 {
-	trx_named_savept_t*	next_savep;
-
-	if (savep == NULL) {
-		savep = UT_LIST_GET_FIRST(trx->trx_savepoints);
-	} else {
-		savep = UT_LIST_GET_NEXT(trx_savepoints, savep);
-	}
-
 	while (savep != NULL) {
+		trx_named_savept_t*	next_savep;
+
 		next_savep = UT_LIST_GET_NEXT(trx_savepoints, savep);
 
 		trx_roll_savepoint_free(trx, savep);
@@ -232,6 +344,56 @@ the row, these locks are naturally released in the rollback. Savepoints which
 were set after this savepoint are deleted.
 @return if no savepoint of the name found then DB_NO_SAVEPOINT,
 otherwise DB_SUCCESS */
+static
+ulint
+trx_rollback_to_savepoint_for_mysql_low(
+/*====================================*/
+	trx_t*			trx,	/*!< in/out: transaction */
+	trx_named_savept_t*	savep,	/*!< in/out: savepoint */
+	ib_int64_t*		mysql_binlog_cache_pos)
+					/*!< out: the MySQL binlog
+					cache position corresponding
+					to this savepoint; MySQL needs
+					this information to remove the
+					binlog entries of the queries
+					executed after the savepoint */
+{
+	ulint	err;
+
+	ut_ad(trx_state_eq(trx, TRX_STATE_ACTIVE));
+	ut_ad(trx->in_mysql_trx_list);
+
+	/* Free all savepoints strictly later than savep. */
+
+	trx_roll_savepoints_free(
+		trx, UT_LIST_GET_NEXT(trx_savepoints, savep));
+
+	*mysql_binlog_cache_pos = savep->mysql_binlog_cache_pos;
+
+	trx->op_info = "rollback to a savepoint";
+
+	err = trx_rollback_to_savepoint(trx, &savep->savept);
+
+	/* Store the current undo_no of the transaction so that
+	we know where to roll back if we have to roll back the
+	next SQL statement: */
+
+	trx_mark_sql_stat_end(trx);
+
+	trx->op_info = "";
+
+	return(err);
+}
+
+/*******************************************************************//**
+Rolls back a transaction back to a named savepoint. Modifications after the
+savepoint are undone but InnoDB does NOT release the corresponding locks
+which are stored in memory. If a lock is 'implicit', that is, a new inserted
+row holds a lock where the lock information is carried by the trx id stored in
+the row, these locks are naturally released in the rollback. Savepoints which
+were set after this savepoint are deleted.
+@return if no savepoint of the name found then DB_NO_SAVEPOINT,
+otherwise DB_SUCCESS */
 UNIV_INTERN
 ulint
 trx_rollback_to_savepoint_for_mysql(
@@ -246,49 +408,38 @@ trx_rollback_to_savepoint_for_mysql(
 						executed after the savepoint */
 {
 	trx_named_savept_t*	savep;
-	ulint			err;
 
-	savep = UT_LIST_GET_FIRST(trx->trx_savepoints);
+	/* We are reading trx->state without holding trx_sys->mutex
+	here, because the savepoint rollback should be invoked for a
+	running active MySQL transaction that is associated with the
+	current thread. */
+	ut_ad(trx->in_mysql_trx_list);
 
-	while (savep != NULL) {
-		if (0 == ut_strcmp(savep->name, savepoint_name)) {
-			/* Found */
-			break;
-		}
-		savep = UT_LIST_GET_NEXT(trx_savepoints, savep);
-	}
+	savep = trx_savepoint_find(trx, savepoint_name);
 
 	if (savep == NULL) {
-
 		return(DB_NO_SAVEPOINT);
 	}
 
-	if (trx->conc_state == TRX_NOT_STARTED) {
+	switch (trx->state) {
+	case TRX_STATE_NOT_STARTED:
 		ut_print_timestamp(stderr);
 		fputs("  InnoDB: Error: transaction has a savepoint ", stderr);
 		ut_print_name(stderr, trx, FALSE, savep->name);
 		fputs(" though it is not started\n", stderr);
 		return(DB_ERROR);
+	case TRX_STATE_ACTIVE:
+		return(trx_rollback_to_savepoint_for_mysql_low(
+				trx, savep, mysql_binlog_cache_pos));
+	case TRX_STATE_PREPARED:
+	case TRX_STATE_COMMITTED_IN_MEMORY:
+		/* The savepoint rollback is only allowed on an ACTIVE
+		transaction, not a PREPARED or COMMITTED one. */
+		break;
 	}
 
-	/* We can now free all savepoints strictly later than this one */
-
-	trx_roll_savepoints_free(trx, savep);
-
-	*mysql_binlog_cache_pos = savep->mysql_binlog_cache_pos;
-
-	trx->op_info = "rollback to a savepoint";
-
-	err = trx_general_rollback_for_mysql(trx, &savep->savept);
-
-	/* Store the current undo_no of the transaction so that we know where
-	to roll back if we have to roll back the next SQL statement: */
-
-	trx_mark_sql_stat_end(trx);
-
-	trx->op_info = "";
-
-	return(err);
+	ut_error;
+	return(DB_CORRUPTION);
 }
 
 /*******************************************************************//**
@@ -310,20 +461,9 @@ trx_savepoint_for_mysql(
 {
 	trx_named_savept_t*	savep;
 
-	ut_a(trx);
-	ut_a(savepoint_name);
-
-	trx_start_if_not_started(trx);
-
-	savep = UT_LIST_GET_FIRST(trx->trx_savepoints);
+	trx_start_if_not_started_xa(trx);
 
-	while (savep != NULL) {
-		if (0 == ut_strcmp(savep->name, savepoint_name)) {
-			/* Found */
-			break;
-		}
-		savep = UT_LIST_GET_NEXT(trx_savepoints, savep);
-	}
+	savep = trx_savepoint_find(trx, savepoint_name);
 
 	if (savep) {
 		/* There is a savepoint with the same name: free that */
@@ -336,7 +476,7 @@ trx_savepoint_for_mysql(
 
 	/* Create a new savepoint and add it as the last in the list */
 
-	savep = mem_alloc(sizeof(trx_named_savept_t));
+	savep = static_cast<trx_named_savept_t*>(mem_alloc(sizeof(*savep)));
 
 	savep->name = mem_strdup(savepoint_name);
 
@@ -363,18 +503,16 @@ trx_release_savepoint_for_mysql(
 {
 	trx_named_savept_t*	savep;
 
-	savep = UT_LIST_GET_FIRST(trx->trx_savepoints);
+	ut_ad(trx_state_eq(trx, TRX_STATE_ACTIVE));
+	ut_ad(trx->in_mysql_trx_list);
 
-	/* Search for the savepoint by name and free if found. */
-	while (savep != NULL) {
-		if (0 == ut_strcmp(savep->name, savepoint_name)) {
-			trx_roll_savepoint_free(trx, savep);
-			return(DB_SUCCESS);
-		}
-		savep = UT_LIST_GET_NEXT(trx_savepoints, savep);
+	savep = trx_savepoint_find(trx, savepoint_name);
+
+	if (savep != NULL) {
+		trx_roll_savepoint_free(trx, savep);
 	}
 
-	return(DB_NO_SAVEPOINT);
+	return(savep != NULL ? DB_SUCCESS : DB_NO_SAVEPOINT);
 }
 
 /*******************************************************************//**
@@ -436,17 +574,22 @@ trx_rollback_active(
 	thr->child = roll_node;
 	roll_node->common.parent = thr;
 
-	mutex_enter(&kernel_mutex);
-
 	trx->graph = fork;
 
 	ut_a(thr == que_fork_start_command(fork));
 
+	mutex_enter(&trx_sys->mutex);
+
 	trx_roll_crash_recv_trx	= trx;
+
 	trx_roll_max_undo_no = trx->undo_no;
+
 	trx_roll_progress_printed_pct = 0;
+
 	rows_to_undo = trx_roll_max_undo_no;
 
+	mutex_exit(&trx_sys->mutex);
+
 	if (rows_to_undo > 1000000000) {
 		rows_to_undo = rows_to_undo / 1000000;
 		unit = "M";
@@ -456,9 +599,8 @@ trx_rollback_active(
 	fprintf(stderr,
 		"  InnoDB: Rolling back trx with id " TRX_ID_FMT ", %lu%s"
 		" rows to undo\n",
-		(ullint) trx->id,
+		trx->id,
 		(ulong) rows_to_undo, unit);
-	mutex_exit(&kernel_mutex);
 
 	if (trx_get_dict_operation(trx) != TRX_DICT_OP_NONE) {
 		row_mysql_lock_data_dictionary(trx);
@@ -466,23 +608,17 @@ trx_rollback_active(
 	}
 
 	que_run_threads(thr);
+	ut_a(roll_node->undo_thr != NULL);
 
-	mutex_enter(&kernel_mutex);
-
-	while (trx->que_state != TRX_QUE_RUNNING) {
-
-		mutex_exit(&kernel_mutex);
+	que_run_threads(roll_node->undo_thr);
 
-		fprintf(stderr,
-			"InnoDB: Waiting for rollback of trx id "
-			TRX_ID_FMT " to end\n",
-			(ullint) trx->id);
-		os_thread_sleep(100000);
+	trx_rollback_finish(thr_get_trx(roll_node->undo_thr));
 
-		mutex_enter(&kernel_mutex);
-	}
+	/* Free the memory reserved by the undo graph */
+	que_graph_free(static_cast<que_t*>(
+			       roll_node->undo_thr->common.parent));
 
-	mutex_exit(&kernel_mutex);
+	ut_a(trx->lock.que_state == TRX_QUE_RUNNING);
 
 	if (trx_get_dict_operation(trx) != TRX_DICT_OP_NONE
 	    && trx->table_id != 0) {
@@ -491,15 +627,24 @@ trx_rollback_active(
 		drop the relevant table, if it still exists */
 
 		fprintf(stderr,
-			"InnoDB: Dropping table with id %llu"
+			"InnoDB: Dropping table with id "UINT64PF
 			" in recovery if it exists\n",
-			(ullint) trx->table_id);
+			(ib_uint64_t) trx->table_id);
 
-		table = dict_table_get_on_id_low(trx->table_id);
+		table = dict_table_open_on_id(trx->table_id, dictionary_locked);
 
 		if (table) {
 			ulint	err;
 
+			/* Ensure that the table doesn't get evicted from the
+			cache, keeps things simple for drop. */
+
+			if (table->can_be_evicted) {
+				dict_table_move_from_lru_to_non_lru(table);
+			}
+
+			dict_table_close(table, dictionary_locked);
+
 			fputs("InnoDB: Table found: dropping table ", stderr);
 			ut_print_name(stderr, trx, TRUE, table->name);
 			fputs(" in recovery\n", stderr);
@@ -517,13 +662,70 @@ trx_rollback_active(
 
 	fprintf(stderr, "\nInnoDB: Rolling back of trx id " TRX_ID_FMT
 		" completed\n",
-		(ullint) trx->id);
+		trx->id);
+
 	mem_heap_free(heap);
 
 	trx_roll_crash_recv_trx	= NULL;
 }
 
 /*******************************************************************//**
+Rollback or clean up any resurrected incomplete transactions. It assumes
+that the caller holds the trx_sys_t::mutex and it will release the
+lock if it does a clean up or rollback.
+@return TRUE if the transaction was cleaned up or rolled back
+and trx_sys->mutex was released. */
+static
+ibool
+trx_rollback_resurrected(
+/*=====================*/
+	trx_t*	trx,	/*!< in: transaction to rollback or clean */
+	ibool	all)	/*!< in: FALSE=roll back dictionary transactions;
+			TRUE=roll back all non-PREPARED transactions */
+{
+	ut_ad(mutex_own(&trx_sys->mutex));
+
+	/* The trx->is_recovered flag and trx->state are set
+	atomically under the protection of the trx->mutex (and
+	lock_sys->mutex) in lock_trx_release_locks(). We do not want
+	to accidentally clean up a non-recovered transaction here. */
+
+	trx_mutex_enter(trx);
+
+	if (!trx->is_recovered) {
+		trx_mutex_exit(trx);
+		return(FALSE);
+	}
+
+	switch (trx->state) {
+	case TRX_STATE_COMMITTED_IN_MEMORY:
+		mutex_exit(&trx_sys->mutex);
+		trx_mutex_exit(trx);
+		fprintf(stderr,
+			"InnoDB: Cleaning up trx with id " TRX_ID_FMT "\n",
+			trx->id);
+		trx_cleanup_at_db_startup(trx);
+		return(TRUE);
+	case TRX_STATE_ACTIVE:
+		trx_mutex_exit(trx);
+		if (all || trx_get_dict_operation(trx) != TRX_DICT_OP_NONE) {
+			mutex_exit(&trx_sys->mutex);
+			trx_rollback_active(trx);
+			return(TRUE);
+		}
+		return(FALSE);
+	case TRX_STATE_PREPARED:
+		trx_mutex_exit(trx);
+		return(FALSE);
+	case TRX_STATE_NOT_STARTED:
+		break;
+	}
+
+	ut_error;
+	return(FALSE);
+}
+
+/*******************************************************************//**
 Rollback or clean up any incomplete transactions which were
 encountered in crash recovery.  If the transaction already was
 committed, then we clean up a possible insert undo log. If the
@@ -537,10 +739,11 @@ trx_rollback_or_clean_recovered(
 {
 	trx_t*	trx;
 
-	mutex_enter(&kernel_mutex);
+	ut_a(srv_force_recovery < SRV_FORCE_NO_TRX_UNDO);
+
+	if (trx_sys_get_n_rw_trx() == 0) {
 
-	if (!UT_LIST_GET_FIRST(trx_sys->trx_list)) {
-		goto leave_function;
+		return;
 	}
 
 	if (all) {
@@ -549,40 +752,38 @@ trx_rollback_or_clean_recovered(
 			" of uncommitted transactions\n");
 	}
 
-	mutex_exit(&kernel_mutex);
+	/* Note: For XA recovered transactions, we rely on MySQL to
+	do rollback. They will be in TRX_STATE_PREPARED state. If the server
+	is shutdown and they are still lingering in trx_sys_t::trx_list
+	then the shutdown will hang. */
 
-loop:
-	mutex_enter(&kernel_mutex);
+	/* Loop over the transaction list as long as there are
+	recovered transactions to clean up or recover. */
 
-	for (trx = UT_LIST_GET_FIRST(trx_sys->trx_list); trx;
-	     trx = UT_LIST_GET_NEXT(trx_list, trx)) {
-		if (!trx->is_recovered) {
-			continue;
-		}
+	do {
+		mutex_enter(&trx_sys->mutex);
 
-		switch (trx->conc_state) {
-		case TRX_NOT_STARTED:
-		case TRX_PREPARED:
-			continue;
-
-		case TRX_COMMITTED_IN_MEMORY:
-			mutex_exit(&kernel_mutex);
-			fprintf(stderr,
-				"InnoDB: Cleaning up trx with id "
-				TRX_ID_FMT "\n",
-				(ullint) trx->id);
-			trx_cleanup_at_db_startup(trx);
-			goto loop;
-
-		case TRX_ACTIVE:
-			if (all || trx_get_dict_operation(trx)
-			    != TRX_DICT_OP_NONE) {
-				mutex_exit(&kernel_mutex);
-				trx_rollback_active(trx);
-				goto loop;
+		for (trx = UT_LIST_GET_FIRST(trx_sys->rw_trx_list);
+		     trx != NULL;
+		     trx = UT_LIST_GET_NEXT(trx_list, trx)) {
+
+			assert_trx_in_rw_list(trx);
+
+			/* If this function does a cleanup or rollback
+			then it will release the trx_sys->mutex, therefore
+			we need to reacquire it before retrying the loop. */
+
+			if (trx_rollback_resurrected(trx, all)) {
+
+				mutex_enter(&trx_sys->mutex);
+
+				break;
 			}
 		}
-	}
+
+		mutex_exit(&trx_sys->mutex);
+
+	} while (trx != NULL);
 
 	if (all) {
 		ut_print_timestamp(stderr);
@@ -590,9 +791,6 @@ loop:
 			"  InnoDB: Rollback of non-prepared"
 			" transactions completed\n");
 	}
-
-leave_function:
-	mutex_exit(&kernel_mutex);
 }
 
 /*******************************************************************//**
@@ -602,10 +800,10 @@ committed, then we clean up a possible insert undo log. If the
 transaction was not yet committed, then we roll it back.
 Note: this is done in a background thread.
 @return	a dummy parameter */
-UNIV_INTERN
+extern "C" UNIV_INTERN
 os_thread_ret_t
-trx_rollback_or_clean_all_recovered(
-/*================================*/
+DECLARE_THREAD(trx_rollback_or_clean_all_recovered)(
+/*================================================*/
 	void*	arg __attribute__((unused)))
 			/*!< in: a dummy parameter required by
 			os_thread_create */
@@ -627,30 +825,25 @@ trx_rollback_or_clean_all_recovered(
 /*******************************************************************//**
 Creates an undo number array.
 @return	own: undo number array */
-UNIV_INTERN
+static
 trx_undo_arr_t*
-trx_undo_arr_create(void)
-/*=====================*/
+trx_undo_arr_create(
+/*================*/
+	ulint		n_cells)	/*!< Number of cells */
 {
 	trx_undo_arr_t*	arr;
 	mem_heap_t*	heap;
-	ulint		i;
+	ulint		sz = sizeof(*arr) + sizeof(*arr->infos) * n_cells;
 
-	heap = mem_heap_create(1024);
+	heap = mem_heap_create(sz);
 
-	arr = mem_heap_alloc(heap, sizeof(trx_undo_arr_t));
+	arr = static_cast<trx_undo_arr_t*>(mem_heap_zalloc(heap, sz));
 
-	arr->infos = mem_heap_alloc(heap, sizeof(trx_undo_inf_t)
-				    * UNIV_MAX_PARALLELISM);
-	arr->n_cells = UNIV_MAX_PARALLELISM;
-	arr->n_used = 0;
+	arr->n_cells = n_cells;
 
-	arr->heap = heap;
+	arr->infos = (trx_undo_inf_t*) (arr + 1);
 
-	for (i = 0; i < UNIV_MAX_PARALLELISM; i++) {
-
-		(trx_undo_arr_get_nth_info(arr, i))->in_use = FALSE;
-	}
+	arr->heap = heap;
 
 	return(arr);
 }
@@ -663,8 +856,6 @@ trx_undo_arr_free(
 /*==============*/
 	trx_undo_arr_t*	arr)	/*!< in: undo number array */
 {
-	ut_ad(arr->n_used == 0);
-
 	mem_heap_free(arr->heap);
 }
 
@@ -678,19 +869,18 @@ trx_undo_arr_store_info(
 	trx_t*		trx,	/*!< in: transaction */
 	undo_no_t	undo_no)/*!< in: undo number */
 {
-	trx_undo_inf_t*	cell;
-	trx_undo_inf_t*	stored_here;
+	ulint		i;
 	trx_undo_arr_t*	arr;
+	ulint		n = 0;
 	ulint		n_used;
-	ulint		n;
-	ulint		i;
+	trx_undo_inf_t*	stored_here = NULL;
 
-	n = 0;
 	arr = trx->undo_no_arr;
 	n_used = arr->n_used;
-	stored_here = NULL;
 
-	for (i = 0;; i++) {
+	for (i = 0; i < arr->n_cells; i++) {
+		trx_undo_inf_t*	cell;
+
 		cell = trx_undo_arr_get_nth_info(arr, i);
 
 		if (!cell->in_use) {
@@ -727,6 +917,10 @@ trx_undo_arr_store_info(
 			return(TRUE);
 		}
 	}
+
+	ut_error;
+
+	return(FALSE);
 }
 
 /*******************************************************************//**
@@ -738,22 +932,19 @@ trx_undo_arr_remove_info(
 	trx_undo_arr_t*	arr,	/*!< in: undo number array */
 	undo_no_t	undo_no)/*!< in: undo number */
 {
-	trx_undo_inf_t*	cell;
 	ulint		i;
 
-	for (i = 0;; i++) {
-		cell = trx_undo_arr_get_nth_info(arr, i);
+	for (i = 0; i < arr->n_cells; i++) {
 
-		if (cell->in_use
-		    && cell->undo_no == undo_no) {
+		trx_undo_inf_t*	cell;
 
-			cell->in_use = FALSE;
+		cell = trx_undo_arr_get_nth_info(arr, i);
 
+		if (cell->in_use && cell->undo_no == undo_no) {
+			cell->in_use = FALSE;
 			ut_ad(arr->n_used > 0);
-
-			arr->n_used--;
-
-			return;
+			--arr->n_used;
+			break;
 		}
 	}
 }
@@ -765,46 +956,40 @@ static
 undo_no_t
 trx_undo_arr_get_biggest(
 /*=====================*/
-	trx_undo_arr_t*	arr)	/*!< in: undo number array */
+	const trx_undo_arr_t*	arr)	/*!< in: undo number array */
 {
-	trx_undo_inf_t*	cell;
-	ulint		n_used;
-	undo_no_t	biggest;
-	ulint		n;
 	ulint		i;
+	undo_no_t	biggest = 0;
+	ulint		n_checked = 0;
 
-	n = 0;
-	n_used = arr->n_used;
-	biggest = 0;
+	for (i = 0; i < arr->n_cells && n_checked < arr->n_used; ++i) {
 
-	for (i = 0;; i++) {
-		cell = trx_undo_arr_get_nth_info(arr, i);
+		const trx_undo_inf_t*	cell = &arr->infos[i];
 
 		if (cell->in_use) {
-			n++;
+
+			++n_checked;
+
 			if (cell->undo_no > biggest) {
 
 				biggest = cell->undo_no;
 			}
 		}
-
-		if (n == n_used) {
-			return(biggest);
-		}
 	}
+
+	return(biggest);
 }
 
 /***********************************************************************//**
 Tries truncate the undo logs. */
-UNIV_INTERN
+static
 void
 trx_roll_try_truncate(
 /*==================*/
 	trx_t*	trx)	/*!< in/out: transaction */
 {
-	trx_undo_arr_t*	arr;
-	undo_no_t	limit;
-	undo_no_t	biggest;
+	undo_no_t		limit;
+	const trx_undo_arr_t*	arr;
 
 	ut_ad(mutex_own(&(trx->undo_mutex)));
 	ut_ad(mutex_own(&((trx->rseg)->mutex)));
@@ -816,6 +1001,8 @@ trx_roll_try_truncate(
 	limit = trx->undo_no;
 
 	if (arr->n_used > 0) {
+		undo_no_t	biggest;
+
 		biggest = trx_undo_arr_get_biggest(arr);
 
 		if (biggest >= limit) {
@@ -850,19 +1037,20 @@ trx_roll_pop_top_rec(
 	trx_undo_rec_t*	prev_rec;
 	page_t*		prev_rec_page;
 
-	ut_ad(mutex_own(&(trx->undo_mutex)));
+	ut_ad(mutex_own(&trx->undo_mutex));
+
+	undo_page = trx_undo_page_get_s_latched(
+		undo->space, undo->zip_size, undo->top_page_no, mtr);
 
-	undo_page = trx_undo_page_get_s_latched(undo->space, undo->zip_size,
-						undo->top_page_no, mtr);
 	offset = undo->top_offset;
 
 	/*	fprintf(stderr, "Thread %lu undoing trx " TRX_ID_FMT
 			" undo record " TRX_ID_FMT "\n",
 	os_thread_get_curr_id(), trx->id, undo->top_undo_no); */
 
-	prev_rec = trx_undo_get_prev_rec(undo_page + offset,
-					 undo->hdr_page_no, undo->hdr_offset,
-					 mtr);
+	prev_rec = trx_undo_get_prev_rec(
+		undo_page + offset, undo->hdr_page_no, undo->hdr_offset, mtr);
+
 	if (prev_rec == NULL) {
 
 		undo->empty = TRUE;
@@ -915,11 +1103,11 @@ try_again:
 	mutex_enter(&(trx->undo_mutex));
 
 	if (trx->pages_undone >= TRX_ROLL_TRUNC_THRESHOLD) {
-		mutex_enter(&(rseg->mutex));
+		mutex_enter(&rseg->mutex);
 
 		trx_roll_try_truncate(trx);
 
-		mutex_exit(&(rseg->mutex));
+		mutex_exit(&rseg->mutex);
 	}
 
 	ins_undo = trx->insert_undo;
@@ -935,8 +1123,7 @@ try_again:
 		undo = ins_undo;
 	}
 
-	if (!undo || undo->empty
-	    || limit > undo->top_undo_no) {
+	if (!undo || undo->empty || limit > undo->top_undo_no) {
 
 		if ((trx->undo_no_arr)->n_used == 0) {
 			/* Rollback is ending */
@@ -953,15 +1140,11 @@ try_again:
 		return(NULL);
 	}
 
-	if (undo == ins_undo) {
-		is_insert = TRUE;
-	} else {
-		is_insert = FALSE;
-	}
+	is_insert = (undo == ins_undo);
+
+	*roll_ptr = trx_undo_build_roll_ptr(
+		is_insert, undo->rseg->id, undo->top_page_no, undo->top_offset);
 
-	*roll_ptr = trx_undo_build_roll_ptr(is_insert, (undo->rseg)->id,
-					    undo->top_page_no,
-					    undo->top_offset);
 	mtr_start(&mtr);
 
 	undo_rec = trx_roll_pop_top_rec(trx, undo, &mtr);
@@ -1055,83 +1238,13 @@ trx_undo_rec_release(
 	mutex_exit(&(trx->undo_mutex));
 }
 
-/*********************************************************************//**
-Starts a rollback operation. */
-UNIV_INTERN
-void
-trx_rollback(
-/*=========*/
-	trx_t*		trx,	/*!< in: transaction */
-	trx_sig_t*	sig,	/*!< in: signal starting the rollback */
-	que_thr_t**	next_thr)/*!< in/out: next query thread to run;
-				if the value which is passed in is
-				a pointer to a NULL pointer, then the
-				calling function can start running
-				a new query thread; if the passed value is
-				NULL, the parameter is ignored */
-{
-	que_t*		roll_graph;
-	que_thr_t*	thr;
-	/*	que_thr_t*	thr2; */
-
-	ut_ad(mutex_own(&kernel_mutex));
-	ut_ad((trx->undo_no_arr == NULL) || ((trx->undo_no_arr)->n_used == 0));
-
-	/* Initialize the rollback field in the transaction */
-
-	switch (sig->type) {
-	case TRX_SIG_TOTAL_ROLLBACK:
-		trx->roll_limit = 0;
-		break;
-	case TRX_SIG_ROLLBACK_TO_SAVEPT:
-		trx->roll_limit = (sig->savept).least_undo_no;
-		break;
-	case TRX_SIG_ERROR_OCCURRED:
-		trx->roll_limit = trx->last_sql_stat_start.least_undo_no;
-		break;
-	default:
-		ut_error;
-	}
-
-	ut_a(trx->roll_limit <= trx->undo_no);
-
-	trx->pages_undone = 0;
-
-	if (trx->undo_no_arr == NULL) {
-		trx->undo_no_arr = trx_undo_arr_create();
-	}
-
-	/* Build a 'query' graph which will perform the undo operations */
-
-	roll_graph = trx_roll_graph_build(trx);
-
-	trx->graph = roll_graph;
-	trx->que_state = TRX_QUE_ROLLING_BACK;
-
-	thr = que_fork_start_command(roll_graph);
-
-	ut_ad(thr);
-
-	/*	thr2 = que_fork_start_command(roll_graph);
-
-	ut_ad(thr2); */
-
-	if (next_thr && (*next_thr == NULL)) {
-		*next_thr = thr;
-		/*		srv_que_task_enqueue_low(thr2); */
-	} else {
-		srv_que_task_enqueue_low(thr);
-		/*		srv_que_task_enqueue_low(thr2); */
-	}
-}
-
 /****************************************************************//**
 Builds an undo 'query' graph for a transaction. The actual rollback is
 performed by executing this query graph like a query subprocedure call.
 The reply about the completion of the rollback will be sent by this
 graph.
 @return	own: the query graph */
-UNIV_INTERN
+static
 que_t*
 trx_roll_graph_build(
 /*=================*/
@@ -1140,147 +1253,76 @@ trx_roll_graph_build(
 	mem_heap_t*	heap;
 	que_fork_t*	fork;
 	que_thr_t*	thr;
-	/*	que_thr_t*	thr2; */
 
-	ut_ad(mutex_own(&kernel_mutex));
+	ut_ad(trx_mutex_own(trx));
 
 	heap = mem_heap_create(512);
 	fork = que_fork_create(NULL, NULL, QUE_FORK_ROLLBACK, heap);
 	fork->trx = trx;
 
 	thr = que_thr_create(fork, heap);
-	/*	thr2 = que_thr_create(fork, heap); */
 
 	thr->child = row_undo_node_create(trx, thr, heap);
-	/*	thr2->child = row_undo_node_create(trx, thr2, heap); */
 
 	return(fork);
 }
 
 /*********************************************************************//**
-Finishes error processing after the necessary partial rollback has been
-done. */
+Starts a rollback operation, creates the UNDO graph that will do the
+actual undo operation.
+@return query graph thread that will perform the UNDO operations. */
 static
-void
-trx_finish_error_processing(
-/*========================*/
-	trx_t*	trx)	/*!< in: transaction */
+que_thr_t*
+trx_rollback_start(
+/*===============*/
+	trx_t*		trx,		/*!< in: transaction */
+	ib_id_t		roll_limit)	/*!< in: rollback to undo no (for
+					partial undo), 0 if we are rolling back
+					the entire transaction */
 {
-	trx_sig_t*	sig;
-	trx_sig_t*	next_sig;
-
-	ut_ad(mutex_own(&kernel_mutex));
+	que_t*		roll_graph;
 
-	sig = UT_LIST_GET_FIRST(trx->signals);
+	ut_ad(trx_mutex_own(trx));
 
-	while (sig != NULL) {
-		next_sig = UT_LIST_GET_NEXT(signals, sig);
+	ut_ad(trx->undo_no_arr == NULL || trx->undo_no_arr->n_used == 0);
 
-		if (sig->type == TRX_SIG_ERROR_OCCURRED) {
+	/* Initialize the rollback field in the transaction */
 
-			trx_sig_remove(trx, sig);
-		}
+	trx->roll_limit = roll_limit;
 
-		sig = next_sig;
-	}
+	ut_a(trx->roll_limit <= trx->undo_no);
 
-	trx->que_state = TRX_QUE_RUNNING;
-}
+	trx->pages_undone = 0;
 
-/*********************************************************************//**
-Finishes a partial rollback operation. */
-static
-void
-trx_finish_partial_rollback_off_kernel(
-/*===================================*/
-	trx_t*		trx,	/*!< in: transaction */
-	que_thr_t**	next_thr)/*!< in/out: next query thread to run;
-				if the value which is passed in is a pointer
-				to a NULL pointer, then the calling function
-				can start running a new query thread; if this
-				parameter is NULL, it is ignored */
-{
-	trx_sig_t*	sig;
+	if (trx->undo_no_arr == NULL) {
+		/* Single query thread -> 1 */
+		trx->undo_no_arr = trx_undo_arr_create(1);
+	}
 
-	ut_ad(mutex_own(&kernel_mutex));
+	/* Build a 'query' graph which will perform the undo operations */
 
-	sig = UT_LIST_GET_FIRST(trx->signals);
+	roll_graph = trx_roll_graph_build(trx);
 
-	/* Remove the signal from the signal queue and send reply message
-	to it */
+	trx->graph = roll_graph;
 
-	trx_sig_reply(sig, next_thr);
-	trx_sig_remove(trx, sig);
+	trx->lock.que_state = TRX_QUE_ROLLING_BACK;
 
-	trx->que_state = TRX_QUE_RUNNING;
+	return(que_fork_start_command(roll_graph));
 }
 
 /****************************************************************//**
 Finishes a transaction rollback. */
-UNIV_INTERN
+static
 void
-trx_finish_rollback_off_kernel(
-/*===========================*/
-	que_t*		graph,	/*!< in: undo graph which can now be freed */
-	trx_t*		trx,	/*!< in: transaction */
-	que_thr_t**	next_thr)/*!< in/out: next query thread to run;
-				if the value which is passed in is
-				a pointer to a NULL pointer, then the
-				calling function can start running
-				a new query thread; if this parameter is
-				NULL, it is ignored */
+trx_rollback_finish(
+/*================*/
+	trx_t*		trx)	/*!< in: transaction */
 {
-	trx_sig_t*	sig;
-	trx_sig_t*	next_sig;
-
-	ut_ad(mutex_own(&kernel_mutex));
-
 	ut_a(trx->undo_no_arr == NULL || trx->undo_no_arr->n_used == 0);
 
-	/* Free the memory reserved by the undo graph */
-	que_graph_free(graph);
-
-	sig = UT_LIST_GET_FIRST(trx->signals);
-
-	if (sig->type == TRX_SIG_ROLLBACK_TO_SAVEPT) {
-
-		trx_finish_partial_rollback_off_kernel(trx, next_thr);
-
-		return;
-
-	} else if (sig->type == TRX_SIG_ERROR_OCCURRED) {
-
-		trx_finish_error_processing(trx);
-
-		return;
-	}
-
-#ifdef UNIV_DEBUG
-	if (lock_print_waits) {
-		fprintf(stderr, "Trx " TRX_ID_FMT " rollback finished\n",
-			(ullint) trx->id);
-	}
-#endif /* UNIV_DEBUG */
-
-	trx_commit_off_kernel(trx);
-
-	/* Remove all TRX_SIG_TOTAL_ROLLBACK signals from the signal queue and
-	send reply messages to them */
-
-	trx->que_state = TRX_QUE_RUNNING;
-
-	while (sig != NULL) {
-		next_sig = UT_LIST_GET_NEXT(signals, sig);
-
-		if (sig->type == TRX_SIG_TOTAL_ROLLBACK) {
+	trx_commit(trx);
 
-			trx_sig_reply(sig, next_thr);
-
-			trx_sig_remove(trx, sig);
-		}
-
-		sig = next_sig;
-	}
+	trx->lock.que_state = TRX_QUE_RUNNING;
 }
 
 /*********************************************************************//**
@@ -1294,11 +1336,11 @@ roll_node_create(
 {
 	roll_node_t*	node;
 
-	node = mem_heap_alloc(heap, sizeof(roll_node_t));
-	node->common.type = QUE_NODE_ROLLBACK;
+	node = static_cast<roll_node_t*>(mem_heap_zalloc(heap, sizeof(*node)));
+
 	node->state = ROLL_NODE_SEND;
 
-	node->partial = FALSE;
+	node->common.type = QUE_NODE_ROLLBACK;
 
 	return(node);
 }
@@ -1313,10 +1355,8 @@ trx_rollback_step(
 	que_thr_t*	thr)	/*!< in: query thread */
 {
 	roll_node_t*	node;
-	ulint		sig_no;
-	trx_savept_t*	savept;
 
-	node = thr->run_node;
+	node = static_cast<roll_node_t*>(thr->run_node);
 
 	ut_ad(que_node_get_type(node) == QUE_NODE_ROLLBACK);
 
@@ -1325,33 +1365,30 @@ trx_rollback_step(
 	}
 
 	if (node->state == ROLL_NODE_SEND) {
-		mutex_enter(&kernel_mutex);
+		trx_t*		trx;
+		ib_id_t		roll_limit = 0;
 
-		node->state = ROLL_NODE_WAIT;
+		trx = thr_get_trx(thr);
 
-		if (node->partial) {
-			sig_no = TRX_SIG_ROLLBACK_TO_SAVEPT;
-			savept = &(node->savept);
-		} else {
-			sig_no = TRX_SIG_TOTAL_ROLLBACK;
-			savept = NULL;
-		}
+		trx_mutex_enter(trx);
 
-		/* Send a rollback signal to the transaction */
+		node->state = ROLL_NODE_WAIT;
 
-		trx_sig_send(thr_get_trx(thr), sig_no, TRX_SIG_SELF, thr,
-			     savept, NULL);
+		ut_a(node->undo_thr == NULL);
 
-		thr->state = QUE_THR_SIG_REPLY_WAIT;
+		roll_limit = node->partial ? node->savept.least_undo_no : 0;
 
-		mutex_exit(&kernel_mutex);
+		trx_commit_or_rollback_prepare(trx);
 
-		return(NULL);
-	}
+		node->undo_thr = trx_rollback_start(trx, roll_limit);
 
-	ut_ad(node->state == ROLL_NODE_WAIT);
+		trx_mutex_exit(trx);
 
-	thr->run_node = que_node_get_parent(node);
+	} else {
+		ut_ad(node->state == ROLL_NODE_WAIT);
+
+		thr->run_node = que_node_get_parent(node);
+	}
 
 	return(thr);
 }
diff --git a/storage/innobase/trx/trx0rseg.c b/storage/innobase/trx/trx0rseg.cc
index 85beac8afbc..003d1036a8c 100644
--- a/storage/innobase/trx/trx0rseg.c
+++ b/storage/innobase/trx/trx0rseg.cc
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1996, 2011, Oracle Corpn. All Rights Reserved.
+Copyright (c) 1996, 2011, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,13 +11,13 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
 /**************************************************//**
-@file trx/trx0rseg.c
+@file trx/trx0rseg.cc
 Rollback segment
 
 Created 3/26/1996 Heikki Tuuri
@@ -33,32 +33,14 @@ Created 3/26/1996 Heikki Tuuri
 #include "fut0lst.h"
 #include "srv0srv.h"
 #include "trx0purge.h"
+#include "ut0bh.h"
+#include "srv0mon.h"
 
 #ifdef UNIV_PFS_MUTEX
 /* Key to register rseg_mutex_key with performance schema */
 UNIV_INTERN mysql_pfs_key_t	rseg_mutex_key;
 #endif /* UNIV_PFS_MUTEX */
 
-/******************************************************************//**
-Looks for a rollback segment, based on the rollback segment id.
-@return	rollback segment */
-UNIV_INTERN
-trx_rseg_t*
-trx_rseg_get_on_id(
-/*===============*/
-	ulint	id)	/*!< in: rollback segment id */
-{
-	trx_rseg_t*	rseg;
-
-	ut_a(id < TRX_SYS_N_RSEGS);
-
-	rseg = trx_sys->rseg_array[id];
-
-	ut_a(rseg == NULL || id == rseg->id);
-
-	return(rseg);
-}
-
 /****************************************************************//**
 Creates a rollback segment header. This function is called only when
 a new rollback segment is created in the database.
@@ -81,13 +63,11 @@ trx_rseg_header_create(
 	buf_block_t*	block;
 
 	ut_ad(mtr);
-	ut_ad(mutex_own(&kernel_mutex));
 	ut_ad(mtr_memo_contains(mtr, fil_space_get_latch(space, NULL),
 				MTR_MEMO_X_LOCK));
 
 	/* Allocate a new file segment for the rollback segment */
-	block = fseg_create(space, 0,
-			    TRX_RSEG + TRX_RSEG_FSEG_HEADER, mtr);
+	block = fseg_create(space, 0, TRX_RSEG + TRX_RSEG_FSEG_HEADER, mtr);
 
 	if (block == NULL) {
 		/* No space left */
@@ -137,6 +117,7 @@ trx_rseg_mem_free(
 	trx_rseg_t*	rseg)	/* in, own: instance to free */
 {
 	trx_undo_t*	undo;
+	trx_undo_t*	next_undo;
 
 	mutex_free(&rseg->mutex);
 
@@ -144,29 +125,36 @@ trx_rseg_mem_free(
 	ut_a(UT_LIST_GET_LEN(rseg->update_undo_list) == 0);
 	ut_a(UT_LIST_GET_LEN(rseg->insert_undo_list) == 0);
 
-	undo = UT_LIST_GET_FIRST(rseg->update_undo_cached);
+	for (undo = UT_LIST_GET_FIRST(rseg->update_undo_cached);
+	     undo != NULL;
+	     undo = next_undo) {
+
+		next_undo = UT_LIST_GET_NEXT(undo_list, undo);
 
-	while (undo != NULL) {
-		trx_undo_t*	prev_undo = undo;
+		UT_LIST_REMOVE(undo_list, rseg->update_undo_cached, undo);
 
-		undo = UT_LIST_GET_NEXT(undo_list, undo);
-		UT_LIST_REMOVE(undo_list, rseg->update_undo_cached, prev_undo);
+		MONITOR_DEC(MONITOR_NUM_UNDO_SLOT_CACHED);
 
-		trx_undo_mem_free(prev_undo);
+		trx_undo_mem_free(undo);
 	}
 
-	undo = UT_LIST_GET_FIRST(rseg->insert_undo_cached);
+	for (undo = UT_LIST_GET_FIRST(rseg->insert_undo_cached);
+	     undo != NULL;
+	     undo = next_undo) {
 
-	while (undo != NULL) {
-		trx_undo_t*	prev_undo = undo;
+		next_undo = UT_LIST_GET_NEXT(undo_list, undo);
 
-		undo = UT_LIST_GET_NEXT(undo_list, undo);
-		UT_LIST_REMOVE(undo_list, rseg->insert_undo_cached, prev_undo);
+		UT_LIST_REMOVE(undo_list, rseg->insert_undo_cached, undo);
 
-		trx_undo_mem_free(prev_undo);
+		MONITOR_DEC(MONITOR_NUM_UNDO_SLOT_CACHED);
+
+		trx_undo_mem_free(undo);
 	}
 
-	trx_sys_set_nth_rseg(trx_sys, rseg->id, NULL);
+	/* const_cast<trx_rseg_t*>() because this function is
+	like a destructor.  */
+
+	*((trx_rseg_t**) trx_sys->rseg_array + rseg->id) = NULL;
 
 	mem_free(rseg);
 }
@@ -198,9 +186,7 @@ trx_rseg_mem_create(
 	trx_ulogf_t*	undo_log_hdr;
 	ulint		sum_of_undo_sizes;
 
-	ut_ad(mutex_own(&kernel_mutex));
-
-	rseg = mem_zalloc(sizeof(trx_rseg_t));
+	rseg = static_cast<trx_rseg_t*>(mem_zalloc(sizeof(trx_rseg_t)));
 
 	rseg->id = id;
 	rseg->space = space;
@@ -209,41 +195,43 @@ trx_rseg_mem_create(
 
 	mutex_create(rseg_mutex_key, &rseg->mutex, SYNC_RSEG);
 
-	UT_LIST_ADD_LAST(rseg_list, trx_sys->rseg_list, rseg);
-
-	trx_sys_set_nth_rseg(trx_sys, id, rseg);
+	/* const_cast<trx_rseg_t*>() because this function is
+	like a constructor.  */
+	*((trx_rseg_t**) trx_sys->rseg_array + rseg->id) = rseg;
 
 	rseg_header = trx_rsegf_get_new(space, zip_size, page_no, mtr);
 
-	rseg->max_size = mtr_read_ulint(rseg_header + TRX_RSEG_MAX_SIZE,
-					MLOG_4BYTES, mtr);
+	rseg->max_size = mtr_read_ulint(
+		rseg_header + TRX_RSEG_MAX_SIZE, MLOG_4BYTES, mtr);
 
 	/* Initialize the undo log lists according to the rseg header */
 
 	sum_of_undo_sizes = trx_undo_lists_init(rseg);
 
-	rseg->curr_size = mtr_read_ulint(rseg_header + TRX_RSEG_HISTORY_SIZE,
-					 MLOG_4BYTES, mtr)
+	rseg->curr_size = mtr_read_ulint(
+		rseg_header + TRX_RSEG_HISTORY_SIZE, MLOG_4BYTES, mtr)
 		+ 1 + sum_of_undo_sizes;
 
 	len = flst_get_len(rseg_header + TRX_RSEG_HISTORY, mtr);
+
 	if (len > 0) {
-		const void*	ptr;
 		rseg_queue_t	rseg_queue;
 
 		trx_sys->rseg_history_len += len;
 
 		node_addr = trx_purge_get_log_from_hist(
 			flst_get_last(rseg_header + TRX_RSEG_HISTORY, mtr));
+
 		rseg->last_page_no = node_addr.page;
 		rseg->last_offset = node_addr.boffset;
 
-		undo_log_hdr = trx_undo_page_get(rseg->space, rseg->zip_size,
-						 node_addr.page,
-						 mtr) + node_addr.boffset;
+		undo_log_hdr = trx_undo_page_get(
+			rseg->space, rseg->zip_size, node_addr.page,
+			mtr) + node_addr.boffset;
 
 		rseg->last_trx_no = mach_read_from_8(
 			undo_log_hdr + TRX_UNDO_TRX_NO);
+
 		rseg->last_del_marks = mtr_read_ulint(
 			undo_log_hdr + TRX_UNDO_DEL_MARKS, MLOG_2BYTES, mtr);
 
@@ -251,6 +239,8 @@ trx_rseg_mem_create(
 		rseg_queue.trx_no = rseg->last_trx_no;
 
 		if (rseg->last_page_no != FIL_NULL) {
+			const void*	ptr;
+
 			/* There is no need to cover this operation by the purge
 			mutex because we are still bootstrapping. */
 
@@ -266,7 +256,7 @@ trx_rseg_mem_create(
 
 /********************************************************************
 Creates the memory copies for the rollback segments and initializes the
-rseg list and array in trx_sys at a database startup. */
+rseg array in trx_sys at a database startup. */
 static
 void
 trx_rseg_create_instance(
@@ -282,9 +272,7 @@ trx_rseg_create_instance(
 
 		page_no = trx_sysf_rseg_get_page_no(sys_header, i, mtr);
 
-		if (page_no == FIL_NULL) {
-			trx_sys_set_nth_rseg(trx_sys, i, NULL);
-		} else {
+		if (page_no != FIL_NULL) {
 			ulint		space;
 			ulint		zip_size;
 			trx_rseg_t*	rseg = NULL;
@@ -299,6 +287,8 @@ trx_rseg_create_instance(
 				i, space, zip_size, page_no, ib_bh, mtr);
 
 			ut_a(rseg->id == i);
+		} else {
+			ut_a(trx_sys->rseg_array[i] == NULL);
 		}
 	}
 }
@@ -308,8 +298,9 @@ Creates a rollback segment.
 @return pointer to new rollback segment if create successful */
 UNIV_INTERN
 trx_rseg_t*
-trx_rseg_create(void)
-/*=================*/
+trx_rseg_create(
+/*============*/
+	ulint		space)		/*!< in: id of UNDO tablespace */
 {
 	mtr_t		mtr;
 	ulint		slot_no;
@@ -318,29 +309,26 @@ trx_rseg_create(void)
 	mtr_start(&mtr);
 
 	/* To obey the latching order, acquire the file space
-	x-latch before the kernel mutex. */
-	mtr_x_lock(fil_space_get_latch(TRX_SYS_SPACE, NULL), &mtr);
-
-	mutex_enter(&kernel_mutex);
+	x-latch before the trx_sys->mutex. */
+	mtr_x_lock(fil_space_get_latch(space, NULL), &mtr);
 
 	slot_no = trx_sysf_rseg_find_free(&mtr);
 
 	if (slot_no != ULINT_UNDEFINED) {
-		ulint		space;
+		ulint		id;
 		ulint		page_no;
 		ulint		zip_size;
 		trx_sysf_t*	sys_header;
 
 		page_no = trx_rseg_header_create(
-			TRX_SYS_SPACE, 0, ULINT_MAX, slot_no, &mtr);
+			space, 0, ULINT_MAX, slot_no, &mtr);
 
 		ut_a(page_no != FIL_NULL);
 
-		ut_ad(!trx_rseg_get_on_id(slot_no));
-
 		sys_header = trx_sysf_get(&mtr);
 
-		space = trx_sysf_rseg_get_space(sys_header, slot_no, &mtr);
+		id = trx_sysf_rseg_get_space(sys_header, slot_no, &mtr);
+		ut_a(id == space);
 
 		zip_size = space ? fil_space_get_zip_size(space) : 0;
 
@@ -349,26 +337,89 @@ trx_rseg_create(void)
 			purge_sys->ib_bh, &mtr);
 	}
 
-	mutex_exit(&kernel_mutex);
 	mtr_commit(&mtr);
 
 	return(rseg);
 }
 
-/********************************************************************
-Initialize the rollback instance list. */
+/*********************************************************************//**
+Creates the memory copies for rollback segments and initializes the
+rseg array in trx_sys at a database startup. */
 UNIV_INTERN
 void
-trx_rseg_list_and_array_init(
-/*=========================*/
-	trx_sysf_t*	sys_header,	/*!< in: trx system header */
+trx_rseg_array_init(
+/*================*/
+	trx_sysf_t*	sys_header,	/* in/out: trx system header */
 	ib_bh_t*	ib_bh,		/*!< in: rseg queue */
 	mtr_t*		mtr)		/*!< in: mtr */
 {
-	UT_LIST_INIT(trx_sys->rseg_list);
-
 	trx_sys->rseg_history_len = 0;
 
 	trx_rseg_create_instance(sys_header, ib_bh, mtr);
 }
 
+/********************************************************************
+Get the number of unique rollback tablespaces in use except space id 0.
+The last space id will be the sentinel value ULINT_UNDEFINED. The array
+will be sorted on space id. Note: space_ids should have have space for
+TRX_SYS_N_RSEGS + 1 elements.
+@return number of unique rollback tablespaces in use. */
+UNIV_INTERN
+ulint
+trx_rseg_get_n_undo_tablespaces(
+/*============================*/
+	ulint*		space_ids)	/*!< out: array of space ids of
+					UNDO tablespaces */
+{
+	ulint		i;
+	mtr_t		mtr;
+	trx_sysf_t*	sys_header;
+	ulint		n_undo_tablespaces = 0;
+	ulint		space_ids_aux[TRX_SYS_N_RSEGS + 1];
+
+	mtr_start(&mtr);
+
+	sys_header = trx_sysf_get(&mtr);
+
+	for (i = 0; i < TRX_SYS_N_RSEGS; i++) {
+		ulint	page_no;
+		ulint	space;
+
+		page_no = trx_sysf_rseg_get_page_no(sys_header, i, &mtr);
+
+		if (page_no == FIL_NULL) {
+			continue;
+		}
+
+		space = trx_sysf_rseg_get_space(sys_header, i, &mtr);
+
+		if (space != 0) {
+			ulint	j;
+			ibool	found = FALSE;
+
+			for (j = 0; j < n_undo_tablespaces; ++j) {
+				if (space_ids[j] == space) {
+					found = TRUE;
+					break;
+				}
+			}
+
+			if (!found) {
+				ut_a(n_undo_tablespaces <= i);
+				space_ids[n_undo_tablespaces++] = space;
+			}
+		}
+	}
+
+	mtr_commit(&mtr);
+
+	ut_a(n_undo_tablespaces <= TRX_SYS_N_RSEGS);
+
+	space_ids[n_undo_tablespaces] = ULINT_UNDEFINED;
+
+	if (n_undo_tablespaces > 0) {
+		ut_ulint_sort(space_ids, space_ids_aux, 0, n_undo_tablespaces);
+	}
+
+	return(n_undo_tablespaces);
+}
diff --git a/storage/innobase/trx/trx0sys.c b/storage/innobase/trx/trx0sys.cc
index 90091c00228..97fd1f36943 100644
--- a/storage/innobase/trx/trx0sys.c
+++ b/storage/innobase/trx/trx0sys.cc
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1996, 2012, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 1996, 2011, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -17,7 +17,7 @@ this program; if not, write to the Free Software Foundation, Inc.,
 *****************************************************************************/
 
 /**************************************************//**
-@file trx/trx0sys.c
+@file trx/trx0sys.cc
 Transaction system
 
 Created 3/26/1996 Heikki Tuuri
@@ -43,6 +43,7 @@ Created 3/26/1996 Heikki Tuuri
 #include "log0recv.h"
 #include "os0file.h"
 #include "read0read.h"
+#include "buf0dblwr.h"
 
 /** The file format tag structure with id and name. */
 struct file_format_struct {
@@ -58,19 +59,6 @@ typedef struct file_format_struct	file_format_t;
 
 /** The transaction system */
 UNIV_INTERN trx_sys_t*		trx_sys		= NULL;
-/** The doublewrite buffer */
-UNIV_INTERN trx_doublewrite_t*	trx_doublewrite = NULL;
-
-/** The following is set to TRUE when we are upgrading from pre-4.1
-format data files to the multiple tablespaces format data files */
-UNIV_INTERN ibool	trx_doublewrite_must_reset_space_ids	= FALSE;
-/** Set to TRUE when the doublewrite buffer is being created */
-UNIV_INTERN ibool	trx_doublewrite_buf_is_being_created = FALSE;
-
-/** The following is TRUE when we are using the database in the
-post-4.1 format, i.e., we have successfully upgraded, or have created
-a new database installation */
-UNIV_INTERN ibool	trx_sys_multiple_tablespace_format	= FALSE;
 
 /** In a MySQL replication slave, in crash recovery we store the master log
 file name and position here. */
@@ -130,9 +118,9 @@ static const ulint	FILE_FORMAT_NAME_N
 
 #ifdef UNIV_PFS_MUTEX
 /* Key to register the mutex with performance schema */
-UNIV_INTERN mysql_pfs_key_t	trx_doublewrite_mutex_key;
 UNIV_INTERN mysql_pfs_key_t	file_format_max_mutex_key;
-#endif /* UNIV_PFS_MUTEX */
+UNIV_INTERN mysql_pfs_key_t	trx_sys_mutex_key;
+#endif /* UNIV_PFS_RWLOCK */
 
 #ifdef UNIV_DEBUG
 /* Flag to control TRX_RSEG_N_SLOTS behavior debugging. */
@@ -145,515 +133,40 @@ updated via SET GLOBAL innodb_file_format_max = 'x' or when we open
 or create a table. */
 static	file_format_t	file_format_max;
 
+#ifdef UNIV_DEBUG
 /****************************************************************//**
-Determines if a page number is located inside the doublewrite buffer.
-@return TRUE if the location is inside the two blocks of the
-doublewrite buffer */
-UNIV_INTERN
-ibool
-trx_doublewrite_page_inside(
-/*========================*/
-	ulint	page_no)	/*!< in: page number */
-{
-	if (trx_doublewrite == NULL) {
-
-		return(FALSE);
-	}
-
-	if (page_no >= trx_doublewrite->block1
-	    && page_no < trx_doublewrite->block1
-	    + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
-		return(TRUE);
-	}
-
-	if (page_no >= trx_doublewrite->block2
-	    && page_no < trx_doublewrite->block2
-	    + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
-		return(TRUE);
-	}
-
-	return(FALSE);
-}
-
-/****************************************************************//**
-Creates or initialializes the doublewrite buffer at a database start. */
-static
-void
-trx_doublewrite_init(
-/*=================*/
-	byte*	doublewrite)	/*!< in: pointer to the doublewrite buf
-				header on trx sys page */
-{
-	trx_doublewrite = mem_alloc(sizeof(trx_doublewrite_t));
-
-	/* Since we now start to use the doublewrite buffer, no need to call
-	fsync() after every write to a data file */
-#ifdef UNIV_DO_FLUSH
-	os_do_not_call_flush_at_each_write = TRUE;
-#endif /* UNIV_DO_FLUSH */
-
-	mutex_create(trx_doublewrite_mutex_key,
-		     &trx_doublewrite->mutex, SYNC_DOUBLEWRITE);
-
-	trx_doublewrite->first_free = 0;
-
-	trx_doublewrite->block1 = mach_read_from_4(
-		doublewrite + TRX_SYS_DOUBLEWRITE_BLOCK1);
-	trx_doublewrite->block2 = mach_read_from_4(
-		doublewrite + TRX_SYS_DOUBLEWRITE_BLOCK2);
-	trx_doublewrite->write_buf_unaligned = ut_malloc(
-		(1 + 2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) * UNIV_PAGE_SIZE);
-
-	trx_doublewrite->write_buf = ut_align(
-		trx_doublewrite->write_buf_unaligned, UNIV_PAGE_SIZE);
-	trx_doublewrite->buf_block_arr = mem_alloc(
-		2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * sizeof(void*));
-}
-
-/****************************************************************//**
-Marks the trx sys header when we have successfully upgraded to the >= 4.1.x
-multiple tablespace format. */
-UNIV_INTERN
-void
-trx_sys_mark_upgraded_to_multiple_tablespaces(void)
-/*===============================================*/
-{
-	buf_block_t*	block;
-	byte*		doublewrite;
-	mtr_t		mtr;
-
-	/* We upgraded to 4.1.x and reset the space id fields in the
-	doublewrite buffer. Let us mark to the trx_sys header that the upgrade
-	has been done. */
-
-	mtr_start(&mtr);
-
-	block = buf_page_get(TRX_SYS_SPACE, 0, TRX_SYS_PAGE_NO,
-			     RW_X_LATCH, &mtr);
-	buf_block_dbg_add_level(block, SYNC_NO_ORDER_CHECK);
-
-	doublewrite = buf_block_get_frame(block) + TRX_SYS_DOUBLEWRITE;
-
-	mlog_write_ulint(doublewrite + TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED,
-			 TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED_N,
-			 MLOG_4BYTES, &mtr);
-	mtr_commit(&mtr);
-
-	/* Flush the modified pages to disk and make a checkpoint */
-	log_make_checkpoint_at(IB_ULONGLONG_MAX, TRUE);
-
-	trx_sys_multiple_tablespace_format = TRUE;
-}
-
-/****************************************************************//**
-Creates the doublewrite buffer to a new InnoDB installation. The header of the
-doublewrite buffer is placed on the trx system header page. */
-UNIV_INTERN
-void
-trx_sys_create_doublewrite_buf(void)
-/*================================*/
-{
-	buf_block_t*	block;
-	buf_block_t*	block2;
-	buf_block_t*	new_block;
-	byte*	doublewrite;
-	byte*	fseg_header;
-	ulint	page_no;
-	ulint	prev_page_no;
-	ulint	i;
-	mtr_t	mtr;
-
-	if (trx_doublewrite) {
-		/* Already inited */
-
-		return;
-	}
-
-start_again:
-	mtr_start(&mtr);
-	trx_doublewrite_buf_is_being_created = TRUE;
-
-	block = buf_page_get(TRX_SYS_SPACE, 0, TRX_SYS_PAGE_NO,
-			     RW_X_LATCH, &mtr);
-	buf_block_dbg_add_level(block, SYNC_NO_ORDER_CHECK);
-
-	doublewrite = buf_block_get_frame(block) + TRX_SYS_DOUBLEWRITE;
-
-	if (mach_read_from_4(doublewrite + TRX_SYS_DOUBLEWRITE_MAGIC)
-	    == TRX_SYS_DOUBLEWRITE_MAGIC_N) {
-		/* The doublewrite buffer has already been created:
-		just read in some numbers */
-
-		trx_doublewrite_init(doublewrite);
-
-		mtr_commit(&mtr);
-		trx_doublewrite_buf_is_being_created = FALSE;
-	} else {
-		fprintf(stderr,
-			"InnoDB: Doublewrite buffer not found:"
-			" creating new\n");
-
-		if (buf_pool_get_curr_size()
-		    < ((2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE
-			+ FSP_EXTENT_SIZE / 2 + 100)
-		       * UNIV_PAGE_SIZE)) {
-			fprintf(stderr,
-				"InnoDB: Cannot create doublewrite buffer:"
-				" you must\n"
-				"InnoDB: increase your buffer pool size.\n"
-				"InnoDB: Cannot continue operation.\n");
-
-			exit(1);
-		}
-
-		block2 = fseg_create(TRX_SYS_SPACE, TRX_SYS_PAGE_NO,
-				     TRX_SYS_DOUBLEWRITE
-				     + TRX_SYS_DOUBLEWRITE_FSEG, &mtr);
-
-		/* fseg_create acquires a second latch on the page,
-		therefore we must declare it: */
-
-		buf_block_dbg_add_level(block2, SYNC_NO_ORDER_CHECK);
-
-		if (block2 == NULL) {
-			fprintf(stderr,
-				"InnoDB: Cannot create doublewrite buffer:"
-				" you must\n"
-				"InnoDB: increase your tablespace size.\n"
-				"InnoDB: Cannot continue operation.\n");
-
-			/* We exit without committing the mtr to prevent
-			its modifications to the database getting to disk */
-
-			exit(1);
-		}
-
-		fseg_header = buf_block_get_frame(block)
-			+ TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_FSEG;
-		prev_page_no = 0;
-
-		for (i = 0; i < 2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE
-			     + FSP_EXTENT_SIZE / 2; i++) {
-			new_block = fseg_alloc_free_page(
-				fseg_header, prev_page_no + 1, FSP_UP, &mtr);
-			if (new_block == NULL) {
-				fprintf(stderr,
-					"InnoDB: Cannot create doublewrite"
-					" buffer: you must\n"
-					"InnoDB: increase your"
-					" tablespace size.\n"
-					"InnoDB: Cannot continue operation.\n"
-					);
-
-				exit(1);
-			}
-
-			/* We read the allocated pages to the buffer pool;
-			when they are written to disk in a flush, the space
-			id and page number fields are also written to the
-			pages. When we at database startup read pages
-			from the doublewrite buffer, we know that if the
-			space id and page number in them are the same as
-			the page position in the tablespace, then the page
-			has not been written to in doublewrite. */
-
-			ut_ad(rw_lock_get_x_lock_count(&new_block->lock) == 1);
-			page_no = buf_block_get_page_no(new_block);
-
-			if (i == FSP_EXTENT_SIZE / 2) {
-				ut_a(page_no == FSP_EXTENT_SIZE);
-				mlog_write_ulint(doublewrite
-						 + TRX_SYS_DOUBLEWRITE_BLOCK1,
-						 page_no, MLOG_4BYTES, &mtr);
-				mlog_write_ulint(doublewrite
-						 + TRX_SYS_DOUBLEWRITE_REPEAT
-						 + TRX_SYS_DOUBLEWRITE_BLOCK1,
-						 page_no, MLOG_4BYTES, &mtr);
-			} else if (i == FSP_EXTENT_SIZE / 2
-				   + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
-				ut_a(page_no == 2 * FSP_EXTENT_SIZE);
-				mlog_write_ulint(doublewrite
-						 + TRX_SYS_DOUBLEWRITE_BLOCK2,
-						 page_no, MLOG_4BYTES, &mtr);
-				mlog_write_ulint(doublewrite
-						 + TRX_SYS_DOUBLEWRITE_REPEAT
-						 + TRX_SYS_DOUBLEWRITE_BLOCK2,
-						 page_no, MLOG_4BYTES, &mtr);
-			} else if (i > FSP_EXTENT_SIZE / 2) {
-				ut_a(page_no == prev_page_no + 1);
-			}
-
-			prev_page_no = page_no;
-		}
-
-		mlog_write_ulint(doublewrite + TRX_SYS_DOUBLEWRITE_MAGIC,
-				 TRX_SYS_DOUBLEWRITE_MAGIC_N,
-				 MLOG_4BYTES, &mtr);
-		mlog_write_ulint(doublewrite + TRX_SYS_DOUBLEWRITE_MAGIC
-				 + TRX_SYS_DOUBLEWRITE_REPEAT,
-				 TRX_SYS_DOUBLEWRITE_MAGIC_N,
-				 MLOG_4BYTES, &mtr);
-
-		mlog_write_ulint(doublewrite
-				 + TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED,
-				 TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED_N,
-				 MLOG_4BYTES, &mtr);
-		mtr_commit(&mtr);
-
-		/* Flush the modified pages to disk and make a checkpoint */
-		log_make_checkpoint_at(IB_ULONGLONG_MAX, TRUE);
-
-		fprintf(stderr, "InnoDB: Doublewrite buffer created\n");
-
-		trx_sys_multiple_tablespace_format = TRUE;
-
-		goto start_again;
-	}
-}
-
-/****************************************************************//**
-At a database startup initializes the doublewrite buffer memory structure if
-we already have a doublewrite buffer created in the data files. If we are
-upgrading to an InnoDB version which supports multiple tablespaces, then this
-function performs the necessary update operations. If we are in a crash
-recovery, this function uses a possible doublewrite buffer to restore
-half-written pages in the data files. */
-UNIV_INTERN
-void
-trx_sys_doublewrite_init_or_restore_pages(
-/*======================================*/
-	ibool	restore_corrupt_pages)	/*!< in: TRUE=restore pages */
-{
-	byte*	buf;
-	byte*	read_buf;
-	byte*	unaligned_read_buf;
-	ulint	block1;
-	ulint	block2;
-	ulint	source_page_no;
-	byte*	page;
-	byte*	doublewrite;
-	ulint	space_id;
-	ulint	page_no;
-	ulint	i;
-
-	/* We do the file i/o past the buffer pool */
-
-	unaligned_read_buf = ut_malloc(2 * UNIV_PAGE_SIZE);
-	read_buf = ut_align(unaligned_read_buf, UNIV_PAGE_SIZE);
-
-	/* Read the trx sys header to check if we are using the doublewrite
-	buffer */
-
-	fil_io(OS_FILE_READ, TRUE, TRX_SYS_SPACE, 0, TRX_SYS_PAGE_NO, 0,
-	       UNIV_PAGE_SIZE, read_buf, NULL);
-	doublewrite = read_buf + TRX_SYS_DOUBLEWRITE;
-
-	if (mach_read_from_4(doublewrite + TRX_SYS_DOUBLEWRITE_MAGIC)
-	    == TRX_SYS_DOUBLEWRITE_MAGIC_N) {
-		/* The doublewrite buffer has been created */
-
-		trx_doublewrite_init(doublewrite);
-
-		block1 = trx_doublewrite->block1;
-		block2 = trx_doublewrite->block2;
-
-		buf = trx_doublewrite->write_buf;
-	} else {
-		goto leave_func;
-	}
-
-	if (mach_read_from_4(doublewrite + TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED)
-	    != TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED_N) {
-
-		/* We are upgrading from a version < 4.1.x to a version where
-		multiple tablespaces are supported. We must reset the space id
-		field in the pages in the doublewrite buffer because starting
-		from this version the space id is stored to
-		FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID. */
-
-		trx_doublewrite_must_reset_space_ids = TRUE;
-
-		fprintf(stderr,
-			"InnoDB: Resetting space id's in the"
-			" doublewrite buffer\n");
-	} else {
-		trx_sys_multiple_tablespace_format = TRUE;
-	}
-
-	/* Read the pages from the doublewrite buffer to memory */
-
-	fil_io(OS_FILE_READ, TRUE, TRX_SYS_SPACE, 0, block1, 0,
-	       TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE,
-	       buf, NULL);
-	fil_io(OS_FILE_READ, TRUE, TRX_SYS_SPACE, 0, block2, 0,
-	       TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE,
-	       buf + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE,
-	       NULL);
-	/* Check if any of these pages is half-written in data files, in the
-	intended position */
-
-	page = buf;
-
-	for (i = 0; i < TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * 2; i++) {
-
-		page_no = mach_read_from_4(page + FIL_PAGE_OFFSET);
-
-		if (trx_doublewrite_must_reset_space_ids) {
-
-			space_id = 0;
-			mach_write_to_4(page
-					+ FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, 0);
-			/* We do not need to calculate new checksums for the
-			pages because the field .._SPACE_ID does not affect
-			them. Write the page back to where we read it from. */
-
-			if (i < TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
-				source_page_no = block1 + i;
-			} else {
-				source_page_no = block2
-					+ i - TRX_SYS_DOUBLEWRITE_BLOCK_SIZE;
-			}
-
-			fil_io(OS_FILE_WRITE, TRUE, 0, 0, source_page_no, 0,
-			       UNIV_PAGE_SIZE, page, NULL);
-			/* printf("Resetting space id in page %lu\n",
-			source_page_no); */
-		} else {
-			space_id = mach_read_from_4(
-				page + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
-		}
-
-		if (!restore_corrupt_pages) {
-			/* The database was shut down gracefully: no need to
-			restore pages */
-
-		} else if (!fil_tablespace_exists_in_mem(space_id)) {
-			/* Maybe we have dropped the single-table tablespace
-			and this page once belonged to it: do nothing */
-
-		} else if (!fil_check_adress_in_tablespace(space_id,
-							   page_no)) {
-			fprintf(stderr,
-				"InnoDB: Warning: a page in the"
-				" doublewrite buffer is not within space\n"
-				"InnoDB: bounds; space id %lu"
-				" page number %lu, page %lu in"
-				" doublewrite buf.\n",
-				(ulong) space_id, (ulong) page_no, (ulong) i);
-
-		} else if (space_id == TRX_SYS_SPACE
-			   && ((page_no >= block1
-				&& page_no
-				< block1 + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE)
-			       || (page_no >= block2
-				   && page_no
-				   < (block2
-				      + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE)))) {
-
-			/* It is an unwritten doublewrite buffer page:
-			do nothing */
-		} else {
-			ulint	zip_size = fil_space_get_zip_size(space_id);
-
-			/* Read in the actual page from the file */
-			fil_io(OS_FILE_READ, TRUE, space_id, zip_size,
-			       page_no, 0,
-			       zip_size ? zip_size : UNIV_PAGE_SIZE,
-			       read_buf, NULL);
-
-			/* Check if the page is corrupt */
-
-			if (UNIV_UNLIKELY
-			    (buf_page_is_corrupted(read_buf, zip_size))) {
-
-				fprintf(stderr,
-					"InnoDB: Warning: database page"
-					" corruption or a failed\n"
-					"InnoDB: file read of"
-					" space %lu page %lu.\n"
-					"InnoDB: Trying to recover it from"
-					" the doublewrite buffer.\n",
-					(ulong) space_id, (ulong) page_no);
-
-				if (buf_page_is_corrupted(page, zip_size)) {
-					fprintf(stderr,
-						"InnoDB: Dump of the page:\n");
-					buf_page_print(
-						read_buf, zip_size,
-						BUF_PAGE_PRINT_NO_CRASH);
-					fprintf(stderr,
-						"InnoDB: Dump of"
-						" corresponding page"
-						" in doublewrite buffer:\n");
-					buf_page_print(
-						page, zip_size,
-						BUF_PAGE_PRINT_NO_CRASH);
-
-					fprintf(stderr,
-						"InnoDB: Also the page in the"
-						" doublewrite buffer"
-						" is corrupt.\n"
-						"InnoDB: Cannot continue"
-						" operation.\n"
-						"InnoDB: You can try to"
-						" recover the database"
-						" with the my.cnf\n"
-						"InnoDB: option:\n"
-						"InnoDB:"
-						" innodb_force_recovery=6\n");
-					ut_error;
-				}
-
-				/* Write the good page from the
-				doublewrite buffer to the intended
-				position */
-
-				fil_io(OS_FILE_WRITE, TRUE, space_id,
-				       zip_size, page_no, 0,
-				       zip_size ? zip_size : UNIV_PAGE_SIZE,
-				       page, NULL);
-				fprintf(stderr,
-					"InnoDB: Recovered the page from"
-					" the doublewrite buffer.\n");
-			}
-		}
-
-		page += UNIV_PAGE_SIZE;
-	}
-
-	fil_flush_file_spaces(FIL_TABLESPACE);
-
-leave_func:
-	ut_free(unaligned_read_buf);
-}
-
-/****************************************************************//**
-Checks that trx is in the trx list.
+Checks whether a trx is in one of rw_trx_list or ro_trx_list.
 @return	TRUE if is in */
 UNIV_INTERN
 ibool
 trx_in_trx_list(
 /*============*/
-	trx_t*	in_trx)	/*!< in: trx */
+	const trx_t*	in_trx)	/*!< in: transaction */
 {
-	trx_t*	trx;
+	const trx_t*	trx;
+	trx_list_t*	trx_list;
 
-	ut_ad(mutex_own(&(kernel_mutex)));
+	/* Non-locking autocommits should not hold any locks. */
+	assert_trx_in_list(in_trx);
 
-	trx = UT_LIST_GET_FIRST(trx_sys->trx_list);
+	trx_list = in_trx->read_only
+		? &trx_sys->ro_trx_list : &trx_sys->rw_trx_list;
 
-	while (trx != NULL) {
+	ut_ad(mutex_own(&trx_sys->mutex));
 
-		if (trx == in_trx) {
+	ut_ad(trx_assert_started(in_trx));
 
-			return(TRUE);
-		}
+	for (trx = UT_LIST_GET_FIRST(*trx_list);
+	     trx != NULL && trx != in_trx;
+	     trx = UT_LIST_GET_NEXT(trx_list, trx)) {
 
-		trx = UT_LIST_GET_NEXT(trx_list, trx);
+		assert_trx_in_list(trx);
+		ut_ad(trx->read_only == (trx_list == &trx_sys->ro_trx_list));
 	}
 
-	return(FALSE);
+	return(trx != NULL);
 }
+#endif /* UNIV_DEBUG */
 
 /*****************************************************************//**
 Writes the value of max_trx_id to the file based trx system header. */
@@ -662,10 +175,10 @@ void
 trx_sys_flush_max_trx_id(void)
 /*==========================*/
 {
-	trx_sysf_t*	sys_header;
 	mtr_t		mtr;
+	trx_sysf_t*	sys_header;
 
-	ut_ad(mutex_own(&kernel_mutex));
+	ut_ad(mutex_own(&trx_sys->mutex));
 
 	mtr_start(&mtr);
 
@@ -771,8 +284,8 @@ trx_sys_print_mysql_binlog_offset(void)
 		+ TRX_SYS_MYSQL_LOG_OFFSET_LOW);
 
 	trx_sys_mysql_bin_log_pos
-		= (((ib_int64_t)trx_sys_mysql_bin_log_pos_high) << 32)
-		+ (ib_int64_t)trx_sys_mysql_bin_log_pos_low;
+		= (((ib_int64_t) trx_sys_mysql_bin_log_pos_high) << 32)
+		+ (ib_int64_t) trx_sys_mysql_bin_log_pos_low;
 
 	ut_memcpy(trx_sys_mysql_bin_log_name,
 		  sys_header + TRX_SYS_MYSQL_LOG_INFO
@@ -850,15 +363,13 @@ trx_sysf_rseg_find_free(
 /*====================*/
 	mtr_t*	mtr)	/*!< in: mtr */
 {
-	trx_sysf_t*	sys_header;
-	ulint		page_no;
 	ulint		i;
-
-	ut_ad(mutex_own(&(kernel_mutex)));
+	trx_sysf_t*	sys_header;
 
 	sys_header = trx_sysf_get(mtr);
 
 	for (i = 0; i < TRX_SYS_N_RSEGS; i++) {
+		ulint	page_no;
 
 		page_no = trx_sysf_rseg_get_page_no(sys_header, i, mtr);
 
@@ -895,7 +406,6 @@ trx_sysf_create(
 	to the latching order rules. */
 
 	mtr_x_lock(fil_space_get_latch(TRX_SYS_SPACE, NULL), mtr);
-	mutex_enter(&kernel_mutex);
 
 	/* Create the trx sys file block in a new allocated file segment */
 	block = fseg_create(TRX_SYS_SPACE, 0, TRX_SYS + TRX_SYS_FSEG_HEADER,
@@ -941,10 +451,9 @@ trx_sysf_create(
 	slot_no = trx_sysf_rseg_find_free(mtr);
 	page_no = trx_rseg_header_create(TRX_SYS_SPACE, 0, ULINT_MAX, slot_no,
 					 mtr);
+
 	ut_a(slot_no == TRX_SYS_SYSTEM_RSEG_ID);
 	ut_a(page_no == FSP_FIRST_RSEG_PAGE_NO);
-
-	mutex_exit(&kernel_mutex);
 }
 
 /*****************************************************************//**
@@ -974,24 +483,18 @@ trx_rseg_compare_last_trx_no(
 
 /*****************************************************************//**
 Creates and initializes the central memory structures for the transaction
-system. This is called when the database is started. */
+system. This is called when the database is started.
+@return min binary heap of rsegs to purge */
 UNIV_INTERN
-void
+ib_bh_t*
 trx_sys_init_at_db_start(void)
 /*==========================*/
 {
+	mtr_t		mtr;
+	ib_bh_t*	ib_bh;
 	trx_sysf_t*	sys_header;
 	ib_uint64_t	rows_to_undo	= 0;
 	const char*	unit		= "";
-	trx_t*		trx;
-	mtr_t		mtr;
-	ib_bh_t*	ib_bh;
-
-	mtr_start(&mtr);
-
-	ut_ad(trx_sys == NULL);
-
-	mutex_enter(&kernel_mutex);
 
 	/* We create the min binary heap here and pass ownership to
 	purge when we init the purge sub-system. Purge is responsible
@@ -1001,13 +504,13 @@ trx_sys_init_at_db_start(void)
 		trx_rseg_compare_last_trx_no,
 		sizeof(rseg_queue_t), TRX_SYS_N_RSEGS);
 
-	trx_sys = mem_zalloc(sizeof(*trx_sys));
+	mtr_start(&mtr);
 
 	sys_header = trx_sysf_get(&mtr);
 
-	trx_rseg_list_and_array_init(sys_header, ib_bh, &mtr);
-
-	trx_sys->latest_rseg = UT_LIST_GET_FIRST(trx_sys->rseg_list);
+	if (srv_force_recovery < SRV_FORCE_NO_UNDO_LOG_SCAN) {
+		trx_rseg_array_init(sys_header, ib_bh, &mtr);
+	}
 
 	/* VERY important: after the database is started, max_trx_id value is
 	divisible by TRX_SYS_TRX_ID_WRITE_MARGIN, and the 'if' in
@@ -1022,22 +525,31 @@ trx_sys_init_at_db_start(void)
 				     TRX_SYS_TRX_ID_WRITE_MARGIN);
 
 	UT_LIST_INIT(trx_sys->mysql_trx_list);
+
 	trx_dummy_sess = sess_open();
+
 	trx_lists_init_at_db_start();
 
-	if (UT_LIST_GET_LEN(trx_sys->trx_list) > 0) {
-		trx = UT_LIST_GET_FIRST(trx_sys->trx_list);
+	/* This S lock is not strictly required, it is here only to satisfy
+	the debug code (assertions). We are still running in single threaded
+	bootstrap mode. */
 
-		for (;;) {
+	mutex_enter(&trx_sys->mutex);
 
-			if (trx->conc_state != TRX_PREPARED) {
-				rows_to_undo += trx->undo_no;
-			}
+	ut_a(UT_LIST_GET_LEN(trx_sys->ro_trx_list) == 0);
 
-			trx = UT_LIST_GET_NEXT(trx_list, trx);
+	if (UT_LIST_GET_LEN(trx_sys->rw_trx_list) > 0) {
+		const trx_t*	trx;
 
-			if (!trx) {
-				break;
+		for (trx = UT_LIST_GET_FIRST(trx_sys->rw_trx_list);
+		     trx != NULL;
+		     trx = UT_LIST_GET_NEXT(trx_list, trx)) {
+
+			ut_ad(trx->is_recovered);
+			assert_trx_in_rw_list(trx);
+
+			if (trx_state_eq(trx, TRX_STATE_ACTIVE)) {
+				rows_to_undo += trx->undo_no;
 			}
 		}
 
@@ -1050,30 +562,43 @@ trx_sys_init_at_db_start(void)
 			"InnoDB: %lu transaction(s) which must be"
 			" rolled back or cleaned up\n"
 			"InnoDB: in total %lu%s row operations to undo\n",
-			(ulong) UT_LIST_GET_LEN(trx_sys->trx_list),
+			(ulong) UT_LIST_GET_LEN(trx_sys->rw_trx_list),
 			(ulong) rows_to_undo, unit);
 
 		fprintf(stderr, "InnoDB: Trx id counter is " TRX_ID_FMT "\n",
-			(ullint) trx_sys->max_trx_id);
+			trx_sys->max_trx_id);
 	}
 
-	UT_LIST_INIT(trx_sys->view_list);
-
-	/* Transfer ownership to purge. */
-	trx_purge_sys_create(ib_bh);
+	mutex_exit(&trx_sys->mutex);
 
-	mutex_exit(&kernel_mutex);
+	UT_LIST_INIT(trx_sys->view_list);
 
 	mtr_commit(&mtr);
+
+	return(ib_bh);
 }
 
 /*****************************************************************//**
-Creates and initializes the transaction system at the database creation. */
+Creates the trx_sys instance and initializes ib_bh and mutex. */
 UNIV_INTERN
 void
 trx_sys_create(void)
 /*================*/
 {
+	ut_ad(trx_sys == NULL);
+
+	trx_sys = static_cast<trx_sys_t*>(mem_zalloc(sizeof(*trx_sys)));
+
+	mutex_create(trx_sys_mutex_key, &trx_sys->mutex, SYNC_TRX_SYS);
+}
+
+/*****************************************************************//**
+Creates and initializes the transaction system at the database creation. */
+UNIV_INTERN
+void
+trx_sys_create_sys_pages(void)
+/*==========================*/
+{
 	mtr_t	mtr;
 
 	mtr_start(&mtr);
@@ -1081,8 +606,6 @@ trx_sys_create(void)
 	trx_sysf_create(&mtr);
 
 	mtr_commit(&mtr);
-
-	trx_sys_init_at_db_start();
 }
 
 /*****************************************************************//**
@@ -1175,7 +698,7 @@ trx_sys_file_format_id_to_name(
 
 /*****************************************************************//**
 Check for the max file format tag stored on disk. Note: If max_format_id
-is == DICT_TF_FORMAT_MAX + 1 then we only print a warning.
+is == UNIV_FORMAT_MAX + 1 then we only print a warning.
 @return	DB_SUCCESS or error code */
 UNIV_INTERN
 ulint
@@ -1192,15 +715,15 @@ trx_sys_file_format_max_check(
 	if (format_id == ULINT_UNDEFINED) {
 		/* Format ID was not set. Set it to minimum possible
 		value. */
-		format_id = DICT_TF_FORMAT_MIN;
+		format_id = UNIV_FORMAT_MIN;
 	}
 
 	ut_print_timestamp(stderr);
 	fprintf(stderr,
 		" InnoDB: highest supported file format is %s.\n",
-		trx_sys_file_format_id_to_name(DICT_TF_FORMAT_MAX));
+		trx_sys_file_format_id_to_name(UNIV_FORMAT_MAX));
 
-	if (format_id > DICT_TF_FORMAT_MAX) {
+	if (format_id > UNIV_FORMAT_MAX) {
 
 		ut_a(format_id < FILE_FORMAT_NAME_N);
 
@@ -1208,11 +731,11 @@ trx_sys_file_format_max_check(
 		fprintf(stderr,
 			" InnoDB: %s: the system tablespace is in a file "
 			"format that this version doesn't support - %s\n",
-			((max_format_id <= DICT_TF_FORMAT_MAX)
+			((max_format_id <= UNIV_FORMAT_MAX)
 				? "Error" : "Warning"),
 			trx_sys_file_format_id_to_name(format_id));
 
-		if (max_format_id <= DICT_TF_FORMAT_MAX) {
+		if (max_format_id <= UNIV_FORMAT_MAX) {
 			return(DB_ERROR);
 		}
 	}
@@ -1241,7 +764,7 @@ trx_sys_file_format_max_set(
 {
 	ibool		ret = FALSE;
 
-	ut_a(format_id <= DICT_TF_FORMAT_MAX);
+	ut_a(format_id <= UNIV_FORMAT_MAX);
 
 	mutex_enter(&file_format_max.mutex);
 
@@ -1272,7 +795,7 @@ trx_sys_file_format_tag_init(void)
 
 	/* If format_id is not set then set it to the minimum. */
 	if (format_id == ULINT_UNDEFINED) {
-		trx_sys_file_format_max_set(DICT_TF_FORMAT_MIN, NULL);
+		trx_sys_file_format_max_set(UNIV_FORMAT_MIN, NULL);
 	}
 }
 
@@ -1291,7 +814,7 @@ trx_sys_file_format_max_upgrade(
 
 	ut_a(name);
 	ut_a(file_format_max.name != NULL);
-	ut_a(format_id <= DICT_TF_FORMAT_MAX);
+	ut_a(format_id <= UNIV_FORMAT_MAX);
 
 	mutex_enter(&file_format_max.mutex);
 
@@ -1328,7 +851,7 @@ trx_sys_file_format_init(void)
 
 	/* We don't need a mutex here, as this function should only
 	be called once at start up. */
-	file_format_max.id = DICT_TF_FORMAT_MIN;
+	file_format_max.id = UNIV_FORMAT_MIN;
 
 	file_format_max.name = trx_sys_file_format_id_to_name(
 		file_format_max.id);
@@ -1345,36 +868,69 @@ trx_sys_file_format_close(void)
 }
 
 /*********************************************************************
-Creates the rollback segments */
+Creates the rollback segments.
+@return number of rollback segments that are active. */
 UNIV_INTERN
-void
+ulint
 trx_sys_create_rsegs(
 /*=================*/
+	ulint	n_spaces,	/*!< number of tablespaces for UNDO logs */
 	ulint	n_rsegs)	/*!< number of rollback segments to create */
 {
-	ulint	new_rsegs = 0;
+	mtr_t	mtr;
+	ulint	n_used;
 
-	/* Do not create additional rollback segments if
-	innodb_force_recovery has been set and the database
-	was not shutdown cleanly. */
-	if (!srv_force_recovery && !recv_needed_recovery) {
+	ut_a(n_spaces < TRX_SYS_N_RSEGS);
+	ut_a(n_rsegs <= TRX_SYS_N_RSEGS);
+
+	if (srv_force_recovery >= SRV_FORCE_NO_TRX_UNDO) {
+		return(ULINT_UNDEFINED);
+	}
+
+	/* This is executed in single-threaded mode therefore it is not
+	necessary to use the same mtr in trx_rseg_create(). n_used cannot
+	change while the function is executing. */
+
+	mtr_start(&mtr);
+	n_used = trx_sysf_rseg_find_free(&mtr);
+	mtr_commit(&mtr);
+
+	if (n_used == ULINT_UNDEFINED) {
+		n_used = TRX_SYS_N_RSEGS;
+	}
+
+	/* Do not create additional rollback segments if innodb_force_recovery
+	has been set and the database was not shutdown cleanly. */
+
+	if (!srv_force_recovery && !recv_needed_recovery && n_used < n_rsegs) {
 		ulint	i;
+		ulint	new_rsegs = n_rsegs - n_used;
+
+		for (i = 0; i < new_rsegs; ++i) {
+			ulint	space;
 
-		for (i = 0;  i < n_rsegs; ++i) {
+			/* Tablespace 0 is the system tablespace. All UNDO
+			log tablespaces start from 1. */
 
-			if (trx_rseg_create() != NULL) {
-				++new_rsegs;
+			if (n_spaces > 0) {
+				space = (i % n_spaces) + 1;
+			} else {
+				space = 0; /* System tablespace */
+			}
+
+			if (trx_rseg_create(space) != NULL) {
+				++n_used;
 			} else {
 				break;
 			}
 		}
 	}
 
-	if (new_rsegs > 0) {
-		fprintf(stderr,
-			"InnoDB: %lu rollback segment(s) active.\n",
-		       	new_rsegs);
-	}
+	ut_print_timestamp(stderr);
+	fprintf(stderr, " InnoDB: %lu rollback segment(s) are active.\n",
+		n_used);
+
+	return(n_used);
 }
 
 #else /* !UNIV_HOTBACKUP */
@@ -1411,23 +967,6 @@ trx_sys_print_mysql_binlog_offset_from_page(
 	}
 }
 
-
-/* THESE ARE COPIED FROM NON-HOTBACKUP PART OF THE INNODB SOURCE TREE
-   (This code duplication should be fixed at some point!)
-*/
-
-#define	TRX_SYS_SPACE	0	/* the SYSTEM tablespace */
-/* The offset of the file format tag on the trx system header page */
-#define TRX_SYS_FILE_FORMAT_TAG		(UNIV_PAGE_SIZE - 16)
-/* We use these random constants to reduce the probability of reading
-garbage (from previous versions) that maps to an actual format id. We
-use these as bit masks at the time of  reading and writing from/to disk. */
-#define TRX_SYS_FILE_FORMAT_TAG_MAGIC_N_LOW	3645922177UL
-#define TRX_SYS_FILE_FORMAT_TAG_MAGIC_N_HIGH	2745987765UL
-
-/* END OF COPIED DEFINITIONS */
-
-
 /*****************************************************************//**
 Reads the file format id from the first system table space file.
 Even if the call succeeds and returns TRUE, the returned format id
@@ -1466,18 +1005,18 @@ trx_sys_read_file_format_id(
 		ut_print_timestamp(stderr);
 
 		fprintf(stderr,
-"  ibbackup: Error: trying to read system tablespace file format,\n"
-"  ibbackup: but could not open the tablespace file %s!\n",
-			pathname
-		);
+			"  ibbackup: Error: trying to read system tablespace "
+			"file format,\n"
+			"  ibbackup: but could not open the tablespace "
+			"file %s!\n", pathname);
 		return(FALSE);
 	}
 
 	/* Read the page on which file format is stored */
 
 	success = os_file_read_no_error_handling(
-		file, page, TRX_SYS_PAGE_NO * UNIV_PAGE_SIZE, 0, UNIV_PAGE_SIZE
-	);
+		file, page, TRX_SYS_PAGE_NO * UNIV_PAGE_SIZE, UNIV_PAGE_SIZE);
+
 	if (!success) {
 		/* The following call prints an error message */
 		os_file_get_last_error(TRUE);
@@ -1485,10 +1024,11 @@ trx_sys_read_file_format_id(
 		ut_print_timestamp(stderr);
 
 		fprintf(stderr,
-"  ibbackup: Error: trying to read system table space file format,\n"
-"  ibbackup: but failed to read the tablespace file %s!\n",
-			pathname
-		);
+			"  ibbackup: Error: trying to read system tablespace "
+			"file format,\n"
+			"  ibbackup: but failed to read the tablespace "
+			"file %s!\n", pathname);
+
 		os_file_close(file);
 		return(FALSE);
 	}
@@ -1510,7 +1050,6 @@ trx_sys_read_file_format_id(
 	return(TRUE);
 }
 
-
 /*****************************************************************//**
 Reads the file format id from the given per-table data file.
 @return TRUE if call succeeds */
@@ -1542,33 +1081,34 @@ trx_sys_read_pertable_file_format_id(
 	if (!success) {
 		/* The following call prints an error message */
 		os_file_get_last_error(TRUE);
-        
+
 		ut_print_timestamp(stderr);
-        
+
 		fprintf(stderr,
-"  ibbackup: Error: trying to read per-table tablespace format,\n"
-"  ibbackup: but could not open the tablespace file %s!\n",
-			pathname
-		);
+			"  ibbackup: Error: trying to read per-table "
+			"tablespace format,\n"
+			"  ibbackup: but could not open the tablespace "
+			"file %s!\n", pathname);
+
 		return(FALSE);
 	}
 
 	/* Read the first page of the per-table datafile */
 
-	success = os_file_read_no_error_handling(
-		file, page, 0, 0, UNIV_PAGE_SIZE
-	);
+	success = os_file_read_no_error_handling(file, page, 0, UNIV_PAGE_SIZE);
+
 	if (!success) {
 		/* The following call prints an error message */
 		os_file_get_last_error(TRUE);
-        
+
 		ut_print_timestamp(stderr);
-        
+
 		fprintf(stderr,
-"  ibbackup: Error: trying to per-table data file format,\n"
-"  ibbackup: but failed to read the tablespace file %s!\n",
-			pathname
-		);
+			"  ibbackup: Error: trying to per-table data file "
+			"format,\n"
+			"  ibbackup: but failed to read the tablespace "
+			"file %s!\n", pathname);
+
 		os_file_close(file);
 		return(FALSE);
 	}
@@ -1619,8 +1159,8 @@ void
 trx_sys_close(void)
 /*===============*/
 {
+	ulint		i;
 	trx_t*		trx;
-	trx_rseg_t*	rseg;
 	read_view_t*	view;
 
 	ut_ad(trx_sys != NULL);
@@ -1629,6 +1169,8 @@ trx_sys_close(void)
 	/* Check that all read views are closed except read view owned
 	by a purge. */
 
+	mutex_enter(&trx_sys->mutex);
+
 	if (UT_LIST_GET_LEN(trx_sys->view_list) > 1) {
 		fprintf(stderr,
 			"InnoDB: Error: all read views were not closed"
@@ -1637,42 +1179,38 @@ trx_sys_close(void)
 			UT_LIST_GET_LEN(trx_sys->view_list) - 1);
 	}
 
+	mutex_exit(&trx_sys->mutex);
+
 	sess_close(trx_dummy_sess);
 	trx_dummy_sess = NULL;
 
 	trx_purge_sys_close();
 
-	mutex_enter(&kernel_mutex);
-
 	/* Free the double write data structures. */
-	ut_a(trx_doublewrite != NULL);
-	ut_free(trx_doublewrite->write_buf_unaligned);
-	trx_doublewrite->write_buf_unaligned = NULL;
+	buf_dblwr_free();
 
-	mem_free(trx_doublewrite->buf_block_arr);
-	trx_doublewrite->buf_block_arr = NULL;
+	mutex_enter(&trx_sys->mutex);
 
-	mutex_free(&trx_doublewrite->mutex);
-	mem_free(trx_doublewrite);
-	trx_doublewrite = NULL;
+	ut_a(UT_LIST_GET_LEN(trx_sys->ro_trx_list) == 0);
 
 	/* Only prepared transactions may be left in the system. Free them. */
-	ut_a(UT_LIST_GET_LEN(trx_sys->trx_list) == trx_n_prepared);
+	ut_a(UT_LIST_GET_LEN(trx_sys->rw_trx_list) == trx_sys->n_prepared_trx);
 
-	while ((trx = UT_LIST_GET_FIRST(trx_sys->trx_list)) != NULL) {
+	while ((trx = UT_LIST_GET_FIRST(trx_sys->rw_trx_list)) != NULL) {
 		trx_free_prepared(trx);
 	}
 
 	/* There can't be any active transactions. */
-	rseg = UT_LIST_GET_FIRST(trx_sys->rseg_list);
+	for (i = 0; i < TRX_SYS_N_RSEGS; ++i) {
+		trx_rseg_t*	rseg;
 
-	while (rseg != NULL) {
-		trx_rseg_t*	prev_rseg = rseg;
+		rseg = trx_sys->rseg_array[i];
 
-		rseg = UT_LIST_GET_NEXT(rseg_list, prev_rseg);
-		UT_LIST_REMOVE(rseg_list, trx_sys->rseg_list, prev_rseg);
-
-		trx_rseg_mem_free(prev_rseg);
+		if (rseg != NULL) {
+			trx_rseg_mem_free(rseg);
+		} else {
+			break;
+		}
 	}
 
 	view = UT_LIST_GET_FIRST(trx_sys->view_list);
@@ -1687,14 +1225,89 @@ trx_sys_close(void)
 		UT_LIST_REMOVE(view_list, trx_sys->view_list, prev_view);
 	}
 
-	ut_a(UT_LIST_GET_LEN(trx_sys->trx_list) == 0);
-	ut_a(UT_LIST_GET_LEN(trx_sys->rseg_list) == 0);
 	ut_a(UT_LIST_GET_LEN(trx_sys->view_list) == 0);
+	ut_a(UT_LIST_GET_LEN(trx_sys->ro_trx_list) == 0);
+	ut_a(UT_LIST_GET_LEN(trx_sys->rw_trx_list) == 0);
 	ut_a(UT_LIST_GET_LEN(trx_sys->mysql_trx_list) == 0);
 
+	mutex_exit(&trx_sys->mutex);
+
+	mutex_free(&trx_sys->mutex);
+
 	mem_free(trx_sys);
 
 	trx_sys = NULL;
-	mutex_exit(&kernel_mutex);
 }
+
+/*********************************************************************
+Check if there are any active (non-prepared) transactions.
+@return total number of active transactions or 0 if none */
+UNIV_INTERN
+ulint
+trx_sys_any_active_transactions(void)
+/*=================================*/
+{
+	ulint	total_trx = 0;
+
+	mutex_enter(&trx_sys->mutex);
+
+	total_trx = UT_LIST_GET_LEN(trx_sys->rw_trx_list)
+		+ trx_sys->n_mysql_trx;
+
+	ut_a(total_trx >= trx_sys->n_prepared_trx);
+	total_trx -= trx_sys->n_prepared_trx;
+
+	mutex_exit(&trx_sys->mutex);
+
+	return(total_trx);
+}
+
+#ifdef UNIV_DEBUG
+/*************************************************************//**
+Validate the trx_list_t.
+@return TRUE if valid. */
+static
+ibool
+trx_sys_validate_trx_list_low(
+/*===========================*/
+	trx_list_t*	trx_list)	/*!< in: &trx_sys->ro_trx_list
+					or &trx_sys->rw_trx_list */
+{
+	const trx_t*	trx;
+	const trx_t*	prev_trx = NULL;
+
+	ut_ad(mutex_own(&trx_sys->mutex));
+
+	ut_ad(trx_list == &trx_sys->ro_trx_list
+	      || trx_list == &trx_sys->rw_trx_list);
+
+	for (trx = UT_LIST_GET_FIRST(*trx_list);
+	     trx != NULL;
+	     prev_trx = trx, trx = UT_LIST_GET_NEXT(trx_list, prev_trx)) {
+
+		assert_trx_in_list(trx);
+		ut_ad(trx->read_only == (trx_list == &trx_sys->ro_trx_list));
+
+		ut_a(prev_trx == NULL || prev_trx->id > trx->id);
+	}
+
+	return(TRUE);
+}
+
+/*************************************************************//**
+Validate the trx_sys_t::ro_trx_list and trx_sys_t::rw_trx_list.
+@return TRUE if lists are valid. */
+UNIV_INTERN
+ibool
+trx_sys_validate_trx_list(void)
+/*===========================*/
+{
+	ut_ad(mutex_own(&trx_sys->mutex));
+
+	ut_a(trx_sys_validate_trx_list_low(&trx_sys->ro_trx_list));
+	ut_a(trx_sys_validate_trx_list_low(&trx_sys->rw_trx_list));
+
+	return(TRUE);
+}
+#endif /* UNIV_DEBUG */
 #endif /* !UNIV_HOTBACKUP */
diff --git a/storage/innobase/trx/trx0trx.c b/storage/innobase/trx/trx0trx.c
deleted file mode 100644
index 85246aa6d1f..00000000000
--- a/storage/innobase/trx/trx0trx.c
+++ /dev/null
@@ -1,2167 +0,0 @@
-/*****************************************************************************
-
-Copyright (c) 1996, 2011, Innobase Oy. All Rights Reserved.
-
-This program is free software; you can redistribute it and/or modify it under
-the terms of the GNU General Public License as published by the Free Software
-Foundation; version 2 of the License.
-
-This program is distributed in the hope that it will be useful, but WITHOUT
-ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
-FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
-
-*****************************************************************************/
-
-/**************************************************//**
-@file trx/trx0trx.c
-The transaction
-
-Created 3/26/1996 Heikki Tuuri
-*******************************************************/
-
-#include "trx0trx.h"
-
-#ifdef UNIV_NONINL
-#include "trx0trx.ic"
-#endif
-
-#include "trx0undo.h"
-#include "trx0rseg.h"
-#include "log0log.h"
-#include "que0que.h"
-#include "lock0lock.h"
-#include "trx0roll.h"
-#include "usr0sess.h"
-#include "read0read.h"
-#include "srv0srv.h"
-#include "btr0sea.h"
-#include "os0proc.h"
-#include "trx0xa.h"
-#include "trx0purge.h"
-#include "ha_prototypes.h"
-
-/** Dummy session used currently in MySQL interface */
-UNIV_INTERN sess_t*		trx_dummy_sess = NULL;
-
-/** Number of transactions currently allocated for MySQL: protected by
-the kernel mutex */
-UNIV_INTERN ulint	trx_n_mysql_transactions = 0;
-/** Number of transactions currently in the XA PREPARED state: protected by
-the kernel mutex */
-UNIV_INTERN ulint	trx_n_prepared = 0;
-
-#ifdef UNIV_PFS_MUTEX
-/* Key to register the mutex with performance schema */
-UNIV_INTERN mysql_pfs_key_t	trx_undo_mutex_key;
-#endif /* UNIV_PFS_MUTEX */
-
-/*************************************************************//**
-Set detailed error message for the transaction. */
-UNIV_INTERN
-void
-trx_set_detailed_error(
-/*===================*/
-	trx_t*		trx,	/*!< in: transaction struct */
-	const char*	msg)	/*!< in: detailed error message */
-{
-	ut_strlcpy(trx->detailed_error, msg, sizeof(trx->detailed_error));
-}
-
-/*************************************************************//**
-Set detailed error message for the transaction from a file. Note that the
-file is rewinded before reading from it. */
-UNIV_INTERN
-void
-trx_set_detailed_error_from_file(
-/*=============================*/
-	trx_t*	trx,	/*!< in: transaction struct */
-	FILE*	file)	/*!< in: file to read message from */
-{
-	os_file_read_string(file, trx->detailed_error,
-			    sizeof(trx->detailed_error));
-}
-
-/****************************************************************//**
-Creates and initializes a transaction object.
-@return	own: the transaction */
-UNIV_INTERN
-trx_t*
-trx_create(
-/*=======*/
-	sess_t*	sess)	/*!< in: session */
-{
-	trx_t*	trx;
-
-	ut_ad(mutex_own(&kernel_mutex));
-	ut_ad(sess);
-
-	trx = mem_alloc(sizeof(trx_t));
-
-	trx->magic_n = TRX_MAGIC_N;
-
-	trx->op_info = "";
-
-	trx->is_purge = 0;
-	trx->is_recovered = 0;
-	trx->conc_state = TRX_NOT_STARTED;
-
-	trx->is_registered = 0;
-	trx->active_commit_ordered = 0;
-
-	trx->start_time = ut_time();
-
-	trx->isolation_level = TRX_ISO_REPEATABLE_READ;
-
-	trx->id = 0;
-	trx->no = IB_ULONGLONG_MAX;
-
-	trx->support_xa = TRUE;
-
-	trx->check_foreigns = TRUE;
-	trx->check_unique_secondary = TRUE;
-
-	trx->flush_log_later = FALSE;
-	trx->must_flush_log_later = FALSE;
-
-	trx->dict_operation = TRX_DICT_OP_NONE;
-	trx->table_id = 0;
-
-	trx->mysql_thd = NULL;
-	trx->duplicates = 0;
-
-	trx->n_mysql_tables_in_use = 0;
-	trx->mysql_n_tables_locked = 0;
-
-	trx->mysql_log_file_name = NULL;
-	trx->mysql_log_offset = 0;
-
-	mutex_create(trx_undo_mutex_key, &trx->undo_mutex, SYNC_TRX_UNDO);
-
-	trx->rseg = NULL;
-
-	trx->undo_no = 0;
-	trx->last_sql_stat_start.least_undo_no = 0;
-	trx->insert_undo = NULL;
-	trx->update_undo = NULL;
-	trx->undo_no_arr = NULL;
-
-	trx->error_state = DB_SUCCESS;
-	trx->error_key_num = 0;
-	trx->detailed_error[0] = '\0';
-
-	trx->sess = sess;
-	trx->que_state = TRX_QUE_RUNNING;
-	trx->n_active_thrs = 0;
-
-	trx->handling_signals = FALSE;
-
-	UT_LIST_INIT(trx->signals);
-	UT_LIST_INIT(trx->reply_signals);
-
-	trx->graph = NULL;
-
-	trx->wait_lock = NULL;
-	trx->was_chosen_as_deadlock_victim = FALSE;
-	UT_LIST_INIT(trx->wait_thrs);
-
-	trx->lock_heap = mem_heap_create_in_buffer(256);
-	UT_LIST_INIT(trx->trx_locks);
-
-	UT_LIST_INIT(trx->trx_savepoints);
-
-	trx->dict_operation_lock_mode = 0;
-	trx->has_search_latch = FALSE;
-	trx->search_latch_timeout = BTR_SEA_TIMEOUT;
-
-	trx->declared_to_be_inside_innodb = FALSE;
-	trx->n_tickets_to_enter_innodb = 0;
-
-	trx->global_read_view_heap = mem_heap_create(256);
-	trx->global_read_view = NULL;
-	trx->read_view = NULL;
-
-	/* Set X/Open XA transaction identification to NULL */
-	memset(&trx->xid, 0, sizeof(trx->xid));
-	trx->xid.formatID = -1;
-
-	trx->n_autoinc_rows = 0;
-
-	/* Remember to free the vector explicitly. */
-	trx->autoinc_locks = ib_vector_create(
-		mem_heap_create(sizeof(ib_vector_t) + sizeof(void*) * 4), 4);
-
-	return(trx);
-}
-
-/********************************************************************//**
-Creates a transaction object for MySQL.
-@return	own: transaction object */
-UNIV_INTERN
-trx_t*
-trx_allocate_for_mysql(void)
-/*========================*/
-{
-	trx_t*	trx;
-
-	mutex_enter(&kernel_mutex);
-
-	trx = trx_create(trx_dummy_sess);
-
-	trx_n_mysql_transactions++;
-
-	UT_LIST_ADD_FIRST(mysql_trx_list, trx_sys->mysql_trx_list, trx);
-
-	mutex_exit(&kernel_mutex);
-
-	return(trx);
-}
-
-/********************************************************************//**
-Creates a transaction object for background operations by the master thread.
-@return	own: transaction object */
-UNIV_INTERN
-trx_t*
-trx_allocate_for_background(void)
-/*=============================*/
-{
-	trx_t*	trx;
-
-	mutex_enter(&kernel_mutex);
-
-	trx = trx_create(trx_dummy_sess);
-
-	mutex_exit(&kernel_mutex);
-
-	return(trx);
-}
-
-/********************************************************************//**
-Releases the search latch if trx has reserved it. */
-UNIV_INTERN
-void
-trx_search_latch_release_if_reserved(
-/*=================================*/
-	trx_t*	   trx) /*!< in: transaction */
-{
-	if (trx->has_search_latch) {
-		rw_lock_s_unlock(&btr_search_latch);
-
-		trx->has_search_latch = FALSE;
-	}
-}
-
-/********************************************************************//**
-Frees a transaction object. */
-UNIV_INTERN
-void
-trx_free(
-/*=====*/
-	trx_t*	trx)	/*!< in, own: trx object */
-{
-	ut_ad(mutex_own(&kernel_mutex));
-
-	if (trx->declared_to_be_inside_innodb) {
-		ut_print_timestamp(stderr);
-		fputs("  InnoDB: Error: Freeing a trx which is declared"
-		      " to be processing\n"
-		      "InnoDB: inside InnoDB.\n", stderr);
-		trx_print(stderr, trx, 600);
-		putc('\n', stderr);
-
-		/* This is an error but not a fatal error. We must keep
-		the counters like srv_conc_n_threads accurate. */
-		srv_conc_force_exit_innodb(trx);
-	}
-
-	if (trx->n_mysql_tables_in_use != 0
-	    || trx->mysql_n_tables_locked != 0) {
-
-		ut_print_timestamp(stderr);
-		fprintf(stderr,
-			"  InnoDB: Error: MySQL is freeing a thd\n"
-			"InnoDB: though trx->n_mysql_tables_in_use is %lu\n"
-			"InnoDB: and trx->mysql_n_tables_locked is %lu.\n",
-			(ulong)trx->n_mysql_tables_in_use,
-			(ulong)trx->mysql_n_tables_locked);
-
-		trx_print(stderr, trx, 600);
-
-		ut_print_buf(stderr, trx, sizeof(trx_t));
-		putc('\n', stderr);
-	}
-
-	ut_a(trx->magic_n == TRX_MAGIC_N);
-
-	trx->magic_n = 11112222;
-
-	ut_a(trx->conc_state == TRX_NOT_STARTED);
-
-	mutex_free(&(trx->undo_mutex));
-
-	ut_a(trx->insert_undo == NULL);
-	ut_a(trx->update_undo == NULL);
-
-	if (trx->undo_no_arr) {
-		trx_undo_arr_free(trx->undo_no_arr);
-	}
-
-	ut_a(UT_LIST_GET_LEN(trx->signals) == 0);
-	ut_a(UT_LIST_GET_LEN(trx->reply_signals) == 0);
-
-	ut_a(trx->wait_lock == NULL);
-	ut_a(UT_LIST_GET_LEN(trx->wait_thrs) == 0);
-
-	ut_a(!trx->has_search_latch);
-
-	ut_a(trx->dict_operation_lock_mode == 0);
-
-	if (trx->lock_heap) {
-		mem_heap_free(trx->lock_heap);
-	}
-
-	ut_a(UT_LIST_GET_LEN(trx->trx_locks) == 0);
-
-	if (trx->global_read_view_heap) {
-		mem_heap_free(trx->global_read_view_heap);
-	}
-
-	trx->global_read_view = NULL;
-
-	ut_a(trx->read_view == NULL);
-
-	ut_a(ib_vector_is_empty(trx->autoinc_locks));
-	/* We allocated a dedicated heap for the vector. */
-	ib_vector_free(trx->autoinc_locks);
-
-	mem_free(trx);
-}
-
-/********************************************************************//**
-At shutdown, frees a transaction object that is in the PREPARED state. */
-UNIV_INTERN
-void
-trx_free_prepared(
-/*==============*/
-	trx_t*	trx)	/*!< in, own: trx object */
-{
-	ut_ad(mutex_own(&kernel_mutex));
-	ut_a(trx->conc_state == TRX_PREPARED);
-	ut_a(trx->magic_n == TRX_MAGIC_N);
-
-	/* Prepared transactions are sort of active; they allow
-	ROLLBACK and COMMIT operations. Because the system does not
-	contain any other transactions than prepared transactions at
-	the shutdown stage and because a transaction cannot become
-	PREPARED while holding locks, it is safe to release the locks
-	held by PREPARED transactions here at shutdown.*/
-	lock_release_off_kernel(trx);
-
-	trx_undo_free_prepared(trx);
-
-	mutex_free(&trx->undo_mutex);
-
-	if (trx->undo_no_arr) {
-		trx_undo_arr_free(trx->undo_no_arr);
-	}
-
-	ut_a(UT_LIST_GET_LEN(trx->signals) == 0);
-	ut_a(UT_LIST_GET_LEN(trx->reply_signals) == 0);
-
-	ut_a(trx->wait_lock == NULL);
-	ut_a(UT_LIST_GET_LEN(trx->wait_thrs) == 0);
-
-	ut_a(!trx->has_search_latch);
-
-	ut_a(trx->dict_operation_lock_mode == 0);
-
-	if (trx->lock_heap) {
-		mem_heap_free(trx->lock_heap);
-	}
-
-	if (trx->global_read_view_heap) {
-		mem_heap_free(trx->global_read_view_heap);
-	}
-
-	ut_a(ib_vector_is_empty(trx->autoinc_locks));
-	ib_vector_free(trx->autoinc_locks);
-
-	UT_LIST_REMOVE(trx_list, trx_sys->trx_list, trx);
-
-	mem_free(trx);
-}
-
-/********************************************************************//**
-Frees a transaction object for MySQL. */
-UNIV_INTERN
-void
-trx_free_for_mysql(
-/*===============*/
-	trx_t*	trx)	/*!< in, own: trx object */
-{
-	mutex_enter(&kernel_mutex);
-
-	UT_LIST_REMOVE(mysql_trx_list, trx_sys->mysql_trx_list, trx);
-
-	trx_free(trx);
-
-	ut_a(trx_n_mysql_transactions > 0);
-
-	trx_n_mysql_transactions--;
-
-	mutex_exit(&kernel_mutex);
-}
-
-/********************************************************************//**
-Frees a transaction object of a background operation of the master thread. */
-UNIV_INTERN
-void
-trx_free_for_background(
-/*====================*/
-	trx_t*	trx)	/*!< in, own: trx object */
-{
-	mutex_enter(&kernel_mutex);
-
-	trx_free(trx);
-
-	mutex_exit(&kernel_mutex);
-}
-
-/****************************************************************//**
-Inserts the trx handle in the trx system trx list in the right position.
-The list is sorted on the trx id so that the biggest id is at the list
-start. This function is used at the database startup to insert incomplete
-transactions to the list. */
-static
-void
-trx_list_insert_ordered(
-/*====================*/
-	trx_t*	trx)	/*!< in: trx handle */
-{
-	trx_t*	trx2;
-
-	ut_ad(mutex_own(&kernel_mutex));
-
-	trx2 = UT_LIST_GET_FIRST(trx_sys->trx_list);
-
-	while (trx2 != NULL) {
-		if (trx->id >= trx2->id) {
-
-			ut_ad(trx->id > trx2->id);
-			break;
-		}
-		trx2 = UT_LIST_GET_NEXT(trx_list, trx2);
-	}
-
-	if (trx2 != NULL) {
-		trx2 = UT_LIST_GET_PREV(trx_list, trx2);
-
-		if (trx2 == NULL) {
-			UT_LIST_ADD_FIRST(trx_list, trx_sys->trx_list, trx);
-		} else {
-			UT_LIST_INSERT_AFTER(trx_list, trx_sys->trx_list,
-					     trx2, trx);
-		}
-	} else {
-		UT_LIST_ADD_LAST(trx_list, trx_sys->trx_list, trx);
-	}
-}
-
-/****************************************************************//**
-Creates trx objects for transactions and initializes the trx list of
-trx_sys at database start. Rollback segment and undo log lists must
-already exist when this function is called, because the lists of
-transactions to be rolled back or cleaned up are built based on the
-undo log lists. */
-UNIV_INTERN
-void
-trx_lists_init_at_db_start(void)
-/*============================*/
-{
-	trx_rseg_t*	rseg;
-	trx_undo_t*	undo;
-	trx_t*		trx;
-
-	ut_ad(mutex_own(&kernel_mutex));
-	UT_LIST_INIT(trx_sys->trx_list);
-
-	/* Look from the rollback segments if there exist undo logs for
-	transactions */
-
-	rseg = UT_LIST_GET_FIRST(trx_sys->rseg_list);
-
-	while (rseg != NULL) {
-		undo = UT_LIST_GET_FIRST(rseg->insert_undo_list);
-
-		while (undo != NULL) {
-
-			trx = trx_create(trx_dummy_sess);
-
-			trx->is_recovered = TRUE;
-			trx->id = undo->trx_id;
-			trx->xid = undo->xid;
-			trx->insert_undo = undo;
-			trx->rseg = rseg;
-
-			if (undo->state != TRX_UNDO_ACTIVE) {
-
-				/* Prepared transactions are left in
-				the prepared state waiting for a
-				commit or abort decision from MySQL */
-
-				if (undo->state == TRX_UNDO_PREPARED) {
-
-					fprintf(stderr,
-						"InnoDB: Transaction "
-						TRX_ID_FMT
-						" was in the"
-						" XA prepared state.\n",
-						(ullint) trx->id);
-
-					if (srv_force_recovery == 0) {
-
-						trx->conc_state = TRX_PREPARED;
-						trx_n_prepared++;
-					} else {
-						fprintf(stderr,
-							"InnoDB: Since"
-							" innodb_force_recovery"
-							" > 0, we will"
-							" rollback it"
-							" anyway.\n");
-
-						trx->conc_state = TRX_ACTIVE;
-					}
-				} else {
-					trx->conc_state
-						= TRX_COMMITTED_IN_MEMORY;
-				}
-
-				/* We give a dummy value for the trx no;
-				this should have no relevance since purge
-				is not interested in committed transaction
-				numbers, unless they are in the history
-				list, in which case it looks the number
-				from the disk based undo log structure */
-
-				trx->no = trx->id;
-			} else {
-				trx->conc_state = TRX_ACTIVE;
-
-				/* A running transaction always has the number
-				field inited to IB_ULONGLONG_MAX */
-
-				trx->no = IB_ULONGLONG_MAX;
-			}
-
-			if (undo->dict_operation) {
-				trx_set_dict_operation(
-					trx, TRX_DICT_OP_TABLE);
-				trx->table_id = undo->table_id;
-			}
-
-			if (!undo->empty) {
-				trx->undo_no = undo->top_undo_no + 1;
-			}
-
-			trx_list_insert_ordered(trx);
-
-			undo = UT_LIST_GET_NEXT(undo_list, undo);
-		}
-
-		undo = UT_LIST_GET_FIRST(rseg->update_undo_list);
-
-		while (undo != NULL) {
-			trx = trx_get_on_id(undo->trx_id);
-
-			if (NULL == trx) {
-				trx = trx_create(trx_dummy_sess);
-
-				trx->is_recovered = TRUE;
-				trx->id = undo->trx_id;
-				trx->xid = undo->xid;
-
-				if (undo->state != TRX_UNDO_ACTIVE) {
-
-					/* Prepared transactions are left in
-					the prepared state waiting for a
-					commit or abort decision from MySQL */
-
-					if (undo->state == TRX_UNDO_PREPARED) {
-						fprintf(stderr,
-							"InnoDB: Transaction "
-							TRX_ID_FMT " was in the"
-							" XA prepared state.\n",
-							(ullint) trx->id);
-
-						if (srv_force_recovery == 0) {
-
-							trx->conc_state
-								= TRX_PREPARED;
-							trx_n_prepared++;
-						} else {
-							fprintf(stderr,
-								"InnoDB: Since"
-								" innodb_force_recovery"
-								" > 0, we will"
-								" rollback it"
-								" anyway.\n");
-
-							trx->conc_state
-								= TRX_ACTIVE;
-						}
-					} else {
-						trx->conc_state
-							= TRX_COMMITTED_IN_MEMORY;
-					}
-
-					/* We give a dummy value for the trx
-					number */
-
-					trx->no = trx->id;
-				} else {
-					trx->conc_state = TRX_ACTIVE;
-
-					/* A running transaction always has
-					the number field inited to
-					IB_ULONGLONG_MAX */
-
-					trx->no = IB_ULONGLONG_MAX;
-				}
-
-				trx->rseg = rseg;
-				trx_list_insert_ordered(trx);
-
-				if (undo->dict_operation) {
-					trx_set_dict_operation(
-						trx, TRX_DICT_OP_TABLE);
-					trx->table_id = undo->table_id;
-				}
-			}
-
-			trx->update_undo = undo;
-
-			if ((!undo->empty)
-			    && undo->top_undo_no >= trx->undo_no) {
-
-				trx->undo_no = undo->top_undo_no + 1;
-			}
-
-			undo = UT_LIST_GET_NEXT(undo_list, undo);
-		}
-
-		rseg = UT_LIST_GET_NEXT(rseg_list, rseg);
-	}
-}
-
-/******************************************************************//**
-Assigns a rollback segment to a transaction in a round-robin fashion.
-@return	assigned rollback segment instance */
-UNIV_INLINE
-trx_rseg_t*
-trx_assign_rseg(
-/*============*/
-	ulint	max_undo_logs)	/*!< in: maximum number of UNDO logs to use */
-{
-	trx_rseg_t*	rseg = trx_sys->latest_rseg;
-
-	ut_ad(mutex_own(&kernel_mutex));
-
-	rseg = UT_LIST_GET_NEXT(rseg_list, rseg);
-
-	if (rseg == NULL || rseg->id == max_undo_logs - 1) {
-		rseg = UT_LIST_GET_FIRST(trx_sys->rseg_list);
-	}
-
-	trx_sys->latest_rseg = rseg;
-
-	return(rseg);
-}
-
-/****************************************************************//**
-Starts a new transaction.
-@return	TRUE */
-UNIV_INTERN
-ibool
-trx_start_low(
-/*==========*/
-	trx_t*	trx,	/*!< in: transaction */
-	ulint	rseg_id)/*!< in: rollback segment id; if ULINT_UNDEFINED
-			is passed, the system chooses the rollback segment
-			automatically in a round-robin fashion */
-{
-	trx_rseg_t*	rseg;
-
-	ut_ad(mutex_own(&kernel_mutex));
-	ut_ad(trx->rseg == NULL);
-
-	if (trx->is_purge) {
-		trx->id = 0;
-		trx->conc_state = TRX_ACTIVE;
-		trx->start_time = time(NULL);
-
-		return(TRUE);
-	}
-
-	ut_ad(trx->conc_state != TRX_ACTIVE);
-
-	ut_a(rseg_id == ULINT_UNDEFINED);
-
-	rseg = trx_assign_rseg(srv_rollback_segments);
-
-	trx->id = trx_sys_get_new_trx_id();
-
-	/* The initial value for trx->no: IB_ULONGLONG_MAX is used in
-	read_view_open_now: */
-
-	trx->no = IB_ULONGLONG_MAX;
-
-	trx->rseg = rseg;
-
-	trx->conc_state = TRX_ACTIVE;
-	trx->start_time = time(NULL);
-
-	UT_LIST_ADD_FIRST(trx_list, trx_sys->trx_list, trx);
-
-	return(TRUE);
-}
-
-/****************************************************************//**
-Starts a new transaction.
-@return	TRUE */
-UNIV_INTERN
-ibool
-trx_start(
-/*======*/
-	trx_t*	trx,	/*!< in: transaction */
-	ulint	rseg_id)/*!< in: rollback segment id; if ULINT_UNDEFINED
-			is passed, the system chooses the rollback segment
-			automatically in a round-robin fashion */
-{
-	ibool	ret;
-
-	/* Update the info whether we should skip XA steps that eat CPU time
-	For the duration of the transaction trx->support_xa is not reread
-	from thd so any changes in the value take effect in the next
-	transaction. This is to avoid a scenario where some undo
-	generated by a transaction, has XA stuff, and other undo,
-	generated by the same transaction, doesn't. */
-	trx->support_xa = thd_supports_xa(trx->mysql_thd);
-
-	mutex_enter(&kernel_mutex);
-
-	ret = trx_start_low(trx, rseg_id);
-
-	mutex_exit(&kernel_mutex);
-
-	return(ret);
-}
-
-/****************************************************************//**
-Set the transaction serialisation number. */
-static
-void
-trx_serialisation_number_get(
-/*=========================*/
-	trx_t*		trx)	/*!< in: transaction */
-{
-	trx_rseg_t*	rseg;
-
-	rseg = trx->rseg;
-
-	ut_ad(mutex_own(&rseg->mutex));
-
-	mutex_enter(&kernel_mutex);
-
-	trx->no = trx_sys_get_new_trx_id();
-
-	/* If the rollack segment is not empty then the
-	new trx_t::no can't be less than any trx_t::no
-	already in the rollback segment. User threads only
-	produce events when a rollback segment is empty. */
-
-	if (rseg->last_page_no == FIL_NULL) {
-		void*		ptr;
-		rseg_queue_t	rseg_queue;
-
-		rseg_queue.rseg = rseg;
-		rseg_queue.trx_no = trx->no;
-
-		mutex_enter(&purge_sys->bh_mutex);
-
-		/* This is to reduce the pressure on the kernel mutex,
-		though in reality it should make very little (read no)
-		difference because this code path is only taken when the
-		rbs is empty. */
-
-		mutex_exit(&kernel_mutex);
-
-		ptr = ib_bh_push(purge_sys->ib_bh, &rseg_queue);
-		ut_a(ptr);
-
-		mutex_exit(&purge_sys->bh_mutex);
-	} else {
-		mutex_exit(&kernel_mutex);
-	}
-}
-
-/****************************************************************//**
-Assign the transaction its history serialisation number and write the
-update UNDO log record to the assigned rollback segment.
-@return the LSN of the UNDO log write. */
-static
-ib_uint64_t
-trx_write_serialisation_history(
-/*============================*/
-	trx_t*		trx)	/*!< in: transaction */
-{
-	mtr_t		mtr;
-	trx_rseg_t*	rseg;
-
-	ut_ad(!mutex_own(&kernel_mutex));
-
-	rseg = trx->rseg;
-
-	mtr_start(&mtr);
-
-	/* Change the undo log segment states from TRX_UNDO_ACTIVE
-	to some other state: these modifications to the file data
-	structure define the transaction as committed in the file
-	based domain, at the serialization point of the log sequence
-	number lsn obtained below. */
-
-	if (trx->update_undo != NULL) {
-		page_t*		undo_hdr_page;
-		trx_undo_t*	undo = trx->update_undo;
-
-		/* We have to hold the rseg mutex because update
-		log headers have to be put to the history list in the
-		(serialisation) order of the UNDO trx number. This is
-		required for the purge in-memory data structures too. */
-
-		mutex_enter(&rseg->mutex);
-
-		/* Assign the transaction serialisation number and also
-		update the purge min binary heap if this is the first
-		UNDO log being written to the assigned rollback segment. */
-
-		trx_serialisation_number_get(trx);
-
-		/* It is not necessary to obtain trx->undo_mutex here
-		because only a single OS thread is allowed to do the
-		transaction commit for this transaction. */
-
-		undo_hdr_page = trx_undo_set_state_at_finish(undo, &mtr);
-
-		trx_undo_update_cleanup(trx, undo_hdr_page, &mtr);
-	} else {
-		mutex_enter(&rseg->mutex);
-	}
-
-	if (trx->insert_undo != NULL) {
-		trx_undo_set_state_at_finish(trx->insert_undo, &mtr);
-	}
-
-	mutex_exit(&rseg->mutex);
-
-	/* Update the latest MySQL binlog name and offset info
-	in trx sys header if MySQL binlogging is on or the database
-	server is a MySQL replication slave */
-
-	if (trx->mysql_log_file_name
-	    && trx->mysql_log_file_name[0] != '\0') {
-
-		trx_sys_update_mysql_binlog_offset(
-			trx->mysql_log_file_name,
-			trx->mysql_log_offset,
-			TRX_SYS_MYSQL_LOG_INFO, &mtr);
-
-		trx->mysql_log_file_name = NULL;
-	}
-
-	/* The following call commits the mini-transaction, making the
-	whole transaction committed in the file-based world, at this
-	log sequence number. The transaction becomes 'durable' when
-	we write the log to disk, but in the logical sense the commit
-	in the file-based data structures (undo logs etc.) happens
-	here.
-
-	NOTE that transaction numbers, which are assigned only to
-	transactions with an update undo log, do not necessarily come
-	in exactly the same order as commit lsn's, if the transactions
-	have different rollback segments. To get exactly the same
-	order we should hold the kernel mutex up to this point,
-	adding to the contention of the kernel mutex. However, if
-	a transaction T2 is able to see modifications made by
-	a transaction T1, T2 will always get a bigger transaction
-	number and a bigger commit lsn than T1. */
-
-	/*--------------*/
-	mtr_commit(&mtr);
-	/*--------------*/
-
-	return(mtr.end_lsn);
-}
-
-/****************************************************************//**
-Commits a transaction. */
-UNIV_INTERN
-void
-trx_commit_off_kernel(
-/*==================*/
-	trx_t*	trx)	/*!< in: transaction */
-{
-	ib_uint64_t	lsn;
-
-	ut_ad(mutex_own(&kernel_mutex));
-
-	trx->must_flush_log_later = FALSE;
-
-	/* If the transaction made any updates then we need to write the
-	UNDO logs for the updates to the assigned rollback segment. */
-
-	if (trx->insert_undo != NULL || trx->update_undo != NULL) {
-		mutex_exit(&kernel_mutex);
-
-		lsn = trx_write_serialisation_history(trx);
-
-		mutex_enter(&kernel_mutex);
-	} else {
-		lsn = 0;
-	}
-
-	ut_ad(trx->conc_state == TRX_ACTIVE || trx->conc_state == TRX_PREPARED);
-	ut_ad(mutex_own(&kernel_mutex));
-
-	if (UNIV_UNLIKELY(trx->conc_state == TRX_PREPARED)) {
-		ut_a(trx_n_prepared > 0);
-		trx_n_prepared--;
-	}
-
-	/* The following assignment makes the transaction committed in memory
-	and makes its changes to data visible to other transactions.
-	NOTE that there is a small discrepancy from the strict formal
-	visibility rules here: a human user of the database can see
-	modifications made by another transaction T even before the necessary
-	log segment has been flushed to the disk. If the database happens to
-	crash before the flush, the user has seen modifications from T which
-	will never be a committed transaction. However, any transaction T2
-	which sees the modifications of the committing transaction T, and
-	which also itself makes modifications to the database, will get an lsn
-	larger than the committing transaction T. In the case where the log
-	flush fails, and T never gets committed, also T2 will never get
-	committed. */
-
-	/*--------------------------------------*/
-	trx->conc_state = TRX_COMMITTED_IN_MEMORY;
-	/*--------------------------------------*/
-
-	/* If we release kernel_mutex below and we are still doing
-	recovery i.e.: back ground rollback thread is still active
-	then there is a chance that the rollback thread may see
-	this trx as COMMITTED_IN_MEMORY and goes adhead to clean it
-	up calling trx_cleanup_at_db_startup(). This can happen
-	in the case we are committing a trx here that is left in
-	PREPARED state during the crash. Note that commit of the
-	rollback of a PREPARED trx happens in the recovery thread
-	while the rollback of other transactions happen in the
-	background thread. To avoid this race we unconditionally
-	unset the is_recovered flag from the trx. */
-
-	trx->is_recovered = FALSE;
-
-	lock_release_off_kernel(trx);
-
-	if (trx->global_read_view) {
-		read_view_close(trx->global_read_view);
-		mem_heap_empty(trx->global_read_view_heap);
-		trx->global_read_view = NULL;
-	}
-
-	trx->read_view = NULL;
-
-	if (lsn) {
-
-		mutex_exit(&kernel_mutex);
-
-		if (trx->insert_undo != NULL) {
-
-			trx_undo_insert_cleanup(trx);
-		}
-
-		/* NOTE that we could possibly make a group commit more
-		efficient here: call os_thread_yield here to allow also other
-		trxs to come to commit! */
-
-		/*-------------------------------------*/
-
-		/* Depending on the my.cnf options, we may now write the log
-		buffer to the log files, making the transaction durable if
-		the OS does not crash. We may also flush the log files to
-		disk, making the transaction durable also at an OS crash or a
-		power outage.
-
-		The idea in InnoDB's group commit is that a group of
-		transactions gather behind a trx doing a physical disk write
-		to log files, and when that physical write has been completed,
-		one of those transactions does a write which commits the whole
-		group. Note that this group commit will only bring benefit if
-		there are > 2 users in the database. Then at least 2 users can
-		gather behind one doing the physical log write to disk.
-
-		If we are calling trx_commit() under prepare_commit_mutex, we
-		will delay possible log write and flush to a separate function
-		trx_commit_complete_for_mysql(), which is only called when the
-		thread has released the mutex. This is to make the
-		group commit algorithm to work. Otherwise, the prepare_commit
-		mutex would serialize all commits and prevent a group of
-		transactions from gathering. */
-
-		if (trx->flush_log_later) {
-			/* Do nothing yet */
-			trx->must_flush_log_later = TRUE;
-		} else if (srv_flush_log_at_trx_commit == 0) {
-			/* Do nothing */
-		} else if (srv_flush_log_at_trx_commit == 1) {
-			if (srv_unix_file_flush_method == SRV_UNIX_NOSYNC) {
-				/* Write the log but do not flush it to disk */
-
-				log_write_up_to(lsn, LOG_WAIT_ONE_GROUP,
-						FALSE);
-			} else {
-				/* Write the log to the log files AND flush
-				them to disk */
-
-				log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, TRUE);
-			}
-		} else if (srv_flush_log_at_trx_commit == 2) {
-
-			/* Write the log but do not flush it to disk */
-
-			log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, FALSE);
-		} else {
-			ut_error;
-		}
-
-		trx->commit_lsn = lsn;
-
-		/*-------------------------------------*/
-
-		mutex_enter(&kernel_mutex);
-	}
-
-	/* Free all savepoints */
-	trx_roll_free_all_savepoints(trx);
-
-	trx->conc_state = TRX_NOT_STARTED;
-	trx->rseg = NULL;
-	trx->undo_no = 0;
-	trx->last_sql_stat_start.least_undo_no = 0;
-
-	ut_ad(UT_LIST_GET_LEN(trx->wait_thrs) == 0);
-	ut_ad(UT_LIST_GET_LEN(trx->trx_locks) == 0);
-
-	UT_LIST_REMOVE(trx_list, trx_sys->trx_list, trx);
-
-	trx->error_state = DB_SUCCESS;
-}
-
-/****************************************************************//**
-Cleans up a transaction at database startup. The cleanup is needed if
-the transaction already got to the middle of a commit when the database
-crashed, and we cannot roll it back. */
-UNIV_INTERN
-void
-trx_cleanup_at_db_startup(
-/*======================*/
-	trx_t*	trx)	/*!< in: transaction */
-{
-	if (trx->insert_undo != NULL) {
-
-		trx_undo_insert_cleanup(trx);
-	}
-
-	trx->conc_state = TRX_NOT_STARTED;
-	trx->rseg = NULL;
-	trx->undo_no = 0;
-	trx->last_sql_stat_start.least_undo_no = 0;
-
-	UT_LIST_REMOVE(trx_list, trx_sys->trx_list, trx);
-}
-
-/********************************************************************//**
-Assigns a read view for a consistent read query. All the consistent reads
-within the same transaction will get the same read view, which is created
-when this function is first called for a new started transaction.
-@return	consistent read view */
-UNIV_INTERN
-read_view_t*
-trx_assign_read_view(
-/*=================*/
-	trx_t*	trx)	/*!< in: active transaction */
-{
-	ut_ad(trx->conc_state == TRX_ACTIVE);
-
-	if (trx->read_view) {
-		return(trx->read_view);
-	}
-
-	mutex_enter(&kernel_mutex);
-
-	if (!trx->read_view) {
-		trx->read_view = read_view_open_now(
-			trx->id, trx->global_read_view_heap);
-		trx->global_read_view = trx->read_view;
-	}
-
-	mutex_exit(&kernel_mutex);
-
-	return(trx->read_view);
-}
-
-/****************************************************************//**
-Commits a transaction. NOTE that the kernel mutex is temporarily released. */
-static
-void
-trx_handle_commit_sig_off_kernel(
-/*=============================*/
-	trx_t*		trx,		/*!< in: transaction */
-	que_thr_t**	next_thr)	/*!< in/out: next query thread to run;
-					if the value which is passed in is
-					a pointer to a NULL pointer, then the
-					calling function can start running
-					a new query thread */
-{
-	trx_sig_t*	sig;
-	trx_sig_t*	next_sig;
-
-	ut_ad(mutex_own(&kernel_mutex));
-
-	trx->que_state = TRX_QUE_COMMITTING;
-
-	trx_commit_off_kernel(trx);
-
-	ut_ad(UT_LIST_GET_LEN(trx->wait_thrs) == 0);
-
-	/* Remove all TRX_SIG_COMMIT signals from the signal queue and send
-	reply messages to them */
-
-	sig = UT_LIST_GET_FIRST(trx->signals);
-
-	while (sig != NULL) {
-		next_sig = UT_LIST_GET_NEXT(signals, sig);
-
-		if (sig->type == TRX_SIG_COMMIT) {
-
-			trx_sig_reply(sig, next_thr);
-			trx_sig_remove(trx, sig);
-		}
-
-		sig = next_sig;
-	}
-
-	trx->que_state = TRX_QUE_RUNNING;
-}
-
-/***********************************************************//**
-The transaction must be in the TRX_QUE_LOCK_WAIT state. Puts it to
-the TRX_QUE_RUNNING state and releases query threads which were
-waiting for a lock in the wait_thrs list. */
-UNIV_INTERN
-void
-trx_end_lock_wait(
-/*==============*/
-	trx_t*	trx)	/*!< in: transaction */
-{
-	que_thr_t*	thr;
-
-	ut_ad(mutex_own(&kernel_mutex));
-	ut_ad(trx->que_state == TRX_QUE_LOCK_WAIT);
-
-	thr = UT_LIST_GET_FIRST(trx->wait_thrs);
-
-	while (thr != NULL) {
-		que_thr_end_wait_no_next_thr(thr);
-
-		UT_LIST_REMOVE(trx_thrs, trx->wait_thrs, thr);
-
-		thr = UT_LIST_GET_FIRST(trx->wait_thrs);
-	}
-
-	trx->que_state = TRX_QUE_RUNNING;
-}
-
-/***********************************************************//**
-Moves the query threads in the lock wait list to the SUSPENDED state and puts
-the transaction to the TRX_QUE_RUNNING state. */
-static
-void
-trx_lock_wait_to_suspended(
-/*=======================*/
-	trx_t*	trx)	/*!< in: transaction in the TRX_QUE_LOCK_WAIT state */
-{
-	que_thr_t*	thr;
-
-	ut_ad(mutex_own(&kernel_mutex));
-	ut_ad(trx->que_state == TRX_QUE_LOCK_WAIT);
-
-	thr = UT_LIST_GET_FIRST(trx->wait_thrs);
-
-	while (thr != NULL) {
-		thr->state = QUE_THR_SUSPENDED;
-
-		UT_LIST_REMOVE(trx_thrs, trx->wait_thrs, thr);
-
-		thr = UT_LIST_GET_FIRST(trx->wait_thrs);
-	}
-
-	trx->que_state = TRX_QUE_RUNNING;
-}
-
-/***********************************************************//**
-Moves the query threads in the sig reply wait list of trx to the SUSPENDED
-state. */
-static
-void
-trx_sig_reply_wait_to_suspended(
-/*============================*/
-	trx_t*	trx)	/*!< in: transaction */
-{
-	trx_sig_t*	sig;
-	que_thr_t*	thr;
-
-	ut_ad(mutex_own(&kernel_mutex));
-
-	sig = UT_LIST_GET_FIRST(trx->reply_signals);
-
-	while (sig != NULL) {
-		thr = sig->receiver;
-
-		ut_ad(thr->state == QUE_THR_SIG_REPLY_WAIT);
-
-		thr->state = QUE_THR_SUSPENDED;
-
-		sig->receiver = NULL;
-
-		UT_LIST_REMOVE(reply_signals, trx->reply_signals, sig);
-
-		sig = UT_LIST_GET_FIRST(trx->reply_signals);
-	}
-}
-
-/*****************************************************************//**
-Checks the compatibility of a new signal with the other signals in the
-queue.
-@return	TRUE if the signal can be queued */
-static
-ibool
-trx_sig_is_compatible(
-/*==================*/
-	trx_t*	trx,	/*!< in: trx handle */
-	ulint	type,	/*!< in: signal type */
-	ulint	sender)	/*!< in: TRX_SIG_SELF or TRX_SIG_OTHER_SESS */
-{
-	trx_sig_t*	sig;
-
-	ut_ad(mutex_own(&kernel_mutex));
-
-	if (UT_LIST_GET_LEN(trx->signals) == 0) {
-
-		return(TRUE);
-	}
-
-	if (sender == TRX_SIG_SELF) {
-		if (type == TRX_SIG_ERROR_OCCURRED) {
-
-			return(TRUE);
-
-		} else if (type == TRX_SIG_BREAK_EXECUTION) {
-
-			return(TRUE);
-		} else {
-			return(FALSE);
-		}
-	}
-
-	ut_ad(sender == TRX_SIG_OTHER_SESS);
-
-	sig = UT_LIST_GET_FIRST(trx->signals);
-
-	if (type == TRX_SIG_COMMIT) {
-		while (sig != NULL) {
-
-			if (sig->type == TRX_SIG_TOTAL_ROLLBACK) {
-
-				return(FALSE);
-			}
-
-			sig = UT_LIST_GET_NEXT(signals, sig);
-		}
-
-		return(TRUE);
-
-	} else if (type == TRX_SIG_TOTAL_ROLLBACK) {
-		while (sig != NULL) {
-
-			if (sig->type == TRX_SIG_COMMIT) {
-
-				return(FALSE);
-			}
-
-			sig = UT_LIST_GET_NEXT(signals, sig);
-		}
-
-		return(TRUE);
-
-	} else if (type == TRX_SIG_BREAK_EXECUTION) {
-
-		return(TRUE);
-	} else {
-		ut_error;
-
-		return(FALSE);
-	}
-}
-
-/****************************************************************//**
-Sends a signal to a trx object. */
-UNIV_INTERN
-void
-trx_sig_send(
-/*=========*/
-	trx_t*		trx,		/*!< in: trx handle */
-	ulint		type,		/*!< in: signal type */
-	ulint		sender,		/*!< in: TRX_SIG_SELF or
-					TRX_SIG_OTHER_SESS */
-	que_thr_t*	receiver_thr,	/*!< in: query thread which wants the
-					reply, or NULL; if type is
-					TRX_SIG_END_WAIT, this must be NULL */
-	trx_savept_t*	savept,		/*!< in: possible rollback savepoint, or
-					NULL */
-	que_thr_t**	next_thr)	/*!< in/out: next query thread to run;
-					if the value which is passed in is
-					a pointer to a NULL pointer, then the
-					calling function can start running
-					a new query thread; if the parameter
-					is NULL, it is ignored */
-{
-	trx_sig_t*	sig;
-	trx_t*		receiver_trx;
-
-	ut_ad(trx);
-	ut_ad(mutex_own(&kernel_mutex));
-
-	if (!trx_sig_is_compatible(trx, type, sender)) {
-		/* The signal is not compatible with the other signals in
-		the queue: die */
-
-		ut_error;
-	}
-
-	/* Queue the signal object */
-
-	if (UT_LIST_GET_LEN(trx->signals) == 0) {
-
-		/* The signal list is empty: the 'sig' slot must be unused
-		(we improve performance a bit by avoiding mem_alloc) */
-		sig = &(trx->sig);
-	} else {
-		/* It might be that the 'sig' slot is unused also in this
-		case, but we choose the easy way of using mem_alloc */
-
-		sig = mem_alloc(sizeof(trx_sig_t));
-	}
-
-	UT_LIST_ADD_LAST(signals, trx->signals, sig);
-
-	sig->type = type;
-	sig->sender = sender;
-	sig->receiver = receiver_thr;
-
-	if (savept) {
-		sig->savept = *savept;
-	}
-
-	if (receiver_thr) {
-		receiver_trx = thr_get_trx(receiver_thr);
-
-		UT_LIST_ADD_LAST(reply_signals, receiver_trx->reply_signals,
-				 sig);
-	}
-
-	if (trx->sess->state == SESS_ERROR) {
-
-		trx_sig_reply_wait_to_suspended(trx);
-	}
-
-	if ((sender != TRX_SIG_SELF) || (type == TRX_SIG_BREAK_EXECUTION)) {
-		ut_error;
-	}
-
-	/* If there were no other signals ahead in the queue, try to start
-	handling of the signal */
-
-	if (UT_LIST_GET_FIRST(trx->signals) == sig) {
-
-		trx_sig_start_handle(trx, next_thr);
-	}
-}
-
-/****************************************************************//**
-Ends signal handling. If the session is in the error state, and
-trx->graph_before_signal_handling != NULL, then returns control to the error
-handling routine of the graph (currently just returns the control to the
-graph root which then will send an error message to the client). */
-UNIV_INTERN
-void
-trx_end_signal_handling(
-/*====================*/
-	trx_t*	trx)	/*!< in: trx */
-{
-	ut_ad(mutex_own(&kernel_mutex));
-	ut_ad(trx->handling_signals == TRUE);
-
-	trx->handling_signals = FALSE;
-
-	trx->graph = trx->graph_before_signal_handling;
-
-	if (trx->graph && (trx->sess->state == SESS_ERROR)) {
-
-		que_fork_error_handle(trx, trx->graph);
-	}
-}
-
-/****************************************************************//**
-Starts handling of a trx signal. */
-UNIV_INTERN
-void
-trx_sig_start_handle(
-/*=================*/
-	trx_t*		trx,		/*!< in: trx handle */
-	que_thr_t**	next_thr)	/*!< in/out: next query thread to run;
-					if the value which is passed in is
-					a pointer to a NULL pointer, then the
-					calling function can start running
-					a new query thread; if the parameter
-					is NULL, it is ignored */
-{
-	trx_sig_t*	sig;
-	ulint		type;
-loop:
-	/* We loop in this function body as long as there are queued signals
-	we can process immediately */
-
-	ut_ad(trx);
-	ut_ad(mutex_own(&kernel_mutex));
-
-	if (trx->handling_signals && (UT_LIST_GET_LEN(trx->signals) == 0)) {
-
-		trx_end_signal_handling(trx);
-
-		return;
-	}
-
-	if (trx->conc_state == TRX_NOT_STARTED) {
-
-		trx_start_low(trx, ULINT_UNDEFINED);
-	}
-
-	/* If the trx is in a lock wait state, moves the waiting query threads
-	to the suspended state */
-
-	if (trx->que_state == TRX_QUE_LOCK_WAIT) {
-
-		trx_lock_wait_to_suspended(trx);
-	}
-
-	/* If the session is in the error state and this trx has threads
-	waiting for reply from signals, moves these threads to the suspended
-	state, canceling wait reservations; note that if the transaction has
-	sent a commit or rollback signal to itself, and its session is not in
-	the error state, then nothing is done here. */
-
-	if (trx->sess->state == SESS_ERROR) {
-		trx_sig_reply_wait_to_suspended(trx);
-	}
-
-	/* If there are no running query threads, we can start processing of a
-	signal, otherwise we have to wait until all query threads of this
-	transaction are aware of the arrival of the signal. */
-
-	if (trx->n_active_thrs > 0) {
-
-		return;
-	}
-
-	if (trx->handling_signals == FALSE) {
-		trx->graph_before_signal_handling = trx->graph;
-
-		trx->handling_signals = TRUE;
-	}
-
-	sig = UT_LIST_GET_FIRST(trx->signals);
-	type = sig->type;
-
-	if (type == TRX_SIG_COMMIT) {
-
-		trx_handle_commit_sig_off_kernel(trx, next_thr);
-
-	} else if ((type == TRX_SIG_TOTAL_ROLLBACK)
-		   || (type == TRX_SIG_ROLLBACK_TO_SAVEPT)) {
-
-		trx_rollback(trx, sig, next_thr);
-
-		/* No further signals can be handled until the rollback
-		completes, therefore we return */
-
-		return;
-
-	} else if (type == TRX_SIG_ERROR_OCCURRED) {
-
-		trx_rollback(trx, sig, next_thr);
-
-		/* No further signals can be handled until the rollback
-		completes, therefore we return */
-
-		return;
-
-	} else if (type == TRX_SIG_BREAK_EXECUTION) {
-
-		trx_sig_reply(sig, next_thr);
-		trx_sig_remove(trx, sig);
-	} else {
-		ut_error;
-	}
-
-	goto loop;
-}
-
-/****************************************************************//**
-Send the reply message when a signal in the queue of the trx has been
-handled. */
-UNIV_INTERN
-void
-trx_sig_reply(
-/*==========*/
-	trx_sig_t*	sig,		/*!< in: signal */
-	que_thr_t**	next_thr)	/*!< in/out: next query thread to run;
-					if the value which is passed in is
-					a pointer to a NULL pointer, then the
-					calling function can start running
-					a new query thread */
-{
-	trx_t*	receiver_trx;
-
-	ut_ad(sig);
-	ut_ad(mutex_own(&kernel_mutex));
-
-	if (sig->receiver != NULL) {
-		ut_ad((sig->receiver)->state == QUE_THR_SIG_REPLY_WAIT);
-
-		receiver_trx = thr_get_trx(sig->receiver);
-
-		UT_LIST_REMOVE(reply_signals, receiver_trx->reply_signals,
-			       sig);
-		ut_ad(receiver_trx->sess->state != SESS_ERROR);
-
-		que_thr_end_wait(sig->receiver, next_thr);
-
-		sig->receiver = NULL;
-
-	}
-}
-
-/****************************************************************//**
-Removes a signal object from the trx signal queue. */
-UNIV_INTERN
-void
-trx_sig_remove(
-/*===========*/
-	trx_t*		trx,	/*!< in: trx handle */
-	trx_sig_t*	sig)	/*!< in, own: signal */
-{
-	ut_ad(trx && sig);
-	ut_ad(mutex_own(&kernel_mutex));
-
-	ut_ad(sig->receiver == NULL);
-
-	UT_LIST_REMOVE(signals, trx->signals, sig);
-	sig->type = 0;	/* reset the field to catch possible bugs */
-
-	if (sig != &(trx->sig)) {
-		mem_free(sig);
-	}
-}
-
-/*********************************************************************//**
-Creates a commit command node struct.
-@return	own: commit node struct */
-UNIV_INTERN
-commit_node_t*
-commit_node_create(
-/*===============*/
-	mem_heap_t*	heap)	/*!< in: mem heap where created */
-{
-	commit_node_t*	node;
-
-	node = mem_heap_alloc(heap, sizeof(commit_node_t));
-	node->common.type  = QUE_NODE_COMMIT;
-	node->state = COMMIT_NODE_SEND;
-
-	return(node);
-}
-
-/***********************************************************//**
-Performs an execution step for a commit type node in a query graph.
-@return	query thread to run next, or NULL */
-UNIV_INTERN
-que_thr_t*
-trx_commit_step(
-/*============*/
-	que_thr_t*	thr)	/*!< in: query thread */
-{
-	commit_node_t*	node;
-	que_thr_t*	next_thr;
-
-	node = thr->run_node;
-
-	ut_ad(que_node_get_type(node) == QUE_NODE_COMMIT);
-
-	if (thr->prev_node == que_node_get_parent(node)) {
-		node->state = COMMIT_NODE_SEND;
-	}
-
-	if (node->state == COMMIT_NODE_SEND) {
-		mutex_enter(&kernel_mutex);
-
-		node->state = COMMIT_NODE_WAIT;
-
-		next_thr = NULL;
-
-		thr->state = QUE_THR_SIG_REPLY_WAIT;
-
-		/* Send the commit signal to the transaction */
-
-		trx_sig_send(thr_get_trx(thr), TRX_SIG_COMMIT, TRX_SIG_SELF,
-			     thr, NULL, &next_thr);
-
-		mutex_exit(&kernel_mutex);
-
-		return(next_thr);
-	}
-
-	ut_ad(node->state == COMMIT_NODE_WAIT);
-
-	node->state = COMMIT_NODE_SEND;
-
-	thr->run_node = que_node_get_parent(node);
-
-	return(thr);
-}
-
-/**********************************************************************//**
-Does the transaction commit for MySQL.
-@return	DB_SUCCESS or error number */
-UNIV_INTERN
-ulint
-trx_commit_for_mysql(
-/*=================*/
-	trx_t*	trx)	/*!< in: trx handle */
-{
-	/* Because we do not do the commit by sending an Innobase
-	sig to the transaction, we must here make sure that trx has been
-	started. */
-
-	ut_a(trx);
-
-	trx_start_if_not_started(trx);
-
-	trx->op_info = "committing";
-
-	mutex_enter(&kernel_mutex);
-
-	trx_commit_off_kernel(trx);
-
-	mutex_exit(&kernel_mutex);
-
-	trx->op_info = "";
-
-	return(DB_SUCCESS);
-}
-
-/**********************************************************************//**
-If required, flushes the log to disk if we called trx_commit_for_mysql()
-with trx->flush_log_later == TRUE.
-@return	0 or error number */
-UNIV_INTERN
-ulint
-trx_commit_complete_for_mysql(
-/*==========================*/
-	trx_t*	trx)	/*!< in: trx handle */
-{
-	ib_uint64_t	lsn	= trx->commit_lsn;
-
-	ut_a(trx);
-
-	trx->op_info = "flushing log";
-
-	if (!trx->must_flush_log_later) {
-		/* Do nothing */
-	} else if (srv_flush_log_at_trx_commit == 0) {
-		/* Do nothing */
-	} else if (srv_flush_log_at_trx_commit == 1) {
-		if (srv_unix_file_flush_method == SRV_UNIX_NOSYNC) {
-			/* Write the log but do not flush it to disk */
-
-			log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, FALSE);
-		} else {
-			/* Write the log to the log files AND flush them to
-			disk */
-
-			log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, TRUE);
-		}
-	} else if (srv_flush_log_at_trx_commit == 2) {
-
-		/* Write the log but do not flush it to disk */
-
-		log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, FALSE);
-	} else {
-		ut_error;
-	}
-
-	trx->must_flush_log_later = FALSE;
-
-	trx->op_info = "";
-
-	return(0);
-}
-
-/**********************************************************************//**
-Marks the latest SQL statement ended. */
-UNIV_INTERN
-void
-trx_mark_sql_stat_end(
-/*==================*/
-	trx_t*	trx)	/*!< in: trx handle */
-{
-	ut_a(trx);
-
-	if (trx->conc_state == TRX_NOT_STARTED) {
-		trx->undo_no = 0;
-	}
-
-	trx->last_sql_stat_start.least_undo_no = trx->undo_no;
-}
-
-/**********************************************************************//**
-Prints info about a transaction to the given file. The caller must own the
-kernel mutex. */
-UNIV_INTERN
-void
-trx_print(
-/*======*/
-	FILE*	f,		/*!< in: output stream */
-	trx_t*	trx,		/*!< in: transaction */
-	ulint	max_query_len)	/*!< in: max query length to print, or 0 to
-				   use the default max length */
-{
-	ibool	newline;
-
-	fprintf(f, "TRANSACTION " TRX_ID_FMT, (ullint) trx->id);
-
-	switch (trx->conc_state) {
-	case TRX_NOT_STARTED:
-		fputs(", not started", f);
-		break;
-	case TRX_ACTIVE:
-		fprintf(f, ", ACTIVE %lu sec",
-			(ulong)difftime(time(NULL), trx->start_time));
-		break;
-	case TRX_PREPARED:
-		fprintf(f, ", ACTIVE (PREPARED) %lu sec",
-			(ulong)difftime(time(NULL), trx->start_time));
-		break;
-	case TRX_COMMITTED_IN_MEMORY:
-		fputs(", COMMITTED IN MEMORY", f);
-		break;
-	default:
-		fprintf(f, " state %lu", (ulong) trx->conc_state);
-	}
-
-	if (*trx->op_info) {
-		putc(' ', f);
-		fputs(trx->op_info, f);
-	}
-
-	if (trx->is_recovered) {
-		fputs(" recovered trx", f);
-	}
-
-	if (trx->is_purge) {
-		fputs(" purge trx", f);
-	}
-
-	if (trx->declared_to_be_inside_innodb) {
-		fprintf(f, ", thread declared inside InnoDB %lu",
-			(ulong) trx->n_tickets_to_enter_innodb);
-	}
-
-	putc('\n', f);
-
-	if (trx->n_mysql_tables_in_use > 0 || trx->mysql_n_tables_locked > 0) {
-		fprintf(f, "mysql tables in use %lu, locked %lu\n",
-			(ulong) trx->n_mysql_tables_in_use,
-			(ulong) trx->mysql_n_tables_locked);
-	}
-
-	newline = TRUE;
-
-	switch (trx->que_state) {
-	case TRX_QUE_RUNNING:
-		newline = FALSE; break;
-	case TRX_QUE_LOCK_WAIT:
-		fputs("LOCK WAIT ", f); break;
-	case TRX_QUE_ROLLING_BACK:
-		fputs("ROLLING BACK ", f); break;
-	case TRX_QUE_COMMITTING:
-		fputs("COMMITTING ", f); break;
-	default:
-		fprintf(f, "que state %lu ", (ulong) trx->que_state);
-	}
-
-	if (0 < UT_LIST_GET_LEN(trx->trx_locks)
-	    || mem_heap_get_size(trx->lock_heap) > 400) {
-		newline = TRUE;
-
-		fprintf(f, "%lu lock struct(s), heap size %lu,"
-			" %lu row lock(s)",
-			(ulong) UT_LIST_GET_LEN(trx->trx_locks),
-			(ulong) mem_heap_get_size(trx->lock_heap),
-			(ulong) lock_number_of_rows_locked(trx));
-	}
-
-	if (trx->has_search_latch) {
-		newline = TRUE;
-		fputs(", holds adaptive hash latch", f);
-	}
-
-	if (trx->undo_no != 0) {
-		newline = TRUE;
-		fprintf(f, ", undo log entries %llu",
-			(ullint) trx->undo_no);
-	}
-
-	if (newline) {
-		putc('\n', f);
-	}
-
-	if (trx->mysql_thd != NULL) {
-		innobase_mysql_print_thd(f, trx->mysql_thd, max_query_len);
-	}
-}
-
-/*******************************************************************//**
-Compares the "weight" (or size) of two transactions. Transactions that
-have edited non-transactional tables are considered heavier than ones
-that have not.
-@return	TRUE if weight(a) >= weight(b) */
-UNIV_INTERN
-ibool
-trx_weight_ge(
-/*==========*/
-	const trx_t*	a,	/*!< in: the first transaction to be compared */
-	const trx_t*	b)	/*!< in: the second transaction to be compared */
-{
-	ibool	a_notrans_edit;
-	ibool	b_notrans_edit;
-
-	/* If mysql_thd is NULL for a transaction we assume that it has
-	not edited non-transactional tables. */
-
-	a_notrans_edit = a->mysql_thd != NULL
-		&& thd_has_edited_nontrans_tables(a->mysql_thd);
-
-	b_notrans_edit = b->mysql_thd != NULL
-		&& thd_has_edited_nontrans_tables(b->mysql_thd);
-
-	if (a_notrans_edit != b_notrans_edit) {
-
-		return(a_notrans_edit);
-	}
-
-	/* Either both had edited non-transactional tables or both had
-	not, we fall back to comparing the number of altered/locked
-	rows. */
-
-#if 0
-	fprintf(stderr,
-		"%s TRX_WEIGHT(a): %lld+%lu, TRX_WEIGHT(b): %lld+%lu\n",
-		__func__,
-		a->undo_no, UT_LIST_GET_LEN(a->trx_locks),
-		b->undo_no, UT_LIST_GET_LEN(b->trx_locks));
-#endif
-
-	return(TRX_WEIGHT(a) >= TRX_WEIGHT(b));
-}
-
-/****************************************************************//**
-Prepares a transaction. */
-UNIV_INTERN
-void
-trx_prepare_off_kernel(
-/*===================*/
-	trx_t*	trx)	/*!< in: transaction */
-{
-	trx_rseg_t*	rseg;
-	ib_uint64_t	lsn		= 0;
-	mtr_t		mtr;
-
-	ut_ad(mutex_own(&kernel_mutex));
-
-	rseg = trx->rseg;
-
-	if (trx->insert_undo != NULL || trx->update_undo != NULL) {
-
-		mutex_exit(&kernel_mutex);
-
-		mtr_start(&mtr);
-
-		/* Change the undo log segment states from TRX_UNDO_ACTIVE
-		to TRX_UNDO_PREPARED: these modifications to the file data
-		structure define the transaction as prepared in the
-		file-based world, at the serialization point of lsn. */
-
-		mutex_enter(&(rseg->mutex));
-
-		if (trx->insert_undo != NULL) {
-
-			/* It is not necessary to obtain trx->undo_mutex here
-			because only a single OS thread is allowed to do the
-			transaction prepare for this transaction. */
-
-			trx_undo_set_state_at_prepare(trx, trx->insert_undo,
-						      &mtr);
-		}
-
-		if (trx->update_undo) {
-			trx_undo_set_state_at_prepare(
-				trx, trx->update_undo, &mtr);
-		}
-
-		mutex_exit(&(rseg->mutex));
-
-		/*--------------*/
-		mtr_commit(&mtr);	/* This mtr commit makes the
-					transaction prepared in the file-based
-					world */
-		/*--------------*/
-		lsn = mtr.end_lsn;
-
-		mutex_enter(&kernel_mutex);
-	}
-
-	ut_ad(mutex_own(&kernel_mutex));
-
-	/*--------------------------------------*/
-	trx->conc_state = TRX_PREPARED;
-	trx_n_prepared++;
-	/*--------------------------------------*/
-
-	if (lsn) {
-		/* Depending on the my.cnf options, we may now write the log
-		buffer to the log files, making the prepared state of the
-		transaction durable if the OS does not crash. We may also
-		flush the log files to disk, making the prepared state of the
-		transaction durable also at an OS crash or a power outage.
-
-		The idea in InnoDB's group prepare is that a group of
-		transactions gather behind a trx doing a physical disk write
-		to log files, and when that physical write has been completed,
-		one of those transactions does a write which prepares the whole
-		group. Note that this group prepare will only bring benefit if
-		there are > 2 users in the database. Then at least 2 users can
-		gather behind one doing the physical log write to disk.
-
-		TODO: find out if MySQL holds some mutex when calling this.
-		That would spoil our group prepare algorithm. */
-
-		mutex_exit(&kernel_mutex);
-
-		if (srv_flush_log_at_trx_commit == 0) {
-			/* Do nothing */
-		} else if (srv_flush_log_at_trx_commit == 1) {
-			if (srv_unix_file_flush_method == SRV_UNIX_NOSYNC) {
-				/* Write the log but do not flush it to disk */
-
-				log_write_up_to(lsn, LOG_WAIT_ONE_GROUP,
-						FALSE);
-			} else {
-				/* Write the log to the log files AND flush
-				them to disk */
-
-				log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, TRUE);
-			}
-		} else if (srv_flush_log_at_trx_commit == 2) {
-
-			/* Write the log but do not flush it to disk */
-
-			log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, FALSE);
-		} else {
-			ut_error;
-		}
-
-		mutex_enter(&kernel_mutex);
-	}
-}
-
-/**********************************************************************//**
-Does the transaction prepare for MySQL.
-@return	0 or error number */
-UNIV_INTERN
-ulint
-trx_prepare_for_mysql(
-/*==================*/
-	trx_t*	trx)	/*!< in: trx handle */
-{
-	/* Because we do not do the prepare by sending an Innobase
-	sig to the transaction, we must here make sure that trx has been
-	started. */
-
-	ut_a(trx);
-
-	trx->op_info = "preparing";
-
-	trx_start_if_not_started(trx);
-
-	mutex_enter(&kernel_mutex);
-
-	trx_prepare_off_kernel(trx);
-
-	mutex_exit(&kernel_mutex);
-
-	trx->op_info = "";
-
-	return(0);
-}
-
-/**********************************************************************//**
-This function is used to find number of prepared transactions and
-their transaction objects for a recovery.
-@return	number of prepared transactions stored in xid_list */
-UNIV_INTERN
-int
-trx_recover_for_mysql(
-/*==================*/
-	XID*	xid_list,	/*!< in/out: prepared transactions */
-	ulint	len)		/*!< in: number of slots in xid_list */
-{
-	trx_t*	trx;
-	ulint	count = 0;
-
-	ut_ad(xid_list);
-	ut_ad(len);
-
-	/* We should set those transactions which are in the prepared state
-	to the xid_list */
-
-	mutex_enter(&kernel_mutex);
-
-	trx = UT_LIST_GET_FIRST(trx_sys->trx_list);
-
-	while (trx) {
-		if (trx->conc_state == TRX_PREPARED) {
-			xid_list[count] = trx->xid;
-
-			if (count == 0) {
-				ut_print_timestamp(stderr);
-				fprintf(stderr,
-					"  InnoDB: Starting recovery for"
-					" XA transactions...\n");
-			}
-
-			ut_print_timestamp(stderr);
-			fprintf(stderr,
-				"  InnoDB: Transaction " TRX_ID_FMT " in"
-				" prepared state after recovery\n",
-				(ullint) trx->id);
-
-			ut_print_timestamp(stderr);
-			fprintf(stderr,
-				"  InnoDB: Transaction contains changes"
-				" to %llu rows\n",
-				(ullint) trx->undo_no);
-
-			count++;
-
-			if (count == len) {
-				break;
-			}
-		}
-
-		trx = UT_LIST_GET_NEXT(trx_list, trx);
-	}
-
-	mutex_exit(&kernel_mutex);
-
-	if (count > 0){
-		ut_print_timestamp(stderr);
-		fprintf(stderr,
-			"  InnoDB: %lu transactions in prepared state"
-			" after recovery\n",
-			(ulong) count);
-	}
-
-	return ((int) count);
-}
-
-/*******************************************************************//**
-This function is used to find one X/Open XA distributed transaction
-which is in the prepared state
-@return	trx or NULL; on match, the trx->xid will be invalidated */
-UNIV_INTERN
-trx_t*
-trx_get_trx_by_xid(
-/*===============*/
-	const XID*	xid)	/*!< in: X/Open XA transaction identifier */
-{
-	trx_t*	trx;
-
-	if (xid == NULL) {
-
-		return(NULL);
-	}
-
-	mutex_enter(&kernel_mutex);
-
-	trx = UT_LIST_GET_FIRST(trx_sys->trx_list);
-
-	while (trx) {
-		/* Compare two X/Open XA transaction id's: their
-		length should be the same and binary comparison
-		of gtrid_length+bqual_length bytes should be
-		the same */
-
-		if (trx->is_recovered
-		    && trx->conc_state == TRX_PREPARED
-		    && xid->gtrid_length == trx->xid.gtrid_length
-		    && xid->bqual_length == trx->xid.bqual_length
-		    && memcmp(xid->data, trx->xid.data,
-			      xid->gtrid_length + xid->bqual_length) == 0) {
-
-			/* Invalidate the XID, so that subsequent calls
-			will not find it. */
-			memset(&trx->xid, 0, sizeof(trx->xid));
-			trx->xid.formatID = -1;
-			break;
-		}
-
-		trx = UT_LIST_GET_NEXT(trx_list, trx);
-	}
-
-	mutex_exit(&kernel_mutex);
-
-	return(trx);
-}
diff --git a/storage/innobase/trx/trx0trx.cc b/storage/innobase/trx/trx0trx.cc
new file mode 100644
index 00000000000..c3a53a18d90
--- /dev/null
+++ b/storage/innobase/trx/trx0trx.cc
@@ -0,0 +1,2071 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2011, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file trx/trx0trx.cc
+The transaction
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#include "trx0trx.h"
+
+#ifdef UNIV_NONINL
+#include "trx0trx.ic"
+#endif
+
+#include "trx0undo.h"
+#include "trx0rseg.h"
+#include "log0log.h"
+#include "que0que.h"
+#include "lock0lock.h"
+#include "trx0roll.h"
+#include "usr0sess.h"
+#include "read0read.h"
+#include "srv0srv.h"
+#include "srv0start.h"
+#include "btr0sea.h"
+#include "os0proc.h"
+#include "trx0xa.h"
+#include "trx0purge.h"
+#include "ha_prototypes.h"
+#include "srv0mon.h"
+#include "ut0vec.h"
+
+/** Dummy session used currently in MySQL interface */
+UNIV_INTERN sess_t*		trx_dummy_sess = NULL;
+
+#ifdef UNIV_PFS_MUTEX
+/* Key to register the mutex with performance schema */
+UNIV_INTERN mysql_pfs_key_t	trx_mutex_key;
+/* Key to register the mutex with performance schema */
+UNIV_INTERN mysql_pfs_key_t	trx_undo_mutex_key;
+#endif /* UNIV_PFS_MUTEX */
+
+/*************************************************************//**
+Set detailed error message for the transaction. */
+UNIV_INTERN
+void
+trx_set_detailed_error(
+/*===================*/
+	trx_t*		trx,	/*!< in: transaction struct */
+	const char*	msg)	/*!< in: detailed error message */
+{
+	ut_strlcpy(trx->detailed_error, msg, sizeof(trx->detailed_error));
+}
+
+/*************************************************************//**
+Set detailed error message for the transaction from a file. Note that the
+file is rewinded before reading from it. */
+UNIV_INTERN
+void
+trx_set_detailed_error_from_file(
+/*=============================*/
+	trx_t*	trx,	/*!< in: transaction struct */
+	FILE*	file)	/*!< in: file to read message from */
+{
+	os_file_read_string(file, trx->detailed_error,
+			    sizeof(trx->detailed_error));
+}
+
+/****************************************************************//**
+Creates and initializes a transaction object. It must be explicitly
+started with trx_start_if_not_started() before using it. The default
+isolation level is TRX_ISO_REPEATABLE_READ.
+@return transaction instance, should never be NULL */
+static
+trx_t*
+trx_create(void)
+/*============*/
+{
+	trx_t*		trx;
+	mem_heap_t*	heap;
+	ib_alloc_t*	heap_alloc;
+
+	trx = static_cast<trx_t*>(mem_zalloc(sizeof(*trx)));
+
+	mutex_create(trx_mutex_key, &trx->mutex, SYNC_TRX);
+
+	trx->magic_n = TRX_MAGIC_N;
+
+	trx->state = TRX_STATE_NOT_STARTED;
+
+	trx->isolation_level = TRX_ISO_REPEATABLE_READ;
+
+	trx->no = IB_ULONGLONG_MAX;
+
+	trx->support_xa = TRUE;
+
+	trx->check_foreigns = TRUE;
+	trx->check_unique_secondary = TRUE;
+
+	trx->dict_operation = TRX_DICT_OP_NONE;
+
+	mutex_create(trx_undo_mutex_key, &trx->undo_mutex, SYNC_TRX_UNDO);
+
+	trx->error_state = DB_SUCCESS;
+
+	trx->lock.que_state = TRX_QUE_RUNNING;
+
+	trx->lock.lock_heap = mem_heap_create_typed(
+		256, MEM_HEAP_FOR_LOCK_HEAP);
+
+	trx->search_latch_timeout = BTR_SEA_TIMEOUT;
+
+	trx->global_read_view_heap = mem_heap_create(256);
+
+	trx->xid.formatID = -1;
+
+	trx->op_info = "";
+
+	heap = mem_heap_create(sizeof(ib_vector_t) + sizeof(void*) * 8);
+	heap_alloc = ib_heap_allocator_create(heap);
+
+	/* Remember to free the vector explicitly in trx_free(). */
+	trx->autoinc_locks = ib_vector_create(heap_alloc, sizeof(void**), 4);
+
+	/* Remember to free the vector explicitly in trx_free(). */
+	heap = mem_heap_create(sizeof(ib_vector_t) + sizeof(void*) * 128);
+	heap_alloc = ib_heap_allocator_create(heap);
+
+	trx->lock.table_locks = ib_vector_create(
+		heap_alloc, sizeof(void**), 32);
+
+	/* For non-locking selects we avoid calling ut_time() too frequently.
+	Set the time here for new transactions. */
+	trx->start_time = ut_time();
+
+	return(trx);
+}
+
+/********************************************************************//**
+Creates a transaction object for background operations by the master thread.
+@return	own: transaction object */
+UNIV_INTERN
+trx_t*
+trx_allocate_for_background(void)
+/*=============================*/
+{
+	trx_t*	trx;
+
+	trx = trx_create();
+
+	trx->sess = trx_dummy_sess;
+
+	return(trx);
+}
+
+/********************************************************************//**
+Creates a transaction object for MySQL.
+@return	own: transaction object */
+UNIV_INTERN
+trx_t*
+trx_allocate_for_mysql(void)
+/*========================*/
+{
+	trx_t*	trx;
+
+	trx = trx_allocate_for_background();
+
+	mutex_enter(&trx_sys->mutex);
+
+	trx_sys->n_mysql_trx++;
+
+	ut_d(trx->in_mysql_trx_list = TRUE);
+	UT_LIST_ADD_FIRST(mysql_trx_list, trx_sys->mysql_trx_list, trx);
+
+	mutex_exit(&trx_sys->mutex);
+
+	return(trx);
+}
+
+/********************************************************************//**
+Frees a transaction object. */
+static
+void
+trx_free(
+/*=====*/
+	trx_t*	trx)	/*!< in, own: trx object */
+{
+	ut_a(trx->magic_n == TRX_MAGIC_N);
+	ut_ad(!trx->in_ro_trx_list);
+	ut_ad(!trx->in_rw_trx_list);
+
+	mutex_free(&trx->undo_mutex);
+
+	if (trx->undo_no_arr != NULL) {
+		trx_undo_arr_free(trx->undo_no_arr);
+	}
+
+	ut_a(trx->lock.wait_lock == NULL);
+	ut_a(trx->lock.wait_thr == NULL);
+
+	ut_a(!trx->has_search_latch);
+
+	ut_a(trx->dict_operation_lock_mode == 0);
+
+	if (trx->lock.lock_heap) {
+		mem_heap_free(trx->lock.lock_heap);
+	}
+
+	ut_a(UT_LIST_GET_LEN(trx->lock.trx_locks) == 0);
+
+	if (trx->global_read_view_heap) {
+		mem_heap_free(trx->global_read_view_heap);
+	}
+
+	ut_a(ib_vector_is_empty(trx->autoinc_locks));
+	/* We allocated a dedicated heap for the vector. */
+	ib_vector_free(trx->autoinc_locks);
+
+	/* We allocated a dedicated heap for the vector. */
+	ib_vector_free(trx->lock.table_locks);
+
+	mutex_free(&trx->mutex);
+
+	mem_free(trx);
+}
+
+/********************************************************************//**
+Frees a transaction object of a background operation of the master thread. */
+UNIV_INTERN
+void
+trx_free_for_background(
+/*====================*/
+	trx_t*	trx)	/*!< in, own: trx object */
+{
+	if (UNIV_UNLIKELY(trx->declared_to_be_inside_innodb)) {
+		ut_print_timestamp(stderr);
+		fputs("  InnoDB: Error: Freeing a trx which is declared"
+		      " to be processing\n"
+		      "InnoDB: inside InnoDB.\n", stderr);
+		trx_print(stderr, trx, 600);
+		putc('\n', stderr);
+
+		/* This is an error but not a fatal error. We must keep
+		the counters like srv_conc_n_threads accurate. */
+		srv_conc_force_exit_innodb(trx);
+	}
+
+	if (UNIV_UNLIKELY(trx->n_mysql_tables_in_use != 0
+			  || trx->mysql_n_tables_locked != 0)) {
+
+		ut_print_timestamp(stderr);
+		fprintf(stderr,
+			"  InnoDB: Error: MySQL is freeing a thd\n"
+			"InnoDB: though trx->n_mysql_tables_in_use is %lu\n"
+			"InnoDB: and trx->mysql_n_tables_locked is %lu.\n",
+			(ulong) trx->n_mysql_tables_in_use,
+			(ulong) trx->mysql_n_tables_locked);
+		trx_print(stderr, trx, 600);
+		ut_print_buf(stderr, trx, sizeof(trx_t));
+		putc('\n', stderr);
+	}
+
+	ut_a(trx->state == TRX_STATE_NOT_STARTED);
+	ut_a(trx->insert_undo == NULL);
+	ut_a(trx->update_undo == NULL);
+	ut_a(trx->read_view == NULL);
+
+	trx_free(trx);
+}
+
+/********************************************************************//**
+At shutdown, frees a transaction object that is in the PREPARED state. */
+UNIV_INTERN
+void
+trx_free_prepared(
+/*==============*/
+	trx_t*	trx)	/*!< in, own: trx object */
+{
+	ut_ad(mutex_own(&trx_sys->mutex));
+
+	ut_a(trx_state_eq(trx, TRX_STATE_PREPARED));
+	ut_a(trx->magic_n == TRX_MAGIC_N);
+
+	trx_undo_free_prepared(trx);
+
+	assert_trx_in_rw_list(trx);
+
+	ut_a(!trx->read_only);
+
+	UT_LIST_REMOVE(trx_list, trx_sys->rw_trx_list, trx);
+	ut_d(trx->in_rw_trx_list = FALSE);
+
+	trx_free(trx);
+}
+
+/********************************************************************//**
+Frees a transaction object for MySQL. */
+UNIV_INTERN
+void
+trx_free_for_mysql(
+/*===============*/
+	trx_t*	trx)	/*!< in, own: trx object */
+{
+	mutex_enter(&trx_sys->mutex);
+
+	ut_ad(trx->in_mysql_trx_list);
+	ut_d(trx->in_mysql_trx_list = FALSE);
+	UT_LIST_REMOVE(mysql_trx_list, trx_sys->mysql_trx_list, trx);
+
+	ut_ad(trx_sys_validate_trx_list());
+
+	trx_sys->n_mysql_trx--;
+
+	mutex_exit(&trx_sys->mutex);
+
+	trx_free_for_background(trx);
+}
+
+/****************************************************************//**
+Inserts the trx handle in the trx system trx list in the right position.
+The list is sorted on the trx id so that the biggest id is at the list
+start. This function is used at the database startup to insert incomplete
+transactions to the list. */
+static
+void
+trx_list_rw_insert_ordered(
+/*=======================*/
+	trx_t*	trx)	/*!< in: trx handle */
+{
+	trx_t*	trx2;
+
+	ut_ad(!trx->read_only);
+
+	ut_a(srv_is_being_started);
+	ut_ad(!trx->in_ro_trx_list);
+	ut_ad(!trx->in_rw_trx_list);
+	ut_ad(trx->state != TRX_STATE_NOT_STARTED);
+	ut_ad(trx->is_recovered);
+
+	for (trx2 = UT_LIST_GET_FIRST(trx_sys->rw_trx_list);
+	     trx2 != NULL;
+	     trx2 = UT_LIST_GET_NEXT(trx_list, trx2)) {
+
+		assert_trx_in_rw_list(trx2);
+
+		if (trx->id >= trx2->id) {
+
+			ut_ad(trx->id > trx2->id);
+			break;
+		}
+	}
+
+	if (trx2 != NULL) {
+		trx2 = UT_LIST_GET_PREV(trx_list, trx2);
+
+		if (trx2 == NULL) {
+			UT_LIST_ADD_FIRST(trx_list, trx_sys->rw_trx_list, trx);
+		} else {
+			UT_LIST_INSERT_AFTER(
+				trx_list, trx_sys->rw_trx_list, trx2, trx);
+		}
+	} else {
+		UT_LIST_ADD_LAST(trx_list, trx_sys->rw_trx_list, trx);
+	}
+
+	ut_ad(!trx->in_rw_trx_list);
+	ut_d(trx->in_rw_trx_list = TRUE);
+}
+
+/****************************************************************//**
+Resurrect the transactions that were doing inserts the time of the
+crash, they need to be undone.
+@return trx_t instance  */
+static
+trx_t*
+trx_resurrect_insert(
+/*=================*/
+	trx_undo_t*	undo,		/*!< in: entry to UNDO */
+	trx_rseg_t*	rseg)		/*!< in: rollback segment */
+{
+	trx_t*		trx;
+
+	trx = trx_allocate_for_background();
+
+	trx->rseg = rseg;
+	trx->xid = undo->xid;
+	trx->id = undo->trx_id;
+	trx->insert_undo = undo;
+	trx->is_recovered = TRUE;
+
+	/* This is single-threaded startup code, we do not need the
+	protection of trx->mutex or trx_sys->mutex here. */
+
+	if (undo->state != TRX_UNDO_ACTIVE) {
+
+		/* Prepared transactions are left in the prepared state
+		waiting for a commit or abort decision from MySQL */
+
+		if (undo->state == TRX_UNDO_PREPARED) {
+
+			fprintf(stderr,
+				"InnoDB: Transaction " TRX_ID_FMT " was in the"
+				" XA prepared state.\n", trx->id);
+
+			if (srv_force_recovery == 0) {
+
+				trx->state = TRX_STATE_PREPARED;
+				trx_sys->n_prepared_trx++;
+			} else {
+				fprintf(stderr,
+					"InnoDB: Since innodb_force_recovery"
+					" > 0, we will rollback it anyway.\n");
+
+				trx->state = TRX_STATE_ACTIVE;
+			}
+		} else {
+			trx->state = TRX_STATE_COMMITTED_IN_MEMORY;
+		}
+
+		/* We give a dummy value for the trx no; this should have no
+		relevance since purge is not interested in committed
+		transaction numbers, unless they are in the history
+		list, in which case it looks the number from the disk based
+		undo log structure */
+
+		trx->no = trx->id;
+	} else {
+		trx->state = TRX_STATE_ACTIVE;
+
+		/* A running transaction always has the number
+		field inited to IB_ULONGLONG_MAX */
+
+		trx->no = IB_ULONGLONG_MAX;
+	}
+
+	if (undo->dict_operation) {
+		trx_set_dict_operation(trx, TRX_DICT_OP_TABLE);
+		trx->table_id = undo->table_id;
+	}
+
+	if (!undo->empty) {
+		trx->undo_no = undo->top_undo_no + 1;
+	}
+
+	return(trx);
+}
+
+/****************************************************************//**
+Prepared transactions are left in the prepared state waiting for a
+commit or abort decision from MySQL */
+static
+void
+trx_resurrect_update_in_prepared_state(
+/*===================================*/
+	trx_t*			trx,	/*!< in,out: transaction */
+	const trx_undo_t*	undo)	/*!< in: update UNDO record */
+{
+	/* This is single-threaded startup code, we do not need the
+	protection of trx->mutex or trx_sys->mutex here. */
+
+	if (undo->state == TRX_UNDO_PREPARED) {
+		fprintf(stderr,
+			"InnoDB: Transaction " TRX_ID_FMT
+			" was in the XA prepared state.\n", trx->id);
+
+		if (srv_force_recovery == 0) {
+			if (trx_state_eq(trx, TRX_STATE_NOT_STARTED)) {
+				trx_sys->n_prepared_trx++;
+			} else {
+				ut_ad(trx_state_eq(trx, TRX_STATE_PREPARED));
+			}
+
+			trx->state = TRX_STATE_PREPARED;
+		} else {
+			fprintf(stderr,
+				"InnoDB: Since innodb_force_recovery"
+				" > 0, we will rollback it anyway.\n");
+
+			trx->state = TRX_STATE_ACTIVE;
+		}
+	} else {
+		trx->state = TRX_STATE_COMMITTED_IN_MEMORY;
+	}
+}
+
+/****************************************************************//**
+Resurrect the transactions that were doing updates the time of the
+crash, they need to be undone. */
+static
+void
+trx_resurrect_update(
+/*=================*/
+	trx_t*		trx,	/*!< in/out: transaction */
+	trx_undo_t*	undo,	/*!< in/out: update UNDO record */
+	trx_rseg_t*	rseg)	/*!< in/out: rollback segment */
+{
+	trx->rseg = rseg;
+	trx->xid = undo->xid;
+	trx->id = undo->trx_id;
+	trx->update_undo = undo;
+	trx->is_recovered = TRUE;
+
+	/* This is single-threaded startup code, we do not need the
+	protection of trx->mutex or trx_sys->mutex here. */
+
+	if (undo->state != TRX_UNDO_ACTIVE) {
+		trx_resurrect_update_in_prepared_state(trx, undo);
+
+		/* We give a dummy value for the trx number */
+
+		trx->no = trx->id;
+
+	} else {
+		trx->state = TRX_STATE_ACTIVE;
+
+		/* A running transaction always has the number field inited to
+		IB_ULONGLONG_MAX */
+
+		trx->no = IB_ULONGLONG_MAX;
+	}
+
+	if (undo->dict_operation) {
+		trx_set_dict_operation(trx, TRX_DICT_OP_TABLE);
+		trx->table_id = undo->table_id;
+	}
+
+	if (!undo->empty && undo->top_undo_no >= trx->undo_no) {
+
+		trx->undo_no = undo->top_undo_no + 1;
+	}
+}
+
+/****************************************************************//**
+Creates trx objects for transactions and initializes the trx list of
+trx_sys at database start. Rollback segment and undo log lists must
+already exist when this function is called, because the lists of
+transactions to be rolled back or cleaned up are built based on the
+undo log lists. */
+UNIV_INTERN
+void
+trx_lists_init_at_db_start(void)
+/*============================*/
+{
+	ulint		i;
+
+	ut_a(srv_is_being_started);
+
+	UT_LIST_INIT(trx_sys->ro_trx_list);
+	UT_LIST_INIT(trx_sys->rw_trx_list);
+
+	/* Look from the rollback segments if there exist undo logs for
+	transactions */
+
+	for (i = 0; i < TRX_SYS_N_RSEGS; ++i) {
+		trx_undo_t*	undo;
+		trx_rseg_t*	rseg;
+
+		rseg = trx_sys->rseg_array[i];
+
+		if (rseg == NULL) {
+			continue;
+		}
+
+		/* Resurrect transactions that were doing inserts. */
+		for (undo = UT_LIST_GET_FIRST(rseg->insert_undo_list);
+		     undo != NULL;
+		     undo = UT_LIST_GET_NEXT(undo_list, undo)) {
+			trx_t*	trx;
+
+			trx = trx_resurrect_insert(undo, rseg);
+
+			trx_list_rw_insert_ordered(trx);
+		}
+
+		/* Ressurrect transactions that were doing updates. */
+		for (undo = UT_LIST_GET_FIRST(rseg->update_undo_list);
+		     undo != NULL;
+		     undo = UT_LIST_GET_NEXT(undo_list, undo)) {
+			trx_t*	trx;
+			ibool	trx_created;
+
+			/* Check the trx_sys->rw_trx_list first. */
+			mutex_enter(&trx_sys->mutex);
+			trx = trx_get_rw_trx_by_id(undo->trx_id);
+			mutex_exit(&trx_sys->mutex);
+
+			if (trx == NULL) {
+				trx = trx_allocate_for_background();
+				trx_created = TRUE;
+			} else {
+				trx_created = FALSE;
+			}
+
+			trx_resurrect_update(trx, undo, rseg);
+
+			if (trx_created) {
+				trx_list_rw_insert_ordered(trx);
+			}
+		}
+	}
+}
+
+/******************************************************************//**
+Assigns a rollback segment to a transaction in a round-robin fashion.
+@return	assigned rollback segment instance */
+UNIV_INLINE
+trx_rseg_t*
+trx_assign_rseg(
+/*============*/
+	ulong	max_undo_logs,	/*!< in: maximum number of UNDO logs to use */
+	ulint	n_tablespaces)	/*!< in: number of rollback tablespaces */
+{
+	ulint		i;
+	trx_rseg_t*	rseg;
+	static ulint	latest_rseg = 0;
+
+	if (srv_force_recovery >= SRV_FORCE_NO_TRX_UNDO) {
+		ut_a(max_undo_logs == ULONG_UNDEFINED);
+		return(NULL);
+	}
+
+	/* This breaks true round robin but that should be OK. */
+
+	ut_a(max_undo_logs > 0 && max_undo_logs <= TRX_SYS_N_RSEGS);
+
+	i = latest_rseg++;
+        i %= max_undo_logs;
+
+	/* Note: The assumption here is that there can't be any gaps in
+	the array. Once we implement more flexible rollback segment
+	management this may not hold. The assertion checks for that case. */
+
+	ut_a(trx_sys->rseg_array[0] != NULL);
+
+	/* Skip the system tablespace if we have more than one tablespace
+	defined for rollback segments. We want all UNDO records to be in
+	the non-system tablespaces. */
+
+	do {
+		rseg = trx_sys->rseg_array[i];
+		ut_a(rseg == NULL || i == rseg->id);
+
+		i = (rseg == NULL) ? 0 : i + 1;
+
+	} while (rseg == NULL
+		 || (rseg->space == 0
+		     && n_tablespaces > 0
+		     && trx_sys->rseg_array[1] != NULL));
+
+	return(rseg);
+}
+
+/****************************************************************//**
+Starts a transaction. */
+static
+void
+trx_start_low(
+/*==========*/
+	trx_t*	trx)		/*!< in: transaction */
+{
+	static ulint	n_start_times;
+
+	ut_ad(trx->rseg == NULL);
+
+	ut_ad(!trx->is_recovered);
+	ut_ad(trx_state_eq(trx, TRX_STATE_NOT_STARTED));
+	ut_ad(UT_LIST_GET_LEN(trx->lock.trx_locks) == 0);
+
+	/* Check whether it is an AUTOCOMMIT SELECT */
+	trx->auto_commit = thd_trx_is_auto_commit(trx->mysql_thd);
+
+	trx->read_only = thd_trx_is_read_only(trx->mysql_thd);
+
+	if (!trx->auto_commit) {
+		++trx->will_lock;
+	} else if (trx->will_lock == 0) {
+		trx->read_only = TRUE;
+	}
+
+	if (!trx->read_only) {
+		trx->rseg = trx_assign_rseg(
+			srv_undo_logs, srv_undo_tablespaces);
+	}
+
+	/* Avoid making an unnecessary system call, for non-locking
+	auto-commit selects we reuse the start_time for every 32  starts. */
+	if (!trx_is_autocommit_non_locking(trx) || !(n_start_times++ % 32)) {
+		trx->start_time = ut_time();
+	}
+
+	/* The initial value for trx->no: IB_ULONGLONG_MAX is used in
+	read_view_open_now: */
+
+	trx->no = IB_ULONGLONG_MAX;
+
+	ut_a(ib_vector_is_empty(trx->autoinc_locks));
+	ut_a(ib_vector_is_empty(trx->lock.table_locks));
+
+	mutex_enter(&trx_sys->mutex);
+
+	/* If this transaction came from trx_allocate_for_mysql(),
+	trx->in_mysql_trx_list would hold. In that case, the trx->state
+	change must be protected by the trx_sys->mutex, so that
+	lock_print_info_all_transactions() will have a consistent view. */
+
+	trx->state = TRX_STATE_ACTIVE;
+
+	trx->id = trx_sys_get_new_trx_id();
+
+	ut_ad(!trx->in_rw_trx_list);
+	ut_ad(!trx->in_ro_trx_list);
+
+	if (trx->read_only) {
+
+		/* Note: The trx_sys_t::ro_trx_list doesn't really need to
+		be ordered, we should exploit this using a list type that
+		doesn't need a list wide lock to increase concurrency. */
+
+		if (!trx_is_autocommit_non_locking(trx)) {
+			UT_LIST_ADD_FIRST(trx_list, trx_sys->ro_trx_list, trx);
+			ut_d(trx->in_ro_trx_list = TRUE);
+		}
+	} else {
+
+		ut_ad(trx->rseg != NULL
+		      || srv_force_recovery >= SRV_FORCE_NO_TRX_UNDO);
+
+		ut_ad(!trx_is_autocommit_non_locking(trx));
+		UT_LIST_ADD_FIRST(trx_list, trx_sys->rw_trx_list, trx);
+		ut_d(trx->in_rw_trx_list = TRUE);
+	}
+
+	ut_ad(trx_sys_validate_trx_list());
+
+	mutex_exit(&trx_sys->mutex);
+
+	MONITOR_INC(MONITOR_TRX_ACTIVE);
+}
+
+/****************************************************************//**
+Set the transaction serialisation number. */
+static
+void
+trx_serialisation_number_get(
+/*=========================*/
+	trx_t*		trx)	/*!< in: transaction */
+{
+	trx_rseg_t*	rseg;
+
+	rseg = trx->rseg;
+
+	ut_ad(mutex_own(&rseg->mutex));
+
+	mutex_enter(&trx_sys->mutex);
+
+	trx->no = trx_sys_get_new_trx_id();
+
+	/* If the rollack segment is not empty then the
+	new trx_t::no can't be less than any trx_t::no
+	already in the rollback segment. User threads only
+	produce events when a rollback segment is empty. */
+
+	if (rseg->last_page_no == FIL_NULL) {
+		void*		ptr;
+		rseg_queue_t	rseg_queue;
+
+		rseg_queue.rseg = rseg;
+		rseg_queue.trx_no = trx->no;
+
+		mutex_enter(&purge_sys->bh_mutex);
+
+		/* This is to reduce the pressure on the trx_sys_t::mutex
+		though in reality it should make very little (read no)
+		difference because this code path is only taken when the
+		rbs is empty. */
+
+		mutex_exit(&trx_sys->mutex);
+
+		ptr = ib_bh_push(purge_sys->ib_bh, &rseg_queue);
+		ut_a(ptr);
+
+		mutex_exit(&purge_sys->bh_mutex);
+	} else {
+		mutex_exit(&trx_sys->mutex);
+	}
+}
+
+/****************************************************************//**
+Assign the transaction its history serialisation number and write the
+update UNDO log record to the assigned rollback segment.
+@return the LSN of the UNDO log write. */
+static
+lsn_t
+trx_write_serialisation_history(
+/*============================*/
+	trx_t*		trx)	/*!< in: transaction */
+{
+
+	mtr_t		mtr;
+	trx_rseg_t*	rseg;
+
+	rseg = trx->rseg;
+
+	mtr_start(&mtr);
+
+	/* Change the undo log segment states from TRX_UNDO_ACTIVE
+	to some other state: these modifications to the file data
+	structure define the transaction as committed in the file
+	based domain, at the serialization point of the log sequence
+	number lsn obtained below. */
+
+	if (trx->update_undo != NULL) {
+		page_t*		undo_hdr_page;
+		trx_undo_t*	undo = trx->update_undo;
+
+		/* We have to hold the rseg mutex because update
+		log headers have to be put to the history list in the
+		(serialisation) order of the UNDO trx number. This is
+		required for the purge in-memory data structures too. */
+
+		mutex_enter(&rseg->mutex);
+
+		/* Assign the transaction serialisation number and also
+		update the purge min binary heap if this is the first
+		UNDO log being written to the assigned rollback segment. */
+
+		trx_serialisation_number_get(trx);
+
+		/* It is not necessary to obtain trx->undo_mutex here
+		because only a single OS thread is allowed to do the
+		transaction commit for this transaction. */
+
+		undo_hdr_page = trx_undo_set_state_at_finish(undo, &mtr);
+
+		trx_undo_update_cleanup(trx, undo_hdr_page, &mtr);
+	} else {
+		mutex_enter(&rseg->mutex);
+	}
+
+	if (trx->insert_undo != NULL) {
+		trx_undo_set_state_at_finish(trx->insert_undo, &mtr);
+	}
+
+	mutex_exit(&rseg->mutex);
+
+	MONITOR_INC(MONITOR_TRX_COMMIT_UNDO);
+
+	/* Update the latest MySQL binlog name and offset info
+	in trx sys header if MySQL binlogging is on or the database
+	server is a MySQL replication slave */
+
+	if (trx->mysql_log_file_name
+	    && trx->mysql_log_file_name[0] != '\0') {
+
+		trx_sys_update_mysql_binlog_offset(
+			trx->mysql_log_file_name,
+			trx->mysql_log_offset,
+			TRX_SYS_MYSQL_LOG_INFO, &mtr);
+
+		trx->mysql_log_file_name = NULL;
+	}
+
+	/* The following call commits the mini-transaction, making the
+	whole transaction committed in the file-based world, at this
+	log sequence number. The transaction becomes 'durable' when
+	we write the log to disk, but in the logical sense the commit
+	in the file-based data structures (undo logs etc.) happens
+	here.
+
+	NOTE that transaction numbers, which are assigned only to
+	transactions with an update undo log, do not necessarily come
+	in exactly the same order as commit lsn's, if the transactions
+	have different rollback segments. To get exactly the same
+	order we should hold the kernel mutex up to this point,
+	adding to the contention of the kernel mutex. However, if
+	a transaction T2 is able to see modifications made by
+	a transaction T1, T2 will always get a bigger transaction
+	number and a bigger commit lsn than T1. */
+
+	/*--------------*/
+	mtr_commit(&mtr);
+	/*--------------*/
+
+	return(mtr.end_lsn);
+}
+
+/********************************************************************
+Finalize a transaction containing updates for a FTS table. */
+static
+void
+trx_finalize_for_fts_table(
+/*=======================*/
+        fts_trx_table_t*        ftt)            /* in: FTS trx table */
+{
+	fts_t*                  fts = ftt->table->fts;
+	fts_doc_ids_t*          doc_ids = ftt->added_doc_ids;
+
+	mutex_enter(&fts->bg_threads_mutex);
+
+	if (fts->fts_status & BG_THREAD_STOP) {
+		/* The table is about to be dropped, no use
+		adding anything to its work queue. */
+
+		mutex_exit(&fts->bg_threads_mutex);
+	} else {
+		mem_heap_t*     heap;
+		mutex_exit(&fts->bg_threads_mutex);
+
+		ut_a(fts->add_wq);
+
+		heap = static_cast<mem_heap_t*>(doc_ids->self_heap->arg);
+
+		ib_wqueue_add(fts->add_wq, doc_ids, heap);
+
+		/* fts_trx_table_t no longer owns the list. */
+		ftt->added_doc_ids = NULL;
+	}
+}
+
+/********************************************************************
+Finalize a transaction containing updates to FTS tables. */
+static
+void
+trx_finalize_for_fts(
+/*=================*/
+        trx_t*  trx,            /* in: transaction */
+        ibool   is_commit)      /* in: TRUE if the transaction was
+                                committed, FALSE if it was rolled back. */
+{
+	if (is_commit) {
+		const ib_rbt_node_t*    node;
+		ib_rbt_t*               tables;
+		fts_savepoint_t*        savepoint;
+
+		savepoint = static_cast<fts_savepoint_t*>(
+			ib_vector_last(trx->fts_trx->savepoints));
+
+		tables = savepoint->tables;
+
+		for (node = rbt_first(tables);
+		     node;
+		     node = rbt_next(tables, node)) {
+			fts_trx_table_t**        ftt;
+
+			ftt = rbt_value(fts_trx_table_t*, node);
+
+			if ((*ftt)->added_doc_ids) {
+				trx_finalize_for_fts_table(*ftt);
+			}
+		}
+	}
+
+	fts_trx_free(trx->fts_trx);
+	trx->fts_trx = NULL;
+}
+
+/****************************************************************//**
+Commits a transaction. */
+UNIV_INTERN
+void
+trx_commit(
+/*=======*/
+	trx_t*	trx)	/*!< in: transaction */
+{
+	trx_named_savept_t*	savep;
+	ib_uint64_t		lsn = 0;
+	ibool			doing_fts_commit = FALSE;
+
+	assert_trx_nonlocking_or_in_list(trx);
+	ut_ad(!trx_state_eq(trx, TRX_STATE_COMMITTED_IN_MEMORY));
+
+	/* undo_no is non-zero if we're doing the final commit. */
+	if (trx->fts_trx && (trx->undo_no != 0)) {
+		ulint   error;
+
+		ut_a(!trx_is_autocommit_non_locking(trx));
+
+		doing_fts_commit = TRUE;
+
+		error = fts_commit(trx);
+
+		/* FTS-FIXME: Temparorily tolerate DB_DUPLICATE_KEY
+		instead of dying. This is a possible scenario if there
+		is a crash between insert to DELETED table committing
+		and transaction committing. The fix would be able to
+		return error from this function */
+		if (error != DB_SUCCESS && error != DB_DUPLICATE_KEY) {
+			/* FTS-FIXME: once we can return values from this
+			function, we should do so and signal an error
+			instead of just dying. */
+
+			ut_error;
+		}
+	}
+
+	if (trx->insert_undo != NULL || trx->update_undo != NULL) {
+		lsn = trx_write_serialisation_history(trx);
+	} else {
+		lsn = 0;
+	}
+
+	trx->must_flush_log_later = FALSE;
+
+	if (trx_is_autocommit_non_locking(trx)) {
+		ut_ad(trx->read_only);
+		ut_a(!trx->is_recovered);
+		ut_ad(trx->rseg == NULL);
+		ut_ad(!trx->in_ro_trx_list);
+		ut_ad(!trx->in_rw_trx_list);
+
+		/* Note: We are asserting without holding the lock mutex. But
+		that is OK because this transaction is not waiting and cannot
+		be rolled back and no new locks can (or should not) be added
+		becuase it is flagged as a non-locking read-only transaction. */
+
+		ut_a(UT_LIST_GET_LEN(trx->lock.trx_locks) == 0);
+
+		/* This state change is not protected by any mutex, therefore
+		there is an inherent race here around state transition during
+		printouts. We ignore this race for the sake of efficiency.
+		However, the trx_sys_t::mutex will protect the trx_t instance
+		and it cannot be removed from the mysql_trx_list and freed
+		without first acquiring the trx_sys_t::mutex. */
+
+		ut_ad(trx_state_eq(trx, TRX_STATE_ACTIVE));
+
+		trx->state = TRX_STATE_NOT_STARTED;
+
+		MONITOR_INC(MONITOR_TRX_NL_RO_COMMIT);
+	} else {
+		lock_trx_release_locks(trx);
+
+		/* Remove the transaction from the list of active
+		transactions now that it no longer holds any user locks. */
+
+		ut_ad(trx_state_eq(trx, TRX_STATE_COMMITTED_IN_MEMORY));
+
+		mutex_enter(&trx_sys->mutex);
+
+		assert_trx_in_list(trx);
+
+		if (trx->read_only) {
+			ut_ad(trx->rseg == NULL);
+			UT_LIST_REMOVE(trx_list, trx_sys->ro_trx_list, trx);
+			ut_d(trx->in_ro_trx_list = FALSE);
+			MONITOR_INC(MONITOR_TRX_RO_COMMIT);
+		} else {
+			UT_LIST_REMOVE(trx_list, trx_sys->rw_trx_list, trx);
+			ut_d(trx->in_rw_trx_list = FALSE);
+			MONITOR_INC(MONITOR_TRX_RW_COMMIT);
+		}
+
+		/* If this transaction came from trx_allocate_for_mysql(),
+		trx->in_mysql_trx_list would hold. In that case, the
+		trx->state change must be protected by trx_sys->mutex, so that
+		lock_print_info_all_transactions() will have a consistent
+		view. */
+
+		trx->state = TRX_STATE_NOT_STARTED;
+
+		ut_ad(trx_sys_validate_trx_list());
+
+		mutex_exit(&trx_sys->mutex);
+	}
+
+	if (trx->global_read_view != NULL) {
+		read_view_remove(trx->global_read_view);
+
+		mem_heap_empty(trx->global_read_view_heap);
+
+		trx->global_read_view = NULL;
+	}
+
+	trx->read_view = NULL;
+
+	if (lsn) {
+		if (trx->insert_undo != NULL) {
+
+			trx_undo_insert_cleanup(trx);
+		}
+
+		/* NOTE that we could possibly make a group commit more
+		efficient here: call os_thread_yield here to allow also other
+		trxs to come to commit! */
+
+		/*-------------------------------------*/
+
+		/* Depending on the my.cnf options, we may now write the log
+		buffer to the log files, making the transaction durable if
+		the OS does not crash. We may also flush the log files to
+		disk, making the transaction durable also at an OS crash or a
+		power outage.
+
+		The idea in InnoDB's group commit is that a group of
+		transactions gather behind a trx doing a physical disk write
+		to log files, and when that physical write has been completed,
+		one of those transactions does a write which commits the whole
+		group. Note that this group commit will only bring benefit if
+		there are > 2 users in the database. Then at least 2 users can
+		gather behind one doing the physical log write to disk.
+
+		If we are calling trx_commit() under prepare_commit_mutex, we
+		will delay possible log write and flush to a separate function
+		trx_commit_complete_for_mysql(), which is only called when the
+		thread has released the mutex. This is to make the
+		group commit algorithm to work. Otherwise, the prepare_commit
+		mutex would serialize all commits and prevent a group of
+		transactions from gathering. */
+
+		if (trx->flush_log_later) {
+			/* Do nothing yet */
+			trx->must_flush_log_later = TRUE;
+		} else if (srv_flush_log_at_trx_commit == 0) {
+			/* Do nothing */
+		} else if (srv_flush_log_at_trx_commit == 1) {
+			if (srv_unix_file_flush_method == SRV_UNIX_NOSYNC) {
+				/* Write the log but do not flush it to disk */
+
+				log_write_up_to(lsn, LOG_WAIT_ONE_GROUP,
+						FALSE);
+			} else {
+				/* Write the log to the log files AND flush
+				them to disk */
+
+				log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, TRUE);
+			}
+		} else if (srv_flush_log_at_trx_commit == 2) {
+
+			/* Write the log but do not flush it to disk */
+
+			log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, FALSE);
+		} else {
+			ut_error;
+		}
+
+		trx->commit_lsn = lsn;
+	}
+
+	/* Free all savepoints, starting from the first. */
+	savep = UT_LIST_GET_FIRST(trx->trx_savepoints);
+	trx_roll_savepoints_free(trx, savep);
+
+	trx->rseg = NULL;
+	trx->undo_no = 0;
+	trx->last_sql_stat_start.least_undo_no = 0;
+
+	trx->will_lock = 0;
+	trx->read_only = FALSE;
+	trx->auto_commit = FALSE;
+
+        if (trx->fts_trx) {
+                trx_finalize_for_fts(trx, doing_fts_commit);
+        }
+
+	ut_ad(trx->lock.wait_thr == NULL);
+	ut_ad(UT_LIST_GET_LEN(trx->lock.trx_locks) == 0);
+	ut_ad(!trx->in_ro_trx_list);
+	ut_ad(!trx->in_rw_trx_list);
+
+	trx->error_state = DB_SUCCESS;
+
+	/* trx->in_mysql_trx_list would hold between
+	trx_allocate_for_mysql() and trx_free_for_mysql(). It does not
+	hold for recovered transactions or system transactions. */
+}
+
+/****************************************************************//**
+Cleans up a transaction at database startup. The cleanup is needed if
+the transaction already got to the middle of a commit when the database
+crashed, and we cannot roll it back. */
+UNIV_INTERN
+void
+trx_cleanup_at_db_startup(
+/*======================*/
+	trx_t*	trx)	/*!< in: transaction */
+{
+	ut_ad(trx->is_recovered);
+
+	if (trx->insert_undo != NULL) {
+
+		trx_undo_insert_cleanup(trx);
+	}
+
+	trx->rseg = NULL;
+	trx->undo_no = 0;
+	trx->last_sql_stat_start.least_undo_no = 0;
+
+	mutex_enter(&trx_sys->mutex);
+
+	ut_a(!trx->read_only);
+
+	UT_LIST_REMOVE(trx_list, trx_sys->rw_trx_list, trx);
+
+	assert_trx_in_rw_list(trx);
+	ut_d(trx->in_rw_trx_list = FALSE);
+
+	mutex_exit(&trx_sys->mutex);
+
+	/* Change the transaction state without mutex protection, now
+	that it no longer is in the trx_list. Recovered transactions
+	are never placed in the mysql_trx_list. */
+	ut_ad(trx->is_recovered);
+	ut_ad(!trx->in_ro_trx_list);
+	ut_ad(!trx->in_rw_trx_list);
+	ut_ad(!trx->in_mysql_trx_list);
+	trx->state = TRX_STATE_NOT_STARTED;
+}
+
+/********************************************************************//**
+Assigns a read view for a consistent read query. All the consistent reads
+within the same transaction will get the same read view, which is created
+when this function is first called for a new started transaction.
+@return	consistent read view */
+UNIV_INTERN
+read_view_t*
+trx_assign_read_view(
+/*=================*/
+	trx_t*	trx)	/*!< in: active transaction */
+{
+	ut_ad(trx->state == TRX_STATE_ACTIVE);
+
+	if (trx->read_view != NULL) {
+		return(trx->read_view);
+	}
+
+	if (!trx->read_view) {
+
+		trx->read_view = read_view_open_now(
+			trx->id, trx->global_read_view_heap);
+
+		trx->global_read_view = trx->read_view;
+	}
+
+	return(trx->read_view);
+}
+
+/****************************************************************//**
+Prepares a transaction for commit/rollback. */
+UNIV_INTERN
+void
+trx_commit_or_rollback_prepare(
+/*===========================*/
+	trx_t*	trx)		/*!< in/out: transaction */
+{
+	/* We are reading trx->state without holding trx_sys->mutex
+	here, because the commit or rollback should be invoked for a
+	running (or recovered prepared) transaction that is associated
+	with the current thread. */
+
+	switch (trx->state) {
+	case TRX_STATE_NOT_STARTED:
+		trx_start_low(trx);
+		/* fall through */
+	case TRX_STATE_ACTIVE:
+	case TRX_STATE_PREPARED:
+		/* If the trx is in a lock wait state, moves the waiting
+		query thread to the suspended state */
+
+		if (trx->lock.que_state == TRX_QUE_LOCK_WAIT) {
+
+			ut_a(trx->lock.wait_thr != NULL);
+			trx->lock.wait_thr->state = QUE_THR_SUSPENDED;
+			trx->lock.wait_thr = NULL;
+
+			trx->lock.que_state = TRX_QUE_RUNNING;
+		}
+
+		ut_a(trx->lock.n_active_thrs == 1);
+		return;
+	case TRX_STATE_COMMITTED_IN_MEMORY:
+		break;
+	}
+
+	ut_error;
+}
+
+/*********************************************************************//**
+Creates a commit command node struct.
+@return	own: commit node struct */
+UNIV_INTERN
+commit_node_t*
+trx_commit_node_create(
+/*===================*/
+	mem_heap_t*	heap)	/*!< in: mem heap where created */
+{
+	commit_node_t*	node;
+
+	node = static_cast<commit_node_t*>(mem_heap_alloc(heap, sizeof(*node)));
+	node->common.type  = QUE_NODE_COMMIT;
+	node->state = COMMIT_NODE_SEND;
+
+	return(node);
+}
+
+/***********************************************************//**
+Performs an execution step for a commit type node in a query graph.
+@return	query thread to run next, or NULL */
+UNIV_INTERN
+que_thr_t*
+trx_commit_step(
+/*============*/
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	commit_node_t*	node;
+
+	node = static_cast<commit_node_t*>(thr->run_node);
+
+	ut_ad(que_node_get_type(node) == QUE_NODE_COMMIT);
+
+	if (thr->prev_node == que_node_get_parent(node)) {
+		node->state = COMMIT_NODE_SEND;
+	}
+
+	if (node->state == COMMIT_NODE_SEND) {
+		trx_t*	trx;
+
+		node->state = COMMIT_NODE_WAIT;
+
+		trx = thr_get_trx(thr);
+
+		ut_a(trx->lock.wait_thr == NULL);
+		ut_a(trx->lock.que_state != TRX_QUE_LOCK_WAIT);
+
+		trx_commit_or_rollback_prepare(trx);
+
+		trx->lock.que_state = TRX_QUE_COMMITTING;
+
+		trx_commit(trx);
+
+		ut_ad(trx->lock.wait_thr == NULL);
+
+		trx->lock.que_state = TRX_QUE_RUNNING;
+
+		thr = NULL;
+	} else {
+		ut_ad(node->state == COMMIT_NODE_WAIT);
+
+		node->state = COMMIT_NODE_SEND;
+
+		thr->run_node = que_node_get_parent(node);
+	}
+
+	return(thr);
+}
+
+/**********************************************************************//**
+Does the transaction commit for MySQL.
+@return	DB_SUCCESS or error number */
+UNIV_INTERN
+ulint
+trx_commit_for_mysql(
+/*=================*/
+	trx_t*	trx)	/*!< in/out: transaction */
+{
+	/* Because we do not do the commit by sending an Innobase
+	sig to the transaction, we must here make sure that trx has been
+	started. */
+
+	ut_a(trx);
+
+	switch (trx->state) {
+	case TRX_STATE_NOT_STARTED:
+		/* Update the info whether we should skip XA steps that eat
+		CPU time.
+
+		For the duration of the transaction trx->support_xa is
+		not reread from thd so any changes in the value take
+		effect in the next transaction. This is to avoid a
+		scenario where some undo log records generated by a
+		transaction contain XA information and other undo log
+		records, generated by the same transaction do not. */
+		trx->support_xa = thd_supports_xa(trx->mysql_thd);
+
+		trx_start_low(trx);
+		/* fall through */
+	case TRX_STATE_ACTIVE:
+	case TRX_STATE_PREPARED:
+		trx->op_info = "committing";
+		trx_commit(trx);
+		MONITOR_DEC(MONITOR_TRX_ACTIVE);
+		trx->op_info = "";
+		return(DB_SUCCESS);
+	case TRX_STATE_COMMITTED_IN_MEMORY:
+		break;
+	}
+	ut_error;
+	return(DB_CORRUPTION);
+}
+
+/**********************************************************************//**
+If required, flushes the log to disk if we called trx_commit_for_mysql()
+with trx->flush_log_later == TRUE.
+@return	0 or error number */
+UNIV_INTERN
+ulint
+trx_commit_complete_for_mysql(
+/*==========================*/
+	trx_t*	trx)	/*!< in: trx handle */
+{
+	lsn_t	lsn	= trx->commit_lsn;
+
+	ut_a(trx);
+
+	trx->op_info = "flushing log";
+
+	if (!trx->must_flush_log_later) {
+		/* Do nothing */
+	} else if (srv_flush_log_at_trx_commit == 0) {
+		/* Do nothing */
+	} else if (srv_flush_log_at_trx_commit == 1) {
+		if (srv_unix_file_flush_method == SRV_UNIX_NOSYNC) {
+			/* Write the log but do not flush it to disk */
+
+			log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, FALSE);
+		} else {
+			/* Write the log to the log files AND flush them to
+			disk */
+
+			log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, TRUE);
+		}
+	} else if (srv_flush_log_at_trx_commit == 2) {
+
+		/* Write the log but do not flush it to disk */
+
+		log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, FALSE);
+	} else {
+		ut_error;
+	}
+
+	trx->must_flush_log_later = FALSE;
+
+	trx->op_info = "";
+
+	return(0);
+}
+
+/**********************************************************************//**
+Marks the latest SQL statement ended. */
+UNIV_INTERN
+void
+trx_mark_sql_stat_end(
+/*==================*/
+	trx_t*	trx)	/*!< in: trx handle */
+{
+	ut_a(trx);
+
+	switch (trx->state) {
+	case TRX_STATE_PREPARED:
+	case TRX_STATE_COMMITTED_IN_MEMORY:
+		break;
+	case TRX_STATE_NOT_STARTED:
+		trx->undo_no = 0;
+		/* fall through */
+	case TRX_STATE_ACTIVE:
+		trx->last_sql_stat_start.least_undo_no = trx->undo_no;
+
+		if (trx->fts_trx) {
+			fts_savepoint_laststmt_refresh(trx);
+		}
+
+		return;
+	}
+
+	ut_error;
+}
+
+/**********************************************************************//**
+Prints info about a transaction.
+Caller must hold trx_sys->mutex. */
+UNIV_INTERN
+void
+trx_print_low(
+/*==========*/
+	FILE*		f,
+			/*!< in: output stream */
+	const trx_t*	trx,
+			/*!< in: transaction */
+	ulint		max_query_len,
+			/*!< in: max query length to print,
+			or 0 to use the default max length */
+	ulint		n_lock_rec,
+			/*!< in: lock_number_of_rows_locked(&trx->lock) */
+	ulint		n_lock_struct,
+			/*!< in: length of trx->lock.trx_locks */
+	ulint		heap_size)
+			/*!< in: mem_heap_get_size(trx->lock.lock_heap) */
+{
+	ibool		newline;
+	const char*	op_info;
+
+	ut_ad(mutex_own(&trx_sys->mutex));
+
+	fprintf(f, "TRANSACTION " TRX_ID_FMT, trx->id);
+
+	/* trx->state cannot change from or to NOT_STARTED while we
+	are holding the trx_sys->mutex. It may change from ACTIVE to
+	PREPARED or COMMITTED. */
+	switch (trx->state) {
+	case TRX_STATE_NOT_STARTED:
+		fputs(", not started", f);
+		goto state_ok;
+	case TRX_STATE_ACTIVE:
+		fprintf(f, ", ACTIVE %lu sec",
+			(ulong) difftime(time(NULL), trx->start_time));
+		goto state_ok;
+	case TRX_STATE_PREPARED:
+		fprintf(f, ", ACTIVE (PREPARED) %lu sec",
+			(ulong) difftime(time(NULL), trx->start_time));
+		goto state_ok;
+	case TRX_STATE_COMMITTED_IN_MEMORY:
+		fputs(", COMMITTED IN MEMORY", f);
+		goto state_ok;
+	}
+	fprintf(f, ", state %lu", (ulong) trx->state);
+	ut_ad(0);
+state_ok:
+
+	/* prevent a race condition */
+	op_info = trx->op_info;
+
+	if (*op_info) {
+		putc(' ', f);
+		fputs(op_info, f);
+	}
+
+	if (trx->is_recovered) {
+		fputs(" recovered trx", f);
+	}
+
+	if (trx->declared_to_be_inside_innodb) {
+		fprintf(f, ", thread declared inside InnoDB %lu",
+			(ulong) trx->n_tickets_to_enter_innodb);
+	}
+
+	putc('\n', f);
+
+	if (trx->n_mysql_tables_in_use > 0 || trx->mysql_n_tables_locked > 0) {
+		fprintf(f, "mysql tables in use %lu, locked %lu\n",
+			(ulong) trx->n_mysql_tables_in_use,
+			(ulong) trx->mysql_n_tables_locked);
+	}
+
+	newline = TRUE;
+
+	/* trx->lock.que_state of an ACTIVE transaction may change
+	while we are not holding trx->mutex. We perform a dirty read
+	for performance reasons. */
+
+	switch (trx->lock.que_state) {
+	case TRX_QUE_RUNNING:
+		newline = FALSE; break;
+	case TRX_QUE_LOCK_WAIT:
+		fputs("LOCK WAIT ", f); break;
+	case TRX_QUE_ROLLING_BACK:
+		fputs("ROLLING BACK ", f); break;
+	case TRX_QUE_COMMITTING:
+		fputs("COMMITTING ", f); break;
+	default:
+		fprintf(f, "que state %lu ", (ulong) trx->lock.que_state);
+	}
+
+	if (n_lock_struct > 0 || heap_size > 400) {
+		newline = TRUE;
+
+		fprintf(f, "%lu lock struct(s), heap size %lu,"
+			" %lu row lock(s)",
+			(ulong) n_lock_struct,
+			(ulong) heap_size,
+			(ulong) n_lock_rec);
+	}
+
+	if (trx->has_search_latch) {
+		newline = TRUE;
+		fputs(", holds adaptive hash latch", f);
+	}
+
+	if (trx->undo_no != 0) {
+		newline = TRUE;
+		fprintf(f, ", undo log entries "TRX_ID_FMT, trx->undo_no);
+	}
+
+	if (newline) {
+		putc('\n', f);
+	}
+
+	if (trx->mysql_thd != NULL) {
+		innobase_mysql_print_thd(f, trx->mysql_thd, max_query_len);
+	}
+}
+
+/**********************************************************************//**
+Prints info about a transaction.
+The caller must hold lock_sys->mutex and trx_sys->mutex.
+When possible, use trx_print() instead. */
+UNIV_INTERN
+void
+trx_print_latched(
+/*==============*/
+	FILE*		f,		/*!< in: output stream */
+	const trx_t*	trx,		/*!< in: transaction */
+	ulint		max_query_len)	/*!< in: max query length to print,
+					or 0 to use the default max length */
+{
+	ut_ad(lock_mutex_own());
+	ut_ad(mutex_own(&trx_sys->mutex));
+
+	trx_print_low(f, trx, max_query_len,
+		      lock_number_of_rows_locked(&trx->lock),
+		      UT_LIST_GET_LEN(trx->lock.trx_locks),
+		      mem_heap_get_size(trx->lock.lock_heap));
+}
+
+/**********************************************************************//**
+Prints info about a transaction.
+Acquires and releases lock_sys->mutex and trx_sys->mutex. */
+UNIV_INTERN
+void
+trx_print(
+/*======*/
+	FILE*		f,		/*!< in: output stream */
+	const trx_t*	trx,		/*!< in: transaction */
+	ulint		max_query_len)	/*!< in: max query length to print,
+					or 0 to use the default max length */
+{
+	ulint	n_lock_rec;
+	ulint	n_lock_struct;
+	ulint	heap_size;
+
+	lock_mutex_enter();
+	n_lock_rec = lock_number_of_rows_locked(&trx->lock);
+	n_lock_struct = UT_LIST_GET_LEN(trx->lock.trx_locks);
+	heap_size = mem_heap_get_size(trx->lock.lock_heap);
+	lock_mutex_exit();
+
+	mutex_enter(&trx_sys->mutex);
+	trx_print_low(f, trx, max_query_len,
+		      n_lock_rec, n_lock_struct, heap_size);
+	mutex_exit(&trx_sys->mutex);
+}
+
+#ifdef UNIV_DEBUG
+/**********************************************************************//**
+Asserts that a transaction has been started.
+The caller must hold trx_sys->mutex.
+@return TRUE if started */
+UNIV_INTERN
+ibool
+trx_assert_started(
+/*===============*/
+	const trx_t*	trx)	/*!< in: transaction */
+{
+	ut_ad(mutex_own(&trx_sys->mutex));
+
+	/* Non-locking autocommits should not hold any locks and this
+	function is only called from the locking code. */
+	assert_trx_in_list(trx);
+
+	/* trx->state can change from or to NOT_STARTED while we are holding
+	trx_sys->mutex for non-locking autocommit selects but not for other
+	types of transactions. It may change from ACTIVE to PREPARED. Unless
+	we are holding lock_sys->mutex, it may also change to COMMITTED. */
+
+	switch (trx->state) {
+	case TRX_STATE_PREPARED:
+		assert_trx_in_rw_list(trx);
+		return(TRUE);
+
+	case TRX_STATE_ACTIVE:
+	case TRX_STATE_COMMITTED_IN_MEMORY:
+		return(TRUE);
+
+	case TRX_STATE_NOT_STARTED:
+		break;
+	}
+
+	ut_error;
+	return(FALSE);
+}
+#endif /* UNIV_DEBUG */
+
+/*******************************************************************//**
+Compares the "weight" (or size) of two transactions. Transactions that
+have edited non-transactional tables are considered heavier than ones
+that have not.
+@return	TRUE if weight(a) >= weight(b) */
+UNIV_INTERN
+ibool
+trx_weight_ge(
+/*==========*/
+	const trx_t*	a,	/*!< in: the first transaction to be compared */
+	const trx_t*	b)	/*!< in: the second transaction to be compared */
+{
+	ibool	a_notrans_edit;
+	ibool	b_notrans_edit;
+
+	/* If mysql_thd is NULL for a transaction we assume that it has
+	not edited non-transactional tables. */
+
+	a_notrans_edit = a->mysql_thd != NULL
+		&& thd_has_edited_nontrans_tables(a->mysql_thd);
+
+	b_notrans_edit = b->mysql_thd != NULL
+		&& thd_has_edited_nontrans_tables(b->mysql_thd);
+
+	if (a_notrans_edit != b_notrans_edit) {
+
+		return(a_notrans_edit);
+	}
+
+	/* Either both had edited non-transactional tables or both had
+	not, we fall back to comparing the number of altered/locked
+	rows. */
+
+#if 0
+	fprintf(stderr,
+		"%s TRX_WEIGHT(a): %lld+%lu, TRX_WEIGHT(b): %lld+%lu\n",
+		__func__,
+		a->undo_no, UT_LIST_GET_LEN(a->lock.trx_locks),
+		b->undo_no, UT_LIST_GET_LEN(b->lock.trx_locks));
+#endif
+
+	return(TRX_WEIGHT(a) >= TRX_WEIGHT(b));
+}
+
+/****************************************************************//**
+Prepares a transaction. */
+static
+void
+trx_prepare(
+/*========*/
+	trx_t*	trx)	/*!< in/out: transaction */
+{
+	trx_rseg_t*	rseg;
+	lsn_t		lsn;
+	mtr_t		mtr;
+
+	rseg = trx->rseg;
+	/* Only fresh user transactions can be prepared.
+	Recovered transactions cannot. */
+	ut_a(!trx->is_recovered);
+
+	if (trx->insert_undo != NULL || trx->update_undo != NULL) {
+
+		mtr_start(&mtr);
+
+		/* Change the undo log segment states from TRX_UNDO_ACTIVE
+		to TRX_UNDO_PREPARED: these modifications to the file data
+		structure define the transaction as prepared in the
+		file-based world, at the serialization point of lsn. */
+
+		mutex_enter(&rseg->mutex);
+
+		if (trx->insert_undo != NULL) {
+
+			/* It is not necessary to obtain trx->undo_mutex here
+			because only a single OS thread is allowed to do the
+			transaction prepare for this transaction. */
+
+			trx_undo_set_state_at_prepare(trx, trx->insert_undo,
+						      &mtr);
+		}
+
+		if (trx->update_undo) {
+			trx_undo_set_state_at_prepare(
+				trx, trx->update_undo, &mtr);
+		}
+
+		mutex_exit(&rseg->mutex);
+
+		/*--------------*/
+		mtr_commit(&mtr);	/* This mtr commit makes the
+					transaction prepared in the file-based
+					world */
+		/*--------------*/
+		lsn = mtr.end_lsn;
+		ut_ad(lsn);
+	} else {
+		lsn = 0;
+	}
+
+	/*--------------------------------------*/
+	ut_a(trx->state == TRX_STATE_ACTIVE);
+	mutex_enter(&trx_sys->mutex);
+	trx->state = TRX_STATE_PREPARED;
+	trx_sys->n_prepared_trx++;
+	mutex_exit(&trx_sys->mutex);
+	/*--------------------------------------*/
+
+	if (lsn) {
+		/* Depending on the my.cnf options, we may now write the log
+		buffer to the log files, making the prepared state of the
+		transaction durable if the OS does not crash. We may also
+		flush the log files to disk, making the prepared state of the
+		transaction durable also at an OS crash or a power outage.
+
+		The idea in InnoDB's group prepare is that a group of
+		transactions gather behind a trx doing a physical disk write
+		to log files, and when that physical write has been completed,
+		one of those transactions does a write which prepares the whole
+		group. Note that this group prepare will only bring benefit if
+		there are > 2 users in the database. Then at least 2 users can
+		gather behind one doing the physical log write to disk.
+
+		TODO: find out if MySQL holds some mutex when calling this.
+		That would spoil our group prepare algorithm. */
+
+		if (srv_flush_log_at_trx_commit == 0) {
+			/* Do nothing */
+		} else if (srv_flush_log_at_trx_commit == 1) {
+			if (srv_unix_file_flush_method == SRV_UNIX_NOSYNC) {
+				/* Write the log but do not flush it to disk */
+
+				log_write_up_to(lsn, LOG_WAIT_ONE_GROUP,
+						FALSE);
+			} else {
+				/* Write the log to the log files AND flush
+				them to disk */
+
+				log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, TRUE);
+			}
+		} else if (srv_flush_log_at_trx_commit == 2) {
+
+			/* Write the log but do not flush it to disk */
+
+			log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, FALSE);
+		} else {
+			ut_error;
+		}
+	}
+}
+
+/**********************************************************************//**
+Does the transaction prepare for MySQL. */
+UNIV_INTERN
+void
+trx_prepare_for_mysql(
+/*==================*/
+	trx_t*	trx)	/*!< in/out: trx handle */
+{
+	trx_start_if_not_started_xa(trx);
+
+	trx->op_info = "preparing";
+
+	trx_prepare(trx);
+
+	trx->op_info = "";
+}
+
+/**********************************************************************//**
+This function is used to find number of prepared transactions and
+their transaction objects for a recovery.
+@return	number of prepared transactions stored in xid_list */
+UNIV_INTERN
+int
+trx_recover_for_mysql(
+/*==================*/
+	XID*	xid_list,	/*!< in/out: prepared transactions */
+	ulint	len)		/*!< in: number of slots in xid_list */
+{
+	const trx_t*	trx;
+	ulint		count = 0;
+
+	ut_ad(xid_list);
+	ut_ad(len);
+
+	/* We should set those transactions which are in the prepared state
+	to the xid_list */
+
+	mutex_enter(&trx_sys->mutex);
+
+	for (trx = UT_LIST_GET_FIRST(trx_sys->rw_trx_list);
+	     trx != NULL;
+	     trx = UT_LIST_GET_NEXT(trx_list, trx)) {
+
+		assert_trx_in_rw_list(trx);
+
+		/* The state of a read-write transaction cannot change
+		from or to NOT_STARTED while we are holding the
+		trx_sys->mutex. It may change to PREPARED, but not if
+		trx->is_recovered. It may also change to COMMITTED. */
+		if (trx_state_eq(trx, TRX_STATE_PREPARED)) {
+			xid_list[count] = trx->xid;
+
+			if (count == 0) {
+				ut_print_timestamp(stderr);
+				fprintf(stderr,
+					"  InnoDB: Starting recovery for"
+					" XA transactions...\n");
+			}
+
+			ut_print_timestamp(stderr);
+			fprintf(stderr,
+				"  InnoDB: Transaction " TRX_ID_FMT " in"
+				" prepared state after recovery\n",
+				trx->id);
+
+			ut_print_timestamp(stderr);
+			fprintf(stderr,
+				"  InnoDB: Transaction contains changes"
+				" to "TRX_ID_FMT" rows\n",
+				trx->undo_no);
+
+			count++;
+
+			if (count == len) {
+				break;
+			}
+		}
+	}
+
+	mutex_exit(&trx_sys->mutex);
+
+	if (count > 0){
+		ut_print_timestamp(stderr);
+		fprintf(stderr,
+			"  InnoDB: %lu transactions in prepared state"
+			" after recovery\n",
+			(ulong) count);
+	}
+
+	return ((int) count);
+}
+
+/*******************************************************************//**
+This function is used to find one X/Open XA distributed transaction
+which is in the prepared state
+@return	trx on match, the trx->xid will be invalidated;
+note that the trx may have been committed, unless the caller is
+holding lock_sys->mutex */
+static __attribute__((nonnull, warn_unused_result))
+trx_t*
+trx_get_trx_by_xid_low(
+/*===================*/
+	const XID*	xid)		/*!< in: X/Open XA transaction
+					identifier */
+{
+	trx_t*		trx;
+
+	ut_ad(mutex_own(&trx_sys->mutex));
+
+	for (trx = UT_LIST_GET_FIRST(trx_sys->rw_trx_list);
+	     trx != NULL;
+	     trx = UT_LIST_GET_NEXT(trx_list, trx)) {
+
+		assert_trx_in_rw_list(trx);
+
+		/* Compare two X/Open XA transaction id's: their
+		length should be the same and binary comparison
+		of gtrid_length+bqual_length bytes should be
+		the same */
+
+		if (trx->is_recovered
+		    && trx_state_eq(trx, TRX_STATE_PREPARED)
+		    && xid->gtrid_length == trx->xid.gtrid_length
+		    && xid->bqual_length == trx->xid.bqual_length
+		    && memcmp(xid->data, trx->xid.data,
+			      xid->gtrid_length + xid->bqual_length) == 0) {
+
+			/* Invalidate the XID, so that subsequent calls
+			will not find it. */
+			memset(&trx->xid, 0, sizeof(trx->xid));
+			trx->xid.formatID = -1;
+			break;
+		}
+	}
+
+	return(trx);
+}
+
+/*******************************************************************//**
+This function is used to find one X/Open XA distributed transaction
+which is in the prepared state
+@return	trx or NULL; on match, the trx->xid will be invalidated;
+note that the trx may have been committed, unless the caller is
+holding lock_sys->mutex */
+UNIV_INTERN
+trx_t*
+trx_get_trx_by_xid(
+/*===============*/
+	const XID*	xid)	/*!< in: X/Open XA transaction identifier */
+{
+	trx_t*	trx;
+
+	if (xid == NULL) {
+
+		return(NULL);
+	}
+
+	mutex_enter(&trx_sys->mutex);
+
+	/* Recovered/Resurrected transactions are always only on the
+	trx_sys_t::rw_trx_list. */
+	trx = trx_get_trx_by_xid_low(xid);
+
+	mutex_exit(&trx_sys->mutex);
+
+	return(trx);
+}
+
+/*************************************************************//**
+Starts the transaction if it is not yet started. */
+UNIV_INTERN
+void
+trx_start_if_not_started_xa(
+/*========================*/
+	trx_t*	trx)	/*!< in: transaction */
+{
+	switch (trx->state) {
+	case TRX_STATE_NOT_STARTED:
+
+		/* Update the info whether we should skip XA steps
+		that eat CPU time.
+
+		For the duration of the transaction trx->support_xa is
+		not reread from thd so any changes in the value take
+		effect in the next transaction. This is to avoid a
+		scenario where some undo generated by a transaction,
+		has XA stuff, and other undo, generated by the same
+		transaction, doesn't. */
+		trx->support_xa = thd_supports_xa(trx->mysql_thd);
+
+		trx_start_low(trx);
+		/* fall through */
+	case TRX_STATE_ACTIVE:
+		return;
+	case TRX_STATE_PREPARED:
+	case TRX_STATE_COMMITTED_IN_MEMORY:
+		break;
+	}
+
+	ut_error;
+}
+
+/*************************************************************//**
+Starts the transaction if it is not yet started. */
+UNIV_INTERN
+void
+trx_start_if_not_started(
+/*=====================*/
+	trx_t*	trx)	/*!< in: transaction */
+{
+	switch (trx->state) {
+	case TRX_STATE_NOT_STARTED:
+		trx_start_low(trx);
+		/* fall through */
+	case TRX_STATE_ACTIVE:
+		return;
+	case TRX_STATE_PREPARED:
+	case TRX_STATE_COMMITTED_IN_MEMORY:
+		break;
+	}
+
+	ut_error;
+}
diff --git a/storage/innobase/trx/trx0undo.c b/storage/innobase/trx/trx0undo.cc
index 3d794c69c8b..13ad2bb3755 100644
--- a/storage/innobase/trx/trx0undo.c
+++ b/storage/innobase/trx/trx0undo.cc
@@ -17,7 +17,7 @@ this program; if not, write to the Free Software Foundation, Inc.,
 *****************************************************************************/
 
 /**************************************************//**
-@file trx/trx0undo.c
+@file trx/trx0undo.cc
 Transaction undo log
 
 Created 3/26/1996 Heikki Tuuri
@@ -39,6 +39,7 @@ Created 3/26/1996 Heikki Tuuri
 #include "srv0start.h"
 #include "trx0rec.h"
 #include "trx0purge.h"
+#include "srv0mon.h"
 
 /* How should the old versions in the history list be managed?
    ----------------------------------------------------------
@@ -79,7 +80,7 @@ can still remove old versions from the bottom of the stack. */
    -------------------------------------------------------------------
 latches?
 -------
-The contention of the kernel mutex should be minimized. When a transaction
+The contention of the trx_sys_t::mutex should be minimized. When a transaction
 does its first insert or modify in an index, an undo log is assigned for it.
 Then we must have an x-latch to the rollback segment header.
 	When the transaction does more modifys or rolls back, the undo log is
@@ -501,6 +502,8 @@ trx_undo_seg_create(
 			       page_get_page_no(*undo_page), mtr);
 	*id = slot_no;
 
+	MONITOR_INC(MONITOR_NUM_UNDO_SLOT_USED);
+
 	return(err);
 }
 
@@ -607,13 +610,13 @@ trx_undo_write_xid(
 	mtr_t*		mtr)	/*!< in: mtr */
 {
 	mlog_write_ulint(log_hdr + TRX_UNDO_XA_FORMAT,
-			 (ulint)xid->formatID, MLOG_4BYTES, mtr);
+			 (ulint) xid->formatID, MLOG_4BYTES, mtr);
 
 	mlog_write_ulint(log_hdr + TRX_UNDO_XA_TRID_LEN,
-			 (ulint)xid->gtrid_length, MLOG_4BYTES, mtr);
+			 (ulint) xid->gtrid_length, MLOG_4BYTES, mtr);
 
 	mlog_write_ulint(log_hdr + TRX_UNDO_XA_BQUAL_LEN,
-			 (ulint)xid->bqual_length, MLOG_4BYTES, mtr);
+			 (ulint) xid->bqual_length, MLOG_4BYTES, mtr);
 
 	mlog_write_string(log_hdr + TRX_UNDO_XA_XID, (const byte*) xid->data,
 			  XIDDATASIZE, mtr);
@@ -628,7 +631,7 @@ trx_undo_read_xid(
 	trx_ulogf_t*	log_hdr,/*!< in: undo log header */
 	XID*		xid)	/*!< out: X/Open XA Transaction Identification */
 {
-	xid->formatID = (long)mach_read_from_4(log_hdr + TRX_UNDO_XA_FORMAT);
+	xid->formatID = (long) mach_read_from_4(log_hdr + TRX_UNDO_XA_FORMAT);
 
 	xid->gtrid_length
 		= (long) mach_read_from_4(log_hdr + TRX_UNDO_XA_TRID_LEN);
@@ -894,7 +897,6 @@ trx_undo_add_page(
 	ulint		n_reserved;
 
 	ut_ad(mutex_own(&(trx->undo_mutex)));
-	ut_ad(!mutex_own(&kernel_mutex));
 	ut_ad(mutex_own(&(trx->rseg->mutex)));
 
 	rseg = trx->rseg;
@@ -969,7 +971,6 @@ trx_undo_free_page(
 	ulint		zip_size;
 
 	ut_a(hdr_page_no != page_no);
-	ut_ad(!mutex_own(&kernel_mutex));
 	ut_ad(mutex_own(&(rseg->mutex)));
 
 	zip_size = rseg->zip_size;
@@ -1218,8 +1219,6 @@ trx_undo_seg_free(
 
 		mtr_start(&mtr);
 
-		ut_ad(!mutex_own(&kernel_mutex));
-
 		mutex_enter(&(rseg->mutex));
 
 		seg_header = trx_undo_page_get(undo->space, undo->zip_size,
@@ -1237,6 +1236,8 @@ trx_undo_seg_free(
 				&mtr);
 			trx_rsegf_set_nth_undo(rseg_header, undo->id, FIL_NULL,
 					       &mtr);
+
+			MONITOR_DEC(MONITOR_NUM_UNDO_SLOT_USED);
 		}
 
 		mutex_exit(&(rseg->mutex));
@@ -1355,6 +1356,7 @@ add_to_list:
 		} else {
 			UT_LIST_ADD_LAST(undo_list, rseg->insert_undo_cached,
 					 undo);
+			MONITOR_INC(MONITOR_NUM_UNDO_SLOT_CACHED);
 		}
 	} else {
 		ut_ad(type == TRX_UNDO_UPDATE);
@@ -1364,6 +1366,7 @@ add_to_list:
 		} else {
 			UT_LIST_ADD_LAST(undo_list, rseg->update_undo_cached,
 					 undo);
+			MONITOR_INC(MONITOR_NUM_UNDO_SLOT_CACHED);
 		}
 	}
 
@@ -1381,8 +1384,6 @@ trx_undo_lists_init(
 /*================*/
 	trx_rseg_t*	rseg)	/*!< in: rollback segment memory object */
 {
-	ulint		page_no;
-	trx_undo_t*	undo;
 	ulint		size	= 0;
 	trx_rsegf_t*	rseg_header;
 	ulint		i;
@@ -1395,10 +1396,12 @@ trx_undo_lists_init(
 
 	mtr_start(&mtr);
 
-	rseg_header = trx_rsegf_get_new(rseg->space, rseg->zip_size,
-					rseg->page_no, &mtr);
+	rseg_header = trx_rsegf_get_new(
+		rseg->space, rseg->zip_size, rseg->page_no, &mtr);
 
 	for (i = 0; i < TRX_RSEG_N_SLOTS; i++) {
+		ulint	page_no;
+
 		page_no = trx_rsegf_get_nth_undo(rseg_header, i, &mtr);
 
 		/* In forced recovery: try to avoid operations which look
@@ -1409,8 +1412,11 @@ trx_undo_lists_init(
 		if (page_no != FIL_NULL
 		    && srv_force_recovery < SRV_FORCE_NO_UNDO_LOG_SCAN) {
 
-			undo = trx_undo_mem_create_at_db_start(rseg, i,
-							       page_no, &mtr);
+			trx_undo_t*	undo;
+
+			undo = trx_undo_mem_create_at_db_start(
+				rseg, i, page_no, &mtr);
+
 			size += undo->size;
 
 			mtr_commit(&mtr);
@@ -1420,6 +1426,9 @@ trx_undo_lists_init(
 			rseg_header = trx_rsegf_get(
 				rseg->space, rseg->zip_size, rseg->page_no,
 				&mtr);
+
+			/* Found a used slot */
+			MONITOR_INC(MONITOR_NUM_UNDO_SLOT_USED);
 		}
 	}
 
@@ -1455,7 +1464,7 @@ trx_undo_mem_create(
 		ut_error;
 	}
 
-	undo = mem_alloc(sizeof(trx_undo_t));
+	undo = static_cast<trx_undo_t*>(mem_alloc(sizeof(*undo)));
 
 	if (undo == NULL) {
 
@@ -1639,6 +1648,8 @@ trx_undo_reuse_cached(
 		}
 
 		UT_LIST_REMOVE(undo_list, rseg->insert_undo_cached, undo);
+
+		MONITOR_DEC(MONITOR_NUM_UNDO_SLOT_CACHED);
 	} else {
 		ut_ad(type == TRX_UNDO_UPDATE);
 
@@ -1649,6 +1660,8 @@ trx_undo_reuse_cached(
 		}
 
 		UT_LIST_REMOVE(undo_list, rseg->update_undo_cached, undo);
+
+		MONITOR_DEC(MONITOR_NUM_UNDO_SLOT_CACHED);
 	}
 
 	ut_ad(undo->size == 1);
@@ -1730,7 +1743,7 @@ trx_undo_mark_as_dict_operation(
 Assigns an undo log for a transaction. A new undo log is created or a cached
 undo log reused.
 @return DB_SUCCESS if undo log assign successful, possible error codes
-are: DB_TOO_MANY_CONCURRENT_TRXS DB_OUT_OF_FILE_SPACE
+are: DB_TOO_MANY_CONCURRENT_TRXS DB_OUT_OF_FILE_SPACE DB_READ_ONLY
 DB_OUT_OF_MEMORY */
 UNIV_INTERN
 ulint
@@ -1745,7 +1758,10 @@ trx_undo_assign_undo(
 	ulint		err = DB_SUCCESS;
 
 	ut_ad(trx);
-	ut_ad(trx->rseg);
+
+	if (trx->rseg == NULL) {
+		return(DB_READ_ONLY);
+	}
 
 	rseg = trx->rseg;
 
@@ -1753,9 +1769,7 @@ trx_undo_assign_undo(
 
 	mtr_start(&mtr);
 
-	ut_ad(!mutex_own(&kernel_mutex));
-
-	mutex_enter(&(rseg->mutex));
+	mutex_enter(&rseg->mutex);
 
 	undo = trx_undo_reuse_cached(trx, rseg, type, trx->id, &trx->xid,
 				     &mtr);
@@ -1804,9 +1818,6 @@ trx_undo_set_state_at_finish(
 	page_t*		undo_page;
 	ulint		state;
 
-	ut_ad(undo);
-	ut_ad(mtr);
-
 	if (undo->id >= TRX_RSEG_N_SLOTS) {
 		fprintf(stderr, "InnoDB: Error: undo->id is %lu\n",
 			(ulong) undo->id);
@@ -1919,9 +1930,10 @@ trx_undo_update_cleanup(
 	if (undo->state == TRX_UNDO_CACHED) {
 
 		UT_LIST_ADD_FIRST(undo_list, rseg->update_undo_cached, undo);
+
+		MONITOR_INC(MONITOR_NUM_UNDO_SLOT_CACHED);
 	} else {
-		ut_ad(undo->state == TRX_UNDO_TO_PURGE
-		      || undo->state == TRX_UNDO_TO_FREE);
+		ut_ad(undo->state == TRX_UNDO_TO_PURGE);
 
 		trx_undo_mem_free(undo);
 	}
@@ -1953,6 +1965,8 @@ trx_undo_insert_cleanup(
 	if (undo->state == TRX_UNDO_CACHED) {
 
 		UT_LIST_ADD_FIRST(undo_list, rseg->insert_undo_cached, undo);
+
+		MONITOR_INC(MONITOR_NUM_UNDO_SLOT_CACHED);
 	} else {
 		ut_ad(undo->state == TRX_UNDO_TO_FREE);
 
diff --git a/storage/innobase/usr/usr0sess.c b/storage/innobase/usr/usr0sess.cc
index 8087dcb4170..ab7ba6bea09 100644
--- a/storage/innobase/usr/usr0sess.c
+++ b/storage/innobase/usr/usr0sess.cc
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2011, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,13 +11,13 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
 /**************************************************//**
-@file usr/usr0sess.c
+@file usr/usr0sess.cc
 Sessions
 
 Created 6/25/1996 Heikki Tuuri
@@ -41,13 +41,12 @@ sess_open(void)
 {
 	sess_t*	sess;
 
-	ut_ad(mutex_own(&kernel_mutex));
-
-	sess = mem_alloc(sizeof(sess_t));
+	sess = static_cast<sess_t*>(mem_zalloc(sizeof(*sess)));
 
 	sess->state = SESS_ACTIVE;
 
-	sess->trx = trx_create(sess);
+	sess->trx = trx_allocate_for_background();
+	sess->trx->sess = sess;
 
 	UT_LIST_INIT(sess->graphs);
 
@@ -62,8 +61,6 @@ sess_close(
 /*=======*/
 	sess_t*	sess)	/*!< in, own: session object */
 {
-	ut_ad(!mutex_own(&kernel_mutex));
-
 	ut_a(UT_LIST_GET_LEN(sess->graphs) == 0);
 
 	trx_free_for_background(sess->trx);
diff --git a/storage/innobase/ut/ut0bh.c b/storage/innobase/ut/ut0bh.cc
index ae0b1aff207..1a3038a0d71 100644
--- a/storage/innobase/ut/ut0bh.c
+++ b/storage/innobase/ut/ut0bh.cc
@@ -1,11 +1,6 @@
 /***************************************************************************//**
-Copyright (c) 2010, 2011, Oracle Corpn. All Rights Reserved.
 
-Portions of this file contain modifications contributed and copyrighted by
-Sun Microsystems, Inc. Those modifications are gratefully acknowledged and
-are described briefly in the InnoDB documentation. The contributions by
-Sun Microsystems are incorporated with their permission, and subject to the
-conditions contained in the file COPYING.Sun_Microsystems.
+Copyright (c) 2010, 2011, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -16,13 +11,13 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
 /******************************************************************//**
-@file ut/ut0bh.c
+@file ut/ut0bh.cc
 Binary min-heap implementation.
 
 Created 2010-05-28 by Sunny Bains
diff --git a/storage/innobase/ut/ut0byte.c b/storage/innobase/ut/ut0byte.cc
index 535f74b8907..bc592edc6bf 100644
--- a/storage/innobase/ut/ut0byte.c
+++ b/storage/innobase/ut/ut0byte.cc
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1994, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,13 +11,13 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
 /***************************************************************//**
-@file ut/ut0byte.c
+@file ut/ut0byte.cc
 Byte utilities
 
 Created 5/11/1994 Heikki Tuuri
diff --git a/storage/innobase/ut/ut0crc32.cc b/storage/innobase/ut/ut0crc32.cc
new file mode 100644
index 00000000000..538879dd9e2
--- /dev/null
+++ b/storage/innobase/ut/ut0crc32.cc
@@ -0,0 +1,322 @@
+/*****************************************************************************
+
+Copyright (C) 2009, 2010 Facebook, Inc. All Rights Reserved.
+Copyright (c) 2011, 2011, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/***************************************************************//**
+@file ut/ut0crc32.cc
+CRC32 implementation from Facebook, based on the zlib implementation.
+
+Created Aug 8, 2011, Vasil Dimov, based on mysys/my_crc32.c and
+mysys/my_perf.c, contributed by Facebook under the following license.
+********************************************************************/
+
+/* Copyright (C) 2009-2010 Facebook, Inc.  All Rights Reserved.
+
+   Dual licensed under BSD license and GPLv2.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are met:
+   1. Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+   2. Redistributions in binary form must reproduce the above copyright notice,
+      this list of conditions and the following disclaimer in the documentation
+      and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY FACEBOOK, INC. ``AS IS'' AND ANY EXPRESS OR
+   IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+   MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO
+   EVENT SHALL FACEBOOK, INC. BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+   OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+   WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+   OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+   ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   This program is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by the Free
+   Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful, but WITHOUT
+   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+   FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+   more details.
+
+   You should have received a copy of the GNU General Public License along with
+   this program; if not, write to the Free Software Foundation, Inc.,
+   51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA */
+
+/* The below CRC32 implementation is based on the implementation included with
+ * zlib with modifications to process 8 bytes at a time and using SSE 4.2
+ * extentions when available.  The polynomial constant has been changed to
+ * match the one used by SSE 4.2 and does not return the same value as the
+ * version used by zlib.  This implementation only supports 64-bit
+ * little-endian processors.  The original zlib copyright notice follows. */
+
+/* crc32.c -- compute the CRC-32 of a buf stream
+ * Copyright (C) 1995-2005 Mark Adler
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ *
+ * Thanks to Rodney Brown <rbrown64@csc.com.au> for his contribution of faster
+ * CRC methods: exclusive-oring 32 bits of buf at a time, and pre-computing
+ * tables for updating the shift register in one step with three exclusive-ors
+ * instead of four steps with four exclusive-ors.  This results in about a
+ * factor of two increase in speed on a Power PC G4 (PPC7455) using gcc -O3.
+ */
+
+#include <string.h> /* memcmp() */
+
+#include "univ.i"
+#include "ut0crc32.h"
+
+ib_ut_crc32_t	ut_crc32;
+
+/* Precalculated table used to generate the CRC32 if the CPU does not
+have support for it */
+static ib_uint32_t	ut_crc32_slice8_table[8][256];
+static ibool		ut_crc32_slice8_table_initialized = FALSE;
+
+/* Flag that tells whether the CPU supports CRC32 or not */
+static ibool		ut_crc32_sse2_enabled = FALSE;
+
+/********************************************************************//**
+Initializes the table that is used to generate the CRC32 if the CPU does
+not have support for it. */
+static
+void
+ut_crc32_slice8_table_init()
+/*========================*/
+{
+	/* bit-reversed poly 0x1EDC6F41 (from SSE42 crc32 instruction) */
+	static const ib_uint32_t	poly = 0x82f63b78;
+	ib_uint32_t			n;
+	ib_uint32_t			k;
+	ib_uint32_t			c;
+
+	for (n = 0; n < 256; n++) {
+		c = n;
+		for (k = 0; k < 8; k++) {
+			c = (c & 1) ? (poly ^ (c >> 1)) : (c >> 1);
+		}
+		ut_crc32_slice8_table[0][n] = c;
+	}
+
+	for (n = 0; n < 256; n++) {
+		c = ut_crc32_slice8_table[0][n];
+		for (k = 1; k < 8; k++) {
+			c = ut_crc32_slice8_table[0][c & 0xFF] ^ (c >> 8);
+			ut_crc32_slice8_table[k][n] = c;
+		}
+	}
+
+	ut_crc32_slice8_table_initialized = TRUE;
+}
+
+#if defined(__GNUC__) && defined(__x86_64__)
+/********************************************************************//**
+Fetches CPU info */
+static
+void
+ut_cpuid(
+/*=====*/
+	ib_uint32_t	vend[3],	/*!< out: CPU vendor */
+	ib_uint32_t*	model,		/*!< out: CPU model */
+	ib_uint32_t*	family,		/*!< out: CPU family */
+	ib_uint32_t*	stepping,	/*!< out: CPU stepping */
+	ib_uint32_t*	features_ecx,	/*!< out: CPU features ecx */
+	ib_uint32_t*	features_edx)	/*!< out: CPU features edx */
+{
+	ib_uint32_t	sig;
+	asm("cpuid" : "=b" (vend[0]), "=c" (vend[2]), "=d" (vend[1]) : "a" (0));
+	asm("cpuid" : "=a" (sig), "=c" (*features_ecx), "=d" (*features_edx)
+	    : "a" (1)
+	    : "ebx");
+
+	*model = ((sig >> 4) & 0xF);
+	*family = ((sig >> 8) & 0xF);
+	*stepping = (sig & 0xF);
+
+	if (memcmp(vend, "GenuineIntel", 12) == 0
+	    || (memcmp(vend, "AuthenticAMD", 12) == 0 && *family == 0xF)) {
+
+		*model += (((sig >> 16) & 0xF) << 4);
+		*family += ((sig >> 20) & 0xFF);
+	}
+}
+
+/* opcodes taken from objdump of "crc32b (%%rdx), %%rcx"
+for RHEL4 support (GCC 3 doesn't support this instruction) */
+#define ut_crc32_sse42_byte \
+	asm(".byte 0xf2, 0x48, 0x0f, 0x38, 0xf0, 0x0a" \
+	    : "=c"(crc) : "c"(crc), "d"(buf)); \
+	len--, buf++
+
+/* opcodes taken from objdump of "crc32q (%%rdx), %%rcx"
+for RHEL4 support (GCC 3 doesn't support this instruction) */
+#define ut_crc32_sse42_quadword \
+	asm(".byte 0xf2, 0x48, 0x0f, 0x38, 0xf1, 0x0a" \
+	    : "=c"(crc) : "c"(crc), "d"(buf)); \
+	len -= 8, buf += 8
+#endif /* defined(__GNUC__) && defined(__x86_64__) */
+
+/********************************************************************//**
+Calculates CRC32 using CPU instructions.
+@return CRC-32C (polynomial 0x11EDC6F41) */
+UNIV_INLINE
+ib_uint32_t
+ut_crc32_sse42(
+/*===========*/
+	const byte*	buf,	/*!< in: data over which to calculate CRC32 */
+	ulint		len)	/*!< in: data length */
+{
+#if defined(__GNUC__) && defined(__x86_64__)
+	ib_uint64_t	crc = (ib_uint32_t) (-1);
+
+	ut_a(ut_crc32_sse2_enabled);
+
+	while (len && ((ulint) buf & 7)) {
+		ut_crc32_sse42_byte;
+	}
+
+	while (len >= 32) {
+		ut_crc32_sse42_quadword;
+		ut_crc32_sse42_quadword;
+		ut_crc32_sse42_quadword;
+		ut_crc32_sse42_quadword;
+	}
+
+	while (len >= 8) {
+		ut_crc32_sse42_quadword;
+	}
+
+	while (len) {
+		ut_crc32_sse42_byte;
+	}
+
+	return((ib_uint32_t) ((~crc) & 0xFFFFFFFF));
+#else
+	ut_error;
+	/* silence compiler warning about unused parameters */
+	return((ib_uint32_t) buf[len]);
+#endif /* defined(__GNUC__) && defined(__x86_64__) */
+}
+
+#define ut_crc32_slice8_byte \
+	crc = (crc >> 8) ^ ut_crc32_slice8_table[0][(crc ^ *buf++) & 0xFF]; \
+	len--
+
+#define ut_crc32_slice8_quadword \
+	crc ^= *(ib_uint64_t*) buf; \
+	crc = ut_crc32_slice8_table[7][(crc      ) & 0xFF] ^ \
+	      ut_crc32_slice8_table[6][(crc >>  8) & 0xFF] ^ \
+	      ut_crc32_slice8_table[5][(crc >> 16) & 0xFF] ^ \
+	      ut_crc32_slice8_table[4][(crc >> 24) & 0xFF] ^ \
+	      ut_crc32_slice8_table[3][(crc >> 32) & 0xFF] ^ \
+	      ut_crc32_slice8_table[2][(crc >> 40) & 0xFF] ^ \
+	      ut_crc32_slice8_table[1][(crc >> 48) & 0xFF] ^ \
+	      ut_crc32_slice8_table[0][(crc >> 56)]; \
+	len -= 8, buf += 8
+
+/********************************************************************//**
+Calculates CRC32 manually.
+@return CRC-32C (polynomial 0x11EDC6F41) */
+UNIV_INLINE
+ib_uint32_t
+ut_crc32_slice8(
+/*============*/
+	const byte*	buf,	/*!< in: data over which to calculate CRC32 */
+	ulint		len)	/*!< in: data length */
+{
+	ib_uint64_t	crc = (ib_uint32_t) (-1);
+
+	ut_a(ut_crc32_slice8_table_initialized);
+
+	while (len && ((ulint) buf & 7)) {
+		ut_crc32_slice8_byte;
+	}
+
+	while (len >= 32) {
+		ut_crc32_slice8_quadword;
+		ut_crc32_slice8_quadword;
+		ut_crc32_slice8_quadword;
+		ut_crc32_slice8_quadword;
+	}
+
+	while (len >= 8) {
+		ut_crc32_slice8_quadword;
+	}
+
+	while (len) {
+		ut_crc32_slice8_byte;
+	}
+
+	return((ib_uint32_t) ((~crc) & 0xFFFFFFFF));
+}
+
+/********************************************************************//**
+Initializes the data structures used by ut_crc32(). Does not do any
+allocations, would not hurt if called twice, but would be pointless. */
+UNIV_INTERN
+void
+ut_crc32_init()
+/*===========*/
+{
+#if defined(__GNUC__) && defined(__x86_64__)
+	ib_uint32_t	vend[3];
+	ib_uint32_t	model;
+	ib_uint32_t	family;
+	ib_uint32_t	stepping;
+	ib_uint32_t	features_ecx;
+	ib_uint32_t	features_edx;
+
+	ut_cpuid(vend, &model, &family, &stepping,
+		 &features_ecx, &features_edx);
+
+	/* Valgrind does not understand the CRC32 instructions:
+
+	vex amd64->IR: unhandled instruction bytes: 0xF2 0x48 0xF 0x38 0xF0 0xA
+	valgrind: Unrecognised instruction at address 0xad3db5.
+	Your program just tried to execute an instruction that Valgrind
+	did not recognise.  There are two possible reasons for this.
+	1. Your program has a bug and erroneously jumped to a non-code
+	   location.  If you are running Memcheck and you just saw a
+	   warning about a bad jump, it's probably your program's fault.
+	2. The instruction is legitimate but Valgrind doesn't handle it,
+	   i.e. it's Valgrind's fault.  If you think this is the case or
+	   you are not sure, please let us know and we'll try to fix it.
+	Either way, Valgrind will now raise a SIGILL signal which will
+	probably kill your program.
+
+	*/
+#ifndef UNIV_DEBUG_VALGRIND
+	ut_crc32_sse2_enabled = (features_ecx >> 20) & 1;
+#endif /* UNIV_DEBUG_VALGRIND */
+
+#endif /* defined(__GNUC__) && defined(__x86_64__) */
+
+	if (ut_crc32_sse2_enabled) {
+		ut_crc32 = ut_crc32_sse42;
+	} else {
+		ut_crc32_slice8_table_init();
+		ut_crc32 = ut_crc32_slice8;
+	}
+
+	ut_print_timestamp(stderr);
+	fprintf(stderr, " InnoDB: CPU %s crc32 instructions\n",
+		ut_crc32_sse2_enabled ? "supports" : "does not support");
+}
diff --git a/storage/innobase/ut/ut0dbg.c b/storage/innobase/ut/ut0dbg.cc
index 64fadd76d1c..37b709785b4 100644
--- a/storage/innobase/ut/ut0dbg.c
+++ b/storage/innobase/ut/ut0dbg.cc
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1994, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,13 +11,13 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
 /*****************************************************************//**
-@file ut/ut0dbg.c
+@file ut/ut0dbg.cc
 Debug utilities for Innobase.
 
 Created 1/30/1994 Heikki Tuuri
@@ -25,7 +25,9 @@ Created 1/30/1994 Heikki Tuuri
 
 #include "univ.i"
 #include "ut0dbg.h"
-#include "ha_prototypes.h"
+#ifndef UNIV_HOTBACKUP
+# include "ha_prototypes.h"
+#endif /* !UNIV_HOTBACKUP */
 
 #if defined(__GNUC__) && (__GNUC__ > 2)
 #else
@@ -56,7 +58,7 @@ ut_dbg_assertion_failed(
 	ut_print_timestamp(stderr);
 #ifdef UNIV_HOTBACKUP
 	fprintf(stderr, "  InnoDB: Assertion failure in file %s line %lu\n",
-		innobase_basename(file), line);
+		file, line);
 #else /* UNIV_HOTBACKUP */
 	fprintf(stderr,
 		"  InnoDB: Assertion failure in thread %lu"
diff --git a/storage/innobase/ut/ut0list.c b/storage/innobase/ut/ut0list.cc
index 895a575c535..f906061d185 100644
--- a/storage/innobase/ut/ut0list.c
+++ b/storage/innobase/ut/ut0list.cc
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 2006, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 2006, 2011, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,13 +11,13 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
 /*******************************************************************//**
-@file ut/ut0list.c
+@file ut/ut0list.cc
 A double-linked list
 
 Created 4/26/2006 Osku Salerma
@@ -36,7 +36,9 @@ ib_list_t*
 ib_list_create(void)
 /*=================*/
 {
-	ib_list_t*	list = mem_alloc(sizeof(ib_list_t));
+	ib_list_t*	list;
+
+	list = static_cast<ib_list_t*>(mem_alloc(sizeof(*list)));
 
 	list->first = NULL;
 	list->last = NULL;
@@ -55,7 +57,9 @@ ib_list_create_heap(
 /*================*/
 	mem_heap_t*	heap)	/*!< in: memory heap to use */
 {
-	ib_list_t*	list = mem_heap_alloc(heap, sizeof(ib_list_t));
+	ib_list_t*	list;
+
+	list = static_cast<ib_list_t*>(mem_heap_alloc(heap, sizeof(*list)));
 
 	list->first = NULL;
 	list->last = NULL;
@@ -122,7 +126,10 @@ ib_list_add_after(
 	void*		data,		/*!< in: data */
 	mem_heap_t*	heap)		/*!< in: memory heap to use */
 {
-	ib_list_node_t*	node = mem_heap_alloc(heap, sizeof(ib_list_node_t));
+	ib_list_node_t*	node;
+
+	node = static_cast<ib_list_node_t*>(
+		mem_heap_alloc(heap, sizeof(*node)));
 
 	node->data = data;
 
@@ -191,4 +198,6 @@ ib_list_remove(
 
 		list->last = node->prev;
 	}
+
+	node->prev = node->next = NULL;
 }
diff --git a/storage/innobase/ut/ut0mem.c b/storage/innobase/ut/ut0mem.cc
index cb6b050beca..42ad180d373 100644
--- a/storage/innobase/ut/ut0mem.c
+++ b/storage/innobase/ut/ut0mem.cc
@@ -11,13 +11,13 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
 /********************************************************************//**
-@file ut/ut0mem.c
+@file ut/ut0mem.cc
 Memory primitives
 
 Created 5/11/1994 Heikki Tuuri
@@ -46,6 +46,11 @@ UNIV_INTERN ulint		ut_total_allocated_memory	= 0;
 /** Mutex protecting ut_total_allocated_memory and ut_mem_block_list */
 UNIV_INTERN os_fast_mutex_t	ut_list_mutex;
 
+#ifdef UNIV_PFS_MUTEX
+/* Key to register server_mutex with performance schema */
+UNIV_INTERN mysql_pfs_key_t	ut_list_mutex_key;
+#endif
+
 /** Dynamically allocated memory block */
 struct ut_mem_block_struct{
 	UT_LIST_NODE_T(ut_mem_block_t) mem_block_list;
@@ -77,7 +82,7 @@ ut_mem_init(void)
 /*=============*/
 {
 	ut_a(!ut_mem_block_list_inited);
-	os_fast_mutex_init(&ut_list_mutex);
+	os_fast_mutex_init(ut_list_mutex_key, &ut_list_mutex);
 	UT_LIST_INIT(ut_mem_block_list);
 	ut_mem_block_list_inited = TRUE;
 }
@@ -185,16 +190,16 @@ retry:
 
 	UNIV_MEM_ALLOC(ret, n + sizeof(ut_mem_block_t));
 
-	((ut_mem_block_t*)ret)->size = n + sizeof(ut_mem_block_t);
-	((ut_mem_block_t*)ret)->magic_n = UT_MEM_MAGIC_N;
+	((ut_mem_block_t*) ret)->size = n + sizeof(ut_mem_block_t);
+	((ut_mem_block_t*) ret)->magic_n = UT_MEM_MAGIC_N;
 
 	ut_total_allocated_memory += n + sizeof(ut_mem_block_t);
 
 	UT_LIST_ADD_FIRST(mem_block_list, ut_mem_block_list,
-			  ((ut_mem_block_t*)ret));
+			  ((ut_mem_block_t*) ret));
 	os_fast_mutex_unlock(&ut_list_mutex);
 
-	return((void*)((byte*)ret + sizeof(ut_mem_block_t)));
+	return((void*)((byte*) ret + sizeof(ut_mem_block_t)));
 #else /* !UNIV_HOTBACKUP */
 	void*	ret = malloc(n);
 	ut_a(ret || !assert_on_error);
@@ -222,7 +227,7 @@ ut_free(
 		return;
 	}
 
-	block = (ut_mem_block_t*)((byte*)ptr - sizeof(ut_mem_block_t));
+	block = (ut_mem_block_t*)((byte*) ptr - sizeof(ut_mem_block_t));
 
 	os_fast_mutex_lock(&ut_list_mutex);
 
@@ -242,7 +247,7 @@ ut_free(
 
 #ifndef UNIV_HOTBACKUP
 /**********************************************************************//**
-Implements realloc. This is needed by /pars/lexyy.c. Otherwise, you should not
+Implements realloc. This is needed by /pars/lexyy.cc. Otherwise, you should not
 use this function because the allocation functions in mem0mem.h are the
 recommended ones in InnoDB.
 
@@ -293,7 +298,7 @@ ut_realloc(
 		return(NULL);
 	}
 
-	block = (ut_mem_block_t*)((byte*)ptr - sizeof(ut_mem_block_t));
+	block = (ut_mem_block_t*)((byte*) ptr - sizeof(ut_mem_block_t));
 
 	ut_a(block->magic_n == UT_MEM_MAGIC_N);
 
@@ -438,6 +443,33 @@ ut_strcount(
 	return(count);
 }
 
+/********************************************************************
+Concatenate 3 strings.*/
+
+char*
+ut_str3cat(
+/*=======*/
+				/* out, own: concatenated string, must be
+				freed with mem_free() */
+	const char*	s1,	/* in: string 1 */
+	const char*	s2,	/* in: string 2 */
+	const char*	s3)	/* in: string 3 */
+{
+	char*	s;
+	ulint	s1_len = strlen(s1);
+	ulint	s2_len = strlen(s2);
+	ulint	s3_len = strlen(s3);
+
+	s = static_cast<char*>(mem_alloc(s1_len + s2_len + s3_len + 1));
+
+	memcpy(s, s1, s1_len);
+	memcpy(s + s1_len, s2, s2_len);
+	memcpy(s + s1_len + s2_len, s3, s3_len);
+
+	s[s1_len + s2_len + s3_len] = '\0';
+
+	return(s);
+}
 /**********************************************************************//**
 Replace every occurrence of s1 in str with s2. Overlapping instances of s1
 are only replaced once.
@@ -457,7 +489,7 @@ ut_strreplace(
 	ulint		s1_len = strlen(s1);
 	ulint		s2_len = strlen(s2);
 	ulint		count = 0;
-	int		len_delta = (int)s2_len - (int)s1_len;
+	int		len_delta = (int) s2_len - (int) s1_len;
 
 	str_end = str + str_len;
 
@@ -467,7 +499,9 @@ ut_strreplace(
 		count = ut_strcount(str, s1);
 	}
 
-	new_str = mem_alloc(str_len + count * len_delta + 1);
+	new_str = static_cast<char*>(
+		mem_alloc(str_len + count * len_delta + 1));
+
 	ptr = new_str;
 
 	while (str) {
diff --git a/storage/innobase/ut/ut0rbt.c b/storage/innobase/ut/ut0rbt.cc
index 3d7cfa7636f..b21543a679d 100644
--- a/storage/innobase/ut/ut0rbt.c
+++ b/storage/innobase/ut/ut0rbt.cc
@@ -1,12 +1,6 @@
 /***************************************************************************//**
 
-Copyright (c) 2007, 2010, Innobase Oy. All Rights Reserved.
-
-Portions of this file contain modifications contributed and copyrighted by
-Sun Microsystems, Inc. Those modifications are gratefully acknowledged and
-are described briefly in the InnoDB documentation. The contributions by
-Sun Microsystems are incorporated with their permission, and subject to the
-conditions contained in the file COPYING.Sun_Microsystems.
+Copyright (c) 2007, 2010, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -17,8 +11,8 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 /********************************************************************//**
@@ -90,8 +84,21 @@ rbt_check_ordering(
 	/* Iterate over all the nodes, comparing each node with the prev */
 	for (node = rbt_first(tree); node; node = rbt_next(tree, prev)) {
 
-		if (prev && tree->compare(prev->value, node->value) >= 0) {
-			return(FALSE);
+		if (prev) {
+			int	result;
+
+			if (tree->cmp_arg) {
+				result = tree->compare_with_arg(
+					tree->cmp_arg, prev->value,
+					node->value);
+			} else {
+				result = tree->compare(
+					prev->value, node->value);
+			}
+
+			if (result >= 0) {
+				return(FALSE);
+			}
 		}
 
 		prev = node;
@@ -267,7 +274,13 @@ rbt_tree_insert(
 	while (current != tree->nil) {
 
 		parent.last = current;
-		parent.result = tree->compare(key, current->value);
+
+		if (tree->cmp_arg) {
+			parent.result = tree->compare_with_arg(
+				tree->cmp_arg, key, current->value);
+		} else {
+			parent.result = tree->compare(key, current->value);
+		}
 
 		if (parent.result < 0) {
 			current = current->left;
@@ -440,7 +453,7 @@ rbt_find_predecessor(
 		ib_rbt_node_t*	parent = current->parent;
 
 		/* Cast away the const. */
-		prev = (ib_rbt_node_t*)current;
+		prev = (ib_rbt_node_t*) current;
 
 		while (parent != tree->root && prev == parent->left) {
 			prev = parent;
@@ -750,6 +763,30 @@ rbt_free(
 }
 
 /**********************************************************************//**
+Create an instance of a red black tree, whose comparison function takes
+an argument
+@return	an empty rb tree */
+UNIV_INTERN
+ib_rbt_t*
+rbt_create_arg_cmp(
+/*===============*/
+	size_t		sizeof_value,		/*!< in: sizeof data item */
+	ib_rbt_arg_compare
+			compare,		/*!< in: fn to compare items */
+	const void*	cmp_arg)		/*!< in: compare fn arg */
+{
+	ib_rbt_t*       tree;
+
+	ut_a(cmp_arg);
+
+	tree = rbt_create(sizeof_value, NULL);
+	tree->cmp_arg = cmp_arg;
+	tree->compare_with_arg = compare;
+
+	return(tree);
+}
+
+/**********************************************************************//**
 Create an instance of a red black tree.
 @return	an empty rb tree */
 UNIV_INTERN
@@ -868,7 +905,14 @@ rbt_lookup(
 
 	/* Regular binary search. */
 	while (current != tree->nil) {
-		int	result = tree->compare(key, current->value);
+		int	result;
+
+		if (tree->cmp_arg) {
+			result = tree->compare_with_arg(
+				tree->cmp_arg, key, current->value);
+		} else {
+			result = tree->compare(key, current->value);
+		}
 
 		if (result < 0) {
 			current = current->left;
@@ -943,7 +987,14 @@ rbt_lower_bound(
 	ib_rbt_node_t*	current = ROOT(tree);
 
 	while (current != tree->nil) {
-		int result = tree->compare(key, current->value);
+		int	result;
+
+		if (tree->cmp_arg) {
+			result = tree->compare_with_arg(
+				tree->cmp_arg, key, current->value);
+		} else {
+			result = tree->compare(key, current->value);
+		}
 
 		if (result > 0) {
 
@@ -977,7 +1028,14 @@ rbt_upper_bound(
 	ib_rbt_node_t*	current = ROOT(tree);
 
 	while (current != tree->nil) {
-		int result = tree->compare(key, current->value);
+		int	result;
+
+		if (tree->cmp_arg) {
+			result = tree->compare_with_arg(
+				tree->cmp_arg, key, current->value);
+		} else {
+			result = tree->compare(key, current->value);
+		}
 
 		if (result > 0) {
 
@@ -1017,7 +1075,13 @@ rbt_search(
 	while (current != tree->nil) {
 
 		parent->last = current;
-		parent->result = tree->compare(key, current->value);
+
+		if (tree->cmp_arg) {
+			parent->result = tree->compare_with_arg(
+				tree->cmp_arg, key, current->value);
+		} else {
+			parent->result = tree->compare(key, current->value);
+		}
 
 		if (parent->result > 0) {
 			current = current->right;
@@ -1042,7 +1106,10 @@ rbt_search_cmp(
 	const ib_rbt_t*	tree,			/*!< in: rb tree */
 	ib_rbt_bound_t*	parent,			/*!< in: search bounds */
 	const void*	key,			/*!< in: key to search */
-	ib_rbt_compare	compare)		/*!< in: fn to compare items */
+	ib_rbt_compare	compare,		/*!< in: fn to compare items */
+	ib_rbt_arg_compare
+			arg_compare)		/*!< in: fn to compare items
+						with argument */
 {
 	ib_rbt_node_t*	current = ROOT(tree);
 
@@ -1053,7 +1120,14 @@ rbt_search_cmp(
 	while (current != tree->nil) {
 
 		parent->last = current;
-		parent->result = compare(key, current->value);
+
+		if (arg_compare) {
+			ut_ad(tree->cmp_arg);
+			parent->result = arg_compare(
+				tree->cmp_arg, key, current->value);
+		} else {
+			parent->result = compare(key, current->value);
+		}
 
 		if (parent->result > 0) {
 			current = current->right;
@@ -1199,7 +1273,7 @@ rbt_merge_uniq_destructive(
 	for (src_node = (ib_rbt_node_t*) rbt_first(src); src_node; /* */) {
 		ib_rbt_node_t*	prev = src_node;
 
-		src_node = (ib_rbt_node_t*)rbt_next(src, prev);
+		src_node = (ib_rbt_node_t*) rbt_next(src, prev);
 
 		/* Skip duplicates. */
 		if (rbt_search(dst, &parent, prev->value) != 0) {
diff --git a/storage/innobase/ut/ut0rnd.c b/storage/innobase/ut/ut0rnd.cc
index cefd0990ecc..3b4d7381181 100644
--- a/storage/innobase/ut/ut0rnd.c
+++ b/storage/innobase/ut/ut0rnd.cc
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1994, 2009, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,13 +11,13 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
 /***************************************************************//**
-@file ut/ut0rnd.c
+@file ut/ut0rnd.cc
 Random numbers and hashing
 
 Created 5/11/1994 Heikki Tuuri
@@ -59,14 +59,14 @@ ut_find_prime(
 		pow2 = 2 * pow2;
 	}
 
-	if ((double)n < 1.05 * (double)pow2) {
-		n = (ulint) ((double)n * UT_RANDOM_1);
+	if ((double) n < 1.05 * (double) pow2) {
+		n = (ulint) ((double) n * UT_RANDOM_1);
 	}
 
 	pow2 = 2 * pow2;
 
-	if ((double)n > 0.95 * (double)pow2) {
-		n = (ulint) ((double)n * UT_RANDOM_2);
+	if ((double) n > 0.95 * (double) pow2) {
+		n = (ulint) ((double) n * UT_RANDOM_2);
 	}
 
 	if (n > pow2 - 20) {
@@ -77,7 +77,7 @@ ut_find_prime(
 	n more random (especially, if it was not near
 	a power of 2), we then multiply it by a random number. */
 
-	n = (ulint) ((double)n * UT_RANDOM_3);
+	n = (ulint) ((double) n * UT_RANDOM_3);
 
 	for (;; n++) {
 		i = 2;
diff --git a/storage/innobase/ut/ut0ut.c b/storage/innobase/ut/ut0ut.cc
index 117a777cb98..2268cfd2493 100644
--- a/storage/innobase/ut/ut0ut.c
+++ b/storage/innobase/ut/ut0ut.cc
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 2011, 2012, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 1994, 2011, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -17,7 +17,7 @@ this program; if not, write to the Free Software Foundation, Inc.,
 *****************************************************************************/
 
 /***************************************************************//**
-@file ut/ut0ut.c
+@file ut/ut0ut.cc
 Various utilities for Innobase.
 
 Created 5/11/1994 Heikki Tuuri
@@ -25,6 +25,10 @@ Created 5/11/1994 Heikki Tuuri
 
 #include "ut0ut.h"
 
+#ifndef UNIV_INNOCHECKSUM
+
+#include "ut0sort.h"
+
 #ifdef UNIV_NONINL
 #include "ut0ut.ic"
 #endif
@@ -93,26 +97,6 @@ reimplement this function. */
 #define	ut_gettimeofday		gettimeofday
 #endif
 
-/********************************************************//**
-Gets the high 32 bits in a ulint. That is makes a shift >> 32,
-but since there seem to be compiler bugs in both gcc and Visual C++,
-we do this by a special conversion.
-@return	a >> 32 */
-UNIV_INTERN
-ulint
-ut_get_high32(
-/*==========*/
-	ulint	a)	/*!< in: ulint */
-{
-	ib_int64_t	i;
-
-	i = (ib_int64_t)a;
-
-	i = i >> 32;
-
-	return((ulint)i);
-}
-
 /**********************************************************//**
 Returns system time. We do not specify the format of the time returned:
 the only way to manipulate it is to use the function ut_difftime.
@@ -224,6 +208,8 @@ ut_difftime(
 	return(difftime(time2, time1));
 }
 
+#endif /* !UNIV_INNOCHECKSUM */
+
 /**********************************************************//**
 Prints a timestamp to a file. */
 UNIV_INTERN
@@ -238,23 +224,23 @@ ut_print_timestamp(
 	GetLocalTime(&cal_tm);
 
 	fprintf(file,"%02d%02d%02d %2d:%02d:%02d",
-		(int)cal_tm.wYear % 100,
-		(int)cal_tm.wMonth,
-		(int)cal_tm.wDay,
-		(int)cal_tm.wHour,
-		(int)cal_tm.wMinute,
-		(int)cal_tm.wSecond);
+		(int) cal_tm.wYear % 100,
+		(int) cal_tm.wMonth,
+		(int) cal_tm.wDay,
+		(int) cal_tm.wHour,
+		(int) cal_tm.wMinute,
+		(int) cal_tm.wSecond);
 #else
-	struct tm  cal_tm;
 	struct tm* cal_tm_ptr;
 	time_t	   tm;
 
-	time(&tm);
-
 #ifdef HAVE_LOCALTIME_R
+	struct tm  cal_tm;
+	time(&tm);
 	localtime_r(&tm, &cal_tm);
 	cal_tm_ptr = &cal_tm;
 #else
+	time(&tm);
 	cal_tm_ptr = localtime(&tm);
 #endif
 	fprintf(file,"%02d%02d%02d %2d:%02d:%02d",
@@ -267,6 +253,8 @@ ut_print_timestamp(
 #endif
 }
 
+#ifndef UNIV_INNOCHECKSUM
+
 /**********************************************************//**
 Sprintfs a timestamp to a buffer, 13..14 chars plus terminating NUL. */
 UNIV_INTERN
@@ -281,23 +269,23 @@ ut_sprintf_timestamp(
 	GetLocalTime(&cal_tm);
 
 	sprintf(buf, "%02d%02d%02d %2d:%02d:%02d",
-		(int)cal_tm.wYear % 100,
-		(int)cal_tm.wMonth,
-		(int)cal_tm.wDay,
-		(int)cal_tm.wHour,
-		(int)cal_tm.wMinute,
-		(int)cal_tm.wSecond);
+		(int) cal_tm.wYear % 100,
+		(int) cal_tm.wMonth,
+		(int) cal_tm.wDay,
+		(int) cal_tm.wHour,
+		(int) cal_tm.wMinute,
+		(int) cal_tm.wSecond);
 #else
-	struct tm  cal_tm;
 	struct tm* cal_tm_ptr;
 	time_t	   tm;
 
-	time(&tm);
-
 #ifdef HAVE_LOCALTIME_R
+	struct tm  cal_tm;
+	time(&tm);
 	localtime_r(&tm, &cal_tm);
 	cal_tm_ptr = &cal_tm;
 #else
+	time(&tm);
 	cal_tm_ptr = localtime(&tm);
 #endif
 	sprintf(buf, "%02d%02d%02d %2d:%02d:%02d",
@@ -326,23 +314,23 @@ ut_sprintf_timestamp_without_extra_chars(
 	GetLocalTime(&cal_tm);
 
 	sprintf(buf, "%02d%02d%02d_%2d_%02d_%02d",
-		(int)cal_tm.wYear % 100,
-		(int)cal_tm.wMonth,
-		(int)cal_tm.wDay,
-		(int)cal_tm.wHour,
-		(int)cal_tm.wMinute,
-		(int)cal_tm.wSecond);
+		(int) cal_tm.wYear % 100,
+		(int) cal_tm.wMonth,
+		(int) cal_tm.wDay,
+		(int) cal_tm.wHour,
+		(int) cal_tm.wMinute,
+		(int) cal_tm.wSecond);
 #else
-	struct tm  cal_tm;
 	struct tm* cal_tm_ptr;
 	time_t	   tm;
 
-	time(&tm);
-
 #ifdef HAVE_LOCALTIME_R
+	struct tm  cal_tm;
+	time(&tm);
 	localtime_r(&tm, &cal_tm);
 	cal_tm_ptr = &cal_tm;
 #else
+	time(&tm);
 	cal_tm_ptr = localtime(&tm);
 #endif
 	sprintf(buf, "%02d%02d%02d_%2d_%02d_%02d",
@@ -370,25 +358,25 @@ ut_get_year_month_day(
 
 	GetLocalTime(&cal_tm);
 
-	*year = (ulint)cal_tm.wYear;
-	*month = (ulint)cal_tm.wMonth;
-	*day = (ulint)cal_tm.wDay;
+	*year = (ulint) cal_tm.wYear;
+	*month = (ulint) cal_tm.wMonth;
+	*day = (ulint) cal_tm.wDay;
 #else
-	struct tm  cal_tm;
 	struct tm* cal_tm_ptr;
 	time_t	   tm;
 
-	time(&tm);
-
 #ifdef HAVE_LOCALTIME_R
+	struct tm  cal_tm;
+	time(&tm);
 	localtime_r(&tm, &cal_tm);
 	cal_tm_ptr = &cal_tm;
 #else
+	time(&tm);
 	cal_tm_ptr = localtime(&tm);
 #endif
-	*year = (ulint)cal_tm_ptr->tm_year + 1900;
-	*month = (ulint)cal_tm_ptr->tm_mon + 1;
-	*day = (ulint)cal_tm_ptr->tm_mday;
+	*year = (ulint) cal_tm_ptr->tm_year + 1900;
+	*month = (ulint) cal_tm_ptr->tm_mon + 1;
+	*day = (ulint) cal_tm_ptr->tm_mday;
 #endif
 }
 #endif /* UNIV_HOTBACKUP */
@@ -438,13 +426,13 @@ ut_print_buf(
 
 	fprintf(file, " len %lu; hex ", len);
 
-	for (data = (const byte*)buf, i = 0; i < len; i++) {
+	for (data = (const byte*) buf, i = 0; i < len; i++) {
 		fprintf(file, "%02lx", (ulong)*data++);
 	}
 
 	fputs("; asc ", file);
 
-	data = (const byte*)buf;
+	data = (const byte*) buf;
 
 	for (i = 0; i < len; i++) {
 		int	c = (int) *data++;
@@ -454,6 +442,21 @@ ut_print_buf(
 	putc(';', file);
 }
 
+/**********************************************************************//**
+Sort function for ulint arrays. */
+UNIV_INTERN
+void
+ut_ulint_sort(
+/*==========*/
+	ulint*	arr,		/*!< in/out: array to sort */
+	ulint*	aux_arr,	/*!< in/out: aux array to use in sort */
+	ulint	low,		/*!< in: lower bound */
+	ulint	high)		/*!< in: upper bound */
+{
+	UT_SORT_FUNCTION_BODY(ut_ulint_sort, arr, aux_arr, low, high,
+			      ut_ulint_cmp);
+}
+
 /*************************************************************//**
 Calculates fast the number rounded up to the nearest power of 2.
 @return	first power of 2 which is >= n */
@@ -579,6 +582,26 @@ ut_copy_file(
 #ifdef __WIN__
 # include <stdarg.h>
 /**********************************************************************//**
+A substitute for vsnprintf(3), formatted output conversion into
+a limited buffer. Note: this function DOES NOT return the number of
+characters that would have been printed if the buffer was unlimited because
+VC's _vsnprintf() returns -1 in this case and we would need to call
+_vscprintf() in addition to estimate that but we would need another copy
+of "ap" for that and VC does not provide va_copy(). */
+UNIV_INTERN
+void
+ut_vsnprintf(
+/*=========*/
+	char*		str,	/*!< out: string */
+	size_t		size,	/*!< in: str size */
+	const char*	fmt,	/*!< in: format */
+	va_list		ap)	/*!< in: format values */
+{
+	_vsnprintf(str, size, fmt, ap);
+	str[size - 1] = '\0';
+}
+
+/**********************************************************************//**
 A substitute for snprintf(3), formatted output conversion into
 a limited buffer.
 @return number of characters that would have been printed if the size
@@ -634,6 +657,8 @@ ut_strerr(
 		return("Success, record lock created");
 	case DB_ERROR:
 		return("Generic error");
+	case DB_READ_ONLY:
+		return("Read only transaction");
 	case DB_INTERRUPTED:
 		return("Operation interrupted");
 	case DB_OUT_OF_MEMORY:
@@ -712,14 +737,14 @@ ut_strerr(
 		return("No index on referencing keys in referencing table");
 	case DB_PARENT_NO_INDEX:
 		return("No index on referenced keys in referenced table");
+	case DB_FTS_INVALID_DOCID:
+		return("FTS Doc ID cannot be zero");
 	case DB_INDEX_CORRUPT:
 		return("Index corrupted");
 	case DB_UNDO_RECORD_TOO_BIG:
 		return("Undo record too big");
 	case DB_END_OF_INDEX:
 		return("End of index");
-	case DB_TABLE_IN_FK_CHECK:
-		return("Table is being used in foreign key check");
 	/* do not add default: in order to produce a warning if new code
 	is added to the enum but not added here */
 	}
@@ -732,3 +757,4 @@ ut_strerr(
 	/* NOT REACHED */
 	return("Unknown error");
 }
+#endif /* !UNIV_INNOCHECKSUM */
diff --git a/storage/innobase/ut/ut0vec.c b/storage/innobase/ut/ut0vec.cc
index 45f2bc9771f..8ac5d9dc5d3 100644
--- a/storage/innobase/ut/ut0vec.c
+++ b/storage/innobase/ut/ut0vec.cc
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 2006, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 2006, 2011, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,13 +11,13 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
 /*******************************************************************//**
-@file ut/ut0vec.c
+@file ut/ut0vec.cc
 A vector of pointers to data items
 
 Created 4/6/2006 Osku Salerma
@@ -27,53 +27,50 @@ Created 4/6/2006 Osku Salerma
 #ifdef UNIV_NONINL
 #include "ut0vec.ic"
 #endif
-#include <string.h>
+#include "mem0mem.h"
 
-/****************************************************************//**
-Create a new vector with the given initial size.
-@return	vector */
+/********************************************************************
+Create a new vector with the given initial size. */
 UNIV_INTERN
 ib_vector_t*
 ib_vector_create(
 /*=============*/
-	mem_heap_t*	heap,	/*!< in: heap */
-	ulint		size)	/*!< in: initial size */
+					/* out: vector */
+	ib_alloc_t*	allocator,	/* in: vector allocator */
+	ulint		sizeof_value,	/* in: size of data item */
+	ulint		size)		/* in: initial size */
 {
 	ib_vector_t*	vec;
 
 	ut_a(size > 0);
 
-	vec = mem_heap_alloc(heap, sizeof(*vec));
+	vec = static_cast<ib_vector_t*>(allocator->mem_malloc(allocator, sizeof(*vec)));
 
-	vec->heap = heap;
-	vec->data = mem_heap_alloc(heap, sizeof(void*) * size);
 	vec->used = 0;
 	vec->total = size;
+	vec->allocator = allocator;
+	vec->sizeof_value = sizeof_value;
+	vec->data = static_cast<void*>(
+		allocator->mem_malloc(allocator, vec->sizeof_value * size));
 
 	return(vec);
 }
 
-/****************************************************************//**
-Push a new element to the vector, increasing its size if necessary. */
+/********************************************************************
+Resize the vector, currently the vector can only grow and we
+expand the number of elements it can hold by 2 times. */
 UNIV_INTERN
 void
-ib_vector_push(
-/*===========*/
-	ib_vector_t*	vec,	/*!< in: vector */
-	void*		elem)	/*!< in: data element */
+ib_vector_resize(
+/*=============*/
+	ib_vector_t*	vec)		/* in: vector */
 {
-	if (vec->used >= vec->total) {
-		void**	new_data;
-		ulint	new_total = vec->total * 2;
-
-		new_data = mem_heap_alloc(vec->heap,
-					  sizeof(void*) * new_total);
-		memcpy(new_data, vec->data, sizeof(void*) * vec->total);
+	ulint		new_total = vec->total * 2;
+	ulint		old_size = vec->used * vec->sizeof_value;
+	ulint		new_size = new_total * vec->sizeof_value;
 
-		vec->data = new_data;
-		vec->total = new_total;
-	}
+	vec->data = static_cast<void*>(vec->allocator->mem_resize(
+		vec->allocator, vec->data, old_size, new_size));
 
-	vec->data[vec->used] = elem;
-	vec->used++;
+	vec->total = new_total;
 }
diff --git a/storage/innobase/ut/ut0wqueue.c b/storage/innobase/ut/ut0wqueue.cc
index d32086bdfc4..6d410524fe7 100644
--- a/storage/innobase/ut/ut0wqueue.c
+++ b/storage/innobase/ut/ut0wqueue.cc
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 2006, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 2006, 2011, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -11,15 +11,15 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-Place, Suite 330, Boston, MA 02111-1307 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 
 *****************************************************************************/
 
 #include "ut0wqueue.h"
 
 /*******************************************************************//**
-@file ut/ut0wqueue.c
+@file ut/ut0wqueue.cc
 A work queue
 
 Created 4/26/2006 Osku Salerma
@@ -33,7 +33,7 @@ ib_wqueue_t*
 ib_wqueue_create(void)
 /*===================*/
 {
-	ib_wqueue_t*	wq = mem_alloc(sizeof(ib_wqueue_t));
+	ib_wqueue_t*	wq = static_cast<ib_wqueue_t*>(mem_alloc(sizeof(*wq)));
 
 	/* Function ib_wqueue_create() has not been used anywhere,
 	not necessary to instrument this mutex */
@@ -53,8 +53,6 @@ ib_wqueue_free(
 /*===========*/
 	ib_wqueue_t*	wq)	/*!< in: work queue */
 {
-	ut_a(!ib_list_get_first(wq->items));
-
 	mutex_free(&wq->mutex);
 	ib_list_free(wq->items);
 	os_event_free(wq->event);
@@ -118,3 +116,60 @@ ib_wqueue_wait(
 
 	return(node->data);
 }
+
+
+/********************************************************************
+Wait for a work item to appear in the queue for specified time. */
+
+void*
+ib_wqueue_timedwait(
+/*================*/
+					/* out: work item or NULL on timeout*/
+	ib_wqueue_t*	wq,		/* in: work queue */
+	ib_time_t	wait_in_usecs)	/* in: wait time in micro seconds */
+{
+	ib_list_node_t*	node = NULL;
+
+	for (;;) {
+		ulint		error;
+		ib_int64_t	sig_count;
+
+		mutex_enter(&wq->mutex);
+
+		node = ib_list_get_first(wq->items);
+
+		if (node) {
+			ib_list_remove(wq->items, node);
+
+			mutex_exit(&wq->mutex);
+			break;
+		}
+
+		sig_count = os_event_reset(wq->event);
+
+		mutex_exit(&wq->mutex);
+
+		error = os_event_wait_time_low(wq->event,
+					       (ulint) wait_in_usecs,
+					       sig_count);
+
+		if (error == OS_SYNC_TIME_EXCEEDED) {
+			break;
+		}
+	}
+
+	return(node ? node->data : NULL);
+}
+
+/********************************************************************
+Check if queue is empty. */
+
+ibool
+ib_wqueue_is_empty(
+/*===============*/
+					/* out: TRUE if queue empty
+					else FALSE */
+	const ib_wqueue_t*	wq)	/* in: work queue */
+{
+	return(ib_list_is_empty(wq->items));
+}
diff --git a/storage/maria/ha_maria.cc b/storage/maria/ha_maria.cc
index 2b67f29f9f8..1f0c7644035 100644
--- a/storage/maria/ha_maria.cc
+++ b/storage/maria/ha_maria.cc
@@ -3746,7 +3746,7 @@ int ha_maria::multi_range_read_next(range_id_t *range_info)
 ha_rows ha_maria::multi_range_read_info_const(uint keyno, RANGE_SEQ_IF *seq,
                                                void *seq_init_param, 
                                                uint n_ranges, uint *bufsz,
-                                               uint *flags, COST_VECT *cost)
+                                               uint *flags, Cost_estimate *cost)
 {
   /*
     This call is here because there is no location where this->table would
@@ -3760,7 +3760,7 @@ ha_rows ha_maria::multi_range_read_info_const(uint keyno, RANGE_SEQ_IF *seq,
 
 ha_rows ha_maria::multi_range_read_info(uint keyno, uint n_ranges, uint keys,
                                        uint key_parts, uint *bufsz, 
-                                       uint *flags, COST_VECT *cost)
+                                       uint *flags, Cost_estimate *cost)
 {
   ds_mrr.init(this, table);
   return ds_mrr.dsmrr_info(keyno, n_ranges, keys, key_parts, bufsz, flags, cost);
diff --git a/storage/maria/ha_maria.h b/storage/maria/ha_maria.h
index 35c98cc52d7..aa8c0f17d48 100644
--- a/storage/maria/ha_maria.h
+++ b/storage/maria/ha_maria.h
@@ -191,10 +191,10 @@ public:
   ha_rows multi_range_read_info_const(uint keyno, RANGE_SEQ_IF *seq,
                                       void *seq_init_param, 
                                       uint n_ranges, uint *bufsz,
-                                      uint *flags, COST_VECT *cost);
+                                      uint *flags, Cost_estimate *cost);
   ha_rows multi_range_read_info(uint keyno, uint n_ranges, uint keys,
                                 uint key_parts, uint *bufsz, 
-                                uint *flags, COST_VECT *cost);
+                                uint *flags, Cost_estimate *cost);
   int multi_range_read_explain_info(uint mrr_mode, char *str, size_t size);
   
   /* Index condition pushdown implementation */
diff --git a/storage/myisam/ha_myisam.cc b/storage/myisam/ha_myisam.cc
index 4fbd94a1a3b..bf341d29d0b 100644
--- a/storage/myisam/ha_myisam.cc
+++ b/storage/myisam/ha_myisam.cc
@@ -2213,7 +2213,7 @@ int ha_myisam::multi_range_read_next(range_id_t *range_info)
 ha_rows ha_myisam::multi_range_read_info_const(uint keyno, RANGE_SEQ_IF *seq,
                                                void *seq_init_param, 
                                                uint n_ranges, uint *bufsz,
-                                               uint *flags, COST_VECT *cost)
+                                               uint *flags, Cost_estimate *cost)
 {
   /*
     This call is here because there is no location where this->table would
@@ -2227,7 +2227,7 @@ ha_rows ha_myisam::multi_range_read_info_const(uint keyno, RANGE_SEQ_IF *seq,
 
 ha_rows ha_myisam::multi_range_read_info(uint keyno, uint n_ranges, uint keys,
                                          uint key_parts, uint *bufsz, 
-                                         uint *flags, COST_VECT *cost)
+                                         uint *flags, Cost_estimate *cost)
 {
   ds_mrr.init(this, table);
   return ds_mrr.dsmrr_info(keyno, n_ranges, keys, key_parts, bufsz, flags, cost);
diff --git a/storage/myisam/ha_myisam.h b/storage/myisam/ha_myisam.h
index 579ef061af4..7b0e9e8a1d7 100644
--- a/storage/myisam/ha_myisam.h
+++ b/storage/myisam/ha_myisam.h
@@ -170,10 +170,10 @@ public:
   ha_rows multi_range_read_info_const(uint keyno, RANGE_SEQ_IF *seq,
                                       void *seq_init_param, 
                                       uint n_ranges, uint *bufsz,
-                                      uint *flags, COST_VECT *cost);
+                                      uint *flags, Cost_estimate *cost);
   ha_rows multi_range_read_info(uint keyno, uint n_ranges, uint keys,
                                 uint key_parts, uint *bufsz, 
-                                uint *flags, COST_VECT *cost);
+                                uint *flags, Cost_estimate *cost);
   int multi_range_read_explain_info(uint mrr_mode, char *str, size_t size);
 
   /* Index condition pushdown implementation */
diff --git a/storage/perfschema/CMakeLists.txt b/storage/perfschema/CMakeLists.txt
index 7702b7365af..16087740da3 100644
--- a/storage/perfschema/CMakeLists.txt
+++ b/storage/perfschema/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
+# Copyright (c) 2009, 2011, Oracle and/or its affiliates. All rights reserved.
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
@@ -21,56 +21,174 @@ INCLUDE_DIRECTORIES(${CMAKE_SOURCE_DIR}
 
 ADD_DEFINITIONS(-DMYSQL_SERVER)
 
-SET(PERFSCHEMA_SOURCES ha_perfschema.h
-  pfs_column_types.h
-  pfs_column_values.h
-  pfs_events_waits.h
-  pfs_global.h
-  pfs.h
-  pfs_instr.h
-  pfs_instr_class.h
-  pfs_lock.h
-  pfs_atomic.h
-  pfs_server.h
-  pfs_stat.h
-  pfs_engine_table.h
-  pfs_timer.h
-  table_all_instr.h
-  table_events_waits.h
-  table_events_waits_summary.h
-  table_ews_global_by_event_name.h
-  table_file_instances.h
-  table_file_summary.h
-  table_performance_timers.h
-  table_setup_consumers.h
-  table_setup_instruments.h
-  table_setup_timers.h
-  table_sync_instances.h
-  table_threads.h
-  ha_perfschema.cc
-  pfs.cc
-  pfs_column_values.cc
-  pfs_events_waits.cc
-  pfs_global.cc
-  pfs_instr.cc
-  pfs_instr_class.cc
-  pfs_server.cc
-  pfs_engine_table.cc
-  pfs_timer.cc
-  table_all_instr.cc
-  table_events_waits.cc
-  table_events_waits_summary.cc
-  table_ews_global_by_event_name.cc
-  table_file_instances.cc
-  table_file_summary.cc
-  table_performance_timers.cc
-  table_setup_consumers.cc
-  table_setup_instruments.cc
-  table_setup_timers.cc
-  table_sync_instances.cc
-  table_threads.cc
-  pfs_atomic.cc
-  pfs_check.cc
+# Gen_pfs_lex_token
+ADD_EXECUTABLE(gen_pfs_lex_token gen_pfs_lex_token.cc)
+# gen_pfs_lex_token itself depends on ${CMAKE_CURRENT_BINARY_DIR}/sql/sql_yacc.h
+ADD_DEPENDENCIES(gen_pfs_lex_token GenServerSource)
+
+ADD_CUSTOM_COMMAND(
+  OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/pfs_lex_token.h
+  COMMAND gen_pfs_lex_token > pfs_lex_token.h
+  DEPENDS gen_pfs_lex_token
+)
+
+SET(PFS_GEN_SOURCES
+  ${CMAKE_CURRENT_BINARY_DIR}/pfs_lex_token.h
+)
+
+SET_SOURCE_FILES_PROPERTIES(${PFS_GEN_SOURCES} PROPERTIES GENERATED 1)
+
+#
+# Maintainer: keep this list sorted, to avoid merge collisions.
+# Tip: ls -1 *.h, ls -1 *.cc
+#
+SET(PERFSCHEMA_SOURCES
+${PFS_GEN_SOURCES}
+ha_perfschema.h
+cursor_by_account.h
+cursor_by_host.h
+cursor_by_thread.h
+cursor_by_user.h
+pfs.h
+pfs_account.h
+pfs_atomic.h
+pfs_column_types.h
+pfs_column_values.h
+pfs_con_slice.h
+pfs_defaults.h
+pfs_digest.h
+pfs_engine_table.h
+pfs_events.h
+pfs_events_stages.h
+pfs_events_statements.h
+pfs_events_waits.h
+pfs_global.h
+pfs_host.h
+pfs_instr.h
+pfs_instr_class.h
+pfs_lock.h
+pfs_server.h
+pfs_setup_actor.h
+pfs_setup_object.h
+pfs_stat.h
+pfs_timer.h
+pfs_user.h
+pfs_visitor.h
+table_accounts.h
+table_all_instr.h
+table_esgs_by_account_by_event_name.h
+table_esgs_by_host_by_event_name.h
+table_esgs_by_thread_by_event_name.h
+table_esgs_by_user_by_event_name.h
+table_esgs_global_by_event_name.h
+table_esms_by_account_by_event_name.h
+table_esms_by_host_by_event_name.h
+table_esms_by_digest.h
+table_esms_by_thread_by_event_name.h
+table_esms_by_user_by_event_name.h
+table_esms_global_by_event_name.h
+table_events_stages.h
+table_events_statements.h
+table_events_waits.h
+table_events_waits_summary.h
+table_ews_by_account_by_event_name.h
+table_ews_by_host_by_event_name.h
+table_ews_by_thread_by_event_name.h
+table_ews_by_user_by_event_name.h
+table_ews_global_by_event_name.h
+table_file_instances.h
+table_file_summary_by_instance.h
+table_file_summary_by_event_name.h
+table_socket_instances.h
+table_socket_summary_by_instance.h
+table_socket_summary_by_event_name.h
+table_helper.h
+table_host_cache.h
+table_hosts.h
+table_os_global_by_type.h
+table_performance_timers.h
+table_setup_actors.h
+table_setup_consumers.h
+table_setup_instruments.h
+table_setup_objects.h
+table_setup_timers.h
+table_sync_instances.h
+table_threads.h
+table_tiws_by_index_usage.h
+table_tiws_by_table.h
+table_tlws_by_table.h
+table_users.h
+cursor_by_account.cc
+cursor_by_host.cc
+cursor_by_thread.cc
+cursor_by_user.cc
+ha_perfschema.cc
+pfs.cc
+pfs_account.cc
+pfs_atomic.cc
+pfs_check.cc
+pfs_column_values.cc
+pfs_con_slice.cc
+pfs_defaults.cc
+pfs_digest.cc
+pfs_engine_table.cc
+pfs_events_stages.cc
+pfs_events_statements.cc
+pfs_events_waits.cc
+pfs_global.cc
+pfs_host.cc
+pfs_instr.cc
+pfs_instr_class.cc
+pfs_server.cc
+pfs_setup_actor.cc
+pfs_setup_object.cc
+pfs_timer.cc
+pfs_user.cc
+pfs_visitor.cc
+table_accounts.cc
+table_all_instr.cc
+table_esgs_by_account_by_event_name.cc
+table_esgs_by_host_by_event_name.cc
+table_esgs_by_thread_by_event_name.cc
+table_esgs_by_user_by_event_name.cc
+table_esgs_global_by_event_name.cc
+table_esms_by_account_by_event_name.cc
+table_esms_by_host_by_event_name.cc
+table_esms_by_digest.cc
+table_esms_by_thread_by_event_name.cc
+table_esms_by_user_by_event_name.cc
+table_esms_global_by_event_name.cc
+table_events_stages.cc
+table_events_statements.cc
+table_events_waits.cc
+table_events_waits_summary.cc
+table_ews_by_account_by_event_name.cc
+table_ews_by_host_by_event_name.cc
+table_ews_by_thread_by_event_name.cc
+table_ews_by_user_by_event_name.cc
+table_ews_global_by_event_name.cc
+table_file_instances.cc
+table_file_summary_by_instance.cc
+table_file_summary_by_event_name.cc
+table_socket_instances.cc
+table_socket_summary_by_instance.cc
+table_socket_summary_by_event_name.cc
+table_helper.cc
+table_host_cache.cc
+table_hosts.cc
+table_os_global_by_type.cc
+table_performance_timers.cc
+table_setup_actors.cc
+table_setup_consumers.cc
+table_setup_instruments.cc
+table_setup_objects.cc
+table_setup_timers.cc
+table_sync_instances.cc
+table_threads.cc
+table_tiws_by_index_usage.cc
+table_tiws_by_table.cc
+table_tlws_by_table.cc
+table_users.cc
 )
 
 MYSQL_ADD_PLUGIN(perfschema ${PERFSCHEMA_SOURCES} STORAGE_ENGINE DEFAULT STATIC_ONLY)
diff --git a/storage/perfschema/cursor_by_account.cc b/storage/perfschema/cursor_by_account.cc
new file mode 100644
index 00000000000..91e9e3c6e54
--- /dev/null
+++ b/storage/perfschema/cursor_by_account.cc
@@ -0,0 +1,72 @@
+/* Copyright (c) 2011, Oracle and/or its affiliates. All rights reserved.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; version 2 of the License.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+/**
+  @file storage/perfschema/cursor_by_account.cc
+  Cursor CURSOR_BY_ACCOUNT (implementation).
+*/
+
+#include "my_global.h"
+#include "cursor_by_account.h"
+#include "pfs_user.h"
+
+cursor_by_account::cursor_by_account(const PFS_engine_table_share *share)
+  : PFS_engine_table(share, &m_pos),
+    m_pos(0), m_next_pos(0)
+{}
+
+void cursor_by_account::reset_position(void)
+{
+  m_pos.m_index= 0;
+  m_next_pos.m_index= 0;
+}
+
+int cursor_by_account::rnd_next(void)
+{
+  PFS_account *pfs;
+
+  for (m_pos.set_at(&m_next_pos);
+       m_pos.m_index < account_max;
+       m_pos.next())
+  {
+    pfs= &account_array[m_pos.m_index];
+    if (pfs->m_lock.is_populated())
+    {
+      make_row(pfs);
+      m_next_pos.set_after(&m_pos);
+      return 0;
+    }
+  }
+
+  return HA_ERR_END_OF_FILE;
+}
+
+int
+cursor_by_account::rnd_pos(const void *pos)
+{
+  PFS_account *pfs;
+
+  set_position(pos);
+  DBUG_ASSERT(m_pos.m_index < account_max);
+  pfs= &account_array[m_pos.m_index];
+  if (pfs->m_lock.is_populated())
+  {
+    make_row(pfs);
+    return 0;
+  }
+
+  return HA_ERR_RECORD_DELETED;
+}
+
diff --git a/storage/perfschema/cursor_by_account.h b/storage/perfschema/cursor_by_account.h
new file mode 100644
index 00000000000..98321df5751
--- /dev/null
+++ b/storage/perfschema/cursor_by_account.h
@@ -0,0 +1,59 @@
+/* Copyright (c) 2011, Oracle and/or its affiliates. All rights reserved.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; version 2 of the License.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+#ifndef CURSOR_BY_ACCOUNT_H
+#define CURSOR_BY_ACCOUNT_H
+
+/**
+  @file storage/perfschema/cursor_by_account.h
+  Cursor CURSOR_BY_ACCOUNT (declarations).
+*/
+
+#include "pfs_engine_table.h"
+#include "pfs_account.h"
+#include "table_helper.h"
+
+/**
+  @addtogroup Performance_schema_tables
+  @{
+*/
+
+/** Cursor CURSOR_BY_ACCOUNT. */
+class cursor_by_account : public PFS_engine_table
+{
+public:
+  virtual int rnd_next();
+  virtual int rnd_pos(const void *pos);
+  virtual void reset_position(void);
+
+protected:
+  cursor_by_account(const PFS_engine_table_share *share);
+
+public:
+  ~cursor_by_account()
+  {}
+
+protected:
+  virtual void make_row(PFS_account *account)= 0;
+
+private:
+  /** Current position. */
+  PFS_simple_index m_pos;
+  /** Next position. */
+  PFS_simple_index m_next_pos;
+};
+
+/** @} */
+#endif
diff --git a/storage/perfschema/cursor_by_host.cc b/storage/perfschema/cursor_by_host.cc
new file mode 100644
index 00000000000..f62005511bf
--- /dev/null
+++ b/storage/perfschema/cursor_by_host.cc
@@ -0,0 +1,72 @@
+/* Copyright (c) 2011, Oracle and/or its affiliates. All rights reserved.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; version 2 of the License.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+/**
+  @file storage/perfschema/cursor_by_host.cc
+  Cursor CURSOR_BY_HOST (implementation).
+*/
+
+#include "my_global.h"
+#include "cursor_by_host.h"
+#include "pfs_host.h"
+
+cursor_by_host::cursor_by_host(const PFS_engine_table_share *share)
+  : PFS_engine_table(share, &m_pos),
+    m_pos(0), m_next_pos(0)
+{}
+
+void cursor_by_host::reset_position(void)
+{
+  m_pos.m_index= 0;
+  m_next_pos.m_index= 0;
+}
+
+int cursor_by_host::rnd_next(void)
+{
+  PFS_host *host;
+
+  for (m_pos.set_at(&m_next_pos);
+       m_pos.m_index < host_max;
+       m_pos.next())
+  {
+    host= & host_array[m_pos.m_index];
+    if (host->m_lock.is_populated())
+    {
+      make_row(host);
+      m_next_pos.set_after(&m_pos);
+      return 0;
+    }
+  }
+
+  return HA_ERR_END_OF_FILE;
+}
+
+int
+cursor_by_host::rnd_pos(const void *pos)
+{
+  PFS_host *pfs;
+
+  set_position(pos);
+  DBUG_ASSERT(m_pos.m_index < host_max);
+  pfs= &host_array[m_pos.m_index];
+  if (pfs->m_lock.is_populated())
+  {
+    make_row(pfs);
+    return 0;
+  }
+
+  return HA_ERR_RECORD_DELETED;
+}
+
diff --git a/storage/perfschema/cursor_by_host.h b/storage/perfschema/cursor_by_host.h
new file mode 100644
index 00000000000..3fbd09e3018
--- /dev/null
+++ b/storage/perfschema/cursor_by_host.h
@@ -0,0 +1,59 @@
+/* Copyright (c) 2011, Oracle and/or its affiliates. All rights reserved.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; version 2 of the License.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+#ifndef CURSOR_BY_HOST_H
+#define CURSOR_BY_HOST_H
+
+/**
+  @file storage/perfschema/cursor_by_host.h
+  Cursor CURSOR_BY_HOST (declarations).
+*/
+
+#include "pfs_engine_table.h"
+#include "pfs_host.h"
+#include "table_helper.h"
+
+/**
+  @addtogroup Performance_schema_tables
+  @{
+*/
+
+/** Cursor CURSOR_BY_HOST. */
+class cursor_by_host : public PFS_engine_table
+{
+public:
+  virtual int rnd_next();
+  virtual int rnd_pos(const void *pos);
+  virtual void reset_position(void);
+
+protected:
+  cursor_by_host(const PFS_engine_table_share *share);
+
+public:
+  ~cursor_by_host()
+  {}
+
+protected:
+  virtual void make_row(PFS_host *host)= 0;
+
+private:
+  /** Current position. */
+  PFS_simple_index m_pos;
+  /** Next position. */
+  PFS_simple_index m_next_pos;
+};
+
+/** @} */
+#endif
diff --git a/storage/perfschema/cursor_by_thread.cc b/storage/perfschema/cursor_by_thread.cc
new file mode 100644
index 00000000000..06ee2f6cbef
--- /dev/null
+++ b/storage/perfschema/cursor_by_thread.cc
@@ -0,0 +1,72 @@
+/* Copyright (c) 2011, Oracle and/or its affiliates. All rights reserved.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; version 2 of the License.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+/**
+  @file storage/perfschema/cursor_by_thread.cc
+  Cursor CURSOR_BY_THREAD (implementation).
+*/
+
+#include "my_global.h"
+#include "cursor_by_thread.h"
+#include "pfs_instr.h"
+
+cursor_by_thread::cursor_by_thread(const PFS_engine_table_share *share)
+  : PFS_engine_table(share, &m_pos),
+    m_pos(0), m_next_pos(0)
+{}
+
+void cursor_by_thread::reset_position(void)
+{
+  m_pos.m_index= 0;
+  m_next_pos.m_index= 0;
+}
+
+int cursor_by_thread::rnd_next(void)
+{
+  PFS_thread *pfs;
+
+  for (m_pos.set_at(&m_next_pos);
+       m_pos.m_index < thread_max;
+       m_pos.next())
+  {
+    pfs= &thread_array[m_pos.m_index];
+    if (pfs->m_lock.is_populated())
+    {
+      make_row(pfs);
+      m_next_pos.set_after(&m_pos);
+      return 0;
+    }
+  }
+
+  return HA_ERR_END_OF_FILE;
+}
+
+int
+cursor_by_thread::rnd_pos(const void *pos)
+{
+  PFS_thread *pfs;
+
+  set_position(pos);
+  DBUG_ASSERT(m_pos.m_index < thread_max);
+  pfs= &thread_array[m_pos.m_index];
+  if (pfs->m_lock.is_populated())
+  {
+    make_row(pfs);
+    return 0;
+  }
+
+  return HA_ERR_RECORD_DELETED;
+}
+
diff --git a/storage/perfschema/cursor_by_thread.h b/storage/perfschema/cursor_by_thread.h
new file mode 100644
index 00000000000..8f2edef6b7e
--- /dev/null
+++ b/storage/perfschema/cursor_by_thread.h
@@ -0,0 +1,59 @@
+/* Copyright (c) 2011, Oracle and/or its affiliates. All rights reserved.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; version 2 of the License.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+#ifndef CURSOR_BY_THREAD_H
+#define CURSOR_BY_THREAD_H
+
+/**
+  @file storage/perfschema/cursor_by_thread.h
+  Cursor CURSOR_BY_THREAD (declarations).
+*/
+
+#include "pfs_engine_table.h"
+#include "pfs_instr.h"
+#include "table_helper.h"
+
+/**
+  @addtogroup Performance_schema_tables
+  @{
+*/
+
+/** Cursor CURSOR_BY_THREAD. */
+class cursor_by_thread : public PFS_engine_table
+{
+public:
+  virtual int rnd_next();
+  virtual int rnd_pos(const void *pos);
+  virtual void reset_position(void);
+
+protected:
+  cursor_by_thread(const PFS_engine_table_share *share);
+
+public:
+  ~cursor_by_thread()
+  {}
+
+protected:
+  virtual void make_row(PFS_thread *thread)= 0;
+
+private:
+  /** Current position. */
+  PFS_simple_index m_pos;
+  /** Next position. */
+  PFS_simple_index m_next_pos;
+};
+
+/** @} */
+#endif
diff --git a/storage/perfschema/cursor_by_user.cc b/storage/perfschema/cursor_by_user.cc
new file mode 100644
index 00000000000..8f8fe99f513
--- /dev/null
+++ b/storage/perfschema/cursor_by_user.cc
@@ -0,0 +1,72 @@
+/* Copyright (c) 2011, Oracle and/or its affiliates. All rights reserved.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; version 2 of the License.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+/**
+  @file storage/perfschema/cursor_by_user.cc
+  Cursor CURSOR_BY_USER (implementation).
+*/
+
+#include "my_global.h"
+#include "cursor_by_user.h"
+#include "pfs_user.h"
+
+cursor_by_user::cursor_by_user(const PFS_engine_table_share *share)
+  : PFS_engine_table(share, &m_pos),
+    m_pos(0), m_next_pos(0)
+{}
+
+void cursor_by_user::reset_position(void)
+{
+  m_pos.m_index= 0;
+  m_next_pos.m_index= 0;
+}
+
+int cursor_by_user::rnd_next(void)
+{
+  PFS_user *pfs;
+
+  for (m_pos.set_at(&m_next_pos);
+       m_pos.m_index < user_max;
+       m_pos.next())
+  {
+    pfs= &user_array[m_pos.m_index];
+    if (pfs->m_lock.is_populated())
+    {
+      make_row(pfs);
+      m_next_pos.set_after(&m_pos);
+      return 0;
+    }
+  }
+
+  return HA_ERR_END_OF_FILE;
+}
+
+int
+cursor_by_user::rnd_pos(const void *pos)
+{
+  PFS_user *pfs;
+
+  set_position(pos);
+  DBUG_ASSERT(m_pos.m_index < user_max);
+  pfs= &user_array[m_pos.m_index];
+  if (pfs->m_lock.is_populated())
+  {
+    make_row(pfs);
+    return 0;
+  }
+
+  return HA_ERR_RECORD_DELETED;
+}
+
diff --git a/storage/perfschema/cursor_by_user.h b/storage/perfschema/cursor_by_user.h
new file mode 100644
index 00000000000..c4f9cabc2dd
--- /dev/null
+++ b/storage/perfschema/cursor_by_user.h
@@ -0,0 +1,59 @@
+/* Copyright (c) 2011, Oracle and/or its affiliates. All rights reserved.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; version 2 of the License.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+#ifndef CURSOR_BY_USER_H
+#define CURSOR_BY_USER_H
+
+/**
+  @file storage/perfschema/cursor_by_user.h
+  Cursor CURSOR_BY_USER (declarations).
+*/
+
+#include "pfs_engine_table.h"
+#include "pfs_user.h"
+#include "table_helper.h"
+
+/**
+  @addtogroup Performance_schema_tables
+  @{
+*/
+
+/** Cursor CURSOR_BY_USER. */
+class cursor_by_user : public PFS_engine_table
+{
+public:
+  virtual int rnd_next();
+  virtual int rnd_pos(const void *pos);
+  virtual void reset_position(void);
+
+protected:
+  cursor_by_user(const PFS_engine_table_share *share);
+
+public:
+  ~cursor_by_user()
+  {}
+
+protected:
+  virtual void make_row(PFS_user *user)= 0;
+
+private:
+  /** Current position. */
+  PFS_simple_index m_pos;
+  /** Next position. */
+  PFS_simple_index m_next_pos;
+};
+
+/** @} */
+#endif
diff --git a/storage/perfschema/gen_pfs_lex_token b/storage/perfschema/gen_pfs_lex_token
new file mode 100755
index 00000000000..0b4116982bb
--- /dev/null
+++ b/storage/perfschema/gen_pfs_lex_token
diff --git a/storage/perfschema/gen_pfs_lex_token.cc b/storage/perfschema/gen_pfs_lex_token.cc
new file mode 100644
index 00000000000..b7470061de1
--- /dev/null
+++ b/storage/perfschema/gen_pfs_lex_token.cc
@@ -0,0 +1,265 @@
+/*
+   Copyright (c) 2011, Oracle and/or its affiliates. All rights reserved.
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software Foundation,
+   51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA */
+
+#include <my_global.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+
+/* We only need the tokens here */
+#define YYSTYPE_IS_DECLARED
+#include <../sql/sql_yacc.h>
+#include <lex.h>
+
+#include <welcome_copyright_notice.h> /* ORACLE_WELCOME_COPYRIGHT_NOTICE */
+
+/*
+  This is a tool used during build only,
+  so MY_MAX_TOKEN does not need to be exact,
+  only big enough to hold:
+  - 256 character terminal tokens
+  - YYNTOKENS named terminal tokens
+  from bison.
+  See also YYMAXUTOK.
+*/
+#define MY_MAX_TOKEN 1000
+struct gen_lex_token_string
+{
+  const char *m_token_string;
+  int m_token_length;
+};
+
+gen_lex_token_string compiled_token_array[MY_MAX_TOKEN];
+int max_token_seen= 0;
+
+char char_tokens[256];
+
+int tok_pfs_generic_value= 0;
+int tok_pfs_generic_value_list= 0;
+int tok_pfs_row_single_value= 0;
+int tok_pfs_row_single_value_list= 0;
+int tok_pfs_row_multiple_value= 0;
+int tok_pfs_row_multiple_value_list= 0;
+int tok_pfs_unused= 0;
+
+void set_token(int tok, const char *str)
+{
+  if (tok <= 0)
+  {
+    fprintf(stderr, "Bad token found\n");
+    exit(1);
+  }
+
+  if (tok > max_token_seen)
+  {
+    max_token_seen= tok;
+  }
+
+  if (max_token_seen >= MY_MAX_TOKEN)
+  {
+    fprintf(stderr, "Added that many new keywords ? Increase MY_MAX_TOKEN\n");
+    exit(1);
+  }
+
+  compiled_token_array[tok].m_token_string= str;
+  compiled_token_array[tok].m_token_length= strlen(str);
+}
+
+void compute_tokens()
+{
+  int tok;
+  unsigned int i;
+  char *str;
+
+  /*
+    Default value.
+  */
+  for (tok= 0; tok < MY_MAX_TOKEN; tok++)
+  {
+    compiled_token_array[tok].m_token_string= "(unknown)";
+    compiled_token_array[tok].m_token_length= 9;
+  }
+
+  /*
+    Tokens made of just one terminal character
+  */
+  for (tok=0; tok < 256; tok++)
+  {
+    str= & char_tokens[tok];
+    str[0]= (char) tok;
+    compiled_token_array[tok].m_token_string= str;
+    compiled_token_array[tok].m_token_length= 1;
+  }
+
+  max_token_seen= 255;
+
+  /*
+    String terminal tokens, used in sql_yacc.yy
+  */
+  set_token(NEG, "~");
+  set_token(TABLE_REF_PRIORITY, "TABLE_REF_PRIORITY");
+
+  /*
+    Tokens hard coded in sql_lex.cc
+  */
+
+  set_token(WITH_CUBE_SYM, "WITH CUBE");
+  set_token(WITH_ROLLUP_SYM, "WITH ROLLUP");
+  set_token(NOT2_SYM, "!");
+  set_token(OR2_SYM, "|");
+  set_token(PARAM_MARKER, "?");
+  set_token(SET_VAR, ":=");
+  set_token(UNDERSCORE_CHARSET, "(_charset)");
+  set_token(END_OF_INPUT, "");
+
+  /*
+    Values.
+    These tokens are all normalized later,
+    so this strings will never be displayed.
+  */
+  set_token(BIN_NUM, "(bin)");
+  set_token(DECIMAL_NUM, "(decimal)");
+  set_token(FLOAT_NUM, "(float)");
+  set_token(HEX_NUM, "(hex)");
+  set_token(LEX_HOSTNAME, "(hostname)");
+  set_token(LONG_NUM, "(long)");
+  set_token(NUM, "(num)");
+  set_token(TEXT_STRING, "(text)");
+  set_token(NCHAR_STRING, "(nchar)");
+  set_token(ULONGLONG_NUM, "(ulonglong)");
+
+  /*
+    Identifiers.
+  */
+  set_token(IDENT, "(id)");
+  set_token(IDENT_QUOTED, "(id_quoted)");
+
+  /*
+    Unused tokens
+  */
+  set_token(LOCATOR_SYM, "LOCATOR");
+  set_token(SERVER_OPTIONS, "SERVER_OPTIONS");
+  set_token(UDF_RETURNS_SYM, "UDF_RETURNS");
+
+  /*
+    See symbols[] in sql/lex.h
+  */
+  for (i= 0; i< sizeof(symbols)/sizeof(symbols[0]); i++)
+  {
+    set_token(symbols[i].tok, symbols[i].name);
+  }
+
+  /*
+    See sql_functions[] in sql/lex.h
+  */
+  for (i= 0; i< sizeof(sql_functions)/sizeof(sql_functions[0]); i++)
+  {
+    set_token(sql_functions[i].tok, sql_functions[i].name);
+  }
+
+  /*
+    Additional FAKE tokens,
+    used internally to normalize a digest text.
+  */
+
+  max_token_seen++;
+  tok_pfs_generic_value= max_token_seen;
+  set_token(tok_pfs_generic_value, "?");
+
+  max_token_seen++;
+  tok_pfs_generic_value_list= max_token_seen;
+  set_token(tok_pfs_generic_value_list, "?, ...");
+
+  max_token_seen++;
+  tok_pfs_row_single_value= max_token_seen;
+  set_token(tok_pfs_row_single_value, "(?)");
+
+  max_token_seen++;
+  tok_pfs_row_single_value_list= max_token_seen;
+  set_token(tok_pfs_row_single_value_list, "(?) /* , ... */");
+
+  max_token_seen++;
+  tok_pfs_row_multiple_value= max_token_seen;
+  set_token(tok_pfs_row_multiple_value, "(...)");
+
+  max_token_seen++;
+  tok_pfs_row_multiple_value_list= max_token_seen;
+  set_token(tok_pfs_row_multiple_value_list, "(...) /* , ... */");
+
+  max_token_seen++;
+  tok_pfs_unused= max_token_seen;
+  set_token(tok_pfs_unused, "UNUSED");
+}
+
+void print_tokens()
+{
+  int tok;
+
+  printf("lex_token_string lex_token_array[]=\n");
+  printf("{\n");
+  printf("/* PART 1: character tokens. */\n");
+
+  for (tok= 0; tok<256; tok++)
+  {
+    printf("/* %03d */  { \"\\x%02x\", 1},\n", tok, tok);
+  }
+
+  printf("/* PART 2: named tokens. */\n");
+
+  for (tok= 256; tok<= max_token_seen; tok++)
+  {
+    printf("/* %03d */  { \"%s\", %d},\n",
+           tok,
+           compiled_token_array[tok].m_token_string,
+           compiled_token_array[tok].m_token_length);
+  }
+
+  printf("/* DUMMY */ { \"\", 0}\n");
+  printf("};\n");
+
+  printf("/* PFS specific tokens. */\n");
+  printf("#define TOK_PFS_GENERIC_VALUE %d\n", tok_pfs_generic_value);
+  printf("#define TOK_PFS_GENERIC_VALUE_LIST %d\n", tok_pfs_generic_value_list);
+  printf("#define TOK_PFS_ROW_SINGLE_VALUE %d\n", tok_pfs_row_single_value);
+  printf("#define TOK_PFS_ROW_SINGLE_VALUE_LIST %d\n", tok_pfs_row_single_value_list);
+  printf("#define TOK_PFS_ROW_MULTIPLE_VALUE %d\n", tok_pfs_row_multiple_value);
+  printf("#define TOK_PFS_ROW_MULTIPLE_VALUE_LIST %d\n", tok_pfs_row_multiple_value_list);
+  printf("#define TOK_PFS_UNUSED %d\n", tok_pfs_unused);
+}
+
+int main(int argc,char **argv)
+{
+  puts("/*");
+  puts(ORACLE_WELCOME_COPYRIGHT_NOTICE("2011, 2012"));
+  puts("*/");
+
+  printf("/*\n");
+  printf("  This file is generated, do not edit.\n");
+  printf("  See file storage/perfschema/gen_pfs_lex_token.cc.\n");
+  printf("*/\n");
+  printf("struct lex_token_string\n");
+  printf("{\n");
+  printf("  const char *m_token_string;\n");
+  printf("  int m_token_length;\n");
+  printf("};\n");
+  printf("typedef struct lex_token_string lex_token_string;\n");
+
+  compute_tokens();
+  print_tokens();
+
+  return 0;
+}
+
diff --git a/storage/perfschema/ha_perfschema.cc b/storage/perfschema/ha_perfschema.cc
index 0fb86cfe5cd..773d822af2b 100644
--- a/storage/perfschema/ha_perfschema.cc
+++ b/storage/perfschema/ha_perfschema.cc
@@ -28,6 +28,10 @@
 #include "pfs_column_values.h"
 #include "pfs_instr_class.h"
 #include "pfs_instr.h"
+#include "pfs_account.h"
+#include "pfs_host.h"
+#include "pfs_user.h"
+#include "pfs_account.h"
 
 #ifdef MY_ATOMIC_MODE_DUMMY
 /*
@@ -80,7 +84,8 @@ static int pfs_init_func(void *p)
   pfs_hton->show_status= pfs_show_status;
   pfs_hton->flags= HTON_ALTER_NOT_SUPPORTED |
     HTON_TEMPORARY_NOT_SUPPORTED |
-    HTON_NO_PARTITION;
+    HTON_NO_PARTITION |
+    HTON_NO_BINLOG_ROW_OPT;
 
   /*
     As long as the server implementation keeps using legacy_db_type,
@@ -125,6 +130,8 @@ static struct st_mysql_show_var pfs_status_vars[]=
     (char*) &thread_class_lost, SHOW_LONG_NOFLUSH},
   {"Performance_schema_file_classes_lost",
     (char*) &file_class_lost, SHOW_LONG_NOFLUSH},
+  {"Performance_schema_socket_classes_lost",
+    (char*) &socket_class_lost, SHOW_LONG_NOFLUSH},
   {"Performance_schema_mutex_instances_lost",
     (char*) &mutex_lost, SHOW_LONG},
   {"Performance_schema_rwlock_instances_lost",
@@ -137,6 +144,8 @@ static struct st_mysql_show_var pfs_status_vars[]=
     (char*) &file_lost, SHOW_LONG},
   {"Performance_schema_file_handles_lost",
     (char*) &file_handle_lost, SHOW_LONG},
+  {"Performance_schema_socket_instances_lost",
+    (char*) &socket_lost, SHOW_LONG},
   {"Performance_schema_locker_lost",
     (char*) &locker_lost, SHOW_LONG},
   /* table shares, can be flushed */
@@ -145,6 +154,18 @@ static struct st_mysql_show_var pfs_status_vars[]=
   /* table handles, can be flushed */
   {"Performance_schema_table_handles_lost",
     (char*) &table_lost, SHOW_LONG},
+  {"Performance_schema_hosts_lost",
+    (char*) &host_lost, SHOW_LONG},
+  {"Performance_schema_users_lost",
+    (char*) &user_lost, SHOW_LONG},
+  {"Performance_schema_accounts_lost",
+    (char*) &account_lost, SHOW_LONG},
+  {"Performance_schema_stage_classes_lost",
+    (char*) &stage_class_lost, SHOW_LONG},
+  {"Performance_schema_statement_classes_lost",
+    (char*) &statement_class_lost, SHOW_LONG},
+  {"Performance_schema_digest_lost",
+    (char*) &digest_lost, SHOW_LONG},
   {NullS, NullS, SHOW_LONG}
 };
 
@@ -217,8 +238,6 @@ int ha_perfschema::open(const char *name, int mode, uint test_if_locked)
   thr_lock_data_init(m_table_share->m_thr_lock_ptr, &m_thr_lock, NULL);
   ref_length= m_table_share->m_ref_length;
 
-  psi_open();
-
   DBUG_RETURN(0);
 }
 
@@ -229,8 +248,6 @@ int ha_perfschema::close(void)
   delete m_table;
   m_table= NULL;
 
-  psi_close();
-
   DBUG_RETURN(0);
 }
 
@@ -243,12 +260,7 @@ int ha_perfschema::write_row(uchar *buf)
   ha_statistic_increment(&SSV::ha_write_count);
   DBUG_ASSERT(m_table_share);
 
-  if (m_table_share->m_write_row)
-    result= m_table_share->m_write_row(table, buf, table->field);
-  else
-  {
-    result= HA_ERR_WRONG_COMMAND;
-  }
+  result= m_table_share->write_row(table, buf, table->field);
 
   DBUG_RETURN(result);
 }
@@ -269,10 +281,21 @@ int ha_perfschema::update_row(const uchar *old_data, uchar *new_data)
   DBUG_ENTER("ha_perfschema::update_row");
 
   DBUG_ASSERT(m_table);
+  ha_statistic_increment(&SSV::ha_update_count);
   int result= m_table->update_row(table, old_data, new_data, table->field);
   DBUG_RETURN(result);
 }
 
+int ha_perfschema::delete_row(const uchar *buf)
+{
+  DBUG_ENTER("ha_perfschema::delete_row");
+
+  DBUG_ASSERT(m_table);
+  ha_statistic_increment(&SSV::ha_delete_count);
+  int result= m_table->delete_row(table, buf, table->field);
+  DBUG_RETURN(result);
+}
+
 int ha_perfschema::rnd_init(bool scan)
 {
   int result;
@@ -287,6 +310,9 @@ int ha_perfschema::rnd_init(bool scan)
   else
     m_table->reset_position();
 
+  if (m_table != NULL)
+    m_table->rnd_init(scan);
+
   result= m_table ? 0 : HA_ERR_OUT_OF_MEM;
   DBUG_RETURN(result);
 }
@@ -305,6 +331,8 @@ int ha_perfschema::rnd_next(uchar *buf)
   DBUG_ENTER("ha_perfschema::rnd_next");
 
   DBUG_ASSERT(m_table);
+  ha_statistic_increment(&SSV::ha_read_rnd_next_count);
+
   int result= m_table->rnd_next();
   if (result == 0)
   {
@@ -329,6 +357,7 @@ int ha_perfschema::rnd_pos(uchar *buf, uchar *pos)
   DBUG_ENTER("ha_perfschema::rnd_pos");
 
   DBUG_ASSERT(m_table);
+  ha_statistic_increment(&SSV::ha_read_rnd_count);
   int result= m_table->rnd_pos(pos);
   if (result == 0)
     result= m_table->read_row(table, buf, table->field);
@@ -340,7 +369,7 @@ int ha_perfschema::info(uint flag)
   DBUG_ENTER("ha_perfschema::info");
   DBUG_ASSERT(m_table_share);
   if (flag & HA_STATUS_VARIABLE)
-    stats.records= m_table_share->m_records;
+    stats.records= m_table_share->get_row_count();
   if (flag & HA_STATUS_CONST)
     ref_length= m_table_share->m_ref_length;
   DBUG_RETURN(0);
@@ -406,6 +435,8 @@ int ha_perfschema::create(const char *name, TABLE *table_arg,
     This is not a general purpose engine.
     Failure to CREATE TABLE is the expected result.
   */
+  DBUG_PRINT("error", ("unknown table: %s.%s", table_arg->s->db.str,
+                       table_arg->s->table_name.str));
   DBUG_RETURN(HA_ERR_WRONG_COMMAND);
 }
 
diff --git a/storage/perfschema/ha_perfschema.h b/storage/perfschema/ha_perfschema.h
index 17ab601e60f..91ca83c443e 100644
--- a/storage/perfschema/ha_perfschema.h
+++ b/storage/perfschema/ha_perfschema.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2008, 2011, Oracle and/or its affiliates. All rights reserved.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
@@ -18,10 +18,6 @@
 
 #include "handler.h"                            /* class handler */
 
-#ifdef USE_PRAGMA_INTERFACE
-#pragma interface /* gcc class implementation */
-#endif
-
 /**
   @file storage/perfschema/ha_perfschema.h
   Performance schema storage engine (declarations).
@@ -32,12 +28,18 @@
 */
 struct PFS_engine_table_share;
 class PFS_engine_table;
+/** Name of the performance schema engine. */
 extern const char *pfs_engine_name;
 
 /** A handler for a PERFORMANCE_SCHEMA table. */
 class ha_perfschema : public handler
 {
 public:
+  /**
+    Create a new performance schema table handle on a table.
+    @param hton storage engine handler singleton
+    @param share table share
+  */
   ha_perfschema(handlerton *hton, TABLE_SHARE *share);
 
   ~ha_perfschema();
@@ -70,13 +72,10 @@ public:
       Without HA_FAST_KEY_READ, the optimizer reads all columns and never
       calls ::rnd_pos(), so it is guaranteed to return only thread <n>
       records.
-      We use HA_HAS_OWN_BINLOGGING to stop changes to this table to
-      be logged to slaves (as enabled performance tracking on all slaves
-      is probably not what anyone wants)
     */
     return (HA_NO_TRANSACTIONS | HA_REC_NOT_IN_SEQ | HA_NO_AUTO_INCREMENT |
             HA_BINLOG_ROW_CAPABLE | HA_BINLOG_STMT_CAPABLE |
-            HA_HAS_OWN_BINLOGGING | HA_NO_BLOBS);
+            HA_PRIMARY_KEY_REQUIRED_FOR_DELETE | HA_HAS_OWN_BINLOGGING);
   }
 
   /**
@@ -104,24 +103,72 @@ public:
   double scan_time(void)
   { return 1.0; }
 
+  /**
+    Open a performance schema table.
+    @param name the table to open
+    @param mode unused
+    @param test_if_locked unused
+    @return 0 on success
+  */
   int open(const char *name, int mode, uint test_if_locked);
 
+  /**
+    Close a table handle.
+    @sa open.
+  */
   int close(void);
 
+  /**
+    Write a row.
+    @param buf the row to write
+    @return 0 on success
+  */
   int write_row(uchar *buf);
 
   void use_hidden_primary_key();
 
+  /**
+    Update a row.
+    @param old_data the row old values
+    @param new_data the row new values
+    @return 0 on success
+  */
   int update_row(const uchar *old_data, uchar *new_data);
 
+  /**
+    Delete a row.
+    @param buf the row to delete
+    @return 0 on success
+  */
+  int delete_row(const uchar *buf);
+
   int rnd_init(bool scan);
 
+  /**
+    Scan end.
+    @sa rnd_init.
+  */
   int rnd_end(void);
 
+  /**
+    Iterator, fetch the next row.
+    @param[out] buf the row fetched.
+    @return 0 on success
+  */
   int rnd_next(uchar *buf);
 
+  /**
+    Iterator, fetch the row at a given position.
+    @param[out] buf the row fetched.
+    @param pos the row position
+    @return 0 on success
+  */
   int rnd_pos(uchar *buf, uchar *pos);
 
+  /**
+    Read the row current position.
+    @param record the current row
+  */
   void position(const uchar *record);
 
   int info(uint);
diff --git a/storage/perfschema/pfs.cc b/storage/perfschema/pfs.cc
index 9a6ada2f814..ccb8c77d573 100644
--- a/storage/perfschema/pfs.cc
+++ b/storage/perfschema/pfs.cc
@@ -17,18 +17,29 @@
   @file storage/perfschema/pfs.cc
   The performance schema implementation of all instruments.
 */
-
 #include "my_global.h"
+#include "thr_lock.h"
+#include "mysql/psi/psi.h"
+#include "mysql/psi/mysql_thread.h"
+#include "my_pthread.h"
+#include "sql_const.h"
 #include "pfs.h"
 #include "pfs_instr_class.h"
 #include "pfs_instr.h"
+#include "pfs_host.h"
+#include "pfs_user.h"
+#include "pfs_account.h"
 #include "pfs_global.h"
 #include "pfs_column_values.h"
 #include "pfs_timer.h"
 #include "pfs_events_waits.h"
-
-/* Pending WL#4895 PERFORMANCE_SCHEMA Instrumenting Table IO */
-#undef HAVE_TABLE_WAIT
+#include "pfs_events_stages.h"
+#include "pfs_events_statements.h"
+#include "pfs_setup_actor.h"
+#include "pfs_setup_object.h"
+#include "sql_error.h"
+#include "sp_head.h"
+#include "pfs_digest.h"
 
 /**
   @page PAGE_PERFORMANCE_SCHEMA The Performance Schema main page
@@ -300,8 +311,7 @@
 */
 
 /**
-  @page PAGE_INSTRUMENTATION_INTERFACE
-  Performance schema: instrumentation interface page.
+  @page PAGE_INSTRUMENTATION_INTERFACE Performance schema: instrumentation interface page.
   MySQL performance schema instrumentation interface.
 
   @section INTRO Introduction
@@ -355,26 +365,14 @@
 
   For a given instrumentation point in the API, the basic coding pattern
   used is:
-  - (a) If the performance schema is not initialized, do nothing
-  - (b) If the object acted upon is not instrumented, do nothing
-  - (c) otherwise, notify the performance schema of the operation
+  - (a) notify the performance schema of the operation
   about to be performed.
+  - (b) execute the instrumented code.
+  - (c) notify the performance schema that the operation
+  is completed.
 
-  The implementation of the instrumentation interface can:
-  - decide that it is not interested by the event, and return NULL.
-  In this context, 'interested' means whether the instrumentation for
-  this object + event is turned on in the performance schema configuration
-  (the SETUP_ tables).
-  - decide that this event is to be instrumented.
-  In this case, the instrumentation returns an opaque pointer,
-  that acts as a listener.
-
-  If a listener is returned, the instrumentation point then:
-  - (d) invokes the "start" event method
-  - (e) executes the instrumented code.
-  - (f) invokes the "end" event method.
-
-  If no listener is returned, only the instrumented code (e) is invoked.
+  An opaque "locker" pointer is returned by (a), that is given to (c).
+  This pointer helps the implementation to keep context, for performances.
 
   The following code fragment is annotated to show how in detail this pattern
   in implemented, when the instrumentation is compiled in:
@@ -384,25 +382,18 @@ static inline int mysql_mutex_lock(
   mysql_mutex_t *that, myf flags, const char *src_file, uint src_line)
 {
   int result;
+  struct PSI_mutex_locker_state state;
   struct PSI_mutex_locker *locker= NULL;
 
-  ...... (a) .......... (b)
-  if (PSI_server && that->m_psi)
+  ............... (a)
+  locker= PSI_server->start_mutex_wait(&state, that->p_psi,
+                                       PSI_MUTEX_LOCK, locker, src_file, src_line);
 
-  .......................... (c)
-    if ((locker= PSI_server->get_thread_mutex_locker(that->m_psi,
-                                                     PSI_MUTEX_LOCK)))
-
-  ............... (d)
-      PSI_server->start_mutex_wait(locker, src_file, src_line);
-
-  ........ (e)
+  ............... (b)
   result= pthread_mutex_lock(&that->m_mutex);
 
-  if (locker)
-
-  ............. (f)
-    PSI_server->end_mutex_wait(locker, result);
+  ............... (c)
+  PSI_server->end_mutex_wait(locker, result);
 
   return result;
 }
@@ -416,7 +407,7 @@ static inline int mysql_mutex_lock(...)
 {
   int result;
 
-  ........ (e)
+  ............... (b)
   result= pthread_mutex_lock(&that->m_mutex);
 
   return result;
@@ -487,7 +478,7 @@ static inline int mysql_mutex_lock(...)
   Applying this function to our point P gives another point P':
 
   F_i (P):
-  P(x1, x2, ..., x{i-1}, x_i, x{i+1}, ..., x_N
+  P(x1, x2, ..., x{i-1}, x_i, x{i+1}, ..., x_N)
   --> P' (x1, x2, ..., x{i-1}, f_i(x_i), x{i+1}, ..., x_N)
 
   That function defines in fact an aggregate !
@@ -583,27 +574,17 @@ static inline int mysql_mutex_lock(...)
 
   What has all this to do with the code ?
 
-  Function composition such as F_2_to_3 o F_1_to_2 o F1 is implemented
-  as PFS_single_stat_chain, where each link in the chain represents
-  an individual F_{i}_to_{i+1} aggregation step.
-
-  A single call to aggregate_single_stat_chain() updates all the tables
-  described in the statistics chain.
-
-  @section STAT_CHAIN Statistics chains
+  Functions (or aggregates) such as F_3 are not implemented as is.
+  Instead, they are decomposed into F_2_to_3 o F_1_to_2 o F1,
+  and each intermediate aggregate is stored into an internal buffer.
+  This allows to support every F1, F2, F3 aggregates from shared
+  internal buffers, where computation already performed to compute F2
+  is reused when computing F3.
 
-  Statistics chains are only used for on the fly aggregates,
-  and are therefore all based initially on the '_CURRENT' base table that
-  contains the data recorded.
-  The following table aggregates are implemented with a statistics chain:
+  @section OBJECT_GRAPH Object graph
 
-  EVENTS_WAITS_CURRENT --> EVENTS_WAITS_SUMMARY_BY_INSTANCE
-  --> EVENTS_WAITS_SUMMARY_BY_EVENT_NAME
-
-  This relationship is between classes.
-
-  In terms of object instances, or records, this chain is implemented
-  as a flyweight.
+  In terms of object instances, or records, pointers between
+  different buffers define an object instance graph.
 
   For example, assuming the following scenario:
   - A mutex class "M" is instrumented, the instrument name
@@ -654,10 +635,413 @@ static inline int mysql_mutex_lock(...)
   This is necessary because the data the aggregate is based on is volatile,
   and can not be kept indefinitely.
 
+  With on the fly aggregates:
+  - the writer thread does all the computation
+  - the reader thread accesses the result directly
+
+  This model is to be avoided if possible, due to the overhead
+  caused when instrumenting code.
+
   @section HIGHER_LEVEL Higher level aggregates
 
-  Note: no higher level aggregate is implemented yet,
-  this section is a place holder.
+  'Higher level' aggregates are implemented on demand only.
+  The code executing a SELECT from the aggregate table is
+  collecting data from multiple internal buffers to produce the result.
+
+  With higher level aggregates:
+  - the reader thread does all the computation
+  - the writer thread has no overhead.
+
+  @section MIXED Mixed level aggregates
+
+  The 'Mixed' model is a compromise between 'On the fly' and 'Higher level'
+  aggregates, for internal buffers that are not permanent.
+
+  While an object is present in a buffer, the higher level model is used.
+  When an object is about to be destroyed, statistics are saved into
+  a 'parent' buffer with a longer life cycle, to follow the on the fly model.
+
+  With mixed aggregates:
+  - the reader thread does a lot of complex computation,
+  - the writer thread has minimal overhead, on destroy events.
+
+  @section IMPL_WAIT Implementation for waits aggregates
+
+  For waits, the tables that contains aggregated wait data are:
+  - EVENTS_WAITS_SUMMARY_BY_ACCOUNT_BY_EVENT_NAME
+  - EVENTS_WAITS_SUMMARY_BY_HOST_BY_EVENT_NAME
+  - EVENTS_WAITS_SUMMARY_BY_INSTANCE
+  - EVENTS_WAITS_SUMMARY_BY_THREAD_BY_EVENT_NAME
+  - EVENTS_WAITS_SUMMARY_BY_USER_BY_EVENT_NAME
+  - EVENTS_WAITS_SUMMARY_GLOBAL_BY_EVENT_NAME
+  - FILE_SUMMARY_BY_EVENT_NAME
+  - FILE_SUMMARY_BY_INSTANCE
+  - SOCKET_SUMMARY_BY_INSTANCE
+  - SOCKET_SUMMARY_BY_EVENT_NAME
+  - OBJECTS_SUMMARY_GLOBAL_BY_TYPE
+
+  The instrumented code that generates waits events consist of:
+  - mutexes (mysql_mutex_t)
+  - rwlocks (mysql_rwlock_t)
+  - conditions (mysql_cond_t)
+  - file io (MYSQL_FILE)
+  - socket io (MYSQL_SOCKET)
+  - table io
+  - table lock
+
+  The flow of data between aggregates tables varies for each instrumentation.
+
+  @subsection IMPL_WAIT_MUTEX Mutex waits
+
+@verbatim
+  mutex_locker(T, M)
+   |
+   | [1]
+   |
+   |-> pfs_mutex(M)                           =====>> [B], [C]
+   |    |
+   |    | [2]
+   |    |
+   |    |-> pfs_mutex_class(M.class)          =====>> [C]
+   |
+   |-> pfs_thread(T).event_name(M)            =====>> [A], [D], [E], [F]
+        |
+        | [3]
+        |
+     3a |-> pfs_account(U, H).event_name(M)   =====>> [D], [E], [F]
+        .    |
+        .    | [4-RESET]
+        .    |
+     3b .....+-> pfs_user(U).event_name(M)    =====>> [E]
+        .    |
+     3c .....+-> pfs_host(H).event_name(M)    =====>> [F]
+@endverbatim
+
+  How to read this diagram:
+  - events that occur during the instrumented code execution are noted with numbers,
+  as in [1]. Code executed by these events has an impact on overhead.
+  - events that occur during TRUNCATE TABLE operations are noted with numbers,
+  followed by "-RESET", as in [4-RESET].
+  Code executed by these events has no impact on overhead,
+  since they are executed by independent monitoring sessions.
+  - events that occur when a reader extracts data from a performance schema table
+  are noted with letters, as in [A]. The name of the table involved,
+  and the method that builds a row are documented. Code executed by these events
+  has no impact on the instrumentation overhead. Note that the table
+  implementation may pull data from different buffers.
+  - nominal code paths are in plain lines. A "nominal" code path corresponds to
+  cases where the performance schema buffers are sized so that no records are lost.
+  - degenerated code paths are in dotted lines. A "degenerated" code path corresponds
+  to edge cases where parent buffers are full, which forces the code to aggregate to
+  grand parents directly.
+
+  Implemented as:
+  - [1] @c start_mutex_wait_v1(), @c end_mutex_wait_v1()
+  - [2] @c destroy_mutex_v1()
+  - [3] @c aggregate_thread_waits()
+  - [4] @c PFS_account::aggregate_waits()
+  - [A] EVENTS_WAITS_SUMMARY_BY_THREAD_BY_EVENT_NAME,
+        @c table_ews_by_thread_by_event_name::make_row()
+  - [B] EVENTS_WAITS_SUMMARY_BY_INSTANCE,
+        @c table_events_waits_summary_by_instance::make_mutex_row()
+  - [C] EVENTS_WAITS_SUMMARY_GLOBAL_BY_EVENT_NAME,
+        @c table_ews_global_by_event_name::make_mutex_row()
+  - [D] EVENTS_WAITS_SUMMARY_BY_ACCOUNT_BY_EVENT_NAME,
+        @c table_ews_by_account_by_event_name::make_row()
+  - [E] EVENTS_WAITS_SUMMARY_BY_USER_BY_EVENT_NAME,
+        @c table_ews_by_user_by_event_name::make_row()
+  - [F] EVENTS_WAITS_SUMMARY_BY_HOST_BY_EVENT_NAME,
+        @c table_ews_by_host_by_event_name::make_row()
+
+  Table EVENTS_WAITS_SUMMARY_BY_INSTANCE is a 'on the fly' aggregate,
+  because the data is collected on the fly by (1) and stored into a buffer,
+  pfs_mutex. The table implementation [B] simply reads the results directly
+  from this buffer.
+
+  Table EVENTS_WAITS_SUMMARY_GLOBAL_BY_EVENT_NAME is a 'mixed' aggregate,
+  because some data is collected on the fly (1),
+  some data is preserved with (2) at a later time in the life cycle,
+  and two different buffers pfs_mutex and pfs_mutex_class are used to store the
+  statistics collected. The table implementation [C] is more complex, since
+  it reads from two buffers pfs_mutex and pfs_mutex_class.
+
+  @subsection IMPL_WAIT_RWLOCK Rwlock waits
+
+@verbatim
+  rwlock_locker(T, R)
+   |
+   | [1]
+   |
+   |-> pfs_rwlock(R)                          =====>> [B], [C]
+   |    |
+   |    | [2]
+   |    |
+   |    |-> pfs_rwlock_class(R.class)         =====>> [C]
+   |
+   |-> pfs_thread(T).event_name(R)            =====>> [A]
+        |
+       ...
+@endverbatim
+
+  Implemented as:
+  - [1] @c start_rwlock_rdwait_v1(), @c end_rwlock_rdwait_v1(), ...
+  - [2] @c destroy_rwlock_v1()
+  - [A] EVENTS_WAITS_SUMMARY_BY_THREAD_BY_EVENT_NAME,
+        @c table_ews_by_thread_by_event_name::make_row()
+  - [B] EVENTS_WAITS_SUMMARY_BY_INSTANCE,
+        @c table_events_waits_summary_by_instance::make_rwlock_row()
+  - [C] EVENTS_WAITS_SUMMARY_GLOBAL_BY_EVENT_NAME,
+        @c table_ews_global_by_event_name::make_rwlock_row()
+
+  @subsection IMPL_WAIT_COND Cond waits
+
+@verbatim
+  cond_locker(T, C)
+   |
+   | [1]
+   |
+   |-> pfs_cond(C)                            =====>> [B], [C]
+   |    |
+   |    | [2]
+   |    |
+   |    |-> pfs_cond_class(C.class)           =====>> [C]
+   |
+   |-> pfs_thread(T).event_name(C)            =====>> [A]
+        |
+       ...
+@endverbatim
+
+  Implemented as:
+  - [1] @c start_cond_wait_v1(), @c end_cond_wait_v1()
+  - [2] @c destroy_cond_v1()
+  - [A] EVENTS_WAITS_SUMMARY_BY_THREAD_BY_EVENT_NAME,
+        @c table_ews_by_thread_by_event_name::make_row()
+  - [B] EVENTS_WAITS_SUMMARY_BY_INSTANCE,
+        @c table_events_waits_summary_by_instance::make_cond_row()
+  - [C] EVENTS_WAITS_SUMMARY_GLOBAL_BY_EVENT_NAME,
+        @c table_ews_global_by_event_name::make_cond_row()
+
+  @subsection IMPL_WAIT_FILE File waits
+
+@verbatim
+  file_locker(T, F)
+   |
+   | [1]
+   |
+   |-> pfs_file(F)                            =====>> [B], [C], [D], [E]
+   |    |
+   |    | [2]
+   |    |
+   |    |-> pfs_file_class(F.class)           =====>> [C], [D]
+   |
+   |-> pfs_thread(T).event_name(F)            =====>> [A]
+        |
+       ...
+@endverbatim
+
+  Implemented as:
+  - [1] @c get_thread_file_name_locker_v1(), @c start_file_wait_v1(),
+        @c end_file_wait_v1(), ...
+  - [2] @c close_file_v1()
+  - [A] EVENTS_WAITS_SUMMARY_BY_THREAD_BY_EVENT_NAME,
+        @c table_ews_by_thread_by_event_name::make_row()
+  - [B] EVENTS_WAITS_SUMMARY_BY_INSTANCE,
+        @c table_events_waits_summary_by_instance::make_file_row()
+  - [C] EVENTS_WAITS_SUMMARY_GLOBAL_BY_EVENT_NAME,
+        @c table_ews_global_by_event_name::make_file_row()
+  - [D] FILE_SUMMARY_BY_EVENT_NAME,
+        @c table_file_summary_by_event_name::make_row()
+  - [E] FILE_SUMMARY_BY_INSTANCE,
+        @c table_file_summary_by_instance::make_row()
+
+  @subsection IMPL_WAIT_SOCKET Socket waits
+
+@verbatim
+  socket_locker(T, F)
+   |
+   | [1]
+   |
+   |-> pfs_socket(F)                            =====>> [A], [B], [C], [D], [E]
+        |
+        | [2]
+        |
+        |-> pfs_socket_class(F.class)           =====>> [C], [D]
+        |
+        |-> pfs_thread(T).event_name(F)         =====>> [A]
+        |
+        ...
+@endverbatim
+
+  Implemented as:
+  - [1] @c start_socket_wait_v1(), @c end_socket_wait_v1().
+  - [2] @c close_socket_v1()
+  - [A] EVENTS_WAITS_SUMMARY_BY_THREAD_BY_EVENT_NAME,
+        @c table_ews_by_thread_by_event_name::make_row()
+  - [B] EVENTS_WAITS_SUMMARY_BY_INSTANCE,
+        @c table_events_waits_summary_by_instance::make_socket_row()
+  - [C] EVENTS_WAITS_SUMMARY_GLOBAL_BY_EVENT_NAME,
+        @c table_ews_global_by_event_name::make_socket_row()
+  - [D] SOCKET_SUMMARY_BY_EVENT_NAME,
+        @c table_socket_summary_by_event_name::make_row()
+  - [E] SOCKET_SUMMARY_BY_INSTANCE,
+        @c table_socket_summary_by_instance::make_row()
+
+  @subsection IMPL_WAIT_TABLE Table waits
+
+@verbatim
+  table_locker(T, Tb)
+   |
+   | [1]
+   |
+   |-> pfs_table(Tb)                          =====>> [B], [C], [D]
+        |
+        | [2]
+        |
+        |-> pfs_table_share(Tb.share)         =====>> [C], [D]
+        |
+        |-> pfs_thread(T).event_name(Tb)      =====>> [A]
+             |
+            ...
+@endverbatim
+
+  Implemented as:
+  - [1] @c start_table_io_wait_v1(), @c end_table_io_wait_v1()
+  - [2] @c close_table_v1()
+  - [A] EVENTS_WAITS_SUMMARY_BY_THREAD_BY_EVENT_NAME,
+        @c table_ews_by_thread_by_event_name::make_row()
+  - [B] EVENTS_WAITS_SUMMARY_BY_INSTANCE,
+        @c table_events_waits_summary_by_instance::make_table_row()
+  - [C] EVENTS_WAITS_SUMMARY_GLOBAL_BY_EVENT_NAME,
+        @c table_ews_global_by_event_name::make_table_io_row(),
+        @c table_ews_global_by_event_name::make_table_lock_row()
+  - [D] OBJECTS_SUMMARY_GLOBAL_BY_TYPE,
+        @c table_os_global_by_type::make_row()
+
+  @section IMPL_STAGE Implementation for stages aggregates
+
+  For stages, the tables that contains aggregated data are:
+  - EVENTS_STAGES_SUMMARY_BY_ACCOUNT_BY_EVENT_NAME
+  - EVENTS_STAGES_SUMMARY_BY_HOST_BY_EVENT_NAME
+  - EVENTS_STAGES_SUMMARY_BY_THREAD_BY_EVENT_NAME
+  - EVENTS_STAGES_SUMMARY_BY_USER_BY_EVENT_NAME
+  - EVENTS_STAGES_SUMMARY_GLOBAL_BY_EVENT_NAME
+
+@verbatim
+  start_stage(T, S)
+   |
+   | [1]
+   |
+1a |-> pfs_thread(T).event_name(S)            =====>> [A], [B], [C], [D], [E]
+   |    |
+   |    | [2]
+   |    |
+   | 2a |-> pfs_account(U, H).event_name(S)   =====>> [B], [C], [D], [E]
+   |    .    |
+   |    .    | [3-RESET]
+   |    .    |
+   | 2b .....+-> pfs_user(U).event_name(S)    =====>> [C]
+   |    .    |
+   | 2c .....+-> pfs_host(H).event_name(S)    =====>> [D], [E]
+   |    .    .    |
+   |    .    .    | [4-RESET]
+   | 2d .    .    |
+1b |----+----+----+-> pfs_stage_class(S)      =====>> [E]
+
+@endverbatim
+
+  Implemented as:
+  - [1] @c start_stage_v1()
+  - [2] @c delete_thread_v1(), @c aggregate_thread_stages()
+  - [3] @c PFS_account::aggregate_stages()
+  - [4] @c PFS_host::aggregate_stages()
+  - [A] EVENTS_STAGES_SUMMARY_BY_THREAD_BY_EVENT_NAME,
+        @c table_esgs_by_thread_by_event_name::make_row()
+  - [B] EVENTS_STAGES_SUMMARY_BY_ACCOUNT_BY_EVENT_NAME,
+        @c table_esgs_by_account_by_event_name::make_row()
+  - [C] EVENTS_STAGES_SUMMARY_BY_USER_BY_EVENT_NAME,
+        @c table_esgs_by_user_by_event_name::make_row()
+  - [D] EVENTS_STAGES_SUMMARY_BY_HOST_BY_EVENT_NAME,
+        @c table_esgs_by_host_by_event_name::make_row()
+  - [E] EVENTS_STAGES_SUMMARY_GLOBAL_BY_EVENT_NAME,
+        @c table_esgs_global_by_event_name::make_row()
+
+@section IMPL_STATEMENT Implementation for statements consumers
+
+  For statements, the tables that contains individual event data are:
+  - EVENTS_STATEMENTS_CURRENT
+  - EVENTS_STATEMENTS_HISTORY
+  - EVENTS_STATEMENTS_HISTORY_LONG
+
+  For statements, the tables that contains aggregated data are:
+  - EVENTS_STATEMENTS_SUMMARY_BY_ACCOUNT_BY_EVENT_NAME
+  - EVENTS_STATEMENTS_SUMMARY_BY_HOST_BY_EVENT_NAME
+  - EVENTS_STATEMENTS_SUMMARY_BY_THREAD_BY_EVENT_NAME
+  - EVENTS_STATEMENTS_SUMMARY_BY_USER_BY_EVENT_NAME
+  - EVENTS_STATEMENTS_SUMMARY_GLOBAL_BY_EVENT_NAME
+  - EVENTS_STATEMENTS_SUMMARY_BY_DIGEST
+
+@verbatim
+  statement_locker(T, S)
+   |
+   | [1]
+   |
+1a |-> pfs_thread(T).event_name(S)            =====>> [A], [B], [C], [D], [E]
+   |    |
+   |    | [2]
+   |    |
+   | 2a |-> pfs_account(U, H).event_name(S)   =====>> [B], [C], [D], [E]
+   |    .    |
+   |    .    | [3-RESET]
+   |    .    |
+   | 2b .....+-> pfs_user(U).event_name(S)    =====>> [C]
+   |    .    |
+   | 2c .....+-> pfs_host(H).event_name(S)    =====>> [D], [E]
+   |    .    .    |
+   |    .    .    | [4-RESET]
+   | 2d .    .    |
+1b |----+----+----+-> pfs_statement_class(S)  =====>> [E]
+   |
+1c |-> pfs_thread(T).statement_current(S)     =====>> [F]
+   |
+1d |-> pfs_thread(T).statement_history(S)     =====>> [G]
+   |
+1e |-> statement_history_long(S)              =====>> [H]
+   |
+1f |-> statement_digest(S)                    =====>> [I]
+
+@endverbatim
+
+  Implemented as:
+  - [1] @c start_statement_v1(), end_statement_v1()
+       (1a, 1b) is an aggregation by EVENT_NAME,
+        (1c, 1d, 1e) is an aggregation by TIME,
+        (1f) is an aggregation by DIGEST
+        all of these are orthogonal,
+        and implemented in end_statement_v1().
+  - [2] @c delete_thread_v1(), @c aggregate_thread_statements()
+  - [3] @c PFS_account::aggregate_statements()
+  - [4] @c PFS_host::aggregate_statements()
+  - [A] EVENTS_STATEMENTS_SUMMARY_BY_THREAD_BY_EVENT_NAME,
+        @c table_esms_by_thread_by_event_name::make_row()
+  - [B] EVENTS_STATEMENTS_SUMMARY_BY_ACCOUNT_BY_EVENT_NAME,
+        @c table_esms_by_account_by_event_name::make_row()
+  - [C] EVENTS_STATEMENTS_SUMMARY_BY_USER_BY_EVENT_NAME,
+        @c table_esms_by_user_by_event_name::make_row()
+  - [D] EVENTS_STATEMENTS_SUMMARY_BY_HOST_BY_EVENT_NAME,
+        @c table_esms_by_host_by_event_name::make_row()
+  - [E] EVENTS_STATEMENTS_SUMMARY_GLOBAL_BY_EVENT_NAME,
+        @c table_esms_global_by_event_name::make_row()
+  - [F] EVENTS_STATEMENTS_CURRENT,
+        @c table_events_statements_current::rnd_next(),
+        @c table_events_statements_common::make_row()
+  - [G] EVENTS_STATEMENTS_HISTORY,
+        @c table_events_statements_history::rnd_next(),
+        @c table_events_statements_common::make_row()
+  - [H] EVENTS_STATEMENTS_HISTORY_LONG,
+        @c table_events_statements_history_long::rnd_next(),
+        @c table_events_statements_common::make_row()
+  - [I] EVENTS_STATEMENTS_SUMMARY_BY_DIGEST
+        @c table_esms_by_digest::make_row()
 */
 
 /**
@@ -676,12 +1060,20 @@ static inline int mysql_mutex_lock(...)
 pthread_key(PFS_thread*, THR_PFS);
 bool THR_PFS_initialized= false;
 
+/**
+  Conversion map from PSI_mutex_operation to enum_operation_type.
+  Indexed by enum PSI_mutex_operation.
+*/
 static enum_operation_type mutex_operation_map[]=
 {
   OPERATION_TYPE_LOCK,
   OPERATION_TYPE_TRYLOCK
 };
 
+/**
+  Conversion map from PSI_rwlock_operation to enum_operation_type.
+  Indexed by enum PSI_rwlock_operation.
+*/
 static enum_operation_type rwlock_operation_map[]=
 {
   OPERATION_TYPE_READLOCK,
@@ -690,6 +1082,10 @@ static enum_operation_type rwlock_operation_map[]=
   OPERATION_TYPE_TRYWRITELOCK
 };
 
+/**
+  Conversion map from PSI_cond_operation to enum_operation_type.
+  Indexed by enum PSI_cond_operation.
+*/
 static enum_operation_type cond_operation_map[]=
 {
   OPERATION_TYPE_WAIT,
@@ -722,6 +1118,60 @@ static enum_operation_type file_operation_map[]=
 };
 
 /**
+  Conversion map from PSI_table_operation to enum_operation_type.
+  Indexed by enum PSI_table_io_operation.
+*/
+static enum_operation_type table_io_operation_map[]=
+{
+  OPERATION_TYPE_TABLE_FETCH,
+  OPERATION_TYPE_TABLE_WRITE_ROW,
+  OPERATION_TYPE_TABLE_UPDATE_ROW,
+  OPERATION_TYPE_TABLE_DELETE_ROW
+};
+
+/**
+  Conversion map from enum PFS_TL_LOCK_TYPE to enum_operation_type.
+  Indexed by enum PFS_TL_LOCK_TYPE.
+*/
+static enum_operation_type table_lock_operation_map[]=
+{
+  OPERATION_TYPE_TL_READ_NORMAL, /* PFS_TL_READ */
+  OPERATION_TYPE_TL_READ_WITH_SHARED_LOCKS, /* PFS_TL_READ_WITH_SHARED_LOCKS */
+  OPERATION_TYPE_TL_READ_HIGH_PRIORITY, /* PFS_TL_READ_HIGH_PRIORITY */
+  OPERATION_TYPE_TL_READ_NO_INSERTS, /* PFS_TL_READ_NO_INSERT */
+  OPERATION_TYPE_TL_WRITE_ALLOW_WRITE, /* PFS_TL_WRITE_ALLOW_WRITE */
+  OPERATION_TYPE_TL_WRITE_CONCURRENT_INSERT, /* PFS_TL_WRITE_CONCURRENT_INSERT */
+  OPERATION_TYPE_TL_WRITE_DELAYED, /* PFS_TL_WRITE_DELAYED */
+  OPERATION_TYPE_TL_WRITE_LOW_PRIORITY, /* PFS_TL_WRITE_LOW_PRIORITY */
+  OPERATION_TYPE_TL_WRITE_NORMAL, /* PFS_TL_WRITE */
+  OPERATION_TYPE_TL_READ_EXTERNAL, /* PFS_TL_READ_EXTERNAL */
+  OPERATION_TYPE_TL_WRITE_EXTERNAL /* PFS_TL_WRITE_EXTERNAL */
+};
+
+/**
+  Conversion map from PSI_socket_operation to enum_operation_type.
+  Indexed by enum PSI_socket_operation.
+*/
+static enum_operation_type socket_operation_map[]=
+{
+  OPERATION_TYPE_SOCKETCREATE,
+  OPERATION_TYPE_SOCKETCONNECT,
+  OPERATION_TYPE_SOCKETBIND,
+  OPERATION_TYPE_SOCKETCLOSE,
+  OPERATION_TYPE_SOCKETSEND,
+  OPERATION_TYPE_SOCKETRECV,
+  OPERATION_TYPE_SOCKETSENDTO,
+  OPERATION_TYPE_SOCKETRECVFROM,
+  OPERATION_TYPE_SOCKETSENDMSG,
+  OPERATION_TYPE_SOCKETRECVMSG,
+  OPERATION_TYPE_SOCKETSEEK,
+  OPERATION_TYPE_SOCKETOPT,
+  OPERATION_TYPE_SOCKETSTAT,
+  OPERATION_TYPE_SOCKETSHUTDOWN,
+  OPERATION_TYPE_SOCKETSELECT
+};
+
+/**
   Build the prefix name of a class of instruments in a category.
   For example, this function builds the string 'wait/sync/mutex/sql/' from
   a prefix 'wait/sync/mutex' and a category 'sql'.
@@ -810,6 +1260,10 @@ static int build_prefix(const LEX_STRING *prefix, const char *category,
 
 C_MODE_START
 
+/**
+  Implementation of the mutex instrumentation interface.
+  @sa PSI_v1::register_mutex.
+*/
 static void register_mutex_v1(const char *category,
                               PSI_mutex_info_v1 *info,
                               int count)
@@ -819,6 +1273,10 @@ static void register_mutex_v1(const char *category,
                    register_mutex_class)
 }
 
+/**
+  Implementation of the rwlock instrumentation interface.
+  @sa PSI_v1::register_rwlock.
+*/
 static void register_rwlock_v1(const char *category,
                                PSI_rwlock_info_v1 *info,
                                int count)
@@ -828,6 +1286,10 @@ static void register_rwlock_v1(const char *category,
                    register_rwlock_class)
 }
 
+/**
+  Implementation of the cond instrumentation interface.
+  @sa PSI_v1::register_cond.
+*/
 static void register_cond_v1(const char *category,
                              PSI_cond_info_v1 *info,
                              int count)
@@ -837,6 +1299,10 @@ static void register_cond_v1(const char *category,
                    register_cond_class)
 }
 
+/**
+  Implementation of the thread instrumentation interface.
+  @sa PSI_v1::register_thread.
+*/
 static void register_thread_v1(const char *category,
                                PSI_thread_info_v1 *info,
                                int count)
@@ -846,6 +1312,10 @@ static void register_thread_v1(const char *category,
                    register_thread_class)
 }
 
+/**
+  Implementation of the file instrumentation interface.
+  @sa PSI_v1::register_file.
+*/
 static void register_file_v1(const char *category,
                              PSI_file_info_v1 *info,
                              int count)
@@ -855,14 +1325,100 @@ static void register_file_v1(const char *category,
                    register_file_class)
 }
 
+static void register_stage_v1(const char *category,
+                              PSI_stage_info_v1 **info_array,
+                              int count)
+{
+  char formatted_name[PFS_MAX_INFO_NAME_LENGTH];
+  int prefix_length;
+  int len;
+  int full_length;
+  PSI_stage_info_v1 *info;
+
+  DBUG_ASSERT(category != NULL);
+  DBUG_ASSERT(info_array != NULL);
+  if (unlikely(build_prefix(&stage_instrument_prefix, category,
+               formatted_name, &prefix_length)))
+  {
+    for (; count>0; count--, info_array++)
+      (*info_array)->m_key= 0;
+    return ;
+  }
+
+  for (; count>0; count--, info_array++)
+  {
+    info= *info_array;
+    DBUG_ASSERT(info != NULL);
+    DBUG_ASSERT(info->m_name != NULL);
+    len= strlen(info->m_name);
+    full_length= prefix_length + len;
+    if (likely(full_length <= PFS_MAX_INFO_NAME_LENGTH))
+    {
+      memcpy(formatted_name + prefix_length, info->m_name, len);
+      info->m_key= register_stage_class(formatted_name, full_length,
+                                        info->m_flags);
+    }
+    else
+    {
+      pfs_print_error("register_stage_v1: name too long <%s> <%s>\n",
+                      category, info->m_name);
+      info->m_key= 0;
+    }
+  }
+  return;
+}
+
+static void register_statement_v1(const char *category,
+                                  PSI_statement_info_v1 *info,
+                                  int count)
+{
+  char formatted_name[PFS_MAX_INFO_NAME_LENGTH];
+  int prefix_length;
+  int len;
+  int full_length;
+
+  DBUG_ASSERT(category != NULL);
+  DBUG_ASSERT(info != NULL);
+  if (unlikely(build_prefix(&statement_instrument_prefix,
+                            category, formatted_name, &prefix_length)))
+  {
+    for (; count>0; count--, info++)
+      info->m_key= 0;
+    return ;
+  }
+
+  for (; count>0; count--, info++)
+  {
+    DBUG_ASSERT(info->m_name != NULL);
+    len= strlen(info->m_name);
+    full_length= prefix_length + len;
+    if (likely(full_length <= PFS_MAX_INFO_NAME_LENGTH))
+    {
+      memcpy(formatted_name + prefix_length, info->m_name, len);
+      info->m_key= register_statement_class(formatted_name, full_length, info->m_flags);
+    }
+    else
+    {
+      pfs_print_error("register_statement_v1: name too long <%s>\n",
+                      info->m_name);
+      info->m_key= 0;
+    }
+  }
+  return;
+}
+
+static void register_socket_v1(const char *category,
+                             PSI_socket_info_v1 *info,
+                             int count)
+{
+  REGISTER_BODY_V1(PSI_socket_key,
+                   socket_instrument_prefix,
+                   register_socket_class)
+}
+
 #define INIT_BODY_V1(T, KEY, ID)                                            \
   PFS_##T##_class *klass;                                                   \
   PFS_##T *pfs;                                                             \
-  PFS_thread *pfs_thread= my_pthread_getspecific_ptr(PFS_thread*, THR_PFS); \
-  if (unlikely(pfs_thread == NULL))                                         \
-    return NULL;                                                            \
-  if (! pfs_thread->m_enabled)                                              \
-    return NULL;                                                            \
   klass= find_##T##_class(KEY);                                             \
   if (unlikely(klass == NULL))                                              \
     return NULL;                                                            \
@@ -871,111 +1427,320 @@ static void register_file_v1(const char *category,
   pfs= create_##T(klass, ID);                                               \
   return reinterpret_cast<PSI_##T *> (pfs)
 
+/**
+  Implementation of the mutex instrumentation interface.
+  @sa PSI_v1::init_mutex.
+*/
 static PSI_mutex*
 init_mutex_v1(PSI_mutex_key key, const void *identity)
 {
   INIT_BODY_V1(mutex, key, identity);
 }
 
+/**
+  Implementation of the mutex instrumentation interface.
+  @sa PSI_v1::destroy_mutex.
+*/
 static void destroy_mutex_v1(PSI_mutex* mutex)
 {
   PFS_mutex *pfs= reinterpret_cast<PFS_mutex*> (mutex);
+
+  DBUG_ASSERT(pfs != NULL);
+
   destroy_mutex(pfs);
 }
 
+/**
+  Implementation of the rwlock instrumentation interface.
+  @sa PSI_v1::init_rwlock.
+*/
 static PSI_rwlock*
 init_rwlock_v1(PSI_rwlock_key key, const void *identity)
 {
   INIT_BODY_V1(rwlock, key, identity);
 }
 
+/**
+  Implementation of the rwlock instrumentation interface.
+  @sa PSI_v1::destroy_rwlock.
+*/
 static void destroy_rwlock_v1(PSI_rwlock* rwlock)
 {
   PFS_rwlock *pfs= reinterpret_cast<PFS_rwlock*> (rwlock);
+
+  DBUG_ASSERT(pfs != NULL);
+
   destroy_rwlock(pfs);
 }
 
+/**
+  Implementation of the cond instrumentation interface.
+  @sa PSI_v1::init_cond.
+*/
 static PSI_cond*
 init_cond_v1(PSI_cond_key key, const void *identity)
 {
   INIT_BODY_V1(cond, key, identity);
 }
 
+/**
+  Implementation of the cond instrumentation interface.
+  @sa PSI_v1::destroy_cond.
+*/
 static void destroy_cond_v1(PSI_cond* cond)
 {
   PFS_cond *pfs= reinterpret_cast<PFS_cond*> (cond);
+
+  DBUG_ASSERT(pfs != NULL);
+
   destroy_cond(pfs);
 }
 
+/**
+  Implementation of the table instrumentation interface.
+  @sa PSI_v1::get_table_share.
+*/
 static PSI_table_share*
-get_table_share_v1(const char *schema_name, int schema_name_length,
-                   const char *table_name, int table_name_length,
-                   const void *identity)
+get_table_share_v1(my_bool temporary, TABLE_SHARE *share)
 {
-#ifdef HAVE_TABLE_WAIT
+  /* Ignore temporary tables and views. */
+  if (temporary || share->is_view)
+    return NULL;
+  /* An instrumented thread is required, for LF_PINS. */
   PFS_thread *pfs_thread= my_pthread_getspecific_ptr(PFS_thread*, THR_PFS);
   if (unlikely(pfs_thread == NULL))
     return NULL;
-  PFS_table_share* share;
-  share= find_or_create_table_share(pfs_thread,
-                                    schema_name, schema_name_length,
-                                    table_name, table_name_length);
-  return reinterpret_cast<PSI_table_share*> (share);
-#else
-  return NULL;
-#endif
+  PFS_table_share* pfs_share;
+  pfs_share= find_or_create_table_share(pfs_thread, temporary, share);
+  return reinterpret_cast<PSI_table_share*> (pfs_share);
 }
 
+/**
+  Implementation of the table instrumentation interface.
+  @sa PSI_v1::release_table_share.
+*/
 static void release_table_share_v1(PSI_table_share* share)
 {
-  /*
-    To be implemented by WL#4895 PERFORMANCE_SCHEMA Instrumenting Table IO.
-  */
+  PFS_table_share* pfs= reinterpret_cast<PFS_table_share*> (share);
+
+  if (unlikely(pfs == NULL))
+    return;
+
+  release_table_share(pfs);
 }
 
+/**
+  Implementation of the table instrumentation interface.
+  @sa PSI_v1::drop_table_share.
+*/
+static void
+drop_table_share_v1(my_bool temporary,
+                    const char *schema_name, int schema_name_length,
+                    const char *table_name, int table_name_length)
+{
+  /* Ignore temporary tables. */
+  if (temporary)
+    return;
+  PFS_thread *pfs_thread= my_pthread_getspecific_ptr(PFS_thread*, THR_PFS);
+  if (unlikely(pfs_thread == NULL))
+    return;
+  /* TODO: temporary tables */
+  drop_table_share(pfs_thread, temporary, schema_name, schema_name_length,
+                   table_name, table_name_length);
+}
+
+/**
+  Implementation of the table instrumentation interface.
+  @sa PSI_v1::open_table.
+*/
 static PSI_table*
 open_table_v1(PSI_table_share *share, const void *identity)
 {
-  PFS_table_share *pfs_table_share=
-    reinterpret_cast<PFS_table_share*> (share);
-  PFS_table *pfs_table;
-  DBUG_ASSERT(pfs_table_share);
-  pfs_table= create_table(pfs_table_share, identity);
+  PFS_table_share *pfs_table_share= reinterpret_cast<PFS_table_share*> (share);
+
+  if (unlikely(pfs_table_share == NULL))
+    return NULL;
+
+  /* This object is not to be instrumented. */
+  if (! pfs_table_share->m_enabled)
+    return NULL;
+
+  /* This object is instrumented, but all table instruments are disabled. */
+  if (! global_table_io_class.m_enabled && ! global_table_lock_class.m_enabled)
+    return NULL;
+
+  /*
+    When the performance schema is off, do not instrument anything.
+    Table handles have short life cycle, instrumentation will happen
+    again if needed during the next open().
+  */
+  if (! flag_global_instrumentation)
+    return NULL;
+
+  PFS_thread *thread= my_pthread_getspecific_ptr(PFS_thread*, THR_PFS);
+  if (unlikely(thread == NULL))
+    return NULL;
+
+  PFS_table *pfs_table= create_table(pfs_table_share, thread, identity);
   return reinterpret_cast<PSI_table *> (pfs_table);
 }
 
+/**
+  Implementation of the table instrumentation interface.
+  @sa PSI_v1::unbind_table.
+*/
+static void unbind_table_v1(PSI_table *table)
+{
+  PFS_table *pfs= reinterpret_cast<PFS_table*> (table);
+  if (likely(pfs != NULL))
+  {
+    pfs->aggregate();
+    pfs->m_thread_owner= NULL;
+  }
+}
+
+/**
+  Implementation of the table instrumentation interface.
+  @sa PSI_v1::rebind_table.
+*/
+static PSI_table *
+rebind_table_v1(PSI_table_share *share, const void *identity, PSI_table *table)
+{
+  PFS_table *pfs= reinterpret_cast<PFS_table*> (table);
+  if (likely(pfs != NULL))
+  {
+    PFS_thread *thread;
+    DBUG_ASSERT(pfs->m_thread_owner == NULL);
+
+    /* The table handle was already instrumented, reuse it for this thread. */
+    thread= my_pthread_getspecific_ptr(PFS_thread*, THR_PFS);
+
+    if (unlikely(thread == NULL))
+    {
+      destroy_table(pfs);
+      return NULL;
+    }
+
+    if (unlikely(! pfs->m_share->m_enabled))
+    {
+      destroy_table(pfs);
+      return NULL;
+    }
+
+    if (unlikely(! global_table_io_class.m_enabled && ! global_table_lock_class.m_enabled))
+    {
+      destroy_table(pfs);
+      return NULL;
+    }
+
+    if (unlikely(! flag_global_instrumentation))
+    {
+      destroy_table(pfs);
+      return NULL;
+    }
+
+    pfs->m_thread_owner= thread;
+    return table;
+  }
+
+  /* See open_table_v1() */
+
+  PFS_table_share *pfs_table_share= reinterpret_cast<PFS_table_share*> (share);
+
+  if (unlikely(pfs_table_share == NULL))
+    return NULL;
+
+  if (! pfs_table_share->m_enabled)
+    return NULL;
+
+  if (! global_table_io_class.m_enabled && ! global_table_lock_class.m_enabled)
+    return NULL;
+
+  if (! flag_global_instrumentation)
+    return NULL;
+
+  PFS_thread *thread= my_pthread_getspecific_ptr(PFS_thread*, THR_PFS);
+  if (unlikely(thread == NULL))
+    return NULL;
+
+  PFS_table *pfs_table= create_table(pfs_table_share, thread, identity);
+  return reinterpret_cast<PSI_table *> (pfs_table);
+}
+
+/**
+  Implementation of the table instrumentation interface.
+  @sa PSI_v1::close_table.
+*/
 static void close_table_v1(PSI_table *table)
 {
   PFS_table *pfs= reinterpret_cast<PFS_table*> (table);
-  DBUG_ASSERT(pfs);
+  if (unlikely(pfs == NULL))
+    return;
+  pfs->aggregate();
   destroy_table(pfs);
 }
 
+static PSI_socket*
+init_socket_v1(PSI_socket_key key, const my_socket *fd)
+{
+  INIT_BODY_V1(socket, key, fd);
+}
+
+static void destroy_socket_v1(PSI_socket *socket)
+{
+  PFS_socket *pfs= reinterpret_cast<PFS_socket*> (socket);
+
+  DBUG_ASSERT(pfs != NULL);
+
+  destroy_socket(pfs);
+}
+
+/**
+  Implementation of the file instrumentation interface.
+  @sa PSI_v1::create_file.
+*/
 static void create_file_v1(PSI_file_key key, const char *name, File file)
 {
+  if (! flag_global_instrumentation)
+    return;
   int index= (int) file;
   if (unlikely(index < 0))
     return;
-  PFS_thread *pfs_thread= my_pthread_getspecific_ptr(PFS_thread*, THR_PFS);
-  if (unlikely(pfs_thread == NULL))
-    return;
-  if (! pfs_thread->m_enabled)
-    return;
   PFS_file_class *klass= find_file_class(key);
   if (unlikely(klass == NULL))
     return;
   if (! klass->m_enabled)
     return;
-  if (likely(index < file_handle_max))
+
+  /* A thread is needed for LF_PINS */
+  PFS_thread *pfs_thread= my_pthread_getspecific_ptr(PFS_thread*, THR_PFS);
+  if (unlikely(pfs_thread == NULL))
+    return;
+
+  if (flag_thread_instrumentation && ! pfs_thread->m_enabled)
+    return;
+
+  /*
+    We want this check after pfs_thread->m_enabled,
+    to avoid reporting false loss.
+  */
+  if (unlikely(index >= file_handle_max))
   {
-    uint len= strlen(name);
-    PFS_file *pfs= find_or_create_file(pfs_thread, klass, name, len);
-    file_handle_array[index]= pfs;
-  }
-  else
     file_handle_lost++;
+    return;
+  }
+
+  uint len= strlen(name);
+  PFS_file *pfs_file= find_or_create_file(pfs_thread, klass, name, len);
+
+  file_handle_array[index]= pfs_file;
 }
 
+/**
+  Arguments given from a parent to a child thread, packaged in one structure.
+  This data is used when spawning a new instrumented thread.
+  @sa pfs_spawn_thread.
+*/
 struct PFS_spawn_thread_arg
 {
   PFS_thread *m_parent_thread;
@@ -996,9 +1761,29 @@ void* pfs_spawn_thread(void *arg)
   /* First, attach instrumentation to this newly created pthread. */
   PFS_thread_class *klass= find_thread_class(typed_arg->m_child_key);
   if (likely(klass != NULL))
+  {
     pfs= create_thread(klass, typed_arg->m_child_identity, 0);
+    if (likely(pfs != NULL))
+    {
+      PFS_thread *parent= typed_arg->m_parent_thread;
+
+      clear_thread_account(pfs);
+
+      pfs->m_parent_thread_internal_id= parent->m_thread_internal_id;
+
+      memcpy(pfs->m_username, parent->m_username, sizeof(pfs->m_username));
+      pfs->m_username_length= parent->m_username_length;
+
+      memcpy(pfs->m_hostname, parent->m_hostname, sizeof(pfs->m_hostname));
+      pfs->m_hostname_length= parent->m_hostname_length;
+
+      set_thread_account(pfs);
+    }
+  }
   else
+  {
     pfs= NULL;
+  }
   my_pthread_setspecific_ptr(THR_PFS, pfs);
 
   /*
@@ -1017,6 +1802,10 @@ void* pfs_spawn_thread(void *arg)
   return NULL;
 }
 
+/**
+  Implementation of the thread instrumentation interface.
+  @sa PSI_v1::spawn_thread.
+*/
 static int spawn_thread_v1(PSI_thread_key key,
                            pthread_t *thread, const pthread_attr_t *attr,
                            void *(*start_routine)(void*), void *arg)
@@ -1041,6 +1830,10 @@ static int spawn_thread_v1(PSI_thread_key key,
   return result;
 }
 
+/**
+  Implementation of the thread instrumentation interface.
+  @sa PSI_v1::new_thread.
+*/
 static PSI_thread*
 new_thread_v1(PSI_thread_key key, const void *identity, ulong thread_id)
 {
@@ -1055,13 +1848,22 @@ new_thread_v1(PSI_thread_key key, const void *identity, ulong thread_id)
   return reinterpret_cast<PSI_thread*> (pfs);
 }
 
+/**
+  Implementation of the thread instrumentation interface.
+  @sa PSI_v1::set_thread_id.
+*/
 static void set_thread_id_v1(PSI_thread *thread, unsigned long id)
 {
-  DBUG_ASSERT(thread);
   PFS_thread *pfs= reinterpret_cast<PFS_thread*> (thread);
+  if (unlikely(pfs == NULL))
+    return;
   pfs->m_thread_id= id;
 }
 
+/**
+  Implementation of the thread instrumentation interface.
+  @sa PSI_v1::get_thread_id.
+*/
 static PSI_thread*
 get_thread_v1(void)
 {
@@ -1069,124 +1871,448 @@ get_thread_v1(void)
   return reinterpret_cast<PSI_thread*> (pfs);
 }
 
+/**
+  Implementation of the thread instrumentation interface.
+  @sa PSI_v1::set_thread_user.
+*/
+static void set_thread_user_v1(const char *user, int user_len)
+{
+  PFS_thread *pfs= my_pthread_getspecific_ptr(PFS_thread*, THR_PFS);
+
+  DBUG_ASSERT((user != NULL) || (user_len == 0));
+  DBUG_ASSERT(user_len >= 0);
+  DBUG_ASSERT((uint) user_len <= sizeof(pfs->m_username));
+
+  if (unlikely(pfs == NULL))
+    return;
+
+  aggregate_thread(pfs);
+
+  pfs->m_lock.allocated_to_dirty();
+
+  clear_thread_account(pfs);
+
+  if (user_len > 0)
+    memcpy(pfs->m_username, user, user_len);
+  pfs->m_username_length= user_len;
+
+  set_thread_account(pfs);
+
+  bool enabled= true;
+  if (flag_thread_instrumentation)
+  {
+    if ((pfs->m_username_length > 0) && (pfs->m_hostname_length > 0))
+    {
+      /*
+        TODO: performance improvement.
+        Once performance_schema.USERS is exposed,
+        we can use PFS_user::m_enabled instead of looking up
+        SETUP_ACTORS every time.
+      */
+      lookup_setup_actor(pfs,
+                         pfs->m_username, pfs->m_username_length,
+                         pfs->m_hostname, pfs->m_hostname_length,
+                         &enabled);
+    }
+  }
+
+  pfs->m_enabled= enabled;
+
+  pfs->m_lock.dirty_to_allocated();
+}
+
+/**
+  Implementation of the thread instrumentation interface.
+  @sa PSI_v1::set_thread_account.
+*/
+static void set_thread_account_v1(const char *user, int user_len,
+                                    const char *host, int host_len)
+{
+  PFS_thread *pfs= my_pthread_getspecific_ptr(PFS_thread*, THR_PFS);
+
+  DBUG_ASSERT((user != NULL) || (user_len == 0));
+  DBUG_ASSERT(user_len >= 0);
+  DBUG_ASSERT((uint) user_len <= sizeof(pfs->m_username));
+  DBUG_ASSERT((host != NULL) || (host_len == 0));
+  DBUG_ASSERT(host_len >= 0);
+  DBUG_ASSERT((uint) host_len <= sizeof(pfs->m_hostname));
+
+  if (unlikely(pfs == NULL))
+    return;
+
+  pfs->m_lock.allocated_to_dirty();
+
+  clear_thread_account(pfs);
+
+  if (host_len > 0)
+    memcpy(pfs->m_hostname, host, host_len);
+  pfs->m_hostname_length= host_len;
+
+  if (user_len > 0)
+    memcpy(pfs->m_username, user, user_len);
+  pfs->m_username_length= user_len;
+
+  set_thread_account(pfs);
+
+  bool enabled= true;
+  if (flag_thread_instrumentation)
+  {
+    if ((pfs->m_username_length > 0) && (pfs->m_hostname_length > 0))
+    {
+      /*
+        TODO: performance improvement.
+        Once performance_schema.USERS is exposed,
+        we can use PFS_user::m_enabled instead of looking up
+        SETUP_ACTORS every time.
+      */
+      lookup_setup_actor(pfs,
+                         pfs->m_username, pfs->m_username_length,
+                         pfs->m_hostname, pfs->m_hostname_length,
+                         &enabled);
+    }
+  }
+  pfs->m_enabled= enabled;
+
+  pfs->m_lock.dirty_to_allocated();
+}
+
+/**
+  Implementation of the thread instrumentation interface.
+  @sa PSI_v1::set_thread_db.
+*/
+static void set_thread_db_v1(const char* db, int db_len)
+{
+  PFS_thread *pfs= my_pthread_getspecific_ptr(PFS_thread*, THR_PFS);
+
+  DBUG_ASSERT((db != NULL) || (db_len == 0));
+  DBUG_ASSERT(db_len >= 0);
+  DBUG_ASSERT((uint) db_len <= sizeof(pfs->m_dbname));
+
+  if (likely(pfs != NULL))
+  {
+    pfs->m_lock.allocated_to_dirty();
+    if (db_len > 0)
+      memcpy(pfs->m_dbname, db, db_len);
+    pfs->m_dbname_length= db_len;
+    pfs->m_lock.dirty_to_allocated();
+  }
+}
+
+/**
+  Implementation of the thread instrumentation interface.
+  @sa PSI_v1::set_thread_command.
+*/
+static void set_thread_command_v1(int command)
+{
+  PFS_thread *pfs= my_pthread_getspecific_ptr(PFS_thread*, THR_PFS);
+
+  DBUG_ASSERT(command >= 0);
+  DBUG_ASSERT(command <= (int) COM_END);
+
+  if (likely(pfs != NULL))
+  {
+    pfs->m_lock.allocated_to_dirty();
+    pfs->m_command= command;
+    pfs->m_lock.dirty_to_allocated();
+  }
+}
+
+/**
+  Implementation of the thread instrumentation interface.
+  @sa PSI_v1::set_thread_start_time.
+*/
+static void set_thread_start_time_v1(time_t start_time)
+{
+  PFS_thread *pfs= my_pthread_getspecific_ptr(PFS_thread*, THR_PFS);
+
+  if (likely(pfs != NULL))
+  {
+    pfs->m_lock.allocated_to_dirty();
+    pfs->m_start_time= start_time;
+    pfs->m_lock.dirty_to_allocated();
+  }
+}
+
+/**
+  Implementation of the thread instrumentation interface.
+  @sa PSI_v1::set_thread_state.
+*/
+static void set_thread_state_v1(const char* state)
+{
+  PFS_thread *pfs= my_pthread_getspecific_ptr(PFS_thread*, THR_PFS);
+
+  if (likely(pfs != NULL))
+  {
+    int state_len= state ? strlen(state) : 0;
+
+    pfs->m_lock.allocated_to_dirty();
+    pfs->m_processlist_state_ptr= state;
+    pfs->m_processlist_state_length= state_len;
+    pfs->m_lock.dirty_to_allocated();
+  }
+}
+
+/**
+  Implementation of the thread instrumentation interface.
+  @sa PSI_v1::set_thread_info.
+*/
+static void set_thread_info_v1(const char* info, int info_len)
+{
+  PFS_thread *pfs= my_pthread_getspecific_ptr(PFS_thread*, THR_PFS);
+
+  if (likely(pfs != NULL))
+  {
+    pfs->m_lock.allocated_to_dirty();
+    pfs->m_processlist_info_ptr= info;
+    pfs->m_processlist_info_length= info_len;
+    pfs->m_lock.dirty_to_allocated();
+  }
+}
+
+/**
+  Implementation of the thread instrumentation interface.
+  @sa PSI_v1::set_thread.
+*/
 static void set_thread_v1(PSI_thread* thread)
 {
   PFS_thread *pfs= reinterpret_cast<PFS_thread*> (thread);
   my_pthread_setspecific_ptr(THR_PFS, pfs);
 }
 
+/**
+  Implementation of the thread instrumentation interface.
+  @sa PSI_v1::delete_current_thread.
+*/
 static void delete_current_thread_v1(void)
 {
   PFS_thread *thread= my_pthread_getspecific_ptr(PFS_thread*, THR_PFS);
   if (thread != NULL)
   {
+    aggregate_thread(thread);
     my_pthread_setspecific_ptr(THR_PFS, NULL);
     destroy_thread(thread);
   }
 }
 
+/**
+  Implementation of the thread instrumentation interface.
+  @sa PSI_v1::delete_thread.
+*/
 static void delete_thread_v1(PSI_thread *thread)
 {
   PFS_thread *pfs= reinterpret_cast<PFS_thread*> (thread);
+
   if (pfs != NULL)
+  {
+    aggregate_thread(pfs);
     destroy_thread(pfs);
+  }
 }
 
+/**
+  Implementation of the mutex instrumentation interface.
+  @sa PSI_v1::start_mutex_wait.
+*/
 static PSI_mutex_locker*
-get_thread_mutex_locker_v1(PSI_mutex_locker_state *state,
-                           PSI_mutex *mutex, PSI_mutex_operation op)
+start_mutex_wait_v1(PSI_mutex_locker_state *state,
+                    PSI_mutex *mutex, PSI_mutex_operation op,
+                    const char *src_file, uint src_line)
 {
   PFS_mutex *pfs_mutex= reinterpret_cast<PFS_mutex*> (mutex);
   DBUG_ASSERT((int) op >= 0);
   DBUG_ASSERT((uint) op < array_elements(mutex_operation_map));
+  DBUG_ASSERT(state != NULL);
+
   DBUG_ASSERT(pfs_mutex != NULL);
   DBUG_ASSERT(pfs_mutex->m_class != NULL);
-  if (! flag_events_waits_current)
-    return NULL;
-  if (! pfs_mutex->m_class->m_enabled)
-    return NULL;
-  PFS_thread *pfs_thread= my_pthread_getspecific_ptr(PFS_thread*, THR_PFS);
-  if (unlikely(pfs_thread == NULL))
-    return NULL;
-  if (! pfs_thread->m_enabled)
-    return NULL;
-  if (unlikely(pfs_thread->m_wait_locker_count >= LOCKER_STACK_SIZE))
-  {
-    locker_lost++;
+
+  if (! pfs_mutex->m_enabled)
     return NULL;
-  }
-  PFS_wait_locker *pfs_locker= &pfs_thread->m_wait_locker_stack
-    [pfs_thread->m_wait_locker_count];
 
-  pfs_locker->m_target.m_mutex= pfs_mutex;
-  pfs_locker->m_waits_current.m_thread= pfs_thread;
-  pfs_locker->m_waits_current.m_class= pfs_mutex->m_class;
-  if (pfs_mutex->m_class->m_timed)
+  register uint flags;
+  ulonglong timer_start= 0;
+
+  if (flag_thread_instrumentation)
   {
-    pfs_locker->m_timer_name= wait_timer;
-    pfs_locker->m_waits_current.m_timer_state= TIMER_STATE_STARTING;
+    PFS_thread *pfs_thread= my_pthread_getspecific_ptr(PFS_thread*, THR_PFS);
+    if (unlikely(pfs_thread == NULL))
+      return NULL;
+    if (! pfs_thread->m_enabled)
+      return NULL;
+    state->m_thread= reinterpret_cast<PSI_thread *> (pfs_thread);
+    flags= STATE_FLAG_THREAD;
+
+    if (pfs_mutex->m_timed)
+    {
+      timer_start= get_timer_raw_value_and_function(wait_timer, & state->m_timer);
+      state->m_timer_start= timer_start;
+      flags|= STATE_FLAG_TIMED;
+    }
+
+    if (flag_events_waits_current)
+    {
+      if (unlikely(pfs_thread->m_events_waits_current >=
+                   & pfs_thread->m_events_waits_stack[WAIT_STACK_SIZE]))
+      {
+        locker_lost++;
+        return NULL;
+      }
+      PFS_events_waits *wait= pfs_thread->m_events_waits_current;
+      state->m_wait= wait;
+      flags|= STATE_FLAG_EVENT;
+
+      PFS_events_waits *parent_event= wait - 1;
+      wait->m_event_type= EVENT_TYPE_WAIT;
+      wait->m_nesting_event_id= parent_event->m_event_id;
+      wait->m_nesting_event_type= parent_event->m_event_type;
+
+      wait->m_thread= pfs_thread;
+      wait->m_class= pfs_mutex->m_class;
+      wait->m_timer_start= timer_start;
+      wait->m_timer_end= 0;
+      wait->m_object_instance_addr= pfs_mutex->m_identity;
+      wait->m_event_id= pfs_thread->m_event_id++;
+      wait->m_end_event_id= 0;
+      wait->m_operation= mutex_operation_map[(int) op];
+      wait->m_source_file= src_file;
+      wait->m_source_line= src_line;
+      wait->m_wait_class= WAIT_CLASS_MUTEX;
+
+      pfs_thread->m_events_waits_current++;
+    }
   }
   else
-    pfs_locker->m_waits_current.m_timer_state= TIMER_STATE_UNTIMED;
-  pfs_locker->m_waits_current.m_object_instance_addr= pfs_mutex->m_identity;
-  pfs_locker->m_waits_current.m_event_id= pfs_thread->m_event_id++;
-  pfs_locker->m_waits_current.m_operation= mutex_operation_map[(int) op];
-  pfs_locker->m_waits_current.m_wait_class= WAIT_CLASS_MUTEX;
+  {
+    if (pfs_mutex->m_timed)
+    {
+      timer_start= get_timer_raw_value_and_function(wait_timer, & state->m_timer);
+      state->m_timer_start= timer_start;
+      flags= STATE_FLAG_TIMED;
+      state->m_thread= NULL;
+    }
+    else
+    {
+      /*
+        Complete shortcut.
+      */
+      /* Aggregate to EVENTS_WAITS_SUMMARY_BY_INSTANCE (counted) */
+      pfs_mutex->m_wait_stat.aggregate_counted();
+      return NULL;
+    }
+  }
 
-  pfs_thread->m_wait_locker_count++;
-  return reinterpret_cast<PSI_mutex_locker*> (pfs_locker);
+  state->m_flags= flags;
+  state->m_mutex= mutex;
+  return reinterpret_cast<PSI_mutex_locker*> (state);
 }
 
+/**
+  Implementation of the rwlock instrumentation interface.
+  @sa PSI_v1::start_rwlock_rdwait
+  @sa PSI_v1::start_rwlock_wrwait
+*/
 static PSI_rwlock_locker*
-get_thread_rwlock_locker_v1(PSI_rwlock_locker_state *state,
-                            PSI_rwlock *rwlock, PSI_rwlock_operation op)
+start_rwlock_wait_v1(PSI_rwlock_locker_state *state,
+                     PSI_rwlock *rwlock,
+                     PSI_rwlock_operation op,
+                     const char *src_file, uint src_line)
 {
   PFS_rwlock *pfs_rwlock= reinterpret_cast<PFS_rwlock*> (rwlock);
   DBUG_ASSERT(static_cast<int> (op) >= 0);
   DBUG_ASSERT(static_cast<uint> (op) < array_elements(rwlock_operation_map));
+  DBUG_ASSERT(state != NULL);
   DBUG_ASSERT(pfs_rwlock != NULL);
   DBUG_ASSERT(pfs_rwlock->m_class != NULL);
-  if (! flag_events_waits_current)
-    return NULL;
-  if (! pfs_rwlock->m_class->m_enabled)
-    return NULL;
-  PFS_thread *pfs_thread= my_pthread_getspecific_ptr(PFS_thread*, THR_PFS);
-  if (unlikely(pfs_thread == NULL))
-    return NULL;
-  if (! pfs_thread->m_enabled)
-    return NULL;
-  if (unlikely(pfs_thread->m_wait_locker_count >= LOCKER_STACK_SIZE))
-  {
-    locker_lost++;
+
+  if (! pfs_rwlock->m_enabled)
     return NULL;
-  }
-  PFS_wait_locker *pfs_locker= &pfs_thread->m_wait_locker_stack
-    [pfs_thread->m_wait_locker_count];
 
-  pfs_locker->m_target.m_rwlock= pfs_rwlock;
-  pfs_locker->m_waits_current.m_thread= pfs_thread;
-  pfs_locker->m_waits_current.m_class= pfs_rwlock->m_class;
-  if (pfs_rwlock->m_class->m_timed)
+  register uint flags;
+  ulonglong timer_start= 0;
+
+  if (flag_thread_instrumentation)
   {
-    pfs_locker->m_timer_name= wait_timer;
-    pfs_locker->m_waits_current.m_timer_state= TIMER_STATE_STARTING;
+    PFS_thread *pfs_thread= my_pthread_getspecific_ptr(PFS_thread*, THR_PFS);
+    if (unlikely(pfs_thread == NULL))
+      return NULL;
+    if (! pfs_thread->m_enabled)
+      return NULL;
+    state->m_thread= reinterpret_cast<PSI_thread *> (pfs_thread);
+    flags= STATE_FLAG_THREAD;
+
+    if (pfs_rwlock->m_timed)
+    {
+      timer_start= get_timer_raw_value_and_function(wait_timer, & state->m_timer);
+      state->m_timer_start= timer_start;
+      flags|= STATE_FLAG_TIMED;
+    }
+
+    if (flag_events_waits_current)
+    {
+      if (unlikely(pfs_thread->m_events_waits_current >=
+                   & pfs_thread->m_events_waits_stack[WAIT_STACK_SIZE]))
+      {
+        locker_lost++;
+        return NULL;
+      }
+      PFS_events_waits *wait= pfs_thread->m_events_waits_current;
+      state->m_wait= wait;
+      flags|= STATE_FLAG_EVENT;
+
+      PFS_events_waits *parent_event= wait - 1;
+      wait->m_event_type= EVENT_TYPE_WAIT;
+      wait->m_nesting_event_id= parent_event->m_event_id;
+      wait->m_nesting_event_type= parent_event->m_event_type;
+
+      wait->m_thread= pfs_thread;
+      wait->m_class= pfs_rwlock->m_class;
+      wait->m_timer_start= timer_start;
+      wait->m_timer_end= 0;
+      wait->m_object_instance_addr= pfs_rwlock->m_identity;
+      wait->m_event_id= pfs_thread->m_event_id++;
+      wait->m_end_event_id= 0;
+      wait->m_operation= rwlock_operation_map[static_cast<int> (op)];
+      wait->m_source_file= src_file;
+      wait->m_source_line= src_line;
+      wait->m_wait_class= WAIT_CLASS_RWLOCK;
+
+      pfs_thread->m_events_waits_current++;
+    }
   }
   else
-    pfs_locker->m_waits_current.m_timer_state= TIMER_STATE_UNTIMED;
-  pfs_locker->m_waits_current.m_object_instance_addr= pfs_rwlock->m_identity;
-  pfs_locker->m_waits_current.m_event_id= pfs_thread->m_event_id++;
-  pfs_locker->m_waits_current.m_operation=
-    rwlock_operation_map[static_cast<int> (op)];
-  pfs_locker->m_waits_current.m_wait_class= WAIT_CLASS_RWLOCK;
+  {
+    if (pfs_rwlock->m_timed)
+    {
+      timer_start= get_timer_raw_value_and_function(wait_timer, & state->m_timer);
+      state->m_timer_start= timer_start;
+      flags= STATE_FLAG_TIMED;
+      state->m_thread= NULL;
+    }
+    else
+    {
+      /*
+        Complete shortcut.
+      */
+      /* Aggregate to EVENTS_WAITS_SUMMARY_BY_INSTANCE (counted) */
+      pfs_rwlock->m_wait_stat.aggregate_counted();
+      return NULL;
+    }
+  }
 
-  pfs_thread->m_wait_locker_count++;
-  return reinterpret_cast<PSI_rwlock_locker*> (pfs_locker);
+  state->m_flags= flags;
+  state->m_rwlock= rwlock;
+  return reinterpret_cast<PSI_rwlock_locker*> (state);
 }
 
+/**
+  Implementation of the cond instrumentation interface.
+  @sa PSI_v1::start_cond_wait.
+*/
 static PSI_cond_locker*
-get_thread_cond_locker_v1(PSI_cond_locker_state *state,
-                          PSI_cond *cond, PSI_mutex * /* unused: mutex */,
-                          PSI_cond_operation op)
+start_cond_wait_v1(PSI_cond_locker_state *state,
+                   PSI_cond *cond, PSI_mutex *mutex,
+                   PSI_cond_operation op,
+                   const char *src_file, uint src_line)
 {
   /*
     Note about the unused PSI_mutex *mutex parameter:
@@ -1202,87 +2328,373 @@ get_thread_cond_locker_v1(PSI_cond_locker_state *state,
   PFS_cond *pfs_cond= reinterpret_cast<PFS_cond*> (cond);
   DBUG_ASSERT(static_cast<int> (op) >= 0);
   DBUG_ASSERT(static_cast<uint> (op) < array_elements(cond_operation_map));
+  DBUG_ASSERT(state != NULL);
   DBUG_ASSERT(pfs_cond != NULL);
   DBUG_ASSERT(pfs_cond->m_class != NULL);
-  if (! flag_events_waits_current)
+
+  if (! pfs_cond->m_enabled)
     return NULL;
-  if (! pfs_cond->m_class->m_enabled)
+
+  register uint flags;
+  ulonglong timer_start= 0;
+
+  if (flag_thread_instrumentation)
+  {
+    PFS_thread *pfs_thread= my_pthread_getspecific_ptr(PFS_thread*, THR_PFS);
+    if (unlikely(pfs_thread == NULL))
+      return NULL;
+    if (! pfs_thread->m_enabled)
+      return NULL;
+    state->m_thread= reinterpret_cast<PSI_thread *> (pfs_thread);
+    flags= STATE_FLAG_THREAD;
+
+    if (pfs_cond->m_timed)
+    {
+      timer_start= get_timer_raw_value_and_function(wait_timer, & state->m_timer);
+      state->m_timer_start= timer_start;
+      flags|= STATE_FLAG_TIMED;
+    }
+
+    if (flag_events_waits_current)
+    {
+      if (unlikely(pfs_thread->m_events_waits_current >=
+                   & pfs_thread->m_events_waits_stack[WAIT_STACK_SIZE]))
+      {
+        locker_lost++;
+        return NULL;
+      }
+      PFS_events_waits *wait= pfs_thread->m_events_waits_current;
+      state->m_wait= wait;
+      flags|= STATE_FLAG_EVENT;
+
+      PFS_events_waits *parent_event= wait - 1;
+      wait->m_event_type= EVENT_TYPE_WAIT;
+      wait->m_nesting_event_id= parent_event->m_event_id;
+      wait->m_nesting_event_type= parent_event->m_event_type;
+
+      wait->m_thread= pfs_thread;
+      wait->m_class= pfs_cond->m_class;
+      wait->m_timer_start= timer_start;
+      wait->m_timer_end= 0;
+      wait->m_object_instance_addr= pfs_cond->m_identity;
+      wait->m_event_id= pfs_thread->m_event_id++;
+      wait->m_end_event_id= 0;
+      wait->m_operation= cond_operation_map[static_cast<int> (op)];
+      wait->m_source_file= src_file;
+      wait->m_source_line= src_line;
+      wait->m_wait_class= WAIT_CLASS_COND;
+
+      pfs_thread->m_events_waits_current++;
+    }
+  }
+  else
+  {
+    if (pfs_cond->m_timed)
+    {
+      timer_start= get_timer_raw_value_and_function(wait_timer, & state->m_timer);
+      state->m_timer_start= timer_start;
+      flags= STATE_FLAG_TIMED;
+    }
+    else
+    {
+      /*
+        Complete shortcut.
+      */
+      /* Aggregate to EVENTS_WAITS_SUMMARY_BY_INSTANCE (counted) */
+      pfs_cond->m_wait_stat.aggregate_counted();
+      return NULL;
+    }
+  }
+
+  state->m_flags= flags;
+  state->m_cond= cond;
+  state->m_mutex= mutex;
+  return reinterpret_cast<PSI_cond_locker*> (state);
+}
+
+static inline PFS_TL_LOCK_TYPE lock_flags_to_lock_type(uint flags)
+{
+  enum thr_lock_type value= static_cast<enum thr_lock_type> (flags);
+
+  switch (value)
+  {
+    case TL_READ:
+      return PFS_TL_READ;
+    case TL_READ_WITH_SHARED_LOCKS:
+      return PFS_TL_READ_WITH_SHARED_LOCKS;
+    case TL_READ_HIGH_PRIORITY:
+      return PFS_TL_READ_HIGH_PRIORITY;
+    case TL_READ_NO_INSERT:
+      return PFS_TL_READ_NO_INSERT;
+    case TL_WRITE_ALLOW_WRITE:
+      return PFS_TL_WRITE_ALLOW_WRITE;
+    case TL_WRITE_CONCURRENT_INSERT:
+      return PFS_TL_WRITE_CONCURRENT_INSERT;
+    case TL_WRITE_DELAYED:
+      return PFS_TL_WRITE_DELAYED;
+    case TL_WRITE_LOW_PRIORITY:
+      return PFS_TL_WRITE_LOW_PRIORITY;
+    case TL_WRITE:
+      return PFS_TL_WRITE;
+
+    case TL_WRITE_ONLY:
+    case TL_IGNORE:
+    case TL_UNLOCK:
+    case TL_READ_DEFAULT:
+    case TL_WRITE_DEFAULT:
+    default:
+      DBUG_ASSERT(false);
+  }
+
+  /* Dead code */
+  return PFS_TL_READ;
+}
+
+static inline PFS_TL_LOCK_TYPE external_lock_flags_to_lock_type(uint flags)
+{
+  DBUG_ASSERT(flags == F_RDLCK || flags == F_WRLCK);
+  return (flags == F_RDLCK ? PFS_TL_READ_EXTERNAL : PFS_TL_WRITE_EXTERNAL);
+}
+
+/**
+  Implementation of the table instrumentation interface.
+  @sa PSI_v1::start_table_io_wait_v1
+*/
+static PSI_table_locker*
+start_table_io_wait_v1(PSI_table_locker_state *state,
+                       PSI_table *table,
+                       PSI_table_io_operation op,
+                       uint index,
+                       const char *src_file, uint src_line)
+{
+  DBUG_ASSERT(static_cast<int> (op) >= 0);
+  DBUG_ASSERT(static_cast<uint> (op) < array_elements(table_io_operation_map));
+  DBUG_ASSERT(state != NULL);
+  PFS_table *pfs_table= reinterpret_cast<PFS_table*> (table);
+  DBUG_ASSERT(pfs_table != NULL);
+  DBUG_ASSERT(pfs_table->m_share != NULL);
+
+  if (! pfs_table->m_io_enabled)
     return NULL;
-  PFS_thread *pfs_thread= my_pthread_getspecific_ptr(PFS_thread*, THR_PFS);
+
+  PFS_thread *pfs_thread= pfs_table->m_thread_owner;
   if (unlikely(pfs_thread == NULL))
     return NULL;
-  if (! pfs_thread->m_enabled)
-    return NULL;
-  if (unlikely(pfs_thread->m_wait_locker_count >= LOCKER_STACK_SIZE))
-  {
-    locker_lost++;
-    return NULL;
-  }
-  PFS_wait_locker *pfs_locker= &pfs_thread->m_wait_locker_stack
-    [pfs_thread->m_wait_locker_count];
 
-  pfs_locker->m_target.m_cond= pfs_cond;
-  pfs_locker->m_waits_current.m_thread= pfs_thread;
-  pfs_locker->m_waits_current.m_class= pfs_cond->m_class;
-  if (pfs_cond->m_class->m_timed)
+  DBUG_ASSERT(pfs_thread ==
+              my_pthread_getspecific_ptr(PFS_thread*, THR_PFS));
+
+  register uint flags;
+  ulonglong timer_start= 0;
+
+  if (flag_thread_instrumentation)
   {
-    pfs_locker->m_timer_name= wait_timer;
-    pfs_locker->m_waits_current.m_timer_state= TIMER_STATE_STARTING;
+    if (! pfs_thread->m_enabled)
+      return NULL;
+    state->m_thread= reinterpret_cast<PSI_thread *> (pfs_thread);
+    flags= STATE_FLAG_THREAD;
+
+    if (pfs_table->m_io_timed)
+    {
+      timer_start= get_timer_raw_value_and_function(wait_timer, & state->m_timer);
+      state->m_timer_start= timer_start;
+      flags|= STATE_FLAG_TIMED;
+    }
+
+    if (flag_events_waits_current)
+    {
+      if (unlikely(pfs_thread->m_events_waits_current >=
+                   & pfs_thread->m_events_waits_stack[WAIT_STACK_SIZE]))
+      {
+        locker_lost++;
+        return NULL;
+      }
+      PFS_events_waits *wait= pfs_thread->m_events_waits_current;
+      state->m_wait= wait;
+      flags|= STATE_FLAG_EVENT;
+
+      PFS_events_waits *parent_event= wait - 1;
+      wait->m_event_type= EVENT_TYPE_WAIT;
+      wait->m_nesting_event_id= parent_event->m_event_id;
+      wait->m_nesting_event_type= parent_event->m_event_type;
+
+      PFS_table_share *share= pfs_table->m_share;
+      wait->m_thread= pfs_thread;
+      wait->m_class= &global_table_io_class;
+      wait->m_timer_start= timer_start;
+      wait->m_timer_end= 0;
+      wait->m_object_instance_addr= pfs_table->m_identity;
+      wait->m_event_id= pfs_thread->m_event_id++;
+      wait->m_end_event_id= 0;
+      wait->m_operation= table_io_operation_map[static_cast<int> (op)];
+      wait->m_flags= 0;
+      wait->m_object_type= share->get_object_type();
+      wait->m_weak_table_share= share;
+      wait->m_weak_version= share->get_version();
+      wait->m_index= index;
+      wait->m_source_file= src_file;
+      wait->m_source_line= src_line;
+      wait->m_wait_class= WAIT_CLASS_TABLE;
+
+      pfs_thread->m_events_waits_current++;
+    }
+    /* TODO: consider a shortcut here */
   }
   else
-    pfs_locker->m_waits_current.m_timer_state= TIMER_STATE_UNTIMED;
-  pfs_locker->m_waits_current.m_object_instance_addr= pfs_cond->m_identity;
-  pfs_locker->m_waits_current.m_event_id= pfs_thread->m_event_id++;
-  pfs_locker->m_waits_current.m_operation=
-    cond_operation_map[static_cast<int> (op)];
-  pfs_locker->m_waits_current.m_wait_class= WAIT_CLASS_COND;
+  {
+    if (pfs_table->m_io_timed)
+    {
+      timer_start= get_timer_raw_value_and_function(wait_timer, & state->m_timer);
+      state->m_timer_start= timer_start;
+      flags= STATE_FLAG_TIMED;
+    }
+    else
+    {
+      /* TODO: consider a shortcut here */
+      flags= 0;
+    }
+  }
 
-  pfs_thread->m_wait_locker_count++;
-  return reinterpret_cast<PSI_cond_locker*> (pfs_locker);
+  state->m_flags= flags;
+  state->m_table= table;
+  state->m_io_operation= op;
+  state->m_index= index;
+  return reinterpret_cast<PSI_table_locker*> (state);
 }
 
+/**
+  Implementation of the table instrumentation interface.
+  @sa PSI_v1::start_table_lock_wait.
+*/
 static PSI_table_locker*
-get_thread_table_locker_v1(PSI_table_locker_state *state,
-                           PSI_table *table)
+start_table_lock_wait_v1(PSI_table_locker_state *state,
+                         PSI_table *table,
+                         PSI_table_lock_operation op,
+                         ulong op_flags,
+                         const char *src_file, uint src_line)
 {
+  DBUG_ASSERT(state != NULL);
+  DBUG_ASSERT((op == PSI_TABLE_LOCK) || (op == PSI_TABLE_EXTERNAL_LOCK));
+
   PFS_table *pfs_table= reinterpret_cast<PFS_table*> (table);
+
   DBUG_ASSERT(pfs_table != NULL);
   DBUG_ASSERT(pfs_table->m_share != NULL);
-  if (! flag_events_waits_current)
-    return NULL;
-  if (! pfs_table->m_share->m_enabled)
+
+  if (! pfs_table->m_lock_enabled)
     return NULL;
-  PFS_thread *pfs_thread= my_pthread_getspecific_ptr(PFS_thread*, THR_PFS);
+
+  PFS_thread *pfs_thread= pfs_table->m_thread_owner;
   if (unlikely(pfs_thread == NULL))
     return NULL;
-  if (! pfs_thread->m_enabled)
-    return NULL;
-  if (unlikely(pfs_thread->m_wait_locker_count >= LOCKER_STACK_SIZE))
+
+  DBUG_ASSERT(pfs_thread ==
+              my_pthread_getspecific_ptr(PFS_thread*, THR_PFS));
+
+  PFS_TL_LOCK_TYPE lock_type;
+
+  switch (op)
   {
-    locker_lost++;
-    return NULL;
+    case PSI_TABLE_LOCK:
+      lock_type= lock_flags_to_lock_type(op_flags);
+      break;
+    case PSI_TABLE_EXTERNAL_LOCK:
+      /*
+        See the handler::external_lock() API design,
+        there is no handler::external_unlock().
+      */
+      if (op_flags == F_UNLCK)
+        return NULL;
+      lock_type= external_lock_flags_to_lock_type(op_flags);
+      break;
+    default:
+      lock_type= PFS_TL_READ;
+      DBUG_ASSERT(false);
   }
-  PFS_wait_locker *pfs_locker= &pfs_thread->m_wait_locker_stack
-    [pfs_thread->m_wait_locker_count];
 
-  pfs_locker->m_target.m_table= pfs_table;
-  pfs_locker->m_waits_current.m_thread= pfs_thread;
-  pfs_locker->m_waits_current.m_class= &global_table_class;
-  if (pfs_table->m_share->m_timed)
+  DBUG_ASSERT((uint) lock_type < array_elements(table_lock_operation_map));
+
+  register uint flags;
+  ulonglong timer_start= 0;
+
+  if (flag_thread_instrumentation)
   {
-    pfs_locker->m_timer_name= wait_timer;
-    pfs_locker->m_waits_current.m_timer_state= TIMER_STATE_STARTING;
+    if (! pfs_thread->m_enabled)
+      return NULL;
+    state->m_thread= reinterpret_cast<PSI_thread *> (pfs_thread);
+    flags= STATE_FLAG_THREAD;
+
+    if (pfs_table->m_lock_timed)
+    {
+      timer_start= get_timer_raw_value_and_function(wait_timer, & state->m_timer);
+      state->m_timer_start= timer_start;
+      flags|= STATE_FLAG_TIMED;
+    }
+
+    if (flag_events_waits_current)
+    {
+      if (unlikely(pfs_thread->m_events_waits_current >=
+                   & pfs_thread->m_events_waits_stack[WAIT_STACK_SIZE]))
+      {
+        locker_lost++;
+        return NULL;
+      }
+      PFS_events_waits *wait= pfs_thread->m_events_waits_current;
+      state->m_wait= wait;
+      flags|= STATE_FLAG_EVENT;
+
+      PFS_events_waits *parent_event= wait - 1;
+      wait->m_event_type= EVENT_TYPE_WAIT;
+      wait->m_nesting_event_id= parent_event->m_event_id;
+      wait->m_nesting_event_type= parent_event->m_event_type;
+
+      PFS_table_share *share= pfs_table->m_share;
+      wait->m_thread= pfs_thread;
+      wait->m_class= &global_table_lock_class;
+      wait->m_timer_start= timer_start;
+      wait->m_timer_end= 0;
+      wait->m_object_instance_addr= pfs_table->m_identity;
+      wait->m_event_id= pfs_thread->m_event_id++;
+      wait->m_end_event_id= 0;
+      wait->m_operation= table_lock_operation_map[lock_type];
+      wait->m_flags= 0;
+      wait->m_object_type= share->get_object_type();
+      wait->m_weak_table_share= share;
+      wait->m_weak_version= share->get_version();
+      wait->m_index= 0;
+      wait->m_source_file= src_file;
+      wait->m_source_line= src_line;
+      wait->m_wait_class= WAIT_CLASS_TABLE;
+
+      pfs_thread->m_events_waits_current++;
+    }
+    /* TODO: consider a shortcut here */
   }
   else
-    pfs_locker->m_waits_current.m_timer_state= TIMER_STATE_UNTIMED;
-  pfs_locker->m_waits_current.m_object_instance_addr= pfs_table->m_identity;
-  pfs_locker->m_waits_current.m_event_id= pfs_thread->m_event_id++;
-  pfs_locker->m_waits_current.m_wait_class= WAIT_CLASS_TABLE;
+  {
+    if (pfs_table->m_lock_timed)
+    {
+      timer_start= get_timer_raw_value_and_function(wait_timer, & state->m_timer);
+      state->m_timer_start= timer_start;
+      flags= STATE_FLAG_TIMED;
+    }
+    else
+    {
+      /* TODO: consider a shortcut here */
+      flags= 0;
+    }
+  }
 
-  pfs_thread->m_wait_locker_count++;
-  return reinterpret_cast<PSI_table_locker*> (pfs_locker);
+  state->m_flags= flags;
+  state->m_table= table;
+  state->m_index= lock_type;
+  return reinterpret_cast<PSI_table_locker*> (state);
 }
 
+/**
+  Implementation of the file instrumentation interface.
+  @sa PSI_v1::get_thread_file_name_locker.
+*/
 static PSI_file_locker*
 get_thread_file_name_locker_v1(PSI_file_locker_state *state,
                                PSI_file_key key,
@@ -1291,180 +2703,398 @@ get_thread_file_name_locker_v1(PSI_file_locker_state *state,
 {
   DBUG_ASSERT(static_cast<int> (op) >= 0);
   DBUG_ASSERT(static_cast<uint> (op) < array_elements(file_operation_map));
+  DBUG_ASSERT(state != NULL);
 
-  if (! flag_events_waits_current)
+  if (! flag_global_instrumentation)
     return NULL;
   PFS_file_class *klass= find_file_class(key);
   if (unlikely(klass == NULL))
     return NULL;
   if (! klass->m_enabled)
     return NULL;
+
+  /* Needed for the LF_HASH */
   PFS_thread *pfs_thread= my_pthread_getspecific_ptr(PFS_thread*, THR_PFS);
   if (unlikely(pfs_thread == NULL))
     return NULL;
-  if (! pfs_thread->m_enabled)
-    return NULL;
-  if (unlikely(pfs_thread->m_wait_locker_count >= LOCKER_STACK_SIZE))
-  {
-    locker_lost++;
+
+  if (flag_thread_instrumentation && ! pfs_thread->m_enabled)
     return NULL;
-  }
+
+  register uint flags;
+
+  state->m_thread= reinterpret_cast<PSI_thread *> (pfs_thread);
+  flags= STATE_FLAG_THREAD;
+
+  if (klass->m_timed)
+    flags|= STATE_FLAG_TIMED;
+
   uint len= strlen(name);
   PFS_file *pfs_file= find_or_create_file(pfs_thread, klass, name, len);
   if (unlikely(pfs_file == NULL))
     return NULL;
 
-  PFS_wait_locker *pfs_locker= &pfs_thread->m_wait_locker_stack
-    [pfs_thread->m_wait_locker_count];
-
-  pfs_locker->m_target.m_file= pfs_file;
-  pfs_locker->m_waits_current.m_thread= pfs_thread;
-  pfs_locker->m_waits_current.m_class= pfs_file->m_class;
-  if (pfs_file->m_class->m_timed)
+  if (flag_events_waits_current)
   {
-    pfs_locker->m_timer_name= wait_timer;
-    pfs_locker->m_waits_current.m_timer_state= TIMER_STATE_STARTING;
+    if (unlikely(pfs_thread->m_events_waits_current >=
+                 & pfs_thread->m_events_waits_stack[WAIT_STACK_SIZE]))
+    {
+      locker_lost++;
+      return NULL;
+    }
+    PFS_events_waits *wait= pfs_thread->m_events_waits_current;
+    state->m_wait= wait;
+    flags|= STATE_FLAG_EVENT;
+
+    PFS_events_waits *parent_event= wait - 1;
+    wait->m_event_type= EVENT_TYPE_WAIT;
+    wait->m_nesting_event_id= parent_event->m_event_id;
+    wait->m_nesting_event_type= parent_event->m_event_type;
+
+    wait->m_thread= pfs_thread;
+    wait->m_class= klass;
+    wait->m_timer_start= 0;
+    wait->m_timer_end= 0;
+    wait->m_object_instance_addr= pfs_file;
+    wait->m_weak_file= pfs_file;
+    wait->m_weak_version= pfs_file->get_version();
+    wait->m_event_id= pfs_thread->m_event_id++;
+    wait->m_end_event_id= 0;
+    wait->m_operation= file_operation_map[static_cast<int> (op)];
+    wait->m_wait_class= WAIT_CLASS_FILE;
+
+    pfs_thread->m_events_waits_current++;
   }
-  else
-    pfs_locker->m_waits_current.m_timer_state= TIMER_STATE_UNTIMED;
-  pfs_locker->m_waits_current.m_object_instance_addr= pfs_file;
-  pfs_locker->m_waits_current.m_object_name= pfs_file->m_filename;
-  pfs_locker->m_waits_current.m_object_name_length=
-    pfs_file->m_filename_length;
-  pfs_locker->m_waits_current.m_event_id= pfs_thread->m_event_id++;
-  pfs_locker->m_waits_current.m_operation=
-    file_operation_map[static_cast<int> (op)];
-  pfs_locker->m_waits_current.m_wait_class= WAIT_CLASS_FILE;
 
-  pfs_thread->m_wait_locker_count++;
-  return reinterpret_cast<PSI_file_locker*> (pfs_locker);
+  state->m_flags= flags;
+  state->m_file= reinterpret_cast<PSI_file*> (pfs_file);
+  state->m_operation= op;
+  return reinterpret_cast<PSI_file_locker*> (state);
 }
 
+/**
+  Implementation of the file instrumentation interface.
+  @sa PSI_v1::get_thread_file_stream_locker.
+*/
 static PSI_file_locker*
 get_thread_file_stream_locker_v1(PSI_file_locker_state *state,
                                  PSI_file *file, PSI_file_operation op)
 {
   PFS_file *pfs_file= reinterpret_cast<PFS_file*> (file);
-
   DBUG_ASSERT(static_cast<int> (op) >= 0);
   DBUG_ASSERT(static_cast<uint> (op) < array_elements(file_operation_map));
-  DBUG_ASSERT(pfs_file != NULL);
-  DBUG_ASSERT(pfs_file->m_class != NULL);
+  DBUG_ASSERT(state != NULL);
 
-  if (! flag_events_waits_current)
-    return NULL;
-  if (! pfs_file->m_class->m_enabled)
-    return NULL;
-  PFS_thread *pfs_thread= my_pthread_getspecific_ptr(PFS_thread*, THR_PFS);
-  if (unlikely(pfs_thread == NULL))
-    return NULL;
-  if (! pfs_thread->m_enabled)
+  if (unlikely(pfs_file == NULL))
     return NULL;
-  if (unlikely(pfs_thread->m_wait_locker_count >= LOCKER_STACK_SIZE))
-  {
-    locker_lost++;
+  DBUG_ASSERT(pfs_file->m_class != NULL);
+
+  if (! pfs_file->m_enabled)
     return NULL;
-  }
-  PFS_wait_locker *pfs_locker= &pfs_thread->m_wait_locker_stack
-    [pfs_thread->m_wait_locker_count];
 
-  pfs_locker->m_target.m_file= pfs_file;
-  pfs_locker->m_waits_current.m_thread= pfs_thread;
-  pfs_locker->m_waits_current.m_class= pfs_file->m_class;
-  if (pfs_file->m_class->m_timed)
+  register uint flags;
+
+  if (flag_thread_instrumentation)
   {
-    pfs_locker->m_timer_name= wait_timer;
-    pfs_locker->m_waits_current.m_timer_state= TIMER_STATE_STARTING;
+    PFS_thread *pfs_thread= my_pthread_getspecific_ptr(PFS_thread*, THR_PFS);
+    if (unlikely(pfs_thread == NULL))
+      return NULL;
+    if (! pfs_thread->m_enabled)
+      return NULL;
+    state->m_thread= reinterpret_cast<PSI_thread *> (pfs_thread);
+    flags= STATE_FLAG_THREAD;
+
+    if (pfs_file->m_timed)
+      flags|= STATE_FLAG_TIMED;
+
+    if (flag_events_waits_current)
+    {
+      if (unlikely(pfs_thread->m_events_waits_current >=
+                   & pfs_thread->m_events_waits_stack[WAIT_STACK_SIZE]))
+      {
+        locker_lost++;
+        return NULL;
+      }
+      PFS_events_waits *wait= pfs_thread->m_events_waits_current;
+      state->m_wait= wait;
+      flags|= STATE_FLAG_EVENT;
+
+      PFS_events_waits *parent_event= wait - 1;
+      wait->m_event_type= EVENT_TYPE_WAIT;
+      wait->m_nesting_event_id= parent_event->m_event_id;
+      wait->m_nesting_event_type= parent_event->m_event_type;
+
+      wait->m_thread= pfs_thread;
+      wait->m_class= pfs_file->m_class;
+      wait->m_timer_start= 0;
+      wait->m_timer_end= 0;
+      wait->m_object_instance_addr= pfs_file;
+      wait->m_weak_file= pfs_file;
+      wait->m_weak_version= pfs_file->get_version();
+      wait->m_event_id= pfs_thread->m_event_id++;
+      wait->m_end_event_id= 0;
+      wait->m_operation= file_operation_map[static_cast<int> (op)];
+      wait->m_wait_class= WAIT_CLASS_FILE;
+
+      pfs_thread->m_events_waits_current++;
+    }
   }
   else
-    pfs_locker->m_waits_current.m_timer_state= TIMER_STATE_UNTIMED;
-  pfs_locker->m_waits_current.m_object_instance_addr= pfs_file;
-  pfs_locker->m_waits_current.m_object_name= pfs_file->m_filename;
-  pfs_locker->m_waits_current.m_object_name_length=
-    pfs_file->m_filename_length;
-  pfs_locker->m_waits_current.m_event_id= pfs_thread->m_event_id++;
-  pfs_locker->m_waits_current.m_operation=
-    file_operation_map[static_cast<int> (op)];
-  pfs_locker->m_waits_current.m_wait_class= WAIT_CLASS_FILE;
+  {
+    state->m_thread= NULL;
+    if (pfs_file->m_timed)
+    {
+      flags= STATE_FLAG_TIMED;
+    }
+    else
+    {
+      /* TODO: consider a shortcut. */
+      flags= 0;
+    }
+  }
 
-  pfs_thread->m_wait_locker_count++;
-  return reinterpret_cast<PSI_file_locker*> (pfs_locker);
+  state->m_flags= flags;
+  state->m_file= reinterpret_cast<PSI_file*> (pfs_file);
+  state->m_operation= op;
+  return reinterpret_cast<PSI_file_locker*> (state);
 }
 
+/**
+  Implementation of the file instrumentation interface.
+  @sa PSI_v1::get_thread_file_descriptor_locker.
+*/
 static PSI_file_locker*
 get_thread_file_descriptor_locker_v1(PSI_file_locker_state *state,
                                      File file, PSI_file_operation op)
 {
   int index= static_cast<int> (file);
-
   DBUG_ASSERT(static_cast<int> (op) >= 0);
   DBUG_ASSERT(static_cast<uint> (op) < array_elements(file_operation_map));
+  DBUG_ASSERT(state != NULL);
 
-  if (! flag_events_waits_current)
+  if (unlikely((index < 0) || (index >= file_handle_max)))
     return NULL;
-  if (likely((index >= 0) && (index < file_handle_max)))
+
+  PFS_file *pfs_file= file_handle_array[index];
+  if (unlikely(pfs_file == NULL))
+    return NULL;
+
+  /*
+    We are about to close a file by descriptor number,
+    and the calling code still holds the descriptor.
+    Cleanup the file descriptor <--> file instrument association.
+    Remove the instrumentation *before* the close to avoid race
+    conditions with another thread opening a file
+    (that could be given the same descriptor).
+  */
+  if (op == PSI_FILE_CLOSE)
+    file_handle_array[index]= NULL;
+
+  DBUG_ASSERT(pfs_file->m_class != NULL);
+  if (! pfs_file->m_enabled)
+    return NULL;
+
+  register uint flags;
+
+  if (flag_thread_instrumentation)
+  {
+    PFS_thread *pfs_thread= my_pthread_getspecific_ptr(PFS_thread*, THR_PFS);
+    if (unlikely(pfs_thread == NULL))
+      return NULL;
+    if (! pfs_thread->m_enabled)
+      return NULL;
+    state->m_thread= reinterpret_cast<PSI_thread *> (pfs_thread);
+    flags= STATE_FLAG_THREAD;
+
+    if (pfs_file->m_timed)
+      flags|= STATE_FLAG_TIMED;
+
+    if (flag_events_waits_current)
+    {
+      if (unlikely(pfs_thread->m_events_waits_current >=
+                   & pfs_thread->m_events_waits_stack[WAIT_STACK_SIZE]))
+      {
+        locker_lost++;
+        return NULL;
+      }
+      PFS_events_waits *wait= pfs_thread->m_events_waits_current;
+      state->m_wait= wait;
+      flags|= STATE_FLAG_EVENT;
+
+      PFS_events_waits *parent_event= wait - 1;
+      wait->m_event_type= EVENT_TYPE_WAIT;
+      wait->m_nesting_event_id= parent_event->m_event_id;
+      wait->m_nesting_event_type= parent_event->m_event_type;
+
+      wait->m_thread= pfs_thread;
+      wait->m_class= pfs_file->m_class;
+      wait->m_timer_start= 0;
+      wait->m_timer_end= 0;
+      wait->m_object_instance_addr= pfs_file;
+      wait->m_weak_file= pfs_file;
+      wait->m_weak_version= pfs_file->get_version();
+      wait->m_event_id= pfs_thread->m_event_id++;
+      wait->m_end_event_id= 0;
+      wait->m_operation= file_operation_map[static_cast<int> (op)];
+      wait->m_wait_class= WAIT_CLASS_FILE;
+
+      pfs_thread->m_events_waits_current++;
+    }
+  }
+  else
   {
-    PFS_file *pfs_file= file_handle_array[index];
-    if (likely(pfs_file != NULL))
+    state->m_thread= NULL;
+    if (pfs_file->m_timed)
     {
-      PFS_thread *pfs_thread;
+      flags= STATE_FLAG_TIMED;
+    }
+    else
+    {
+      /* TODO: consider a shortcut. */
+      flags= 0;
+    }
+  }
 
-      /*
-        We are about to close a file by descriptor number,
-        and the calling code still holds the descriptor.
-        Cleanup the file descriptor <--> file instrument association.
-        Remove the instrumentation *before* the close to avoid race
-        conditions with another thread opening a file
-        (that could be given the same descriptor).
-      */
-      if (op == PSI_FILE_CLOSE)
-        file_handle_array[index]= NULL;
+  state->m_flags= flags;
+  state->m_file= reinterpret_cast<PSI_file*> (pfs_file);
+  state->m_operation= op;
+  return reinterpret_cast<PSI_file_locker*> (state);
+}
 
-      DBUG_ASSERT(pfs_file->m_class != NULL);
-      if (! pfs_file->m_class->m_enabled)
-        return NULL;
-      pfs_thread= my_pthread_getspecific_ptr(PFS_thread*, THR_PFS);
-      if (unlikely(pfs_thread == NULL))
-        return NULL;
-      if (! pfs_thread->m_enabled)
-        return NULL;
-      if (unlikely(pfs_thread->m_wait_locker_count >= LOCKER_STACK_SIZE))
+/** Socket locker */
+
+static PSI_socket_locker*
+start_socket_wait_v1(PSI_socket_locker_state *state,
+                     PSI_socket *socket,
+                     PSI_socket_operation op,
+                     size_t count,
+                     const char *src_file, uint src_line)
+{
+  DBUG_ASSERT(static_cast<int> (op) >= 0);
+  DBUG_ASSERT(static_cast<uint> (op) < array_elements(socket_operation_map));
+  DBUG_ASSERT(state != NULL);
+  PFS_socket *pfs_socket= reinterpret_cast<PFS_socket*> (socket);
+
+  DBUG_ASSERT(pfs_socket != NULL);
+  DBUG_ASSERT(pfs_socket->m_class != NULL);
+
+  if (!pfs_socket->m_enabled || pfs_socket->m_idle)
+    return NULL;
+
+  register uint flags= 0;
+  ulonglong timer_start= 0;
+
+  if (flag_thread_instrumentation)
+  {
+    PFS_thread *pfs_thread= pfs_socket->m_thread_owner;
+
+    if (unlikely(pfs_thread == NULL))
+      return NULL;
+
+#ifdef LATER
+    /*
+      Needs refinement, because of KILL.
+    */
+    DBUG_ASSERT(pfs_thread ==
+                my_pthread_getspecific_ptr(PFS_thread*, THR_PFS));
+#endif
+
+    if (!pfs_thread->m_enabled)
+      return NULL;
+
+    state->m_thread= reinterpret_cast<PSI_thread *> (pfs_thread);
+    flags= STATE_FLAG_THREAD;
+
+    if (pfs_socket->m_timed)
+    {
+      timer_start= get_timer_raw_value_and_function(wait_timer, & state->m_timer);
+      state->m_timer_start= timer_start;
+      flags|= STATE_FLAG_TIMED;
+    }
+
+    if (flag_events_waits_current)
+    {
+      if (unlikely(pfs_thread->m_events_waits_current >=
+                   & pfs_thread->m_events_waits_stack[WAIT_STACK_SIZE]))
       {
         locker_lost++;
         return NULL;
       }
-      PFS_wait_locker *pfs_locker= &pfs_thread->m_wait_locker_stack
-        [pfs_thread->m_wait_locker_count];
-
-      pfs_locker->m_target.m_file= pfs_file;
-      pfs_locker->m_waits_current.m_thread= pfs_thread;
-      pfs_locker->m_waits_current.m_class= pfs_file->m_class;
-      if (pfs_file->m_class->m_timed)
+      PFS_events_waits *wait= pfs_thread->m_events_waits_current;
+      state->m_wait= wait;
+      flags|= STATE_FLAG_EVENT;
+
+      PFS_events_waits *parent_event= wait - 1;
+      wait->m_event_type= EVENT_TYPE_WAIT;
+      wait->m_nesting_event_id=   parent_event->m_event_id;
+      wait->m_nesting_event_type= parent_event->m_event_type;
+      wait->m_thread=       pfs_thread;
+      wait->m_class=        pfs_socket->m_class;
+      wait->m_timer_start=  timer_start;
+      wait->m_timer_end=    0;
+      wait->m_object_instance_addr= pfs_socket->m_identity;
+      wait->m_weak_socket=  pfs_socket;
+      wait->m_weak_version= pfs_socket->get_version();
+      wait->m_event_id=     pfs_thread->m_event_id++;
+      wait->m_end_event_id= 0;
+      wait->m_operation=    socket_operation_map[static_cast<int>(op)];
+      wait->m_source_file= src_file;
+      wait->m_source_line= src_line;
+      wait->m_number_of_bytes= count;
+      wait->m_wait_class=   WAIT_CLASS_SOCKET;
+
+      pfs_thread->m_events_waits_current++;
+    }
+  }
+  else
+  {
+    if (pfs_socket->m_timed)
+    {
+      timer_start= get_timer_raw_value_and_function(wait_timer, & state->m_timer);
+      state->m_timer_start= timer_start;
+      flags= STATE_FLAG_TIMED;
+    }
+    else
+    {
+      /*
+        Even if timing is disabled, end_socket_wait() still needs a locker to
+        capture the number of bytes sent or received by the socket operation.
+        For operations that do not have a byte count, then just increment the
+        event counter and return a NULL locker.
+      */
+      switch (op)
       {
-        pfs_locker->m_timer_name= wait_timer;
-        pfs_locker->m_waits_current.m_timer_state= TIMER_STATE_STARTING;
+        case PSI_SOCKET_CONNECT:
+        case PSI_SOCKET_CREATE:
+        case PSI_SOCKET_BIND:
+        case PSI_SOCKET_SEEK:
+        case PSI_SOCKET_OPT:
+        case PSI_SOCKET_STAT:
+        case PSI_SOCKET_SHUTDOWN:
+        case PSI_SOCKET_CLOSE:
+        case PSI_SOCKET_SELECT:
+          pfs_socket->m_socket_stat.m_io_stat.m_misc.aggregate_counted();
+          return NULL;
+        default:
+          break;
       }
-      else
-        pfs_locker->m_waits_current.m_timer_state= TIMER_STATE_UNTIMED;
-      pfs_locker->m_waits_current.m_object_instance_addr= pfs_file;
-      pfs_locker->m_waits_current.m_object_name= pfs_file->m_filename;
-      pfs_locker->m_waits_current.m_object_name_length=
-        pfs_file->m_filename_length;
-      pfs_locker->m_waits_current.m_event_id= pfs_thread->m_event_id++;
-      pfs_locker->m_waits_current.m_operation=
-        file_operation_map[static_cast<int> (op)];
-      pfs_locker->m_waits_current.m_wait_class= WAIT_CLASS_FILE;
-
-      pfs_thread->m_wait_locker_count++;
-      return reinterpret_cast<PSI_file_locker*> (pfs_locker);
     }
   }
-  return NULL;
+
+  state->m_flags= flags;
+  state->m_socket= socket;
+  state->m_operation= op;
+  return reinterpret_cast<PSI_socket_locker*> (state);
 }
 
+/**
+  Implementation of the mutex instrumentation interface.
+  @sa PSI_v1::unlock_mutex.
+*/
 static void unlock_mutex_v1(PSI_mutex *mutex)
 {
   PFS_mutex *pfs_mutex= reinterpret_cast<PFS_mutex*> (mutex);
+
   DBUG_ASSERT(pfs_mutex != NULL);
 
   /*
@@ -1495,16 +3125,24 @@ static void unlock_mutex_v1(PSI_mutex *mutex)
   if (pfs_mutex->m_class->m_timed)
   {
     ulonglong locked_time;
-    locked_time= get_timer_value(wait_timer) - pfs_mutex->m_last_locked;
+    locked_time= get_timer_pico_value(wait_timer) - pfs_mutex->m_last_locked;
     aggregate_single_stat_chain(&pfs_mutex->m_lock_stat, locked_time);
   }
 #endif
 }
 
+/**
+  Implementation of the rwlock instrumentation interface.
+  @sa PSI_v1::unlock_rwlock.
+*/
 static void unlock_rwlock_v1(PSI_rwlock *rwlock)
 {
   PFS_rwlock *pfs_rwlock= reinterpret_cast<PFS_rwlock*> (rwlock);
   DBUG_ASSERT(pfs_rwlock != NULL);
+  DBUG_ASSERT(pfs_rwlock == sanitize_rwlock(pfs_rwlock));
+  DBUG_ASSERT(pfs_rwlock->m_class != NULL);
+  DBUG_ASSERT(pfs_rwlock->m_lock.is_populated());
+
   bool last_writer= false;
   bool last_reader= false;
 
@@ -1517,7 +3155,7 @@ static void unlock_rwlock_v1(PSI_rwlock *rwlock)
   */
 
   /* Always update the instrumented state */
-  if (pfs_rwlock->m_writer)
+  if (pfs_rwlock->m_writer != NULL)
   {
     /* Nominal case, a writer is unlocking. */
     last_writer= true;
@@ -1562,7 +3200,7 @@ static void unlock_rwlock_v1(PSI_rwlock *rwlock)
   {
     if (pfs_rwlock->m_class->m_timed)
     {
-      locked_time= get_timer_value(wait_timer) - pfs_rwlock->m_last_written;
+      locked_time= get_timer_pico_value(wait_timer) - pfs_rwlock->m_last_written;
       aggregate_single_stat_chain(&pfs_rwlock->m_write_lock_stat, locked_time);
     }
   }
@@ -1570,7 +3208,7 @@ static void unlock_rwlock_v1(PSI_rwlock *rwlock)
   {
     if (pfs_rwlock->m_class->m_timed)
     {
-      locked_time= get_timer_value(wait_timer) - pfs_rwlock->m_last_read;
+      locked_time= get_timer_pico_value(wait_timer) - pfs_rwlock->m_last_read;
       aggregate_single_stat_chain(&pfs_rwlock->m_read_lock_stat, locked_time);
     }
   }
@@ -1580,109 +3218,266 @@ static void unlock_rwlock_v1(PSI_rwlock *rwlock)
 #endif
 }
 
+/**
+  Implementation of the cond instrumentation interface.
+  @sa PSI_v1::signal_cond.
+*/
 static void signal_cond_v1(PSI_cond* cond)
 {
   PFS_cond *pfs_cond= reinterpret_cast<PFS_cond*> (cond);
+
   DBUG_ASSERT(pfs_cond != NULL);
 
   pfs_cond->m_cond_stat.m_signal_count++;
 }
 
+/**
+  Implementation of the cond instrumentation interface.
+  @sa PSI_v1::broadcast_cond.
+*/
 static void broadcast_cond_v1(PSI_cond* cond)
 {
   PFS_cond *pfs_cond= reinterpret_cast<PFS_cond*> (cond);
+
   DBUG_ASSERT(pfs_cond != NULL);
 
   pfs_cond->m_cond_stat.m_broadcast_count++;
 }
 
-static void start_mutex_wait_v1(PSI_mutex_locker* locker,
-                                const char *src_file, uint src_line)
+/**
+  Implementation of the idle instrumentation interface.
+  @sa PSI_v1::start_idle_wait.
+*/
+static PSI_idle_locker*
+start_idle_wait_v1(PSI_idle_locker_state* state, const char *src_file, uint src_line)
 {
-  PFS_wait_locker *pfs_locker= reinterpret_cast<PFS_wait_locker*> (locker);
-  DBUG_ASSERT(pfs_locker != NULL);
+  DBUG_ASSERT(state != NULL);
+
+  if (!flag_global_instrumentation)
+    return NULL;
+
+  if (!global_idle_class.m_enabled)
+    return NULL;
+
+  register uint flags= 0;
+  ulonglong timer_start= 0;
+
+  if (flag_thread_instrumentation)
+  {
+    PFS_thread *pfs_thread= my_pthread_getspecific_ptr(PFS_thread*, THR_PFS);
+    if (unlikely(pfs_thread == NULL))
+      return NULL;
+    if (!pfs_thread->m_enabled)
+      return NULL;
+    state->m_thread= reinterpret_cast<PSI_thread *> (pfs_thread);
+    flags= STATE_FLAG_THREAD;
+
+    if (global_idle_class.m_timed)
+    {
+      timer_start= get_timer_raw_value_and_function(idle_timer, &state->m_timer);
+      state->m_timer_start= timer_start;
+      flags|= STATE_FLAG_TIMED;
+    }
 
-  PFS_events_waits *wait= &pfs_locker->m_waits_current;
-  if (wait->m_timer_state == TIMER_STATE_STARTING)
+    if (flag_events_waits_current)
+    {
+      if (unlikely(pfs_thread->m_events_waits_current >=
+                   & pfs_thread->m_events_waits_stack[WAIT_STACK_SIZE]))
+      {
+        locker_lost++;
+        return NULL;
+      }
+      PFS_events_waits *wait= pfs_thread->m_events_waits_current;
+      state->m_wait= wait;
+      flags|= STATE_FLAG_EVENT;
+
+      wait->m_event_type= EVENT_TYPE_WAIT;
+      /*
+        IDLE events are waits, but by definition we know that
+        such waits happen outside of any STAGE and STATEMENT,
+        so they have no parents.
+      */
+      wait->m_nesting_event_id= 0;
+      /* no need to set wait->m_nesting_event_type */
+
+      wait->m_thread= pfs_thread;
+      wait->m_class= &global_idle_class;
+      wait->m_timer_start= timer_start;
+      wait->m_timer_end= 0;
+      wait->m_event_id= pfs_thread->m_event_id++;
+      wait->m_end_event_id= 0;
+      wait->m_operation= OPERATION_TYPE_IDLE;
+      wait->m_source_file= src_file;
+      wait->m_source_line= src_line;
+      wait->m_wait_class= WAIT_CLASS_IDLE;
+
+      pfs_thread->m_events_waits_current++;
+    }
+  }
+  else
   {
-    wait->m_timer_start= get_timer_value(pfs_locker->m_timer_name);
-    wait->m_timer_state= TIMER_STATE_STARTED;
+    if (global_idle_class.m_timed)
+    {
+      timer_start= get_timer_raw_value_and_function(idle_timer, &state->m_timer);
+      state->m_timer_start= timer_start;
+      flags= STATE_FLAG_TIMED;
+    }
   }
-  wait->m_source_file= src_file;
-  wait->m_source_line= src_line;
+
+  state->m_flags= flags;
+  return reinterpret_cast<PSI_idle_locker*> (state);
 }
 
-static void end_mutex_wait_v1(PSI_mutex_locker* locker, int rc)
+/**
+  Implementation of the mutex instrumentation interface.
+  @sa PSI_v1::end_idle_wait.
+*/
+static void end_idle_wait_v1(PSI_idle_locker* locker)
 {
-  PFS_wait_locker *pfs_locker= reinterpret_cast<PFS_wait_locker*> (locker);
-  DBUG_ASSERT(pfs_locker != NULL);
-  PFS_events_waits *wait= &pfs_locker->m_waits_current;
+  PSI_idle_locker_state *state= reinterpret_cast<PSI_idle_locker_state*> (locker);
+  DBUG_ASSERT(state != NULL);
+  ulonglong timer_end= 0;
+  ulonglong wait_time= 0;
+
+  register uint flags= state->m_flags;
 
-  if (wait->m_timer_state == TIMER_STATE_STARTED)
+  if (flags & STATE_FLAG_TIMED)
   {
-    wait->m_timer_end= get_timer_value(pfs_locker->m_timer_name);
-    wait->m_timer_state= TIMER_STATE_TIMED;
+    timer_end= state->m_timer();
+    wait_time= timer_end - state->m_timer_start;
   }
-  if (flag_events_waits_history)
-    insert_events_waits_history(wait->m_thread, wait);
-  if (flag_events_waits_history_long)
-    insert_events_waits_history_long(wait);
 
-  if (rc == 0)
+  if (flags & STATE_FLAG_THREAD)
   {
-    /* Thread safe: we are protected by the instrumented mutex */
-    PFS_mutex *mutex= pfs_locker->m_target.m_mutex;
-    PFS_single_stat_chain *stat= find_per_thread_mutex_class_wait_stat(wait->m_thread, mutex->m_class);
-    mutex->m_owner= wait->m_thread;
-    mutex->m_last_locked= wait->m_timer_end;
+    PFS_thread *thread= reinterpret_cast<PFS_thread *> (state->m_thread);
+    PFS_single_stat *event_name_array;
+    event_name_array= thread->m_instr_class_waits_stats;
+    uint index= global_idle_class.m_event_name_index;
 
-    /* If timed then aggregate stats, else increment the value counts only */
-    if (wait->m_timer_state == TIMER_STATE_TIMED)
+    if (flags & STATE_FLAG_TIMED)
     {
-      ulonglong wait_time= wait->m_timer_end - wait->m_timer_start;
-      aggregate_single_stat_chain(&mutex->m_wait_stat, wait_time);
-      aggregate_single_stat_chain(stat, wait_time);
+      /* Aggregate to EVENTS_WAITS_SUMMARY_BY_THREAD_BY_EVENT_NAME (timed) */
+      event_name_array[index].aggregate_value(wait_time);
     }
     else
     {
-      increment_single_stat_chain(&mutex->m_wait_stat);
-      increment_single_stat_chain(stat);
+      /* Aggregate to EVENTS_WAITS_SUMMARY_BY_THREAD_BY_EVENT_NAME (counted) */
+      event_name_array[index].aggregate_counted();
+    }
+
+    if (flags & STATE_FLAG_EVENT)
+    {
+      PFS_events_waits *wait= reinterpret_cast<PFS_events_waits*> (state->m_wait);
+      DBUG_ASSERT(wait != NULL);
+
+      wait->m_timer_end= timer_end;
+      wait->m_end_event_id= thread->m_event_id;
+      if (flag_events_waits_history)
+        insert_events_waits_history(thread, wait);
+      if (flag_events_waits_history_long)
+        insert_events_waits_history_long(wait);
+      thread->m_events_waits_current--;
     }
   }
-  wait->m_thread->m_wait_locker_count--;
 }
 
-static void start_rwlock_rdwait_v1(PSI_rwlock_locker* locker,
-                                   const char *src_file, uint src_line)
+/**
+  Implementation of the mutex instrumentation interface.
+  @sa PSI_v1::end_mutex_wait.
+*/
+static void end_mutex_wait_v1(PSI_mutex_locker* locker, int rc)
 {
-  PFS_wait_locker *pfs_locker= reinterpret_cast<PFS_wait_locker*> (locker);
-  DBUG_ASSERT(pfs_locker != NULL);
+  PSI_mutex_locker_state *state= reinterpret_cast<PSI_mutex_locker_state*> (locker);
+  DBUG_ASSERT(state != NULL);
 
-  PFS_events_waits *wait= &pfs_locker->m_waits_current;
-  if (wait->m_timer_state == TIMER_STATE_STARTING)
+  ulonglong timer_end= 0;
+  ulonglong wait_time= 0;
+
+  PFS_mutex *mutex= reinterpret_cast<PFS_mutex *> (state->m_mutex);
+  DBUG_ASSERT(mutex != NULL);
+  PFS_thread *thread= reinterpret_cast<PFS_thread *> (state->m_thread);
+
+  register uint flags= state->m_flags;
+
+  if (flags & STATE_FLAG_TIMED)
+  {
+    timer_end= state->m_timer();
+    wait_time= timer_end - state->m_timer_start;
+    /* Aggregate to EVENTS_WAITS_SUMMARY_BY_INSTANCE (timed) */
+    mutex->m_wait_stat.aggregate_value(wait_time);
+  }
+  else
   {
-    wait->m_timer_start= get_timer_value(pfs_locker->m_timer_name);
-    wait->m_timer_state= TIMER_STATE_STARTED;
+    /* Aggregate to EVENTS_WAITS_SUMMARY_BY_INSTANCE (counted) */
+    mutex->m_wait_stat.aggregate_counted();
+  }
+
+  if (likely(rc == 0))
+  {
+    mutex->m_owner= thread;
+    mutex->m_last_locked= timer_end;
+  }
+
+  if (flags & STATE_FLAG_THREAD)
+  {
+    PFS_single_stat *event_name_array;
+    event_name_array= thread->m_instr_class_waits_stats;
+    uint index= mutex->m_class->m_event_name_index;
+
+    if (flags & STATE_FLAG_TIMED)
+    {
+      /* Aggregate to EVENTS_WAITS_SUMMARY_BY_THREAD_BY_EVENT_NAME (timed) */
+      event_name_array[index].aggregate_value(wait_time);
+    }
+    else
+    {
+      /* Aggregate to EVENTS_WAITS_SUMMARY_BY_THREAD_BY_EVENT_NAME (counted) */
+      event_name_array[index].aggregate_counted();
+    }
+
+    if (flags & STATE_FLAG_EVENT)
+    {
+      PFS_events_waits *wait= reinterpret_cast<PFS_events_waits*> (state->m_wait);
+      DBUG_ASSERT(wait != NULL);
+
+      wait->m_timer_end= timer_end;
+      wait->m_end_event_id= thread->m_event_id;
+      if (flag_events_waits_history)
+        insert_events_waits_history(thread, wait);
+      if (flag_events_waits_history_long)
+        insert_events_waits_history_long(wait);
+      thread->m_events_waits_current--;
+    }
   }
-  wait->m_source_file= src_file;
-  wait->m_source_line= src_line;
 }
 
+/**
+  Implementation of the rwlock instrumentation interface.
+  @sa PSI_v1::end_rwlock_rdwait.
+*/
 static void end_rwlock_rdwait_v1(PSI_rwlock_locker* locker, int rc)
 {
-  PFS_wait_locker *pfs_locker= reinterpret_cast<PFS_wait_locker*> (locker);
-  DBUG_ASSERT(pfs_locker != NULL);
-  PFS_events_waits *wait= &pfs_locker->m_waits_current;
+  PSI_rwlock_locker_state *state= reinterpret_cast<PSI_rwlock_locker_state*> (locker);
+  DBUG_ASSERT(state != NULL);
+
+  ulonglong timer_end= 0;
+  ulonglong wait_time= 0;
 
-  if (wait->m_timer_state == TIMER_STATE_STARTED)
+  PFS_rwlock *rwlock= reinterpret_cast<PFS_rwlock *> (state->m_rwlock);
+  DBUG_ASSERT(rwlock != NULL);
+
+  if (state->m_flags & STATE_FLAG_TIMED)
+  {
+    timer_end= state->m_timer();
+    wait_time= timer_end - state->m_timer_start;
+    /* Aggregate to EVENTS_WAITS_SUMMARY_BY_INSTANCE (timed) */
+    rwlock->m_wait_stat.aggregate_value(wait_time);
+  }
+  else
   {
-    wait->m_timer_end= get_timer_value(pfs_locker->m_timer_name);
-    wait->m_timer_state= TIMER_STATE_TIMED;
+    /* Aggregate to EVENTS_WAITS_SUMMARY_BY_INSTANCE (counted) */
+    rwlock->m_wait_stat.aggregate_counted();
   }
-  if (flag_events_waits_history)
-    insert_events_waits_history(wait->m_thread, wait);
-  if (flag_events_waits_history_long)
-    insert_events_waits_history_long(wait);
 
   if (rc == 0)
   {
@@ -1693,220 +3488,307 @@ static void end_rwlock_rdwait_v1(PSI_rwlock_locker* locker, int rc)
       The statistics generated are not safe, which is why they are
       just statistics, not facts.
     */
-    PFS_rwlock *rwlock= pfs_locker->m_target.m_rwlock;
-    PFS_single_stat_chain *stat= find_per_thread_rwlock_class_wait_stat(wait->m_thread, rwlock->m_class);
-
     if (rwlock->m_readers == 0)
-      rwlock->m_last_read= wait->m_timer_end;
+      rwlock->m_last_read= timer_end;
     rwlock->m_writer= NULL;
     rwlock->m_readers++;
+  }
+
+  if (state->m_flags & STATE_FLAG_THREAD)
+  {
+    PFS_thread *thread= reinterpret_cast<PFS_thread *> (state->m_thread);
+    DBUG_ASSERT(thread != NULL);
+
+    PFS_single_stat *event_name_array;
+    event_name_array= thread->m_instr_class_waits_stats;
+    uint index= rwlock->m_class->m_event_name_index;
 
-    /* If timed then aggregate stats, else increment the value counts only */
-    if (wait->m_timer_state == TIMER_STATE_TIMED)
+    if (state->m_flags & STATE_FLAG_TIMED)
     {
-      ulonglong wait_time= wait->m_timer_end - wait->m_timer_start;
-      aggregate_single_stat_chain(&rwlock->m_wait_stat, wait_time);
-      aggregate_single_stat_chain(stat, wait_time);
+      /* Aggregate to EVENTS_WAITS_SUMMARY_BY_THREAD_BY_EVENT_NAME (timed) */
+      event_name_array[index].aggregate_value(wait_time);
     }
     else
     {
-      increment_single_stat_chain(&rwlock->m_wait_stat);
-      increment_single_stat_chain(stat);
+      /* Aggregate to EVENTS_WAITS_SUMMARY_BY_THREAD_BY_EVENT_NAME (counted) */
+      event_name_array[index].aggregate_counted();
     }
-  }
-  wait->m_thread->m_wait_locker_count--;
-}
-
-static void start_rwlock_wrwait_v1(PSI_rwlock_locker* locker,
-                                   const char *src_file, uint src_line)
-{
-  PFS_wait_locker *pfs_locker= reinterpret_cast<PFS_wait_locker*> (locker);
-  DBUG_ASSERT(pfs_locker != NULL);
 
-  PFS_events_waits *wait= &pfs_locker->m_waits_current;
-  if (wait->m_timer_state == TIMER_STATE_STARTING)
-  {
-    wait->m_timer_start= get_timer_value(pfs_locker->m_timer_name);
-    wait->m_timer_state= TIMER_STATE_STARTED;
+    if (state->m_flags & STATE_FLAG_EVENT)
+    {
+      PFS_events_waits *wait= reinterpret_cast<PFS_events_waits*> (state->m_wait);
+      DBUG_ASSERT(wait != NULL);
+
+      wait->m_timer_end= timer_end;
+      wait->m_end_event_id= thread->m_event_id;
+      if (flag_events_waits_history)
+        insert_events_waits_history(thread, wait);
+      if (flag_events_waits_history_long)
+        insert_events_waits_history_long(wait);
+      thread->m_events_waits_current--;
+    }
   }
-  wait->m_source_file= src_file;
-  wait->m_source_line= src_line;
 }
 
+/**
+  Implementation of the rwlock instrumentation interface.
+  @sa PSI_v1::end_rwlock_wrwait.
+*/
 static void end_rwlock_wrwait_v1(PSI_rwlock_locker* locker, int rc)
 {
-  PFS_wait_locker *pfs_locker= reinterpret_cast<PFS_wait_locker*> (locker);
-  DBUG_ASSERT(pfs_locker != NULL);
-  PFS_events_waits *wait= &pfs_locker->m_waits_current;
+  PSI_rwlock_locker_state *state= reinterpret_cast<PSI_rwlock_locker_state*> (locker);
+  DBUG_ASSERT(state != NULL);
 
-  if (wait->m_timer_state == TIMER_STATE_STARTED)
+  ulonglong timer_end= 0;
+  ulonglong wait_time= 0;
+
+  PFS_rwlock *rwlock= reinterpret_cast<PFS_rwlock *> (state->m_rwlock);
+  DBUG_ASSERT(rwlock != NULL);
+  PFS_thread *thread= reinterpret_cast<PFS_thread *> (state->m_thread);
+
+  if (state->m_flags & STATE_FLAG_TIMED)
+  {
+    timer_end= state->m_timer();
+    wait_time= timer_end - state->m_timer_start;
+    /* Aggregate to EVENTS_WAITS_SUMMARY_BY_INSTANCE (timed) */
+    rwlock->m_wait_stat.aggregate_value(wait_time);
+  }
+  else
   {
-    wait->m_timer_end= get_timer_value(pfs_locker->m_timer_name);
-    wait->m_timer_state= TIMER_STATE_TIMED;
+    /* Aggregate to EVENTS_WAITS_SUMMARY_BY_INSTANCE (counted) */
+    rwlock->m_wait_stat.aggregate_counted();
   }
-  if (flag_events_waits_history)
-    insert_events_waits_history(wait->m_thread, wait);
-  if (flag_events_waits_history_long)
-    insert_events_waits_history_long(wait);
 
-  if (rc == 0)
+  if (likely(rc == 0))
   {
     /* Thread safe : we are protected by the instrumented rwlock */
-    PFS_rwlock *rwlock= pfs_locker->m_target.m_rwlock;
-    PFS_single_stat_chain *stat= find_per_thread_rwlock_class_wait_stat(wait->m_thread, rwlock->m_class);
-    rwlock->m_writer= wait->m_thread;
-    rwlock->m_last_written= wait->m_timer_end;
+    rwlock->m_writer= thread;
+    rwlock->m_last_written= timer_end;
     /* Reset the readers stats, they could be off */
     rwlock->m_readers= 0;
     rwlock->m_last_read= 0;
+  }
+
+  if (state->m_flags & STATE_FLAG_THREAD)
+  {
+    PFS_single_stat *event_name_array;
+    event_name_array= thread->m_instr_class_waits_stats;
+    uint index= rwlock->m_class->m_event_name_index;
 
-    /* If timed then aggregate stats, else increment the value counts only */
-    if (wait->m_timer_state == TIMER_STATE_TIMED)
+    if (state->m_flags & STATE_FLAG_TIMED)
     {
-      ulonglong wait_time= wait->m_timer_end - wait->m_timer_start;
-      aggregate_single_stat_chain(&rwlock->m_wait_stat, wait_time);
-      aggregate_single_stat_chain(stat, wait_time);
+      /* Aggregate to EVENTS_WAITS_SUMMARY_BY_THREAD_BY_EVENT_NAME (timed) */
+      event_name_array[index].aggregate_value(wait_time);
     }
     else
     {
-      increment_single_stat_chain(&rwlock->m_wait_stat);
-      increment_single_stat_chain(stat);
+      /* Aggregate to EVENTS_WAITS_SUMMARY_BY_THREAD_BY_EVENT_NAME (counted) */
+      event_name_array[index].aggregate_counted();
     }
-  }
-  wait->m_thread->m_wait_locker_count--;
-}
-
-static void start_cond_wait_v1(PSI_cond_locker* locker,
-                               const char *src_file, uint src_line)
-{
-  PFS_wait_locker *pfs_locker= reinterpret_cast<PFS_wait_locker*> (locker);
-  DBUG_ASSERT(pfs_locker != NULL);
 
-  PFS_events_waits *wait= &pfs_locker->m_waits_current;
-  if (wait->m_timer_state == TIMER_STATE_STARTING)
-  {
-    wait->m_timer_start= get_timer_value(pfs_locker->m_timer_name);
-    wait->m_timer_state= TIMER_STATE_STARTED;
+    if (state->m_flags & STATE_FLAG_EVENT)
+    {
+      PFS_events_waits *wait= reinterpret_cast<PFS_events_waits*> (state->m_wait);
+      DBUG_ASSERT(wait != NULL);
+
+      wait->m_timer_end= timer_end;
+      wait->m_end_event_id= thread->m_event_id;
+      if (flag_events_waits_history)
+        insert_events_waits_history(thread, wait);
+      if (flag_events_waits_history_long)
+        insert_events_waits_history_long(wait);
+      thread->m_events_waits_current--;
+    }
   }
-  wait->m_source_file= src_file;
-  wait->m_source_line= src_line;
 }
 
+/**
+  Implementation of the cond instrumentation interface.
+  @sa PSI_v1::end_cond_wait.
+*/
 static void end_cond_wait_v1(PSI_cond_locker* locker, int rc)
 {
-  PFS_wait_locker *pfs_locker= reinterpret_cast<PFS_wait_locker*> (locker);
-  DBUG_ASSERT(pfs_locker != NULL);
-  PFS_events_waits *wait= &pfs_locker->m_waits_current;
+  PSI_cond_locker_state *state= reinterpret_cast<PSI_cond_locker_state*> (locker);
+  DBUG_ASSERT(state != NULL);
+
+  ulonglong timer_end= 0;
+  ulonglong wait_time= 0;
 
-  if (wait->m_timer_state == TIMER_STATE_STARTED)
+  PFS_cond *cond= reinterpret_cast<PFS_cond *> (state->m_cond);
+  /* PFS_mutex *mutex= reinterpret_cast<PFS_mutex *> (state->m_mutex); */
+
+  if (state->m_flags & STATE_FLAG_TIMED)
+  {
+    timer_end= state->m_timer();
+    wait_time= timer_end - state->m_timer_start;
+    /* Aggregate to EVENTS_WAITS_SUMMARY_BY_INSTANCE (timed) */
+    cond->m_wait_stat.aggregate_value(wait_time);
+  }
+  else
   {
-    wait->m_timer_end= get_timer_value(pfs_locker->m_timer_name);
-    wait->m_timer_state= TIMER_STATE_TIMED;
+    /* Aggregate to EVENTS_WAITS_SUMMARY_BY_INSTANCE (counted) */
+    cond->m_wait_stat.aggregate_counted();
   }
-  if (flag_events_waits_history)
-    insert_events_waits_history(wait->m_thread, wait);
-  if (flag_events_waits_history_long)
-    insert_events_waits_history_long(wait);
 
-  if (rc == 0)
+  if (state->m_flags & STATE_FLAG_THREAD)
   {
-    /*
-      Not thread safe, race conditions will occur.
-      A first race condition is:
-      - thread 1 waits on cond A
-      - thread 2 waits on cond B
-      threads 1 and 2 compete when updating the same cond A
-      statistics, possibly missing a min / max / sum / count.
-      A second race condition is:
-      - thread 1 waits on cond A
-      - thread 2 destroys cond A
-      - thread 2 or 3 creates cond B in the same condition slot
-      thread 1 will then aggregate statistics about defunct A
-      in condition B.
-      This is accepted, the data will be slightly inaccurate.
-    */
-    PFS_cond *cond= pfs_locker->m_target.m_cond;
-    PFS_single_stat_chain *stat= find_per_thread_cond_class_wait_stat(wait->m_thread, cond->m_class);
+    PFS_thread *thread= reinterpret_cast<PFS_thread *> (state->m_thread);
+    DBUG_ASSERT(thread != NULL);
 
-    /* If timed then aggregate stats, else increment the value counts only */
-    if (wait->m_timer_state == TIMER_STATE_TIMED)
+    PFS_single_stat *event_name_array;
+    event_name_array= thread->m_instr_class_waits_stats;
+    uint index= cond->m_class->m_event_name_index;
+
+    if (state->m_flags & STATE_FLAG_TIMED)
     {
-      ulonglong wait_time= wait->m_timer_end - wait->m_timer_start;
-      aggregate_single_stat_chain(&cond->m_wait_stat, wait_time);
-      aggregate_single_stat_chain(stat, wait_time);
+      /* Aggregate to EVENTS_WAITS_SUMMARY_BY_THREAD_BY_EVENT_NAME (timed) */
+      event_name_array[index].aggregate_value(wait_time);
     }
     else
     {
-      increment_single_stat_chain(&cond->m_wait_stat);
-      increment_single_stat_chain(stat);
+      /* Aggregate to EVENTS_WAITS_SUMMARY_BY_THREAD_BY_EVENT_NAME (counted) */
+      event_name_array[index].aggregate_counted();
+    }
+
+    if (state->m_flags & STATE_FLAG_EVENT)
+    {
+      PFS_events_waits *wait= reinterpret_cast<PFS_events_waits*> (state->m_wait);
+      DBUG_ASSERT(wait != NULL);
+
+      wait->m_timer_end= timer_end;
+      wait->m_end_event_id= thread->m_event_id;
+      if (flag_events_waits_history)
+        insert_events_waits_history(thread, wait);
+      if (flag_events_waits_history_long)
+        insert_events_waits_history_long(wait);
+      thread->m_events_waits_current--;
     }
   }
-  wait->m_thread->m_wait_locker_count--;
 }
 
-static void start_table_wait_v1(PSI_table_locker* locker,
-                                const char *src_file, uint src_line)
+/**
+  Implementation of the table instrumentation interface.
+  @sa PSI_v1::end_table_io_wait.
+*/
+static void end_table_io_wait_v1(PSI_table_locker* locker)
 {
-  PFS_wait_locker *pfs_locker= reinterpret_cast<PFS_wait_locker*> (locker);
-  DBUG_ASSERT(pfs_locker != NULL);
+  PSI_table_locker_state *state= reinterpret_cast<PSI_table_locker_state*> (locker);
+  DBUG_ASSERT(state != NULL);
+
+  ulonglong timer_end= 0;
+  ulonglong wait_time= 0;
+
+  PFS_table *table= reinterpret_cast<PFS_table *> (state->m_table);
+  DBUG_ASSERT(table != NULL);
+
+  PFS_single_stat *stat;
 
-  PFS_events_waits *wait= &pfs_locker->m_waits_current;
-  if (wait->m_timer_state == TIMER_STATE_STARTING)
+  DBUG_ASSERT((state->m_index < table->m_share->m_key_count) ||
+              (state->m_index == MAX_KEY));
+
+  switch (state->m_io_operation)
   {
-    wait->m_timer_start= get_timer_value(pfs_locker->m_timer_name);
-    wait->m_timer_state= TIMER_STATE_STARTED;
+  case PSI_TABLE_FETCH_ROW:
+    stat= & table->m_table_stat.m_index_stat[state->m_index].m_fetch;
+    break;
+  case PSI_TABLE_WRITE_ROW:
+    stat= & table->m_table_stat.m_index_stat[state->m_index].m_insert;
+    break;
+  case PSI_TABLE_UPDATE_ROW:
+    stat= & table->m_table_stat.m_index_stat[state->m_index].m_update;
+    break;
+  case PSI_TABLE_DELETE_ROW:
+    stat= & table->m_table_stat.m_index_stat[state->m_index].m_delete;
+    break;
+  default:
+    DBUG_ASSERT(false);
+    stat= NULL;
+    break;
   }
-  wait->m_source_file= src_file;
-  wait->m_source_line= src_line;
-  wait->m_operation= OPERATION_TYPE_LOCK;
-  PFS_table_share *share= pfs_locker->m_target.m_table->m_share;
-  wait->m_schema_name= share->m_schema_name;
-  wait->m_schema_name_length= share->m_schema_name_length;
-  wait->m_object_name= share->m_table_name;
-  wait->m_object_name_length= share->m_table_name_length;
-}
 
-static void end_table_wait_v1(PSI_table_locker* locker)
-{
-  PFS_wait_locker *pfs_locker= reinterpret_cast<PFS_wait_locker*> (locker);
-  DBUG_ASSERT(pfs_locker != NULL);
-  PFS_events_waits *wait= &pfs_locker->m_waits_current;
+  register uint flags= state->m_flags;
+
+  if (flags & STATE_FLAG_TIMED)
+  {
+    timer_end= state->m_timer();
+    wait_time= timer_end - state->m_timer_start;
+    stat->aggregate_value(wait_time);
+  }
+  else
+  {
+    stat->aggregate_counted();
+  }
 
-  if (wait->m_timer_state == TIMER_STATE_STARTED)
+  if (flags & STATE_FLAG_EVENT)
   {
-    wait->m_timer_end= get_timer_value(pfs_locker->m_timer_name);
-    wait->m_timer_state= TIMER_STATE_TIMED;
+    DBUG_ASSERT(flags & STATE_FLAG_THREAD);
+    PFS_thread *thread= reinterpret_cast<PFS_thread *> (state->m_thread);
+    DBUG_ASSERT(thread != NULL);
+
+    PFS_events_waits *wait= reinterpret_cast<PFS_events_waits*> (state->m_wait);
+    DBUG_ASSERT(wait != NULL);
+
+    wait->m_timer_end= timer_end;
+    wait->m_end_event_id= thread->m_event_id;
+    if (flag_events_waits_history)
+      insert_events_waits_history(thread, wait);
+    if (flag_events_waits_history_long)
+      insert_events_waits_history_long(wait);
+    thread->m_events_waits_current--;
   }
-  if (flag_events_waits_history)
-    insert_events_waits_history(wait->m_thread, wait);
-  if (flag_events_waits_history_long)
-    insert_events_waits_history_long(wait);
 
-  PFS_table *table= pfs_locker->m_target.m_table;
+  table->m_has_io_stats= true;
+}
+
+/**
+  Implementation of the table instrumentation interface.
+  @sa PSI_v1::end_table_lock_wait.
+*/
+static void end_table_lock_wait_v1(PSI_table_locker* locker)
+{
+  PSI_table_locker_state *state= reinterpret_cast<PSI_table_locker_state*> (locker);
+  DBUG_ASSERT(state != NULL);
+
+  ulonglong timer_end= 0;
+  ulonglong wait_time= 0;
+
+  PFS_table *table= reinterpret_cast<PFS_table *> (state->m_table);
+  DBUG_ASSERT(table != NULL);
 
-  /* If timed then aggregate stats, else increment the value counts only */
-  if (wait->m_timer_state == TIMER_STATE_TIMED)
+  PFS_single_stat *stat= & table->m_table_stat.m_lock_stat.m_stat[state->m_index];
+
+  register uint flags= state->m_flags;
+
+  if (flags & STATE_FLAG_TIMED)
   {
-    ulonglong wait_time= wait->m_timer_end - wait->m_timer_start;
-    aggregate_single_stat_chain(&table->m_wait_stat, wait_time);
+    timer_end= state->m_timer();
+    wait_time= timer_end - state->m_timer_start;
+    stat->aggregate_value(wait_time);
   }
   else
   {
-    increment_single_stat_chain(&table->m_wait_stat);
+    stat->aggregate_counted();
   }
 
-  /*
-    There is currently no per table and per thread aggregation.
-    The number of tables in the application is arbitrary, and may be high.
-    The number of slots per thread to hold aggregates is fixed,
-    and is constrained by memory.
-    Implementing a per thread and per table aggregate has not been
-    decided yet.
-    If it's implemented, it's likely that the user will have to specify,
-    per table name, if the aggregate per thread is to be computed or not.
-    This will mean a SETUP_ table.
-  */
-  wait->m_thread->m_wait_locker_count--;
+  if (flags & STATE_FLAG_EVENT)
+  {
+    DBUG_ASSERT(flags & STATE_FLAG_THREAD);
+    PFS_thread *thread= reinterpret_cast<PFS_thread *> (state->m_thread);
+    DBUG_ASSERT(thread != NULL);
+
+    PFS_events_waits *wait= reinterpret_cast<PFS_events_waits*> (state->m_wait);
+    DBUG_ASSERT(wait != NULL);
+
+    wait->m_timer_end= timer_end;
+    wait->m_end_event_id= thread->m_event_id;
+    if (flag_events_waits_history)
+      insert_events_waits_history(thread, wait);
+    if (flag_events_waits_history_long)
+      insert_events_waits_history_long(wait);
+    thread->m_events_waits_current--;
+  }
+
+  table->m_has_lock_stats= true;
 }
 
 static void start_file_wait_v1(PSI_file_locker *locker,
@@ -1917,34 +3799,45 @@ static void start_file_wait_v1(PSI_file_locker *locker,
 static void end_file_wait_v1(PSI_file_locker *locker,
                              size_t count);
 
+/**
+  Implementation of the file instrumentation interface.
+  @sa PSI_v1::start_file_open_wait.
+*/
 static PSI_file* start_file_open_wait_v1(PSI_file_locker *locker,
                                          const char *src_file,
                                          uint src_line)
 {
-  PFS_wait_locker *pfs_locker= reinterpret_cast<PFS_wait_locker*> (locker);
-  DBUG_ASSERT(pfs_locker != NULL);
+  PSI_file_locker_state *state= reinterpret_cast<PSI_file_locker_state*> (locker);
+  DBUG_ASSERT(state != NULL);
 
   start_file_wait_v1(locker, 0, src_file, src_line);
 
-  PFS_file *pfs_file= pfs_locker->m_target.m_file;
-  return reinterpret_cast<PSI_file*> (pfs_file);
+  return state->m_file;
 }
 
+/**
+  Implementation of the file instrumentation interface.
+  @sa PSI_v1::end_file_open_wait.
+*/
 static void end_file_open_wait_v1(PSI_file_locker *locker)
 {
   end_file_wait_v1(locker, 0);
 }
 
+/**
+  Implementation of the file instrumentation interface.
+  @sa PSI_v1::end_file_open_wait_and_bind_to_descriptor.
+*/
 static void end_file_open_wait_and_bind_to_descriptor_v1
   (PSI_file_locker *locker, File file)
 {
   int index= (int) file;
-  PFS_wait_locker *pfs_locker= reinterpret_cast<PFS_wait_locker*> (locker);
-  DBUG_ASSERT(pfs_locker != NULL);
+  PSI_file_locker_state *state= reinterpret_cast<PSI_file_locker_state*> (locker);
+  DBUG_ASSERT(state != NULL);
 
   end_file_wait_v1(locker, 0);
 
-  PFS_file *pfs_file= pfs_locker->m_target.m_file;
+  PFS_file *pfs_file= reinterpret_cast<PFS_file*> (state->m_file);
   DBUG_ASSERT(pfs_file != NULL);
 
   if (likely(index >= 0))
@@ -1955,93 +3848,1031 @@ static void end_file_open_wait_and_bind_to_descriptor_v1
       file_handle_lost++;
   }
   else
+  {
     release_file(pfs_file);
+  }
 }
 
+/**
+  Implementation of the file instrumentation interface.
+  @sa PSI_v1::start_file_wait.
+*/
 static void start_file_wait_v1(PSI_file_locker *locker,
                                size_t count,
                                const char *src_file,
                                uint src_line)
 {
-  PFS_wait_locker *pfs_locker= reinterpret_cast<PFS_wait_locker*> (locker);
-  DBUG_ASSERT(pfs_locker != NULL);
+  ulonglong timer_start= 0;
+  PSI_file_locker_state *state= reinterpret_cast<PSI_file_locker_state*> (locker);
+  DBUG_ASSERT(state != NULL);
+
+  register uint flags= state->m_flags;
 
-  PFS_events_waits *wait= &pfs_locker->m_waits_current;
-  if (wait->m_timer_state == TIMER_STATE_STARTING)
+  if (flags & STATE_FLAG_TIMED)
   {
-    wait->m_timer_start= get_timer_value(pfs_locker->m_timer_name);
-    wait->m_timer_state= TIMER_STATE_STARTED;
+    timer_start= get_timer_raw_value_and_function(wait_timer, & state->m_timer);
+    state->m_timer_start= timer_start;
+  }
+
+  if (flags & STATE_FLAG_EVENT)
+  {
+    PFS_events_waits *wait= reinterpret_cast<PFS_events_waits*> (state->m_wait);
+    DBUG_ASSERT(wait != NULL);
+
+    wait->m_timer_start= timer_start;
+    wait->m_source_file= src_file;
+    wait->m_source_line= src_line;
+    wait->m_number_of_bytes= count;
   }
-  wait->m_source_file= src_file;
-  wait->m_source_line= src_line;
-  wait->m_number_of_bytes= count;
 }
 
+/**
+  Implementation of the file instrumentation interface.
+  @sa PSI_v1::end_file_wait.
+*/
 static void end_file_wait_v1(PSI_file_locker *locker,
-                             size_t count)
+                             size_t byte_count)
 {
-  PFS_wait_locker *pfs_locker= reinterpret_cast<PFS_wait_locker*> (locker);
-  DBUG_ASSERT(pfs_locker != NULL);
-  PFS_events_waits *wait= &pfs_locker->m_waits_current;
-
-  wait->m_number_of_bytes= count;
-  if (wait->m_timer_state == TIMER_STATE_STARTED)
+  PSI_file_locker_state *state= reinterpret_cast<PSI_file_locker_state*> (locker);
+  DBUG_ASSERT(state != NULL);
+  PFS_file *file= reinterpret_cast<PFS_file *> (state->m_file);
+  DBUG_ASSERT(file != NULL);
+  PFS_thread *thread= reinterpret_cast<PFS_thread *> (state->m_thread);
+
+  ulonglong timer_end= 0;
+  ulonglong wait_time= 0;
+  PFS_byte_stat *byte_stat;
+  register uint flags= state->m_flags;
+  size_t bytes= ((int)byte_count > -1 ? byte_count : 0);
+
+  switch (state->m_operation)
   {
-    wait->m_timer_end= get_timer_value(pfs_locker->m_timer_name);
-    wait->m_timer_state= TIMER_STATE_TIMED;
+    /* Group read operations */
+    case PSI_FILE_READ:
+      byte_stat= &file->m_file_stat.m_io_stat.m_read;
+      break;
+    /* Group write operations */
+    case PSI_FILE_WRITE:
+      byte_stat= &file->m_file_stat.m_io_stat.m_write;
+      break;
+    /* Group remaining operations as miscellaneous */
+    case PSI_FILE_CREATE:
+    case PSI_FILE_CREATE_TMP:
+    case PSI_FILE_OPEN:
+    case PSI_FILE_STREAM_OPEN:
+    case PSI_FILE_STREAM_CLOSE:
+    case PSI_FILE_SEEK:
+    case PSI_FILE_TELL:
+    case PSI_FILE_FLUSH:
+    case PSI_FILE_FSTAT:
+    case PSI_FILE_CHSIZE:
+    case PSI_FILE_DELETE:
+    case PSI_FILE_RENAME:
+    case PSI_FILE_SYNC:
+    case PSI_FILE_STAT:
+    case PSI_FILE_CLOSE:
+      byte_stat= &file->m_file_stat.m_io_stat.m_misc;
+      break;
+    default:
+      DBUG_ASSERT(false);
+      byte_stat= NULL;
+      break;
   }
-  if (flag_events_waits_history)
-    insert_events_waits_history(wait->m_thread, wait);
-  if (flag_events_waits_history_long)
-    insert_events_waits_history_long(wait);
 
-  PFS_file *file= pfs_locker->m_target.m_file;
-  PFS_single_stat_chain *stat= find_per_thread_file_class_wait_stat(wait->m_thread, file->m_class);
-
-  /* If timed then aggregate stats, else increment the value counts only */
-  if (wait->m_timer_state == TIMER_STATE_TIMED)
+  /* Aggregation for EVENTS_WAITS_SUMMARY_BY_INSTANCE */
+  if (flags & STATE_FLAG_TIMED)
   {
-    ulonglong wait_time= wait->m_timer_end - wait->m_timer_start;
-    aggregate_single_stat_chain(&file->m_wait_stat, wait_time);
-    aggregate_single_stat_chain(stat, wait_time);
+    timer_end= state->m_timer();
+    wait_time= timer_end - state->m_timer_start;
+    /* Aggregate to EVENTS_WAITS_SUMMARY_BY_INSTANCE (timed) */
+    byte_stat->aggregate(wait_time, bytes);
   }
   else
   {
-    increment_single_stat_chain(&file->m_wait_stat);
-    increment_single_stat_chain(stat);
+    /* Aggregate to EVENTS_WAITS_SUMMARY_BY_INSTANCE (counted) */
+    byte_stat->aggregate_counted(bytes);
   }
 
-  PFS_file_class *klass= file->m_class;
+  if (flags & STATE_FLAG_THREAD)
+  {
+    DBUG_ASSERT(thread != NULL);
+
+    PFS_single_stat *event_name_array;
+    event_name_array= thread->m_instr_class_waits_stats;
+    uint index= file->m_class->m_event_name_index;
+
+    if (flags & STATE_FLAG_TIMED)
+    {
+      /* Aggregate to EVENTS_WAITS_SUMMARY_BY_THREAD_BY_EVENT_NAME (timed) */
+      event_name_array[index].aggregate_value(wait_time);
+    }
+    else
+    {
+      /* Aggregate to EVENTS_WAITS_SUMMARY_BY_THREAD_BY_EVENT_NAME (counted) */
+      event_name_array[index].aggregate_counted();
+    }
+
+    if (state->m_flags & STATE_FLAG_EVENT)
+    {
+      PFS_events_waits *wait= reinterpret_cast<PFS_events_waits*> (state->m_wait);
+      DBUG_ASSERT(wait != NULL);
+
+      wait->m_timer_end= timer_end;
+      wait->m_number_of_bytes= bytes;
+      wait->m_end_event_id= thread->m_event_id;
+
+      if (flag_events_waits_history)
+        insert_events_waits_history(thread, wait);
+      if (flag_events_waits_history_long)
+        insert_events_waits_history_long(wait);
+      thread->m_events_waits_current--;
+    }
+  }
 
-  switch(wait->m_operation)
+  /* Release or destroy the file if necessary */
+  switch(state->m_operation)
   {
-  case OPERATION_TYPE_FILEREAD:
-    file->m_file_stat.m_count_read++;
-    file->m_file_stat.m_read_bytes+= count;
-    klass->m_file_stat.m_count_read++;
-    klass->m_file_stat.m_read_bytes+= count;
-    break;
-  case OPERATION_TYPE_FILEWRITE:
-    file->m_file_stat.m_count_write++;
-    file->m_file_stat.m_write_bytes+= count;
-    klass->m_file_stat.m_count_write++;
-    klass->m_file_stat.m_write_bytes+= count;
+  case PSI_FILE_CLOSE:
+  case PSI_FILE_STREAM_CLOSE:
+  case PSI_FILE_STAT:
+    release_file(file);
     break;
-  case OPERATION_TYPE_FILECLOSE:
-  case OPERATION_TYPE_FILESTREAMCLOSE:
-  case OPERATION_TYPE_FILESTAT:
-    release_file(pfs_locker->m_target.m_file);
-    break;
-  case OPERATION_TYPE_FILEDELETE:
-    destroy_file(wait->m_thread, pfs_locker->m_target.m_file);
+  case PSI_FILE_DELETE:
+    DBUG_ASSERT(thread != NULL);
+    destroy_file(thread, file);
     break;
   default:
     break;
   }
+}
+
+static void start_stage_v1(PSI_stage_key key, const char *src_file, int src_line)
+{
+  ulonglong timer_value= 0;
+
+  if (! flag_global_instrumentation)
+    return;
 
-  wait->m_thread->m_wait_locker_count--;
+  PFS_thread *pfs_thread= my_pthread_getspecific_ptr(PFS_thread*, THR_PFS);
+  if (unlikely(pfs_thread == NULL))
+    return;
+
+  if (flag_thread_instrumentation && ! pfs_thread->m_enabled)
+    return;
+
+  PFS_events_stages *pfs= & pfs_thread->m_stage_current;
+  PFS_events_waits *child_wait= & pfs_thread->m_events_waits_stack[0];
+  PFS_events_statements *parent_statement= & pfs_thread->m_statement_stack[0];
+
+  PFS_instr_class *old_class= pfs->m_class;
+  if (likely(old_class != NULL))
+  {
+    PFS_stage_stat *event_name_array;
+    event_name_array= pfs_thread->m_instr_class_stages_stats;
+    uint index= old_class->m_event_name_index;
+
+    /* Finish old event */
+    if (old_class->m_timed)
+    {
+      timer_value= get_timer_raw_value(stage_timer);;
+      pfs->m_timer_end= timer_value;
+
+      /* Aggregate to EVENTS_STAGES_SUMMARY_BY_THREAD_BY_EVENT_NAME (timed) */
+      ulonglong stage_time= timer_value - pfs->m_timer_start;
+      event_name_array[index].aggregate_value(stage_time);
+    }
+    else
+    {
+      /* Aggregate to EVENTS_STAGES_SUMMARY_BY_THREAD_BY_EVENT_NAME (counted) */
+      event_name_array[index].aggregate_counted();
+    }
+
+    if (flag_events_stages_current)
+    {
+      pfs->m_end_event_id= pfs_thread->m_event_id;
+      if (flag_events_stages_history)
+        insert_events_stages_history(pfs_thread, pfs);
+      if (flag_events_stages_history_long)
+        insert_events_stages_history_long(pfs);
+    }
+
+    /* This stage event is now complete. */
+    pfs->m_class= NULL;
+
+    /* New waits will now be attached directly to the parent statement. */
+    child_wait->m_event_id= parent_statement->m_event_id;
+    child_wait->m_event_type= parent_statement->m_event_type;
+    /* See below for new stages, that may overwrite this. */
+  }
+
+  /* Start new event */
+
+  PFS_stage_class *new_klass= find_stage_class(key);
+  if (unlikely(new_klass == NULL))
+    return;
+
+  if (! new_klass->m_enabled)
+    return;
+
+  pfs->m_class= new_klass;
+  if (new_klass->m_timed)
+  {
+    /*
+      Do not call the timer again if we have a
+      TIMER_END for the previous stage already.
+    */
+    if (timer_value == 0)
+      timer_value= get_timer_raw_value(stage_timer);
+    pfs->m_timer_start= timer_value;
+  }
+  else
+    pfs->m_timer_start= 0;
+  pfs->m_timer_end= 0;
+
+  if (flag_events_stages_current)
+  {
+    /* m_thread_internal_id is immutable and already set */
+    DBUG_ASSERT(pfs->m_thread_internal_id == pfs_thread->m_thread_internal_id);
+    pfs->m_event_id= pfs_thread->m_event_id++;
+    pfs->m_end_event_id= 0;
+    pfs->m_source_file= src_file;
+    pfs->m_source_line= src_line;
+
+    /* New wait events will have this new stage as parent. */
+    child_wait->m_event_id= pfs->m_event_id;
+    child_wait->m_event_type= EVENT_TYPE_STAGE;
+  }
+}
+
+static void end_stage_v1()
+{
+  ulonglong timer_value= 0;
+
+  if (! flag_global_instrumentation)
+    return;
+
+  PFS_thread *pfs_thread= my_pthread_getspecific_ptr(PFS_thread*, THR_PFS);
+  if (unlikely(pfs_thread == NULL))
+    return;
+
+  if (flag_thread_instrumentation && ! pfs_thread->m_enabled)
+    return;
+
+  PFS_events_stages *pfs= & pfs_thread->m_stage_current;
+
+  PFS_instr_class *old_class= pfs->m_class;
+  if (likely(old_class != NULL))
+  {
+    PFS_stage_stat *event_name_array;
+    event_name_array= pfs_thread->m_instr_class_stages_stats;
+    uint index= old_class->m_event_name_index;
+
+    /* Finish old event */
+    if (old_class->m_timed)
+    {
+      timer_value= get_timer_raw_value(stage_timer);;
+      pfs->m_timer_end= timer_value;
+
+      /* Aggregate to EVENTS_STAGES_SUMMARY_BY_THREAD_BY_EVENT_NAME (timed) */
+      ulonglong stage_time= timer_value - pfs->m_timer_start;
+      event_name_array[index].aggregate_value(stage_time);
+    }
+    else
+    {
+      /* Aggregate to EVENTS_STAGES_SUMMARY_BY_THREAD_BY_EVENT_NAME (counted) */
+      event_name_array[index].aggregate_counted();
+    }
+
+    if (flag_events_stages_current)
+    {
+      pfs->m_end_event_id= pfs_thread->m_event_id;
+      if (flag_events_stages_history)
+        insert_events_stages_history(pfs_thread, pfs);
+      if (flag_events_stages_history_long)
+        insert_events_stages_history_long(pfs);
+    }
+
+    /* New waits will now be attached directly to the parent statement. */
+    PFS_events_waits *child_wait= & pfs_thread->m_events_waits_stack[0];
+    PFS_events_statements *parent_statement= & pfs_thread->m_statement_stack[0];
+    child_wait->m_event_id= parent_statement->m_event_id;
+    child_wait->m_event_type= parent_statement->m_event_type;
+
+    /* This stage is completed */
+    pfs->m_class= NULL;
+  }
+}
+
+static PSI_statement_locker*
+get_thread_statement_locker_v1(PSI_statement_locker_state *state,
+                               PSI_statement_key key)
+{
+  DBUG_ASSERT(state != NULL);
+  if (! flag_global_instrumentation)
+    return NULL;
+  PFS_statement_class *klass= find_statement_class(key);
+  if (unlikely(klass == NULL))
+    return NULL;
+  if (! klass->m_enabled)
+    return NULL;
+
+  register uint flags;
+
+  if (flag_thread_instrumentation)
+  {
+    PFS_thread *pfs_thread= my_pthread_getspecific_ptr(PFS_thread*, THR_PFS);
+    if (unlikely(pfs_thread == NULL))
+      return NULL;
+    if (! pfs_thread->m_enabled)
+      return NULL;
+    state->m_thread= reinterpret_cast<PSI_thread *> (pfs_thread);
+    flags= STATE_FLAG_THREAD;
+
+    if (klass->m_timed)
+      flags|= STATE_FLAG_TIMED;
+
+    if (flag_events_statements_current)
+    {
+      ulonglong event_id= pfs_thread->m_event_id++;
+
+      if (pfs_thread->m_events_statements_count >= statement_stack_max)
+      {
+        return NULL;
+      }
+
+      PFS_events_statements *pfs= & pfs_thread->m_statement_stack[pfs_thread->m_events_statements_count];
+      /* m_thread_internal_id is immutable and already set */
+      DBUG_ASSERT(pfs->m_thread_internal_id == pfs_thread->m_thread_internal_id);
+      pfs->m_event_id= event_id;
+      pfs->m_end_event_id= 0;
+      pfs->m_class= klass;
+      pfs->m_timer_start= 0;
+      pfs->m_timer_end= 0;
+      pfs->m_lock_time= 0;
+      pfs->m_current_schema_name_length= 0;
+      pfs->m_sqltext_length= 0;
+
+      pfs->m_message_text[0]= '\0';
+      pfs->m_sql_errno= 0;
+      pfs->m_sqlstate[0]= '\0';
+      pfs->m_error_count= 0;
+      pfs->m_warning_count= 0;
+      pfs->m_rows_affected= 0;
+
+      pfs->m_rows_sent= 0;
+      pfs->m_rows_examined= 0;
+      pfs->m_created_tmp_disk_tables= 0;
+      pfs->m_created_tmp_tables= 0;
+      pfs->m_select_full_join= 0;
+      pfs->m_select_full_range_join= 0;
+      pfs->m_select_range= 0;
+      pfs->m_select_range_check= 0;
+      pfs->m_select_scan= 0;
+      pfs->m_sort_merge_passes= 0;
+      pfs->m_sort_range= 0;
+      pfs->m_sort_rows= 0;
+      pfs->m_sort_scan= 0;
+      pfs->m_no_index_used= 0;
+      pfs->m_no_good_index_used= 0;
+      digest_reset(& pfs->m_digest_storage);
+
+      /* New stages will have this statement as parent */
+      PFS_events_stages *child_stage= & pfs_thread->m_stage_current;
+      child_stage->m_nesting_event_id= event_id;
+      child_stage->m_nesting_event_type= EVENT_TYPE_STATEMENT;
+
+      /* New waits will have this statement as parent, if no stage is instrumented */
+      PFS_events_waits *child_wait= & pfs_thread->m_events_waits_stack[0];
+      child_wait->m_nesting_event_id= event_id;
+      child_wait->m_nesting_event_type= EVENT_TYPE_STATEMENT;
+
+      state->m_statement= pfs;
+      flags|= STATE_FLAG_EVENT;
+
+      pfs_thread->m_events_statements_count++;
+    }
+  }
+  else
+  {
+    if (klass->m_timed)
+      flags= STATE_FLAG_TIMED;
+    else
+      flags= 0;
+  }
+
+  if (flag_statements_digest)
+  {
+    flags|= STATE_FLAG_DIGEST;
+    state->m_digest_state.m_last_id_index= 0;
+    digest_reset(& state->m_digest_state.m_digest_storage);
+  }
+
+  state->m_discarded= false;
+  state->m_class= klass;
+  state->m_flags= flags;
+
+  state->m_lock_time= 0;
+  state->m_rows_sent= 0;
+  state->m_rows_examined= 0;
+  state->m_created_tmp_disk_tables= 0;
+  state->m_created_tmp_tables= 0;
+  state->m_select_full_join= 0;
+  state->m_select_full_range_join= 0;
+  state->m_select_range= 0;
+  state->m_select_range_check= 0;
+  state->m_select_scan= 0;
+  state->m_sort_merge_passes= 0;
+  state->m_sort_range= 0;
+  state->m_sort_rows= 0;
+  state->m_sort_scan= 0;
+  state->m_no_index_used= 0;
+  state->m_no_good_index_used= 0;
+
+  return reinterpret_cast<PSI_statement_locker*> (state);
+}
+
+static PSI_statement_locker*
+refine_statement_v1(PSI_statement_locker *locker,
+                    PSI_statement_key key)
+{
+  PSI_statement_locker_state *state= reinterpret_cast<PSI_statement_locker_state*> (locker);
+  if (state == NULL)
+    return NULL;
+  DBUG_ASSERT(state->m_class != NULL);
+  PFS_statement_class *klass;
+  /* Only refine statements for mutable instrumentation */
+  klass= reinterpret_cast<PFS_statement_class*> (state->m_class);
+  DBUG_ASSERT(klass->m_flags & PSI_FLAG_MUTABLE);
+  klass= find_statement_class(key);
+  if (unlikely(klass == NULL))
+  {
+    /* FIXME : pop statement stack */
+    state->m_discarded= true;
+    return NULL;
+  }
+  if (! klass->m_enabled)
+  {
+    /* FIXME : pop statement stack */
+    state->m_discarded= true;
+    return NULL;
+  }
+
+  register uint flags= state->m_flags;
+
+  if ((flags & STATE_FLAG_TIMED) && ! klass->m_timed)
+    flags= flags & ~STATE_FLAG_TIMED;
+
+  if (flags & STATE_FLAG_EVENT)
+  {
+    PFS_events_statements *pfs= reinterpret_cast<PFS_events_statements*> (state->m_statement);
+    DBUG_ASSERT(pfs != NULL);
+
+    /* mutate EVENTS_STATEMENTS_CURRENT.EVENT_NAME */
+    pfs->m_class= klass;
+  }
+
+  state->m_class= klass;
+  state->m_flags= flags;
+  return reinterpret_cast<PSI_statement_locker*> (state);
+}
+
+static void start_statement_v1(PSI_statement_locker *locker,
+                               const char *db, uint db_len,
+                               const char *src_file, uint src_line)
+{
+  PSI_statement_locker_state *state= reinterpret_cast<PSI_statement_locker_state*> (locker);
+  DBUG_ASSERT(state != NULL);
+
+  register uint flags= state->m_flags;
+  ulonglong timer_start= 0;
+
+  if (flags & STATE_FLAG_TIMED)
+  {
+    timer_start= get_timer_raw_value_and_function(statement_timer, & state->m_timer);
+    state->m_timer_start= timer_start;
+  }
+
+  if (flags & STATE_FLAG_EVENT)
+  {
+    PFS_events_statements *pfs= reinterpret_cast<PFS_events_statements*> (state->m_statement);
+    DBUG_ASSERT(pfs != NULL);
+
+    pfs->m_timer_start= timer_start;
+    pfs->m_source_file= src_file;
+    pfs->m_source_line= src_line;
+
+    DBUG_ASSERT(db_len <= sizeof(pfs->m_current_schema_name));
+    if (db_len > 0)
+      memcpy(pfs->m_current_schema_name, db, db_len);
+    pfs->m_current_schema_name_length= db_len;
+  }
+}
+
+static void set_statement_text_v1(PSI_statement_locker *locker,
+                                  const char *text, uint text_len)
+{
+  PSI_statement_locker_state *state= reinterpret_cast<PSI_statement_locker_state*> (locker);
+  DBUG_ASSERT(state != NULL);
+
+  if (state->m_discarded)
+    return;
+
+  if (state->m_flags & STATE_FLAG_EVENT)
+  {
+    PFS_events_statements *pfs= reinterpret_cast<PFS_events_statements*> (state->m_statement);
+    DBUG_ASSERT(pfs != NULL);
+    if (text_len > sizeof (pfs->m_sqltext))
+      text_len= sizeof(pfs->m_sqltext);
+    if (text_len)
+      memcpy(pfs->m_sqltext, text, text_len);
+    pfs->m_sqltext_length= text_len;
+  }
+
+  return;
+}
+
+#define SET_STATEMENT_ATTR_BODY(LOCKER, ATTR, VALUE)                    \
+  PSI_statement_locker_state *state;                                    \
+  state= reinterpret_cast<PSI_statement_locker_state*> (LOCKER);        \
+  if (unlikely(state == NULL))                                          \
+    return;                                                             \
+  if (state->m_discarded)                                               \
+    return;                                                             \
+  state->ATTR= VALUE;                                                   \
+  if (state->m_flags & STATE_FLAG_EVENT)                                \
+  {                                                                     \
+    PFS_events_statements *pfs;                                         \
+    pfs= reinterpret_cast<PFS_events_statements*> (state->m_statement); \
+    DBUG_ASSERT(pfs != NULL);                                           \
+    pfs->ATTR= VALUE;                                                   \
+  }                                                                     \
+  return;
+
+#define INC_STATEMENT_ATTR_BODY(LOCKER, ATTR, VALUE)                    \
+  PSI_statement_locker_state *state;                                    \
+  state= reinterpret_cast<PSI_statement_locker_state*> (LOCKER);        \
+  if (unlikely(state == NULL))                                          \
+    return;                                                             \
+  if (state->m_discarded)                                               \
+    return;                                                             \
+  state->ATTR+= VALUE;                                                  \
+  if (state->m_flags & STATE_FLAG_EVENT)                                \
+  {                                                                     \
+    PFS_events_statements *pfs;                                         \
+    pfs= reinterpret_cast<PFS_events_statements*> (state->m_statement); \
+    DBUG_ASSERT(pfs != NULL);                                           \
+    pfs->ATTR+= VALUE;                                                  \
+  }                                                                     \
+  return;
+
+static void set_statement_lock_time_v1(PSI_statement_locker *locker,
+                                       ulonglong count)
+{
+  SET_STATEMENT_ATTR_BODY(locker, m_lock_time, count);
+}
+
+static void set_statement_rows_sent_v1(PSI_statement_locker *locker,
+                                       ulonglong count)
+{
+  SET_STATEMENT_ATTR_BODY(locker, m_rows_sent, count);
+}
+
+static void set_statement_rows_examined_v1(PSI_statement_locker *locker,
+                                           ulonglong count)
+{
+  SET_STATEMENT_ATTR_BODY(locker, m_rows_examined, count);
+}
+
+static void inc_statement_created_tmp_disk_tables_v1(PSI_statement_locker *locker,
+                                                    ulong count)
+{
+  INC_STATEMENT_ATTR_BODY(locker, m_created_tmp_disk_tables, count);
+}
+
+static void inc_statement_created_tmp_tables_v1(PSI_statement_locker *locker,
+                                                ulong count)
+{
+  INC_STATEMENT_ATTR_BODY(locker, m_created_tmp_tables, count);
 }
 
+static void inc_statement_select_full_join_v1(PSI_statement_locker *locker,
+                                              ulong count)
+{
+  INC_STATEMENT_ATTR_BODY(locker, m_select_full_join, count);
+}
+
+static void inc_statement_select_full_range_join_v1(PSI_statement_locker *locker,
+                                                    ulong count)
+{
+  INC_STATEMENT_ATTR_BODY(locker, m_select_full_range_join, count);
+}
+
+static void inc_statement_select_range_v1(PSI_statement_locker *locker,
+                                          ulong count)
+{
+  INC_STATEMENT_ATTR_BODY(locker, m_select_range, count);
+}
+
+static void inc_statement_select_range_check_v1(PSI_statement_locker *locker,
+                                                ulong count)
+{
+  INC_STATEMENT_ATTR_BODY(locker, m_select_range_check, count);
+}
+
+static void inc_statement_select_scan_v1(PSI_statement_locker *locker,
+                                         ulong count)
+{
+  INC_STATEMENT_ATTR_BODY(locker, m_select_scan, count);
+}
+
+static void inc_statement_sort_merge_passes_v1(PSI_statement_locker *locker,
+                                               ulong count)
+{
+  INC_STATEMENT_ATTR_BODY(locker, m_sort_merge_passes, count);
+}
+
+static void inc_statement_sort_range_v1(PSI_statement_locker *locker,
+                                        ulong count)
+{
+  INC_STATEMENT_ATTR_BODY(locker, m_sort_range, count);
+}
+
+static void inc_statement_sort_rows_v1(PSI_statement_locker *locker,
+                                       ulong count)
+{
+  INC_STATEMENT_ATTR_BODY(locker, m_sort_rows, count);
+}
+
+static void inc_statement_sort_scan_v1(PSI_statement_locker *locker,
+                                       ulong count)
+{
+  INC_STATEMENT_ATTR_BODY(locker, m_sort_scan, count);
+}
+
+static void set_statement_no_index_used_v1(PSI_statement_locker *locker)
+{
+  SET_STATEMENT_ATTR_BODY(locker, m_no_index_used, 1);
+}
+
+static void set_statement_no_good_index_used_v1(PSI_statement_locker *locker)
+{
+  SET_STATEMENT_ATTR_BODY(locker, m_no_good_index_used, 1);
+}
+
+static void end_statement_v1(PSI_statement_locker *locker, void *stmt_da)
+{
+  PSI_statement_locker_state *state= reinterpret_cast<PSI_statement_locker_state*> (locker);
+  Diagnostics_area *da= reinterpret_cast<Diagnostics_area*> (stmt_da);
+  DBUG_ASSERT(state != NULL);
+  DBUG_ASSERT(da != NULL);
+
+  if (state->m_discarded)
+    return;
+
+  PFS_statement_class *klass= reinterpret_cast<PFS_statement_class *> (state->m_class);
+  DBUG_ASSERT(klass != NULL);
+
+  ulonglong timer_end= 0;
+  ulonglong wait_time= 0;
+  register uint flags= state->m_flags;
+
+  if (flags & STATE_FLAG_TIMED)
+  {
+    timer_end= state->m_timer();
+    wait_time= timer_end - state->m_timer_start;
+  }
+
+  PFS_statement_stat *event_name_array;
+  uint index= klass->m_event_name_index;
+  PFS_statement_stat *stat;
+  
+  /*
+   Capture statement stats by digest.
+  */
+  PSI_digest_storage *digest_storage= NULL;
+  PFS_statement_stat *digest_stat= NULL;
+
+  if (flags & STATE_FLAG_THREAD)
+  {
+    PFS_thread *thread= reinterpret_cast<PFS_thread *> (state->m_thread);
+    DBUG_ASSERT(thread != NULL);
+    event_name_array= thread->m_instr_class_statements_stats;
+    /* Aggregate to EVENTS_STATEMENTS_SUMMARY_BY_THREAD_BY_EVENT_NAME */
+    stat= & event_name_array[index];
+
+    if (flags & STATE_FLAG_DIGEST)
+    {
+      digest_storage= &state->m_digest_state.m_digest_storage;
+
+      /* 
+        Populate PFS_statements_digest_stat with computed digest information.
+      */
+      digest_stat= find_or_create_digest(thread, digest_storage);
+    }
+
+    if (flags & STATE_FLAG_EVENT)
+    {
+      PFS_events_statements *pfs= reinterpret_cast<PFS_events_statements*> (state->m_statement);
+      DBUG_ASSERT(pfs != NULL);
+
+      switch(da->status())
+      {
+        case Diagnostics_area::DA_EMPTY:
+          break;
+        case Diagnostics_area::DA_OK:
+          memcpy(pfs->m_message_text, da->message(), MYSQL_ERRMSG_SIZE);
+          pfs->m_message_text[MYSQL_ERRMSG_SIZE]= 0;
+          pfs->m_rows_affected= da->affected_rows();
+          pfs->m_warning_count= da->statement_warn_count();
+          memcpy(pfs->m_sqlstate, "00000", SQLSTATE_LENGTH);
+          break;
+        case Diagnostics_area::DA_EOF:
+          pfs->m_warning_count= da->statement_warn_count();
+          break;
+        case Diagnostics_area::DA_ERROR:
+          memcpy(pfs->m_message_text, da->message(), MYSQL_ERRMSG_SIZE);
+          pfs->m_message_text[MYSQL_ERRMSG_SIZE]= 0;
+          pfs->m_sql_errno= da->sql_errno();
+          memcpy(pfs->m_sqlstate, da->get_sqlstate(), SQLSTATE_LENGTH);
+          break;
+        case Diagnostics_area::DA_DISABLED:
+          break;
+      }
+
+      pfs->m_timer_end= timer_end;
+      pfs->m_end_event_id= thread->m_event_id;
+
+      if (flags & STATE_FLAG_DIGEST)
+      {
+        /*
+          The following columns in events_statement_current:
+          - DIGEST,
+          - DIGEST_TEXT
+          are computed from the digest storage.
+        */
+        digest_copy(& pfs->m_digest_storage, digest_storage);
+      }
+
+      if (flag_events_statements_history)
+        insert_events_statements_history(thread, pfs);
+      if (flag_events_statements_history_long)
+        insert_events_statements_history_long(pfs);
+
+      DBUG_ASSERT(thread->m_events_statements_count > 0);
+      thread->m_events_statements_count--;
+    }
+  }
+  else
+  {
+    if (flags & STATE_FLAG_DIGEST)
+    {
+      PFS_thread *thread= my_pthread_getspecific_ptr(PFS_thread*, THR_PFS);
+
+      /* An instrumented thread is required, for LF_PINS. */
+      if (thread != NULL)
+      {
+        /* Set digest stat. */
+        digest_storage= &state->m_digest_state.m_digest_storage;
+
+        /* 
+          Populate PFS_statements_digest_stat with computed digest information.
+        */
+        digest_stat= find_or_create_digest(thread, digest_storage);
+      }
+    }
+
+    event_name_array= global_instr_class_statements_array;
+    /* Aggregate to EVENTS_STATEMENTS_SUMMARY_GLOBAL_BY_EVENT_NAME */
+    stat= & event_name_array[index];
+  }
+
+  if (flags & STATE_FLAG_TIMED)
+  {
+    /* Aggregate to EVENTS_STATEMENTS_SUMMARY_..._BY_EVENT_NAME (timed) */
+    stat->aggregate_value(wait_time);
+  }
+  else
+  {
+    /* Aggregate to EVENTS_STATEMENTS_SUMMARY_..._BY_EVENT_NAME (counted) */
+    stat->aggregate_counted();
+  }
+
+  stat->m_lock_time+= state->m_lock_time;
+  stat->m_rows_sent+= state->m_rows_sent;
+  stat->m_rows_examined+= state->m_rows_examined;
+  stat->m_created_tmp_disk_tables+= state->m_created_tmp_disk_tables;
+  stat->m_created_tmp_tables+= state->m_created_tmp_tables;
+  stat->m_select_full_join+= state->m_select_full_join;
+  stat->m_select_full_range_join+= state->m_select_full_range_join;
+  stat->m_select_range+= state->m_select_range;
+  stat->m_select_range_check+= state->m_select_range_check;
+  stat->m_select_scan+= state->m_select_scan;
+  stat->m_sort_merge_passes+= state->m_sort_merge_passes;
+  stat->m_sort_range+= state->m_sort_range;
+  stat->m_sort_rows+= state->m_sort_rows;
+  stat->m_sort_scan+= state->m_sort_scan;
+  stat->m_no_index_used+= state->m_no_index_used;
+  stat->m_no_good_index_used+= state->m_no_good_index_used;
+
+  if (digest_stat != NULL)
+  {
+    if (flags & STATE_FLAG_TIMED)
+    {
+      digest_stat->aggregate_value(wait_time);
+    }
+    else
+    {
+      digest_stat->aggregate_counted();
+    }
+  
+    digest_stat->m_lock_time+= state->m_lock_time;
+    digest_stat->m_rows_sent+= state->m_rows_sent;
+    digest_stat->m_rows_examined+= state->m_rows_examined;
+    digest_stat->m_created_tmp_disk_tables+= state->m_created_tmp_disk_tables;
+    digest_stat->m_created_tmp_tables+= state->m_created_tmp_tables;
+    digest_stat->m_select_full_join+= state->m_select_full_join;
+    digest_stat->m_select_full_range_join+= state->m_select_full_range_join;
+    digest_stat->m_select_range+= state->m_select_range;
+    digest_stat->m_select_range_check+= state->m_select_range_check;
+    digest_stat->m_select_scan+= state->m_select_scan;
+    digest_stat->m_sort_merge_passes+= state->m_sort_merge_passes;
+    digest_stat->m_sort_range+= state->m_sort_range;
+    digest_stat->m_sort_rows+= state->m_sort_rows;
+    digest_stat->m_sort_scan+= state->m_sort_scan;
+    digest_stat->m_no_index_used+= state->m_no_index_used;
+    digest_stat->m_no_good_index_used+= state->m_no_good_index_used;
+  }
+
+  switch (da->status())
+  {
+    case Diagnostics_area::DA_EMPTY:
+      break;
+    case Diagnostics_area::DA_OK:
+      stat->m_rows_affected+= da->affected_rows();
+      stat->m_warning_count+= da->statement_warn_count();
+      if (digest_stat != NULL)
+      {
+        digest_stat->m_rows_affected+= da->affected_rows();
+        digest_stat->m_warning_count+= da->statement_warn_count();
+      }
+      break;
+    case Diagnostics_area::DA_EOF:
+      stat->m_warning_count+= da->statement_warn_count();
+      if (digest_stat != NULL)
+      {
+        digest_stat->m_warning_count+= da->statement_warn_count();
+      }
+      break;
+    case Diagnostics_area::DA_ERROR:
+      stat->m_error_count++;
+      if (digest_stat != NULL)
+      {
+        digest_stat->m_error_count++;
+      }
+      break;
+    case Diagnostics_area::DA_DISABLED:
+      break;
+  }
+}
+
+/**
+  Implementation of the socket instrumentation interface.
+  @sa PSI_v1::end_socket_wait.
+*/
+static void end_socket_wait_v1(PSI_socket_locker *locker, size_t byte_count)
+{
+  PSI_socket_locker_state *state= reinterpret_cast<PSI_socket_locker_state*> (locker);
+  DBUG_ASSERT(state != NULL);
+
+  PFS_socket *socket= reinterpret_cast<PFS_socket *>(state->m_socket);
+  DBUG_ASSERT(socket != NULL);
+
+  ulonglong timer_end= 0;
+  ulonglong wait_time= 0;
+  PFS_byte_stat *byte_stat;
+  register uint flags= state->m_flags;
+  size_t bytes= ((int)byte_count > -1 ? byte_count : 0);
+
+  switch (state->m_operation)
+  {
+    /* Group read operations */
+    case PSI_SOCKET_RECV:
+    case PSI_SOCKET_RECVFROM:
+    case PSI_SOCKET_RECVMSG:
+      byte_stat= &socket->m_socket_stat.m_io_stat.m_read;
+      break;
+    /* Group write operations */
+    case PSI_SOCKET_SEND:
+    case PSI_SOCKET_SENDTO:
+    case PSI_SOCKET_SENDMSG:
+      byte_stat= &socket->m_socket_stat.m_io_stat.m_write;
+      break;
+    /* Group remaining operations as miscellaneous */
+    case PSI_SOCKET_CONNECT:
+    case PSI_SOCKET_CREATE:
+    case PSI_SOCKET_BIND:
+    case PSI_SOCKET_SEEK:
+    case PSI_SOCKET_OPT:
+    case PSI_SOCKET_STAT:
+    case PSI_SOCKET_SHUTDOWN:
+    case PSI_SOCKET_SELECT:
+    case PSI_SOCKET_CLOSE:
+      byte_stat= &socket->m_socket_stat.m_io_stat.m_misc;
+      break;
+    default:
+      DBUG_ASSERT(false);
+      byte_stat= NULL;
+      break;
+  }
+
+  /* Aggregation for EVENTS_WAITS_SUMMARY_BY_INSTANCE */
+  if (flags & STATE_FLAG_TIMED)
+  {
+    timer_end= state->m_timer();
+    wait_time= timer_end - state->m_timer_start;
+
+    /* Aggregate to the socket instrument for now (timed) */
+    byte_stat->aggregate(wait_time, bytes);
+  }
+  else
+  {
+    /* Aggregate to the socket instrument (event count and byte count) */
+    byte_stat->aggregate_counted(bytes);
+  }
+
+  /* Aggregate to EVENTS_WAITS_HISTORY and EVENTS_WAITS_HISTORY_LONG */
+  if (flags & STATE_FLAG_EVENT)
+  {
+    PFS_thread *thread= reinterpret_cast<PFS_thread *>(state->m_thread);
+    DBUG_ASSERT(thread != NULL);
+    PFS_events_waits *wait= reinterpret_cast<PFS_events_waits*> (state->m_wait);
+    DBUG_ASSERT(wait != NULL);
+
+    wait->m_timer_end= timer_end;
+    wait->m_end_event_id= thread->m_event_id;
+    wait->m_number_of_bytes= bytes;
+
+    if (flag_events_waits_history)
+      insert_events_waits_history(thread, wait);
+    if (flag_events_waits_history_long)
+      insert_events_waits_history_long(wait);
+    thread->m_events_waits_current--;
+  }
+}
+
+static void set_socket_state_v1(PSI_socket *socket, PSI_socket_state state)
+{
+  DBUG_ASSERT((state == PSI_SOCKET_STATE_IDLE) || (state == PSI_SOCKET_STATE_ACTIVE));
+  PFS_socket *pfs= reinterpret_cast<PFS_socket*>(socket);
+  DBUG_ASSERT(pfs != NULL);
+  DBUG_ASSERT(pfs->m_idle || (state == PSI_SOCKET_STATE_IDLE));
+  DBUG_ASSERT(!pfs->m_idle || (state == PSI_SOCKET_STATE_ACTIVE));
+  pfs->m_idle= (state == PSI_SOCKET_STATE_IDLE);
+}
+
+/**
+  Set socket descriptor and address info.
+*/
+static void set_socket_info_v1(PSI_socket *socket,
+                               const my_socket *fd,
+                               const struct sockaddr *addr,
+                               socklen_t addr_len)
+{
+  PFS_socket *pfs= reinterpret_cast<PFS_socket*>(socket);
+  DBUG_ASSERT(pfs != NULL);
+
+  /** Set socket descriptor */
+  if (fd != NULL)
+    pfs->m_fd= *fd;
+
+  /** Set raw socket address and length */
+  if (likely(addr != NULL && addr_len > 0))
+  {
+    pfs->m_addr_len= addr_len;
+
+    /** Restrict address length to size of struct */
+    if (unlikely(pfs->m_addr_len > sizeof(sockaddr_storage)))
+      pfs->m_addr_len= sizeof(struct sockaddr_storage);
+
+    memcpy(&pfs->m_sock_addr, addr, pfs->m_addr_len);
+  }
+}
+
+/**
+  Implementation of the socket instrumentation interface.
+  @sa PSI_v1::set_socket_info.
+*/
+static void set_socket_thread_owner_v1(PSI_socket *socket)
+{
+  PFS_socket *pfs_socket= reinterpret_cast<PFS_socket*>(socket);
+  DBUG_ASSERT(pfs_socket != NULL);
+  pfs_socket->m_thread_owner= my_pthread_getspecific_ptr(PFS_thread*, THR_PFS);
+}
+
+/**
+  Implementation of the instrumentation interface.
+  @sa PSI_v1.
+*/
 PSI_v1 PFS_v1=
 {
   register_mutex_v1,
@@ -2049,28 +4880,39 @@ PSI_v1 PFS_v1=
   register_cond_v1,
   register_thread_v1,
   register_file_v1,
+  register_stage_v1,
+  register_statement_v1,
+  register_socket_v1,
   init_mutex_v1,
   destroy_mutex_v1,
   init_rwlock_v1,
   destroy_rwlock_v1,
   init_cond_v1,
   destroy_cond_v1,
+  init_socket_v1,
+  destroy_socket_v1,
   get_table_share_v1,
   release_table_share_v1,
+  drop_table_share_v1,
   open_table_v1,
+  unbind_table_v1,
+  rebind_table_v1,
   close_table_v1,
   create_file_v1,
   spawn_thread_v1,
   new_thread_v1,
   set_thread_id_v1,
   get_thread_v1,
+  set_thread_user_v1,
+  set_thread_account_v1,
+  set_thread_db_v1,
+  set_thread_command_v1,
+  set_thread_start_time_v1,
+  set_thread_state_v1,
+  set_thread_info_v1,
   set_thread_v1,
   delete_current_thread_v1,
   delete_thread_v1,
-  get_thread_mutex_locker_v1,
-  get_thread_rwlock_locker_v1,
-  get_thread_cond_locker_v1,
-  get_thread_table_locker_v1,
   get_thread_file_name_locker_v1,
   get_thread_file_stream_locker_v1,
   get_thread_file_descriptor_locker_v1,
@@ -2078,21 +4920,55 @@ PSI_v1 PFS_v1=
   unlock_rwlock_v1,
   signal_cond_v1,
   broadcast_cond_v1,
+  start_idle_wait_v1,
+  end_idle_wait_v1,
   start_mutex_wait_v1,
   end_mutex_wait_v1,
-  start_rwlock_rdwait_v1,
+  start_rwlock_wait_v1, /* read */
   end_rwlock_rdwait_v1,
-  start_rwlock_wrwait_v1,
+  start_rwlock_wait_v1, /* write */
   end_rwlock_wrwait_v1,
   start_cond_wait_v1,
   end_cond_wait_v1,
-  start_table_wait_v1,
-  end_table_wait_v1,
+  start_table_io_wait_v1,
+  end_table_io_wait_v1,
+  start_table_lock_wait_v1,
+  end_table_lock_wait_v1,
   start_file_open_wait_v1,
   end_file_open_wait_v1,
   end_file_open_wait_and_bind_to_descriptor_v1,
   start_file_wait_v1,
-  end_file_wait_v1
+  end_file_wait_v1,
+  start_stage_v1,
+  end_stage_v1,
+  get_thread_statement_locker_v1,
+  refine_statement_v1,
+  start_statement_v1,
+  set_statement_text_v1,
+  set_statement_lock_time_v1,
+  set_statement_rows_sent_v1,
+  set_statement_rows_examined_v1,
+  inc_statement_created_tmp_disk_tables_v1,
+  inc_statement_created_tmp_tables_v1,
+  inc_statement_select_full_join_v1,
+  inc_statement_select_full_range_join_v1,
+  inc_statement_select_range_v1,
+  inc_statement_select_range_check_v1,
+  inc_statement_select_scan_v1,
+  inc_statement_sort_merge_passes_v1,
+  inc_statement_sort_range_v1,
+  inc_statement_sort_rows_v1,
+  inc_statement_sort_scan_v1,
+  set_statement_no_index_used_v1,
+  set_statement_no_good_index_used_v1,
+  end_statement_v1,
+  start_socket_wait_v1,
+  end_socket_wait_v1,
+  set_socket_state_v1,
+  set_socket_info_v1,
+  set_socket_thread_owner_v1,
+  pfs_digest_start_v1,
+  pfs_digest_add_token_v1
 };
 
 static void* get_interface(int version)
diff --git a/storage/perfschema/pfs.h b/storage/perfschema/pfs.h
index 4e11736b1b9..5f543d80375 100644
--- a/storage/perfschema/pfs.h
+++ b/storage/perfschema/pfs.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2008, 2011, Oracle and/or its affiliates. All rights reserved.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
@@ -27,8 +27,14 @@
 #include <my_pthread.h>
 #include <mysql/psi/psi.h>
 
+/**
+  Entry point to the performance schema implementation.
+  This singleton is used to discover the performance schema services.
+*/
 extern struct PSI_bootstrap PFS_bootstrap;
+/** Performance schema Thread Local Storage key.  */
 extern pthread_key(PFS_thread*, THR_PFS);
+/** True when @c THR_PFS is initialized. */
 extern bool THR_PFS_initialized;
 
 #endif
diff --git a/storage/perfschema/pfs_account.cc b/storage/perfschema/pfs_account.cc
new file mode 100644
index 00000000000..c9298c7972c
--- /dev/null
+++ b/storage/perfschema/pfs_account.cc
@@ -0,0 +1,573 @@
+/* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; version 2 of the License.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+/**
+  @file storage/perfschema/pfs_account.cc
+  Performance schema user@host (implementation).
+*/
+
+#include "my_global.h"
+#include "my_sys.h"
+#include "pfs.h"
+#include "pfs_stat.h"
+#include "pfs_instr.h"
+#include "pfs_setup_actor.h"
+#include "pfs_host.h"
+#include "pfs_host.h"
+#include "pfs_user.h"
+#include "pfs_account.h"
+#include "pfs_global.h"
+#include "pfs_instr_class.h"
+
+/**
+  @addtogroup Performance_schema_buffers
+  @{
+*/
+
+ulong account_max;
+ulong account_lost;
+
+PFS_account *account_array= NULL;
+
+static PFS_single_stat *account_instr_class_waits_array= NULL;
+static PFS_stage_stat *account_instr_class_stages_array= NULL;
+static PFS_statement_stat *account_instr_class_statements_array= NULL;
+
+static LF_HASH account_hash;
+static bool account_hash_inited= false;
+
+/**
+  Initialize the user buffers.
+  @param param                        sizing parameters
+  @return 0 on success
+*/
+int init_account(const PFS_global_param *param)
+{
+  uint index;
+
+  account_max= param->m_account_sizing;
+
+  account_array= NULL;
+  account_instr_class_waits_array= NULL;
+  account_instr_class_stages_array= NULL;
+  account_instr_class_statements_array= NULL;
+  uint waits_sizing= account_max * wait_class_max;
+  uint stages_sizing= account_max * stage_class_max;
+  uint statements_sizing= account_max * statement_class_max;
+
+  if (account_max > 0)
+  {
+    account_array= PFS_MALLOC_ARRAY(account_max, PFS_account,
+                                      MYF(MY_ZEROFILL));
+    if (unlikely(account_array == NULL))
+      return 1;
+  }
+
+  if (waits_sizing > 0)
+  {
+    account_instr_class_waits_array=
+      PFS_connection_slice::alloc_waits_slice(waits_sizing);
+    if (unlikely(account_instr_class_waits_array == NULL))
+      return 1;
+  }
+
+  if (stages_sizing > 0)
+  {
+    account_instr_class_stages_array=
+      PFS_connection_slice::alloc_stages_slice(stages_sizing);
+    if (unlikely(account_instr_class_stages_array == NULL))
+      return 1;
+  }
+
+  if (statements_sizing > 0)
+  {
+    account_instr_class_statements_array=
+      PFS_connection_slice::alloc_statements_slice(statements_sizing);
+    if (unlikely(account_instr_class_statements_array == NULL))
+      return 1;
+  }
+
+  for (index= 0; index < account_max; index++)
+  {
+    account_array[index].m_instr_class_waits_stats=
+      &account_instr_class_waits_array[index * wait_class_max];
+    account_array[index].m_instr_class_stages_stats=
+      &account_instr_class_stages_array[index * stage_class_max];
+    account_array[index].m_instr_class_statements_stats=
+      &account_instr_class_statements_array[index * statement_class_max];
+  }
+
+  return 0;
+}
+
+/** Cleanup all the user buffers. */
+void cleanup_account(void)
+{
+  pfs_free(account_array);
+  account_array= NULL;
+  pfs_free(account_instr_class_waits_array);
+  account_instr_class_waits_array= NULL;
+  account_max= 0;
+}
+
+C_MODE_START
+static uchar *account_hash_get_key(const uchar *entry, size_t *length,
+                                my_bool)
+{
+  const PFS_account * const *typed_entry;
+  const PFS_account *account;
+  const void *result;
+  typed_entry= reinterpret_cast<const PFS_account* const *> (entry);
+  DBUG_ASSERT(typed_entry != NULL);
+  account= *typed_entry;
+  DBUG_ASSERT(account != NULL);
+  *length= account->m_key.m_key_length;
+  result= account->m_key.m_hash_key;
+  return const_cast<uchar*> (reinterpret_cast<const uchar*> (result));
+}
+C_MODE_END
+
+/**
+  Initialize the user hash.
+  @return 0 on success
+*/
+int init_account_hash(void)
+{
+  if (! account_hash_inited)
+  {
+    lf_hash_init(&account_hash, sizeof(PFS_account*), LF_HASH_UNIQUE,
+                 0, 0, account_hash_get_key, &my_charset_bin);
+    account_hash_inited= true;
+  }
+  return 0;
+}
+
+/** Cleanup the user hash. */
+void cleanup_account_hash(void)
+{
+  if (account_hash_inited)
+  {
+    lf_hash_destroy(&account_hash);
+    account_hash_inited= false;
+  }
+}
+
+static LF_PINS* get_account_hash_pins(PFS_thread *thread)
+{
+  if (unlikely(thread->m_account_hash_pins == NULL))
+  {
+    if (! account_hash_inited)
+      return NULL;
+    thread->m_account_hash_pins= lf_hash_get_pins(&account_hash);
+  }
+  return thread->m_account_hash_pins;
+}
+
+static void set_account_key(PFS_account_key *key,
+                              const char *user, uint user_length,
+                              const char *host, uint host_length)
+{
+  DBUG_ASSERT(user_length <= USERNAME_LENGTH);
+  DBUG_ASSERT(host_length <= HOSTNAME_LENGTH);
+
+  char *ptr= &key->m_hash_key[0];
+  if (user_length > 0)
+  {
+    memcpy(ptr, user, user_length);
+    ptr+= user_length;
+  }
+  ptr[0]= 0;
+  ptr++;
+  if (host_length > 0)
+  {
+    memcpy(ptr, host, host_length);
+    ptr+= host_length;
+  }
+  ptr[0]= 0;
+  ptr++;
+  key->m_key_length= ptr - &key->m_hash_key[0];
+}
+
+PFS_account *
+find_or_create_account(PFS_thread *thread,
+                         const char *username, uint username_length,
+                         const char *hostname, uint hostname_length)
+{
+  if (account_max == 0)
+  {
+    account_lost++;
+    return NULL;
+  }
+
+  LF_PINS *pins= get_account_hash_pins(thread);
+  if (unlikely(pins == NULL))
+  {
+    account_lost++;
+    return NULL;
+  }
+
+  PFS_account_key key;
+  set_account_key(&key, username, username_length,
+                    hostname, hostname_length);
+
+  PFS_account **entry;
+  uint retry_count= 0;
+  const uint retry_max= 3;
+
+search:
+  entry= reinterpret_cast<PFS_account**>
+    (lf_hash_search(&account_hash, pins,
+                    key.m_hash_key, key.m_key_length));
+  if (entry && (entry != MY_ERRPTR))
+  {
+    PFS_account *pfs;
+    pfs= *entry;
+    pfs->inc_refcount();
+    lf_hash_search_unpin(pins);
+    return pfs;
+  }
+
+  lf_hash_search_unpin(pins);
+
+  PFS_scan scan;
+  uint random= randomized_index(username, account_max);
+
+  for (scan.init(random, account_max);
+       scan.has_pass();
+       scan.next_pass())
+  {
+    PFS_account *pfs= account_array + scan.first();
+    PFS_account *pfs_last= account_array + scan.last();
+    for ( ; pfs < pfs_last; pfs++)
+    {
+      if (pfs->m_lock.is_free())
+      {
+        if (pfs->m_lock.free_to_dirty())
+        {
+          pfs->m_key= key;
+          if (username_length > 0)
+            pfs->m_username= &pfs->m_key.m_hash_key[0];
+          else
+            pfs->m_username= NULL;
+          pfs->m_username_length= username_length;
+
+          if (hostname_length > 0)
+            pfs->m_hostname= &pfs->m_key.m_hash_key[username_length + 1];
+          else
+            pfs->m_hostname= NULL;
+          pfs->m_hostname_length= hostname_length;
+
+          pfs->m_user= find_or_create_user(thread, username, username_length);
+          pfs->m_host= find_or_create_host(thread, hostname, hostname_length);
+
+          pfs->init_refcount();
+          pfs->reset_stats();
+          pfs->m_disconnected_count= 0;
+
+          int res;
+          res= lf_hash_insert(&account_hash, pins, &pfs);
+          if (likely(res == 0))
+          {
+            pfs->m_lock.dirty_to_allocated();
+            return pfs;
+          }
+
+          if (pfs->m_user)
+          {
+            pfs->m_user->release();
+            pfs->m_user= NULL;
+          }
+          if (pfs->m_host)
+          {
+            pfs->m_host->release();
+            pfs->m_host= NULL;
+          }
+
+          pfs->m_lock.dirty_to_free();
+
+          if (res > 0)
+          {
+            if (++retry_count > retry_max)
+            {
+              account_lost++;
+              return NULL;
+            }
+            goto search;
+          }
+
+          account_lost++;
+          return NULL;
+        }
+      }
+    }
+  }
+
+  account_lost++;
+  return NULL;
+}
+
+void PFS_account::aggregate()
+{
+  aggregate_waits();
+  aggregate_stages();
+  aggregate_statements();
+  aggregate_stats();
+}
+
+void PFS_account::aggregate_waits()
+{
+  if (likely(m_user != NULL && m_host != NULL))
+  {
+    /*
+      Aggregate EVENTS_WAITS_SUMMARY_BY_ACCOUNT_BY_EVENT_NAME to:
+      -  EVENTS_WAITS_SUMMARY_BY_USER_BY_EVENT_NAME
+      -  EVENTS_WAITS_SUMMARY_BY_HOST_BY_EVENT_NAME
+      in parallel.
+    */
+    aggregate_all_event_names(m_instr_class_waits_stats,
+                              m_user->m_instr_class_waits_stats,
+                              m_host->m_instr_class_waits_stats);
+    return;
+  }
+
+  if (m_user != NULL)
+  {
+    /*
+      Aggregate EVENTS_WAITS_SUMMARY_BY_ACCOUNT_BY_EVENT_NAME to:
+      -  EVENTS_WAITS_SUMMARY_BY_USER_BY_EVENT_NAME
+    */
+    aggregate_all_event_names(m_instr_class_waits_stats,
+                              m_user->m_instr_class_waits_stats);
+    return;
+  }
+
+  if (m_host != NULL)
+  {
+    /*
+      Aggregate EVENTS_WAITS_SUMMARY_BY_ACCOUNT_BY_EVENT_NAME to:
+      -  EVENTS_WAITS_SUMMARY_BY_HOST_BY_EVENT_NAME
+    */
+    aggregate_all_event_names(m_instr_class_waits_stats,
+                              m_host->m_instr_class_waits_stats);
+    return;
+  }
+
+  /* Orphan account, no parent to aggregate to. */
+  reset_waits_stats();
+  return;
+}
+
+void PFS_account::aggregate_stages()
+{
+  if (likely(m_user != NULL && m_host != NULL))
+  {
+    /*
+      Aggregate EVENTS_STAGES_SUMMARY_BY_ACCOUNT_BY_EVENT_NAME to:
+      -  EVENTS_STAGES_SUMMARY_BY_USER_BY_EVENT_NAME
+      -  EVENTS_STAGES_SUMMARY_BY_HOST_BY_EVENT_NAME
+      in parallel.
+    */
+    aggregate_all_stages(m_instr_class_stages_stats,
+                         m_user->m_instr_class_stages_stats,
+                         m_host->m_instr_class_stages_stats);
+    return;
+  }
+
+  if (m_user != NULL)
+  {
+    /*
+      Aggregate EVENTS_STAGES_SUMMARY_BY_ACCOUNT_BY_EVENT_NAME to:
+      -  EVENTS_STAGES_SUMMARY_BY_USER_BY_EVENT_NAME
+      -  EVENTS_STAGES_SUMMARY_GLOBAL_BY_EVENT_NAME
+      in parallel.
+    */
+    aggregate_all_stages(m_instr_class_stages_stats,
+                         m_user->m_instr_class_stages_stats,
+                         global_instr_class_stages_array);
+    return;
+  }
+
+  if (m_host != NULL)
+  {
+    /*
+      Aggregate EVENTS_STAGES_SUMMARY_BY_ACCOUNT_BY_EVENT_NAME to:
+      -  EVENTS_STAGES_SUMMARY_BY_HOST_BY_EVENT_NAME
+    */
+    aggregate_all_stages(m_instr_class_stages_stats,
+                         m_host->m_instr_class_stages_stats);
+    return;
+  }
+
+  /*
+    Aggregate EVENTS_STAGES_SUMMARY_BY_ACCOUNT_BY_EVENT_NAME to:
+    -  EVENTS_STAGES_SUMMARY_GLOBAL_BY_EVENT_NAME
+  */
+  aggregate_all_stages(m_instr_class_stages_stats,
+                       global_instr_class_stages_array);
+  return;
+}
+
+void PFS_account::aggregate_statements()
+{
+  if (likely(m_user != NULL && m_host != NULL))
+  {
+    /*
+      Aggregate EVENTS_STATEMENTS_SUMMARY_BY_ACCOUNT_BY_EVENT_NAME to:
+      -  EVENTS_STATEMENTS_SUMMARY_BY_USER_BY_EVENT_NAME
+      -  EVENTS_STATEMENTS_SUMMARY_BY_HOST_BY_EVENT_NAME
+      in parallel.
+    */
+    aggregate_all_statements(m_instr_class_statements_stats,
+                             m_user->m_instr_class_statements_stats,
+                             m_host->m_instr_class_statements_stats);
+    return;
+  }
+
+  if (m_user != NULL)
+  {
+    /*
+      Aggregate EVENTS_STATEMENTS_SUMMARY_BY_ACCOUNT_BY_EVENT_NAME to:
+      -  EVENTS_STATEMENTS_SUMMARY_BY_USER_BY_EVENT_NAME
+      -  EVENTS_STATEMENTS_SUMMARY_GLOBAL_BY_EVENT_NAME
+      in parallel.
+    */
+    aggregate_all_statements(m_instr_class_statements_stats,
+                             m_user->m_instr_class_statements_stats,
+                             global_instr_class_statements_array);
+    return;
+  }
+
+  if (m_host != NULL)
+  {
+    /*
+      Aggregate EVENTS_STATEMENTS_SUMMARY_BY_ACCOUNT_BY_EVENT_NAME to:
+      -  EVENTS_STATEMENTS_SUMMARY_BY_HOST_BY_EVENT_NAME
+    */
+    aggregate_all_statements(m_instr_class_statements_stats,
+                             m_host->m_instr_class_statements_stats);
+    return;
+  }
+
+  /*
+    Aggregate EVENTS_STATEMENTS_SUMMARY_BY_ACCOUNT_BY_EVENT_NAME to:
+    -  EVENTS_STATEMENTS_SUMMARY_GLOBAL_BY_EVENT_NAME
+  */
+  aggregate_all_statements(m_instr_class_statements_stats,
+                           global_instr_class_statements_array);
+  return;
+}
+
+void PFS_account::aggregate_stats()
+{
+  if (likely(m_user != NULL && m_host != NULL))
+  {
+    m_user->m_disconnected_count+= m_disconnected_count;
+    m_host->m_disconnected_count+= m_disconnected_count;
+    m_disconnected_count= 0;
+    return;
+  }
+
+  if (m_user != NULL)
+  {
+    m_user->m_disconnected_count+= m_disconnected_count;
+    m_disconnected_count= 0;
+    return;
+  }
+
+  if (m_host != NULL)
+  {
+    m_host->m_disconnected_count+= m_disconnected_count;
+    m_disconnected_count= 0;
+    return;
+  }
+
+  m_disconnected_count= 0;
+  return;
+}
+
+void PFS_account::release()
+{
+  dec_refcount();
+}
+
+PFS_account *sanitize_account(PFS_account *unsafe)
+{
+  if ((&account_array[0] <= unsafe) &&
+      (unsafe < &account_array[account_max]))
+    return unsafe;
+  return NULL;
+}
+
+void purge_account(PFS_thread *thread, PFS_account *account)
+{
+  account->aggregate();
+
+  LF_PINS *pins= get_account_hash_pins(thread);
+  if (unlikely(pins == NULL))
+    return;
+
+  PFS_account **entry;
+  entry= reinterpret_cast<PFS_account**>
+    (lf_hash_search(&account_hash, pins,
+                    account->m_key.m_hash_key,
+                    account->m_key.m_key_length));
+  if (entry && (entry != MY_ERRPTR))
+  {
+    DBUG_ASSERT(*entry == account);
+    if (account->get_refcount() == 0)
+    {
+      lf_hash_delete(&account_hash, pins,
+                     account->m_key.m_hash_key,
+                     account->m_key.m_key_length);
+      if (account->m_user != NULL)
+      {
+        account->m_user->release();
+        account->m_user= NULL;
+      }
+      if (account->m_host != NULL)
+      {
+        account->m_host->release();
+        account->m_host= NULL;
+      }
+      account->m_lock.allocated_to_free();
+    }
+  }
+
+  lf_hash_search_unpin(pins);
+}
+
+/** Purge non connected user@host, reset stats of connected user@host. */
+void purge_all_account(void)
+{
+  PFS_thread *thread= PFS_thread::get_current_thread();
+  if (unlikely(thread == NULL))
+    return;
+
+  PFS_account *pfs= account_array;
+  PFS_account *pfs_last= account_array + account_max;
+
+  for ( ; pfs < pfs_last; pfs++)
+  {
+    if (pfs->m_lock.is_populated())
+    {
+      pfs->aggregate_stats();
+
+      if (pfs->get_refcount() == 0)
+        purge_account(thread, pfs);
+    }
+  }
+}
+
+/** @} */
diff --git a/storage/perfschema/pfs_account.h b/storage/perfschema/pfs_account.h
new file mode 100644
index 00000000000..77a9dfab7ba
--- /dev/null
+++ b/storage/perfschema/pfs_account.h
@@ -0,0 +1,120 @@
+/* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; version 2 of the License.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+#ifndef PFS_ACCOUNT_H
+#define PFS_ACCOUNT_H
+
+/**
+  @file storage/perfschema/pfs_account.h
+  Performance schema user@host (declarations).
+*/
+
+#include "pfs_lock.h"
+#include "lf.h"
+#include "pfs_con_slice.h"
+
+struct PFS_global_param;
+struct PFS_user;
+struct PFS_host;
+struct PFS_thread;
+
+/**
+  @addtogroup Performance_schema_buffers
+  @{
+*/
+
+struct PFS_account_key
+{
+  /**
+    Hash search key.
+    This has to be a string for LF_HASH,
+    the format is "<username><0x00><hostname><0x00>"
+  */
+  char m_hash_key[USERNAME_LENGTH + 1 + HOSTNAME_LENGTH + 1];
+  uint m_key_length;
+};
+
+struct PFS_account : PFS_connection_slice
+{
+public:
+  inline void init_refcount(void)
+  {
+    PFS_atomic::store_32(& m_refcount, 1);
+  }
+
+  inline int get_refcount(void)
+  {
+    return PFS_atomic::load_32(& m_refcount);
+  }
+
+  inline void inc_refcount(void)
+  {
+    PFS_atomic::add_32(& m_refcount, 1);
+  }
+
+  inline void dec_refcount(void)
+  {
+    PFS_atomic::add_32(& m_refcount, -1);
+  }
+
+  void aggregate(void);
+  void aggregate_waits(void);
+  void aggregate_stages(void);
+  void aggregate_statements(void);
+  void aggregate_stats(void);
+  void release(void);
+
+  /** Internal lock. */
+  pfs_lock m_lock;
+  PFS_account_key m_key;
+  const char *m_username;
+  uint m_username_length;
+  const char *m_hostname;
+  uint m_hostname_length;
+  PFS_user *m_user;
+  PFS_host *m_host;
+
+  ulonglong m_disconnected_count;
+
+private:
+  int m_refcount;
+};
+
+int init_account(const PFS_global_param *param);
+void cleanup_account(void);
+int init_account_hash(void);
+void cleanup_account_hash(void);
+
+PFS_account *
+find_or_create_account(PFS_thread *thread,
+                         const char *username, uint username_length,
+                         const char *hostname, uint hostname_length);
+
+PFS_account *sanitize_account(PFS_account *unsafe);
+void purge_all_account(void);
+
+
+/* For iterators and show status. */
+
+extern ulong account_max;
+extern ulong account_lost;
+
+/* Exposing the data directly, for iterators. */
+
+extern PFS_account *account_array;
+
+/** @} */
+#endif
+
diff --git a/storage/perfschema/pfs_atomic.cc b/storage/perfschema/pfs_atomic.cc
index 4db807b1d88..601bd94cabd 100644
--- a/storage/perfschema/pfs_atomic.cc
+++ b/storage/perfschema/pfs_atomic.cc
@@ -1,6 +1,4 @@
-/*
-  Copyright (c) 2009, 2010 Sun Microsystems, Inc.
-  Use is subject to license terms.
+/* Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
diff --git a/storage/perfschema/pfs_atomic.h b/storage/perfschema/pfs_atomic.h
index 4daebfbbae4..ffb4c24ecbf 100644
--- a/storage/perfschema/pfs_atomic.h
+++ b/storage/perfschema/pfs_atomic.h
@@ -1,6 +1,4 @@
-/*
-  Copyright (c) 2009, 2010 Sun Microsystems, Inc.
-  Use is subject to license terms.
+/* Copyright (c) 2009, 2011, Oracle and/or its affiliates. All rights reserved.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
@@ -29,7 +27,9 @@
 class PFS_atomic
 {
 public:
+  /** Initialise the PFS_atomic component. */
   static void init();
+  /** Cleanup the PFS_atomic component. */
   static void cleanup();
 
   /** Atomic load. */
@@ -114,6 +114,10 @@ public:
 private:
   static my_atomic_rwlock_t m_rwlock_array[256];
 
+  /**
+    Helper used only with non native atomic implementations.
+    @sa MY_ATOMIC_MODE_RWLOCKS
+  */
   static inline my_atomic_rwlock_t *get_rwlock(volatile void *ptr)
   {
     /*
@@ -125,21 +129,37 @@ private:
     return result;
   }
 
+  /**
+    Helper used only with non native atomic implementations.
+    @sa MY_ATOMIC_MODE_RWLOCKS
+  */
   static inline void rdlock(volatile void *ptr)
   {
     my_atomic_rwlock_rdlock(get_rwlock(ptr));
   }
 
+  /**
+    Helper used only with non native atomic implementations.
+    @sa MY_ATOMIC_MODE_RWLOCKS
+  */
   static inline void wrlock(volatile void *ptr)
   {
     my_atomic_rwlock_wrlock(get_rwlock(ptr));
   }
 
+  /**
+    Helper used only with non native atomic implementations.
+    @sa MY_ATOMIC_MODE_RWLOCKS
+  */
   static inline void rdunlock(volatile void *ptr)
   {
     my_atomic_rwlock_rdunlock(get_rwlock(ptr));
   }
 
+  /**
+    Helper used only with non native atomic implementations.
+    @sa MY_ATOMIC_MODE_RWLOCKS
+  */
   static inline void wrunlock(volatile void *ptr)
   {
     my_atomic_rwlock_wrunlock(get_rwlock(ptr));
diff --git a/storage/perfschema/pfs_column_types.h b/storage/perfschema/pfs_column_types.h
index dc990664a8f..23ef946ee82 100644
--- a/storage/perfschema/pfs_column_types.h
+++ b/storage/perfschema/pfs_column_types.h
@@ -1,5 +1,4 @@
-/* Copyright (c) 2008 MySQL AB, 2010 Sun Microsystems, Inc.
-   Use is subject to license terms.
+/* Copyright (c) 2008, 2011, Oracle and/or its affiliates. All rights reserved.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
@@ -39,9 +38,27 @@
 /** Size of the OBJECT_NAME columns. */
 #define COL_OBJECT_NAME_SIZE 64
 
+/** Size of the INDEX_NAME columns. */
+#define COL_INDEX_NAME_SIZE 64
+
+/**
+  Size of INFO columns.
+  Size in bytes of:
+  - performance_schema.events_statement_current (INFO)
+  - performance_schema.events_statement_history (INFO)
+  - performance_schema.events_statement_history_long (INFO)
+*/
+#define COL_INFO_SIZE 1024
+
 /** Size of the SOURCE columns. */
 #define COL_SOURCE_SIZE 64
 
+/** Size of the DIGEST columns. */
+#define COL_DIGEST_SIZE 64
+
+/** Size of the DIGEST_TEXT columns. */
+#define COL_DIGEST_TEXT_SIZE 1024
+
 /**
   Enum values for the TIMER_NAME columns.
   This enum is found in the following tables:
@@ -57,8 +74,11 @@ enum enum_timer_name
   TIMER_NAME_TICK= 5
 };
 
+/** Integer, first value of @sa enum_timer_name. */
 #define FIRST_TIMER_NAME (static_cast<int> (TIMER_NAME_CYCLE))
+/** Integer, last value of @sa enum_timer_name. */
 #define LAST_TIMER_NAME (static_cast<int> (TIMER_NAME_TICK))
+/** Integer, number of values of @sa enum_timer_name. */
 #define COUNT_TIMER_NAME (LAST_TIMER_NAME - FIRST_TIMER_NAME + 1)
 
 /**
@@ -83,17 +103,21 @@ enum enum_yes_no
 */
 enum enum_operation_type
 {
+  /* Mutex operations */
   OPERATION_TYPE_LOCK= 1,
   OPERATION_TYPE_TRYLOCK= 2,
 
+  /* Rwlock operations */
   OPERATION_TYPE_READLOCK= 3,
   OPERATION_TYPE_WRITELOCK= 4,
   OPERATION_TYPE_TRYREADLOCK= 5,
   OPERATION_TYPE_TRYWRITELOCK= 6,
 
+  /* Cond operations */
   OPERATION_TYPE_WAIT= 7,
   OPERATION_TYPE_TIMEDWAIT= 8,
 
+  /* File operations */
   OPERATION_TYPE_FILECREATE= 9,
   OPERATION_TYPE_FILECREATETMP= 10,
   OPERATION_TYPE_FILEOPEN= 11,
@@ -110,11 +134,89 @@ enum enum_operation_type
   OPERATION_TYPE_FILECHSIZE= 22,
   OPERATION_TYPE_FILEDELETE= 23,
   OPERATION_TYPE_FILERENAME= 24,
-  OPERATION_TYPE_FILESYNC= 25
+  OPERATION_TYPE_FILESYNC= 25,
+
+  /* Table io operations */
+  OPERATION_TYPE_TABLE_FETCH= 26,
+  OPERATION_TYPE_TABLE_WRITE_ROW= 27,
+  OPERATION_TYPE_TABLE_UPDATE_ROW= 28,
+  OPERATION_TYPE_TABLE_DELETE_ROW= 29,
+
+  /* Table lock operations */
+  OPERATION_TYPE_TL_READ_NORMAL= 30,
+  OPERATION_TYPE_TL_READ_WITH_SHARED_LOCKS= 31,
+  OPERATION_TYPE_TL_READ_HIGH_PRIORITY= 32,
+  OPERATION_TYPE_TL_READ_NO_INSERTS= 33,
+  OPERATION_TYPE_TL_WRITE_ALLOW_WRITE= 34,
+  OPERATION_TYPE_TL_WRITE_CONCURRENT_INSERT= 35,
+  OPERATION_TYPE_TL_WRITE_DELAYED= 36,
+  OPERATION_TYPE_TL_WRITE_LOW_PRIORITY= 37,
+  OPERATION_TYPE_TL_WRITE_NORMAL= 38,
+  OPERATION_TYPE_TL_READ_EXTERNAL= 39,
+  OPERATION_TYPE_TL_WRITE_EXTERNAL= 40,
+
+  /* Socket operations */
+  OPERATION_TYPE_SOCKETCREATE = 41,
+  OPERATION_TYPE_SOCKETCONNECT = 42,
+  OPERATION_TYPE_SOCKETBIND = 43,
+  OPERATION_TYPE_SOCKETCLOSE = 44,
+  OPERATION_TYPE_SOCKETSEND = 45,
+  OPERATION_TYPE_SOCKETRECV = 46,
+  OPERATION_TYPE_SOCKETSENDTO = 47,
+  OPERATION_TYPE_SOCKETRECVFROM = 48,
+  OPERATION_TYPE_SOCKETSENDMSG = 49,
+  OPERATION_TYPE_SOCKETRECVMSG = 50,
+  OPERATION_TYPE_SOCKETSEEK = 51,
+  OPERATION_TYPE_SOCKETOPT = 52,
+  OPERATION_TYPE_SOCKETSTAT = 53,
+  OPERATION_TYPE_SOCKETSHUTDOWN = 54,
+  OPERATION_TYPE_SOCKETSELECT = 55,
+
+  /* Idle operation */
+  OPERATION_TYPE_IDLE= 56
 };
+/** Integer, first value of @sa enum_operation_type. */
 #define FIRST_OPERATION_TYPE (static_cast<int> (OPERATION_TYPE_LOCK))
-#define LAST_OPERATION_TYPE (static_cast<int> (OPERATION_TYPE_FILESYNC))
+/** Integer, last value of @sa enum_operation_type. */
+#define LAST_OPERATION_TYPE (static_cast<int> (OPERATION_TYPE_IDLE))
+/** Integer, number of values of @sa enum_operation_type. */
 #define COUNT_OPERATION_TYPE (LAST_OPERATION_TYPE - FIRST_OPERATION_TYPE + 1)
 
+/**
+  Enum values for the various OBJECT_TYPE columns.
+*/
+enum enum_object_type
+{
+  OBJECT_TYPE_TABLE= 1,
+  OBJECT_TYPE_TEMPORARY_TABLE= 2
+};
+/** Integer, first value of @sa enum_object_type. */
+#define FIRST_OBJECT_TYPE (static_cast<int> (OBJECT_TYPE_TABLE))
+/** Integer, last value of @sa enum_object_type. */
+#define LAST_OBJECT_TYPE (static_cast<int> (OBJECT_TYPE_TEMPORARY_TABLE))
+/** Integer, number of values of @sa enum_object_type. */
+#define COUNT_OBJECT_TYPE (LAST_OBJECT_TYPE - FIRST_OBJECT_TYPE + 1)
+
+/**
+  Enum values for the NESTING_EVENT_TYPE columns.
+  This enum is found in the following tables:
+  - performance_schema.events_waits_current (NESTING_EVENT_TYPE)
+  - performance_schema.events_stages_current (NESTING_EVENT_TYPE)
+  - performance_schema.events_statements_current (NESTING_EVENT_TYPE)
+*/
+enum enum_event_type
+{
+  EVENT_TYPE_STATEMENT= 1,
+  EVENT_TYPE_STAGE= 2,
+  EVENT_TYPE_WAIT= 3
+};
+
+/** Integer, first value of @sa enum_event_type. */
+#define FIRST_EVENT_TYPE (static_cast<int> (EVENT_TYPE_STATEMENT))
+/** Integer, last value of @sa enum_event_type. */
+#define LAST_EVENT_TYPE (static_cast<int> (EVENT_TYPE_WAIT))
+/** Integer, number of values of @sa enum_event_type. */
+#define COUNT_EVENT_TYPE (LAST_EVENT_TYPE - FIRST_EVENT_TYPE + 1)
+
 #endif
 
diff --git a/storage/perfschema/pfs_column_values.cc b/storage/perfschema/pfs_column_values.cc
index ea65441b8c6..65d0ae7171b 100644
--- a/storage/perfschema/pfs_column_values.cc
+++ b/storage/perfschema/pfs_column_values.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2008, 2011, Oracle and/or its affiliates. All rights reserved.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
@@ -19,6 +19,7 @@
   schema tables (implementation).
 */
 
+#include "my_global.h"
 #include "pfs_column_values.h"
 
 LEX_STRING PERFORMANCE_SCHEMA_str=
@@ -39,3 +40,11 @@ LEX_STRING thread_instrument_prefix=
 LEX_STRING file_instrument_prefix=
 { C_STRING_WITH_LEN("wait/io/file/") };
 
+LEX_STRING stage_instrument_prefix=
+{ C_STRING_WITH_LEN("stage/") };
+
+LEX_STRING statement_instrument_prefix=
+{ C_STRING_WITH_LEN("statement/") };
+
+LEX_STRING socket_instrument_prefix=
+{ C_STRING_WITH_LEN("wait/io/socket/") };
diff --git a/storage/perfschema/pfs_column_values.h b/storage/perfschema/pfs_column_values.h
index f9e7f90dbc9..204d5230ddf 100644
--- a/storage/perfschema/pfs_column_values.h
+++ b/storage/perfschema/pfs_column_values.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2008, 2011, Oracle and/or its affiliates. All rights reserved.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
@@ -24,13 +24,24 @@
   performance schema tables (declarations).
 */
 
+/** String, "PERFORMANCE_SCHEMA". */
 extern LEX_STRING PERFORMANCE_SCHEMA_str;
 
+/** String prefix for all mutex instruments. */
 extern LEX_STRING mutex_instrument_prefix;
+/** String prefix for all rwlock instruments. */
 extern LEX_STRING rwlock_instrument_prefix;
+/** String prefix for all cond instruments. */
 extern LEX_STRING cond_instrument_prefix;
+/** String prefix for all thread instruments. */
 extern LEX_STRING thread_instrument_prefix;
+/** String prefix for all file instruments. */
 extern LEX_STRING file_instrument_prefix;
+/** String prefix for all stage instruments. */
+extern LEX_STRING stage_instrument_prefix;
+/** String prefix for all statement instruments. */
+extern LEX_STRING statement_instrument_prefix;
+extern LEX_STRING socket_instrument_prefix;
 
 #endif
 
diff --git a/storage/perfschema/pfs_con_slice.cc b/storage/perfschema/pfs_con_slice.cc
new file mode 100644
index 00000000000..263f25c1c08
--- /dev/null
+++ b/storage/perfschema/pfs_con_slice.cc
@@ -0,0 +1,116 @@
+/* Copyright (c) 2010, 2011, Oracle and/or its affiliates. All rights reserved.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; version 2 of the License.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301  USA */
+
+
+#include "my_global.h"
+#include "my_pthread.h"
+#include "pfs_con_slice.h"
+#include "pfs_stat.h"
+#include "pfs_global.h"
+#include "pfs_instr_class.h"
+
+/**
+  @file storage/perfschema/pfs_con_slice.cc
+  Performance schema connection slice (implementation).
+*/
+
+/**
+  @addtogroup Performance_schema_buffers
+  @{
+*/
+
+PFS_single_stat *
+PFS_connection_slice::alloc_waits_slice(uint sizing)
+{
+  PFS_single_stat *slice= NULL;
+  uint index;
+
+  if (sizing > 0)
+  {
+    slice= PFS_MALLOC_ARRAY(sizing, PFS_single_stat, MYF(MY_ZEROFILL));
+    if (unlikely(slice == NULL))
+      return NULL;
+
+    for (index= 0; index < sizing; index++)
+      slice[index].reset();
+  }
+
+  return slice;
+}
+
+PFS_stage_stat *
+PFS_connection_slice::alloc_stages_slice(uint sizing)
+{
+  PFS_stage_stat *slice= NULL;
+  uint index;
+
+  if (sizing > 0)
+  {
+    slice= PFS_MALLOC_ARRAY(sizing, PFS_stage_stat, MYF(MY_ZEROFILL));
+    if (unlikely(slice == NULL))
+      return NULL;
+
+    for (index= 0; index < sizing; index++)
+      slice[index].reset();
+  }
+
+  return slice;
+}
+
+PFS_statement_stat *
+PFS_connection_slice::alloc_statements_slice(uint sizing)
+{
+  PFS_statement_stat *slice= NULL;
+  uint index;
+
+  if (sizing > 0)
+  {
+    slice= PFS_MALLOC_ARRAY(sizing, PFS_statement_stat, MYF(MY_ZEROFILL));
+    if (unlikely(slice == NULL))
+      return NULL;
+
+    for (index= 0; index < sizing; index++)
+      slice[index].reset();
+  }
+
+  return slice;
+}
+
+void PFS_connection_slice::reset_waits_stats()
+{
+  PFS_single_stat *stat= m_instr_class_waits_stats;
+  PFS_single_stat *stat_last= stat + wait_class_max;
+  for ( ; stat < stat_last; stat++)
+    stat->reset();
+}
+
+void PFS_connection_slice::reset_stages_stats()
+{
+  PFS_stage_stat *stat= m_instr_class_stages_stats;
+  PFS_stage_stat *stat_last= stat + stage_class_max;
+  for ( ; stat < stat_last; stat++)
+    stat->reset();
+}
+
+void PFS_connection_slice::reset_statements_stats()
+{
+  PFS_statement_stat *stat= m_instr_class_statements_stats;
+  PFS_statement_stat *stat_last= stat + statement_class_max;
+  for ( ; stat < stat_last; stat++)
+    stat->reset();
+}
+
+/** @} */
+
diff --git a/storage/perfschema/pfs_con_slice.h b/storage/perfschema/pfs_con_slice.h
new file mode 100644
index 00000000000..d82adcb58a3
--- /dev/null
+++ b/storage/perfschema/pfs_con_slice.h
@@ -0,0 +1,103 @@
+/* Copyright (c) 2010, 2011, Oracle and/or its affiliates. All rights reserved.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; version 2 of the License.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301  USA */
+
+#ifndef PFS_CON_SLICE_H
+#define PFS_CON_SLICE_H
+
+/**
+  @file storage/perfschema/pfs_con_slice.h
+  Performance schema connection slice (declarations).
+*/
+
+#include "pfs_lock.h"
+#include "lf.h"
+
+struct PFS_single_stat;
+struct PFS_stage_stat;
+struct PFS_statement_stat;
+
+/**
+  @addtogroup Performance_schema_buffers
+  @{
+*/
+
+/**
+  A connection slice, an arbitrary grouping of several connections.
+  This structure holds statistics for grouping of connections.
+*/
+struct PFS_connection_slice
+{
+  /**
+    Allocate memory for waits statistics.
+    @param sizing the number of wait classes.
+    @return wait statistics for this slice.
+  */
+  static PFS_single_stat *alloc_waits_slice(uint sizing);
+  /**
+    Allocate memory for stages statistics.
+    @param sizing the number of stage classes.
+    @return stage statistics for this slice.
+  */
+  static PFS_stage_stat *alloc_stages_slice(uint sizing);
+  /**
+    Allocate memory for statement statistics.
+    @param sizing the number of statement classes.
+    @return statement statistics for this slice.
+  */
+  static PFS_statement_stat *alloc_statements_slice(uint sizing);
+
+  /** Reset all statistics. */
+  inline void reset_stats()
+  {
+    reset_waits_stats();
+    reset_stages_stats();
+    reset_statements_stats();
+  }
+
+  /** Reset all wait statistics. */
+  void reset_waits_stats();
+  /** Reset all stages statistics. */
+  void reset_stages_stats();
+  /** Reset all statements statistics. */
+  void reset_statements_stats();
+
+  /**
+    Per connection slice waits aggregated statistics.
+    This member holds the data for the table
+    PERFORMANCE_SCHEMA.EVENTS_WAITS_SUMMARY_BY_*_BY_EVENT_NAME.
+    Immutable, safe to use without internal lock.
+  */
+  PFS_single_stat *m_instr_class_waits_stats;
+
+  /**
+    Per connection slice stages aggregated statistics.
+    This member holds the data for the table
+    PERFORMANCE_SCHEMA.EVENTS_STAGES_SUMMARY_BY_*_BY_EVENT_NAME.
+    Immutable, safe to use without internal lock.
+  */
+  PFS_stage_stat *m_instr_class_stages_stats;
+
+  /**
+    Per connection slice statements aggregated statistics.
+    This member holds the data for the table
+    PERFORMANCE_SCHEMA.EVENTS_STATEMENTS_SUMMARY_BY_*_BY_EVENT_NAME.
+    Immutable, safe to use without internal lock.
+  */
+  PFS_statement_stat *m_instr_class_statements_stats;
+};
+
+/** @} */
+#endif
+
diff --git a/storage/perfschema/pfs_defaults.cc b/storage/perfschema/pfs_defaults.cc
new file mode 100644
index 00000000000..4bd24f59ca8
--- /dev/null
+++ b/storage/perfschema/pfs_defaults.cc
@@ -0,0 +1,63 @@
+/* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; version 2 of the License.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software Foundation,
+  51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA */
+
+/**
+  @file storage/perfschema/pfs_defaults.cc
+  Default setup (implementation).
+*/
+
+#include "pfs.h"
+#include "pfs_defaults.h"
+#include "pfs_instr.h"
+#include "pfs_setup_actor.h"
+#include "pfs_setup_object.h"
+
+static PSI_thread_key key;
+static PSI_thread_info info= { &key, "setup", PSI_FLAG_GLOBAL };
+
+void install_default_setup(PSI_bootstrap *boot)
+{
+  PSI *psi= (PSI*) boot->get_interface(PSI_CURRENT_VERSION);
+  if (psi == NULL)
+    return;
+
+  psi->register_thread("performance_schema", &info, 1);
+  PSI_thread *psi_thread= psi->new_thread(key, NULL, 0);
+  if (psi_thread == NULL)
+    return;
+
+  /* LF_HASH needs a thread, for PINS */
+  psi->set_thread(psi_thread);
+
+  String percent("%", 1, &my_charset_utf8_bin);
+  /* Enable all users on all hosts by default */
+  insert_setup_actor(&percent, &percent, &percent);
+
+  /* Disable system tables by default */
+  String mysql_db("mysql", 5, &my_charset_utf8_bin);
+  insert_setup_object(OBJECT_TYPE_TABLE, &mysql_db, &percent, false, false);
+
+  /* Disable performance/information schema tables. */
+  String PS_db("performance_schema", 18, &my_charset_utf8_bin);
+  String IS_db("information_schema", 18, &my_charset_utf8_bin);
+  insert_setup_object(OBJECT_TYPE_TABLE, &PS_db, &percent, false, false);
+  insert_setup_object(OBJECT_TYPE_TABLE, &IS_db, &percent, false, false);
+
+  /* Enable every other tables */
+  insert_setup_object(OBJECT_TYPE_TABLE, &percent, &percent, true, true);
+
+  psi->delete_current_thread();
+}
+
diff --git a/storage/perfschema/pfs_defaults.h b/storage/perfschema/pfs_defaults.h
new file mode 100644
index 00000000000..bbb041504e2
--- /dev/null
+++ b/storage/perfschema/pfs_defaults.h
@@ -0,0 +1,32 @@
+/* Copyright (c) 2010, 2011, Oracle and/or its affiliates. All rights reserved.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; version 2 of the License.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software Foundation,
+  51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA */
+
+#ifndef PFS_DEFAULTS_H
+#define PFS_DEFAULTS_H
+
+/**
+  @file storage/perfschema/pfs_defaults.h
+  Default setup (declarations).
+*/
+
+/**
+  Configure the performance schema setup tables with default content.
+  The tables populated are:
+  - SETUP_ACTORS
+  - SETUP_OBJECTS
+*/
+void install_default_setup(PSI_bootstrap *boot);
+
+#endif
diff --git a/storage/perfschema/pfs_digest.cc b/storage/perfschema/pfs_digest.cc
new file mode 100644
index 00000000000..92c27b2e85f
--- /dev/null
+++ b/storage/perfschema/pfs_digest.cc
@@ -0,0 +1,676 @@
+/* Copyright (c) 2008, 2011, Oracle and/or its affiliates. All rights reserved.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; version 2 of the License.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software Foundation,
+  51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA */
+
+/**
+  @file storage/perfschema/pfs_digest.h
+  Statement Digest data structures (implementation).
+*/
+
+/*
+  This code needs extra visibility in the lexer structures
+*/
+
+#include "my_global.h"
+#include "my_sys.h"
+#include "pfs_instr.h"
+#include "pfs_digest.h"
+#include "pfs_global.h"
+#include "table_helper.h"
+#include "my_md5.h"
+#include "sql_lex.h"
+#include <string.h>
+
+/* Generated code */
+#include "../sql/sql_yacc.h"
+#include "../storage/perfschema/pfs_lex_token.h"
+
+/* Name pollution from sql/sql_lex.h */
+#ifdef LEX_YYSTYPE
+#undef LEX_YYSTYPE
+#endif
+
+#define LEX_YYSTYPE YYSTYPE
+
+/**
+  Token array : 
+  Token array is an array of bytes to store tokens recieved during parsing.
+  Following is the way token array is formed.
+     
+      ...<non-id-token><non-id-token><id-token><id_len><id_text>...
+
+  For Ex:
+  SELECT * FROM T1;
+  <SELECT_TOKEN><*><FROM_TOKEN><ID_TOKEN><2><T1>
+*/
+
+ulong digest_max= 0;
+ulong digest_lost= 0;
+
+
+/** EVENTS_STATEMENTS_HISTORY_LONG circular buffer. */
+PFS_statements_digest_stat *statements_digest_stat_array= NULL;
+/** Consumer flag for table EVENTS_STATEMENTS_SUMMARY_BY_DIGEST. */
+bool flag_statements_digest= true;
+/** 
+  Current index in Stat array where new record is to be inserted.
+  index 0 is reserved for "all else" case when entire array is full.
+*/
+volatile uint32 digest_index= 1;
+
+static LF_HASH digest_hash;
+static bool digest_hash_inited= false;
+
+/**
+  Initialize table EVENTS_STATEMENTS_SUMMARY_BY_DIGEST.
+  @param digest_sizing      
+*/
+int init_digest(const PFS_global_param *param)
+{
+  unsigned int index;
+
+  /*
+    Allocate memory for statements_digest_stat_array based on
+    performance_schema_digests_size values
+  */
+  digest_max= param->m_digest_sizing;
+  digest_lost= 0;
+
+  if (digest_max == 0)
+    return 0;
+
+  statements_digest_stat_array=
+    PFS_MALLOC_ARRAY(digest_max, PFS_statements_digest_stat,
+                     MYF(MY_ZEROFILL));
+  if (unlikely(statements_digest_stat_array == NULL))
+    return 1;
+
+  for (index= 0; index < digest_max; index++)
+  {
+    statements_digest_stat_array[index].reset_data();
+  }
+
+  return 0;
+}
+
+/** Cleanup table EVENTS_STATEMENTS_SUMMARY_BY_DIGEST. */
+void cleanup_digest(void)
+{
+  /*  Free memory allocated to statements_digest_stat_array. */
+  pfs_free(statements_digest_stat_array);
+  statements_digest_stat_array= NULL;
+}
+
+C_MODE_START
+static uchar *digest_hash_get_key(const uchar *entry, size_t *length,
+                                  my_bool)
+{
+  const PFS_statements_digest_stat * const *typed_entry;
+  const PFS_statements_digest_stat *digest;
+  const void *result;
+  typed_entry= reinterpret_cast<const PFS_statements_digest_stat*const*>(entry);
+  DBUG_ASSERT(typed_entry != NULL);
+  digest= *typed_entry;
+  DBUG_ASSERT(digest != NULL);
+  *length= PFS_MD5_SIZE; 
+  result= digest->m_digest_hash.m_md5;
+  return const_cast<uchar*> (reinterpret_cast<const uchar*> (result));
+}
+C_MODE_END
+
+
+/**
+  Initialize the digest hash.
+  @return 0 on success
+*/
+int init_digest_hash(void)
+{
+  if (! digest_hash_inited)
+  {
+    lf_hash_init(&digest_hash, sizeof(PFS_statements_digest_stat*),
+                 LF_HASH_UNIQUE, 0, 0, digest_hash_get_key,
+                 &my_charset_bin);
+    digest_hash_inited= true;
+  }
+  return 0;
+}
+
+void cleanup_digest_hash(void)
+{
+  if (digest_hash_inited)
+  {
+    lf_hash_destroy(&digest_hash);
+    digest_hash_inited= false;
+  }
+}
+
+static LF_PINS* get_digest_hash_pins(PFS_thread *thread)
+{
+  if (unlikely(thread->m_digest_hash_pins == NULL))
+  {
+    if (!digest_hash_inited)
+      return NULL;
+    thread->m_digest_hash_pins= lf_hash_get_pins(&digest_hash);
+  }
+  return thread->m_digest_hash_pins;
+}
+
+PFS_statement_stat*
+find_or_create_digest(PFS_thread* thread,
+                      PSI_digest_storage* digest_storage)
+{
+  if (statements_digest_stat_array == NULL)
+    return NULL;
+
+  if (digest_storage->m_byte_count <= 0)
+    return NULL;
+
+  LF_PINS *pins= get_digest_hash_pins(thread);
+  if (unlikely(pins == NULL))
+    return NULL;
+
+  /* Compute MD5 Hash of the tokens received. */
+  PFS_digest_hash md5;
+  compute_md5_hash((char *) md5.m_md5,
+                   (char *) digest_storage->m_token_array,
+                   digest_storage->m_byte_count);
+
+  unsigned char* hash_key= md5.m_md5;
+
+  int res;
+  ulong safe_index;
+  uint retry_count= 0;
+  const uint retry_max= 3;
+  PFS_statements_digest_stat **entry;
+  PFS_statements_digest_stat *pfs= NULL;
+
+  ulonglong now= microsecond_interval_timer();
+
+search:
+
+  /* Lookup LF_HASH using this new key. */
+  entry= reinterpret_cast<PFS_statements_digest_stat**>
+    (lf_hash_search(&digest_hash, pins,
+                    hash_key, PFS_MD5_SIZE));
+
+  if (entry && (entry != MY_ERRPTR))
+  {
+    /* If digest already exists, update stats and return. */
+    pfs= *entry;
+    pfs->m_last_seen= now;
+    lf_hash_search_unpin(pins);
+    return & pfs->m_stat;
+  }
+
+  lf_hash_search_unpin(pins);
+
+  /* Dirty read of digest_index */
+  if (digest_index == 0)
+  {
+    /*  digest_stat array is full. Add stat at index 0 and return. */
+    pfs= &statements_digest_stat_array[0];
+
+    if (pfs->m_first_seen == 0)
+      pfs->m_first_seen= now;
+    pfs->m_last_seen= now;
+    return & pfs->m_stat;
+  }
+
+  safe_index= PFS_atomic::add_u32(& digest_index, 1);
+  if (safe_index >= digest_max)
+  {
+    /* The digest array is now full. */
+    digest_index= 0;
+    pfs= &statements_digest_stat_array[0];
+
+    if (pfs->m_first_seen == 0)
+      pfs->m_first_seen= now;
+    pfs->m_last_seen= now;
+    return & pfs->m_stat;
+  }
+
+  /* Add a new record in digest stat array. */
+  pfs= &statements_digest_stat_array[safe_index];
+
+  /* Copy digest hash/LF Hash search key. */
+  memcpy(pfs->m_digest_hash.m_md5, md5.m_md5, PFS_MD5_SIZE);
+
+  /*
+    Copy digest storage to statement_digest_stat_array so that it could be
+    used later to generate digest text.
+  */
+  digest_copy(& pfs->m_digest_storage, digest_storage);
+
+  pfs->m_first_seen= now;
+  pfs->m_last_seen= now;
+
+  res= lf_hash_insert(&digest_hash, pins, &pfs);
+  if (likely(res == 0))
+  {
+    return & pfs->m_stat;
+  }
+
+  if (res > 0)
+  {
+    /* Duplicate insert by another thread */
+    if (++retry_count > retry_max)
+    {
+      /* Avoid infinite loops */
+      digest_lost++;
+      return NULL;
+    }
+    goto search;
+  }
+
+  /* OOM in lf_hash_insert */
+  digest_lost++;
+  return NULL;
+}
+
+void purge_digest(PFS_thread* thread, unsigned char* hash_key)
+{
+  LF_PINS *pins= get_digest_hash_pins(thread);
+  if (unlikely(pins == NULL))
+    return;
+
+  PFS_statements_digest_stat **entry;
+
+  /* Lookup LF_HASH using this new key. */
+  entry= reinterpret_cast<PFS_statements_digest_stat**>
+    (lf_hash_search(&digest_hash, pins,
+                    hash_key, PFS_MD5_SIZE));
+
+  if (entry && (entry != MY_ERRPTR))
+  { 
+    lf_hash_delete(&digest_hash, pins,
+                   hash_key, PFS_MD5_SIZE);
+  }
+  lf_hash_search_unpin(pins);
+  return;
+}
+
+void PFS_statements_digest_stat::reset_data()
+{
+  digest_reset(& m_digest_storage);
+  m_stat.reset();
+  m_first_seen= 0;
+  m_last_seen= 0;
+}
+
+void PFS_statements_digest_stat::reset_index(PFS_thread *thread)
+{
+  /* Only remove entries that exists in the HASH index. */
+  if (m_digest_storage.m_byte_count > 0)
+  {
+    purge_digest(thread, m_digest_hash.m_md5);
+  }
+}
+
+void reset_esms_by_digest()
+{
+  uint index;
+
+  if (statements_digest_stat_array == NULL)
+    return;
+
+  PFS_thread *thread= PFS_thread::get_current_thread();
+  if (unlikely(thread == NULL))
+    return;
+
+  /* Reset statements_digest_stat_array. */
+  for (index= 0; index < digest_max; index++)
+  {
+    statements_digest_stat_array[index].reset_index(thread);
+    statements_digest_stat_array[index].reset_data();
+  }
+
+  /* 
+    Reset index which indicates where the next calculated digest information
+    to be inserted in statements_digest_stat_array.
+  */
+  digest_index= 1;
+}
+
+/*
+  Iterate token array and updates digest_text.
+*/
+void get_digest_text(char* digest_text, PSI_digest_storage* digest_storage)
+{
+  bool truncated= false;
+  int byte_count= digest_storage->m_byte_count;
+  int need_bytes;
+  uint tok= 0;
+  char *id_string;
+  int id_length;
+  int current_byte= 0;
+  lex_token_string *tok_data;
+  /* -4 is to make sure extra space for '...' and a '\0' at the end. */
+  int available_bytes_to_write= COL_DIGEST_TEXT_SIZE - 4;
+
+  DBUG_ASSERT(byte_count <= PSI_MAX_DIGEST_STORAGE_SIZE);
+
+  while ((current_byte < byte_count) &&
+         (available_bytes_to_write > 0) &&
+         (! truncated))
+  {
+    current_byte= read_token(digest_storage, current_byte, &tok);
+    tok_data= & lex_token_array[tok];
+    
+    switch (tok)
+    {
+    /* All identifiers are printed with their name. */
+    case IDENT:
+      current_byte= read_identifier(digest_storage, current_byte,
+                                    & id_string, & id_length);
+      need_bytes= id_length + 1; /* <id> space */
+      if (need_bytes <= available_bytes_to_write)
+      {
+        if (id_length > 0)
+        {
+          strncpy(digest_text, id_string, id_length);
+          digest_text+= id_length;
+        }
+        *digest_text= ' ';
+        digest_text++;
+        available_bytes_to_write-= need_bytes;
+      }
+      else
+      {
+        truncated= true;
+      }
+      break;
+    case IDENT_QUOTED:
+      current_byte= read_identifier(digest_storage, current_byte,
+                                    & id_string, & id_length);
+      need_bytes= id_length + 3; /* quote <id> quote space  */
+      if (need_bytes <= available_bytes_to_write)
+      {
+        *digest_text= '`';
+        digest_text++;
+        if (id_length > 0)
+        {
+          strncpy(digest_text, id_string, id_length);
+          digest_text+= id_length;
+        }
+        *digest_text= '`';
+        digest_text++;
+        *digest_text= ' ';
+        digest_text++;
+        available_bytes_to_write-= need_bytes;
+      }
+      else
+      {
+        truncated= true;
+      }
+      break;
+
+    /* Everything else is printed as is. */
+    default:
+      /* 
+        Make sure not to overflow digest_text buffer while writing
+        this token string.
+        +1 is to make sure extra space for ' '.
+      */
+      int tok_length= tok_data->m_token_length;
+      need_bytes= tok_length + 1;
+
+      if (need_bytes <= available_bytes_to_write)
+      {
+        strncpy(digest_text,
+                tok_data->m_token_string,
+                tok_length);
+        digest_text+= tok_length;
+        *digest_text= ' ';
+        digest_text++;
+        available_bytes_to_write-= need_bytes;
+      }
+      else
+      {
+        truncated= true;
+      }
+    }
+  }
+
+  /* Truncate digest text in case of long queries. */
+  if (digest_storage->m_full || truncated)
+  {
+    strcpy(digest_text, "...");
+    digest_text+= 3;
+  }
+
+  *digest_text= '\0';
+}
+
+static inline uint peek_token(const PSI_digest_storage *digest, int index)
+{
+  uint token;
+  DBUG_ASSERT(index >= 0);
+  DBUG_ASSERT(index + PFS_SIZE_OF_A_TOKEN <= digest->m_byte_count);
+  DBUG_ASSERT(digest->m_byte_count <=  PSI_MAX_DIGEST_STORAGE_SIZE);
+
+  token= ((digest->m_token_array[index + 1])<<8) | digest->m_token_array[index];
+  return token;
+}
+
+/**
+  Function to read last two tokens from token array. If an identifier
+  is found, do not look for token after that.
+*/
+static inline void peek_last_two_tokens(const PSI_digest_storage* digest_storage,
+                                        int last_id_index, uint *t1, uint *t2)
+{
+  int byte_count= digest_storage->m_byte_count;
+
+  if (last_id_index <= byte_count - PFS_SIZE_OF_A_TOKEN)
+  {
+    /* Take last token. */
+    *t1= peek_token(digest_storage, byte_count - PFS_SIZE_OF_A_TOKEN);
+  }
+  else
+  {
+    *t1= TOK_PFS_UNUSED;
+  }
+
+  if (last_id_index <= byte_count - 2*PFS_SIZE_OF_A_TOKEN)
+  {
+    /* Take 2nd token from last. */
+    *t2= peek_token(digest_storage, byte_count - 2 * PFS_SIZE_OF_A_TOKEN);
+  }
+  else
+  {
+    *t2= TOK_PFS_UNUSED;
+  }
+}
+
+struct PSI_digest_locker* pfs_digest_start_v1(PSI_statement_locker *locker)
+{
+  PSI_statement_locker_state *statement_state;
+  statement_state= reinterpret_cast<PSI_statement_locker_state*> (locker);
+  DBUG_ASSERT(statement_state != NULL);
+
+  if (statement_state->m_discarded)
+    return NULL;
+
+  if (statement_state->m_flags & STATE_FLAG_DIGEST)
+  {
+    PSI_digest_locker_state *digest_state;
+    digest_state= &statement_state->m_digest_state;
+    return reinterpret_cast<PSI_digest_locker*> (digest_state);
+  }
+
+  return NULL;
+}
+
+PSI_digest_locker* pfs_digest_add_token_v1(PSI_digest_locker *locker,
+                                           uint token,
+                                           OPAQUE_LEX_YYSTYPE *yylval)
+{
+  PSI_digest_locker_state *state= NULL;
+  PSI_digest_storage      *digest_storage= NULL;
+
+  state= reinterpret_cast<PSI_digest_locker_state*> (locker);
+  DBUG_ASSERT(state != NULL);
+
+  digest_storage= &state->m_digest_storage;
+
+  if (digest_storage->m_full)
+    return NULL;
+
+  /* 
+    Take last_token 2 tokens collected till now. These tokens will be used
+    in reduce for normalisation. Make sure not to consider ID tokens in reduce.
+  */
+  uint last_token;
+  uint last_token2;
+  
+  peek_last_two_tokens(digest_storage, state->m_last_id_index,
+                       &last_token, &last_token2);
+
+  switch (token)
+  {
+    case BIN_NUM:
+    case DECIMAL_NUM:
+    case FLOAT_NUM:
+    case HEX_NUM:
+    case LEX_HOSTNAME:
+    case LONG_NUM:
+    case NUM:
+    case TEXT_STRING:
+    case NCHAR_STRING:
+    case ULONGLONG_NUM:
+    {
+      /*
+        REDUCE:
+        TOK_PFS_GENERIC_VALUE := BIN_NUM | DECIMAL_NUM | ... | ULONGLONG_NUM
+      */
+      token= TOK_PFS_GENERIC_VALUE;
+
+      if ((last_token2 == TOK_PFS_GENERIC_VALUE ||
+           last_token2 == TOK_PFS_GENERIC_VALUE_LIST) &&
+          (last_token == ','))
+      {
+        /*
+          REDUCE:
+          TOK_PFS_GENERIC_VALUE_LIST :=
+            TOK_PFS_GENERIC_VALUE ',' TOK_PFS_GENERIC_VALUE
+          
+          REDUCE:
+          TOK_PFS_GENERIC_VALUE_LIST :=
+            TOK_PFS_GENERIC_VALUE_LIST ',' TOK_PFS_GENERIC_VALUE
+        */
+        digest_storage->m_byte_count-= 2*PFS_SIZE_OF_A_TOKEN;
+        token= TOK_PFS_GENERIC_VALUE_LIST;
+      }
+      /*
+        Add this token or the resulting reduce to digest storage.
+      */
+      store_token(digest_storage, token);
+      break;
+    }
+    case ')':
+    {
+      if (last_token == TOK_PFS_GENERIC_VALUE &&
+          last_token2 == '(') 
+      { 
+        /*
+          REDUCE:
+          TOK_PFS_ROW_SINGLE_VALUE :=
+            '(' TOK_PFS_GENERIC_VALUE ')' 
+        */
+        digest_storage->m_byte_count-= 2*PFS_SIZE_OF_A_TOKEN;
+        token= TOK_PFS_ROW_SINGLE_VALUE;
+      
+        /* Read last two tokens again */
+        peek_last_two_tokens(digest_storage, state->m_last_id_index,
+                             &last_token, &last_token2);
+
+        if ((last_token2 == TOK_PFS_ROW_SINGLE_VALUE ||
+             last_token2 == TOK_PFS_ROW_SINGLE_VALUE_LIST) &&
+            (last_token == ','))
+        {
+          /*
+            REDUCE:
+            TOK_PFS_ROW_SINGLE_VALUE_LIST := 
+              TOK_PFS_ROW_SINGLE_VALUE ',' TOK_PFS_ROW_SINGLE_VALUE
+
+            REDUCE:
+            TOK_PFS_ROW_SINGLE_VALUE_LIST := 
+              TOK_PFS_ROW_SINGLE_VALUE_LIST ',' TOK_PFS_ROW_SINGLE_VALUE
+          */
+          digest_storage->m_byte_count-= 2*PFS_SIZE_OF_A_TOKEN;
+          token= TOK_PFS_ROW_SINGLE_VALUE_LIST;
+        }
+      }
+      else if (last_token == TOK_PFS_GENERIC_VALUE_LIST &&
+               last_token2 == '(') 
+      {
+        /*
+          REDUCE:
+          TOK_PFS_ROW_MULTIPLE_VALUE :=
+            '(' TOK_PFS_GENERIC_VALUE_LIST ')'
+        */
+        digest_storage->m_byte_count-= 2*PFS_SIZE_OF_A_TOKEN;
+        token= TOK_PFS_ROW_MULTIPLE_VALUE;
+
+        /* Read last two tokens again */
+        peek_last_two_tokens(digest_storage, state->m_last_id_index,
+                             &last_token, &last_token2);
+
+        if ((last_token2 == TOK_PFS_ROW_MULTIPLE_VALUE ||
+             last_token2 == TOK_PFS_ROW_MULTIPLE_VALUE_LIST) &&
+            (last_token == ','))
+        {
+          /*
+            REDUCE:
+            TOK_PFS_ROW_MULTIPLE_VALUE_LIST :=
+              TOK_PFS_ROW_MULTIPLE_VALUE ',' TOK_PFS_ROW_MULTIPLE_VALUE
+
+            REDUCE:
+            TOK_PFS_ROW_MULTIPLE_VALUE_LIST :=
+              TOK_PFS_ROW_MULTIPLE_VALUE_LIST ',' TOK_PFS_ROW_MULTIPLE_VALUE
+          */
+          digest_storage->m_byte_count-= 2*PFS_SIZE_OF_A_TOKEN;
+          token= TOK_PFS_ROW_MULTIPLE_VALUE_LIST;
+        }
+      }
+      /*
+        Add this token or the resulting reduce to digest storage.
+      */
+      store_token(digest_storage, token);
+      break;
+    }
+    case IDENT:
+    case IDENT_QUOTED:
+    {
+      LEX_YYSTYPE *lex_token= (LEX_YYSTYPE*) yylval;
+      char *yytext= lex_token->lex_str.str;
+      int yylen= lex_token->lex_str.length;
+
+      /* Add this token and identifier string to digest storage. */
+      store_token_identifier(digest_storage, token, yylen, yytext);
+
+      /* Update the index of last identifier found. */
+      state->m_last_id_index= digest_storage->m_byte_count;
+      break;
+    }
+    default:
+    {
+      /* Add this token to digest storage. */
+      store_token(digest_storage, token);
+      break;
+    }
+  }
+
+  return locker;
+}
diff --git a/storage/perfschema/pfs_digest.h b/storage/perfschema/pfs_digest.h
new file mode 100644
index 00000000000..2646596171c
--- /dev/null
+++ b/storage/perfschema/pfs_digest.h
@@ -0,0 +1,221 @@
+/* Copyright (c) 2011, 2012, Oracle and/or its affiliates. All rights reserved.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; version 2 of the License.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software Foundation,
+  51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA */
+
+#ifndef PFS_DIGEST_H
+#define PFS_DIGEST_H
+
+/**
+  @file storage/perfschema/pfs_digest.h
+  Statement Digest data structures (declarations).
+*/
+
+#include "pfs_column_types.h"
+#include "lf.h"
+#include "pfs_stat.h"
+
+#define PFS_SIZE_OF_A_TOKEN 2
+
+extern bool flag_statements_digest;
+extern ulong digest_max;
+extern ulong digest_lost;
+struct PFS_thread;
+
+/* Fixed, per MD5 hash. */
+#define PFS_MD5_SIZE 16
+
+/**
+  Structure to store a MD5 hash value (digest) for a statement.
+*/
+struct PFS_digest_hash
+{
+  unsigned char m_md5[PFS_MD5_SIZE];
+};
+
+/** A statement digest stat record. */
+struct PFS_statements_digest_stat
+{
+  /**
+    Digest MD5 Hash.
+  */
+  PFS_digest_hash m_digest_hash;
+
+  /**
+    Digest Storage.
+  */
+  PSI_digest_storage m_digest_storage;
+
+  /**
+    Statement stat.
+  */
+  PFS_statement_stat m_stat;
+
+  /**
+    First Seen/last seen.
+  */
+  ulonglong m_first_seen;
+  ulonglong m_last_seen;
+
+  /** Reset data for this record. */
+  void reset_data();
+  /** Reset data and remove index for this record. */
+  void reset_index(PFS_thread *thread);
+};
+
+int init_digest(const PFS_global_param *param);
+void cleanup_digest();
+
+int init_digest_hash(void);
+void cleanup_digest_hash(void);
+PFS_statement_stat* find_or_create_digest(PFS_thread*,
+                                          PSI_digest_storage*);
+
+void get_digest_text(char* digest_text, PSI_digest_storage*);
+
+void reset_esms_by_digest();
+
+/* Exposing the data directly, for iterators. */
+extern PFS_statements_digest_stat *statements_digest_stat_array;
+
+/* Instrumentation callbacks for pfs.cc */
+
+struct PSI_digest_locker* pfs_digest_start_v1(PSI_statement_locker *locker);
+PSI_digest_locker* pfs_digest_add_token_v1(PSI_digest_locker *locker,
+                                           uint token,
+                                           OPAQUE_LEX_YYSTYPE *yylval);
+
+static inline void digest_reset(PSI_digest_storage *digest)
+{
+  digest->m_full= false;
+  digest->m_byte_count= 0;
+}
+
+static inline void digest_copy(PSI_digest_storage *to, const PSI_digest_storage *from)
+{
+  if (from->m_byte_count > 0)
+  {
+    to->m_full= from->m_full;
+    to->m_byte_count= from->m_byte_count;
+    DBUG_ASSERT(to->m_byte_count <= PSI_MAX_DIGEST_STORAGE_SIZE);
+    memcpy(to->m_token_array, from->m_token_array, to->m_byte_count);
+  }
+  else
+  {
+    DBUG_ASSERT(! from->m_full);
+    DBUG_ASSERT(from->m_byte_count == 0);
+    to->m_full= false;
+    to->m_byte_count= 0;
+  }
+}
+
+/** 
+  Function to read a single token from token array.
+*/
+inline int read_token(PSI_digest_storage *digest_storage,
+                      int index, uint *tok)
+{
+  DBUG_ASSERT(index <= digest_storage->m_byte_count);
+  DBUG_ASSERT(digest_storage->m_byte_count <= PSI_MAX_DIGEST_STORAGE_SIZE);
+
+  if (index + PFS_SIZE_OF_A_TOKEN <= digest_storage->m_byte_count)
+  {
+    unsigned char *src= & digest_storage->m_token_array[index];
+    *tok= src[0] | (src[1] << 8);
+    return index + PFS_SIZE_OF_A_TOKEN;
+  }
+
+  /* The input byte stream is exhausted. */
+  *tok= 0;
+  return PSI_MAX_DIGEST_STORAGE_SIZE + 1;
+}
+
+/**
+  Function to store a single token in token array.
+*/
+inline void store_token(PSI_digest_storage* digest_storage, uint token)
+{
+  DBUG_ASSERT(digest_storage->m_byte_count >= 0);
+  DBUG_ASSERT(digest_storage->m_byte_count <= PSI_MAX_DIGEST_STORAGE_SIZE);
+
+  if (digest_storage->m_byte_count + PFS_SIZE_OF_A_TOKEN <= PSI_MAX_DIGEST_STORAGE_SIZE)
+  {
+    unsigned char* dest= & digest_storage->m_token_array[digest_storage->m_byte_count];
+    dest[0]= token & 0xff;
+    dest[1]= (token >> 8) & 0xff;
+    digest_storage->m_byte_count+= PFS_SIZE_OF_A_TOKEN; 
+  }
+  else
+  {
+    digest_storage->m_full= true;
+  }
+}
+
+/**
+  Function to read an identifier from token array.
+*/
+inline int read_identifier(PSI_digest_storage* digest_storage,
+                           int index, char ** id_string, int *id_length)
+{
+  int new_index;
+  DBUG_ASSERT(index <= digest_storage->m_byte_count);
+  DBUG_ASSERT(digest_storage->m_byte_count <= PSI_MAX_DIGEST_STORAGE_SIZE);
+
+  /*
+    token + length + string are written in an atomic way,
+    so we do always expect a length + string here
+  */
+  unsigned char *src= & digest_storage->m_token_array[index];
+  uint length= src[0] | (src[1] << 8);
+  *id_string= (char *) (src + 2);
+  *id_length= length;
+
+  new_index= index + PFS_SIZE_OF_A_TOKEN + length;
+  DBUG_ASSERT(new_index <= digest_storage->m_byte_count);
+  return new_index;
+}
+
+/**
+  Function to store an identifier in token array.
+*/
+inline void store_token_identifier(PSI_digest_storage* digest_storage,
+                                   uint token,
+                                   uint id_length, const char *id_name)
+{
+  DBUG_ASSERT(digest_storage->m_byte_count >= 0);
+  DBUG_ASSERT(digest_storage->m_byte_count <= PSI_MAX_DIGEST_STORAGE_SIZE);
+
+  uint bytes_needed= 2 * PFS_SIZE_OF_A_TOKEN + id_length;
+  if (digest_storage->m_byte_count + bytes_needed <= PSI_MAX_DIGEST_STORAGE_SIZE)
+  {
+    unsigned char* dest= & digest_storage->m_token_array[digest_storage->m_byte_count];
+    /* Write the token */
+    dest[0]= token & 0xff;
+    dest[1]= (token >> 8) & 0xff;
+    /* Write the string length */
+    dest[2]= id_length & 0xff;
+    dest[3]= (id_length >> 8) & 0xff;
+    /* Write the string data */
+    if (id_length > 0)
+    {
+      strncpy((char *)(dest + 4), id_name, id_length);
+    }
+    digest_storage->m_byte_count+= bytes_needed; 
+  }
+  else
+  {
+    digest_storage->m_full= true;
+  }
+}
+
+#endif
diff --git a/storage/perfschema/pfs_engine_table.cc b/storage/perfschema/pfs_engine_table.cc
index 38f6df3003d..884a0f9e848 100644
--- a/storage/perfschema/pfs_engine_table.cc
+++ b/storage/perfschema/pfs_engine_table.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2008, 2011, Oracle and/or its affiliates. All rights reserved.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
@@ -18,24 +18,66 @@
   Performance schema tables (implementation).
 */
 
+#include "my_global.h"
+#include "my_pthread.h"
 #include "pfs_engine_table.h"
 
 #include "table_events_waits.h"
+#include "table_setup_actors.h"
 #include "table_setup_consumers.h"
 #include "table_setup_instruments.h"
+#include "table_setup_objects.h"
 #include "table_setup_timers.h"
 #include "table_performance_timers.h"
-#include "table_threads.h"
 #include "table_events_waits_summary.h"
+#include "table_ews_by_thread_by_event_name.h"
 #include "table_ews_global_by_event_name.h"
+#include "table_host_cache.h"
+#include "table_os_global_by_type.h"
 #include "table_sync_instances.h"
 #include "table_file_instances.h"
-#include "table_file_summary.h"
+#include "table_file_summary_by_instance.h"
+#include "table_file_summary_by_event_name.h"
+#include "table_threads.h"
+
+#include "table_ews_by_host_by_event_name.h"
+#include "table_ews_by_user_by_event_name.h"
+#include "table_ews_by_account_by_event_name.h"
+#include "table_tiws_by_index_usage.h"
+#include "table_tiws_by_table.h"
+#include "table_tlws_by_table.h"
+
+#include "table_events_stages.h"
+#include "table_esgs_by_thread_by_event_name.h"
+#include "table_esgs_by_host_by_event_name.h"
+#include "table_esgs_by_user_by_event_name.h"
+#include "table_esgs_by_account_by_event_name.h"
+#include "table_esgs_global_by_event_name.h"
+
+#include "table_events_statements.h"
+#include "table_esms_by_thread_by_event_name.h"
+#include "table_esms_by_host_by_event_name.h"
+#include "table_esms_by_user_by_event_name.h"
+#include "table_esms_by_account_by_event_name.h"
+#include "table_esms_global_by_event_name.h"
+#include "table_esms_by_digest.h"
+
+#include "table_users.h"
+#include "table_accounts.h"
+#include "table_hosts.h"
+
+#include "table_socket_instances.h"
+#include "table_socket_summary_by_instance.h"
+#include "table_socket_summary_by_event_name.h"
 
 /* For show status */
 #include "pfs_column_values.h"
+#include "pfs_instr_class.h"
 #include "pfs_instr.h"
+#include "pfs_setup_actor.h"
+#include "pfs_setup_object.h"
 #include "pfs_global.h"
+#include "pfs_digest.h"
 
 #include "sql_base.h"                           // close_thread_tables
 #include "lock.h"                               // MYSQL_LOCK_IGNORE_TIMEOUT
@@ -47,23 +89,62 @@
 
 static PFS_engine_table_share *all_shares[]=
 {
+  &table_cond_instances::m_share,
   &table_events_waits_current::m_share,
   &table_events_waits_history::m_share,
   &table_events_waits_history_long::m_share,
-  &table_setup_consumers::m_share,
-  &table_setup_instruments::m_share,
-  &table_setup_timers::m_share,
-  &table_performance_timers::m_share,
-  &table_threads::m_share,
-  &table_events_waits_summary_by_thread_by_event_name::m_share,
+  &table_ews_by_host_by_event_name::m_share,
   &table_events_waits_summary_by_instance::m_share,
+  &table_ews_by_thread_by_event_name::m_share,
+  &table_ews_by_user_by_event_name::m_share,
+  &table_ews_by_account_by_event_name::m_share,
   &table_ews_global_by_event_name::m_share,
+  &table_file_instances::m_share,
   &table_file_summary_by_event_name::m_share,
   &table_file_summary_by_instance::m_share,
+#ifdef QQ_NOT_YET
+  &table_host_cache::m_share,
+#endif
   &table_mutex_instances::m_share,
+  &table_os_global_by_type::m_share,
+  &table_performance_timers::m_share,
   &table_rwlock_instances::m_share,
-  &table_cond_instances::m_share,
-  &table_file_instances::m_share,
+  &table_setup_actors::m_share,
+  &table_setup_consumers::m_share,
+  &table_setup_instruments::m_share,
+  &table_setup_objects::m_share,
+  &table_setup_timers::m_share,
+  &table_tiws_by_index_usage::m_share,
+  &table_tiws_by_table::m_share,
+  &table_tlws_by_table::m_share,
+  &table_threads::m_share,
+
+  &table_events_stages_current::m_share,
+  &table_events_stages_history::m_share,
+  &table_events_stages_history_long::m_share,
+  &table_esgs_by_thread_by_event_name::m_share,
+  &table_esgs_by_account_by_event_name::m_share,
+  &table_esgs_by_user_by_event_name::m_share,
+  &table_esgs_by_host_by_event_name::m_share,
+  &table_esgs_global_by_event_name::m_share,
+
+  &table_events_statements_current::m_share,
+  &table_events_statements_history::m_share,
+  &table_events_statements_history_long::m_share,
+  &table_esms_by_thread_by_event_name::m_share,
+  &table_esms_by_account_by_event_name::m_share,
+  &table_esms_by_user_by_event_name::m_share,
+  &table_esms_by_host_by_event_name::m_share,
+  &table_esms_global_by_event_name::m_share,
+  &table_esms_by_digest::m_share,
+
+  &table_users::m_share,
+  &table_accounts::m_share,
+  &table_hosts::m_share,
+
+  &table_socket_instances::m_share,
+  &table_socket_summary_by_instance::m_share,
+  &table_socket_summary_by_event_name::m_share,
   NULL
 };
 
@@ -78,13 +159,14 @@ void PFS_engine_table_share::check_all_tables(THD *thd)
   DBUG_EXECUTE_IF("tampered_perfschema_table1",
                   {
                     /* Hack SETUP_INSTRUMENT, incompatible change. */
-                    all_shares[4]->m_field_def->count++;
+                    all_shares[20]->m_field_def->count++;
                   });
 
   for (current= &all_shares[0]; (*current) != NULL; current++)
     (*current)->check_one_table(thd);
 }
 
+/** Error reporting for schema integrity checks. */
 class PFS_check_intact : public Table_check_intact
 {
 protected:
@@ -169,6 +251,42 @@ void PFS_engine_table_share::delete_all_locks(void)
     thr_lock_delete((*current)->m_thr_lock_ptr);
 }
 
+ha_rows PFS_engine_table_share::get_row_count(void) const
+{
+  /* If available, count the exact number or records */
+  if (m_get_row_count)
+    return m_get_row_count();
+  /* Otherwise, return an estimate */
+  return m_records;
+}
+
+int PFS_engine_table_share::write_row(TABLE *table, unsigned char *buf,
+                                      Field **fields) const
+{
+  my_bitmap_map *org_bitmap;
+
+  /*
+    Make sure the table structure is as expected before mapping
+    hard wired columns in m_write_row.
+  */
+  if (! m_checked)
+  {
+    return HA_ERR_TABLE_NEEDS_UPGRADE;
+  }
+
+  if (m_write_row == NULL)
+  {
+    return HA_ERR_WRONG_COMMAND;
+  }
+
+  /* We internally read from Fields to support the write interface */
+  org_bitmap= dbug_tmp_use_all_columns(table, table->read_set);
+  int result= m_write_row(table, buf, fields);
+  dbug_tmp_restore_column_map(table->read_set, org_bitmap);
+
+  return result;
+}
+
 static int compare_table_names(const char *name1, const char *name2)
 {
   /*
@@ -288,6 +406,36 @@ int PFS_engine_table::update_row(TABLE *table,
   return result;
 }
 
+int PFS_engine_table::delete_row(TABLE *table,
+                                 const unsigned char *buf,
+                                 Field **fields)
+{
+  my_bitmap_map *org_bitmap;
+
+  /*
+    Make sure the table structure is as expected before mapping
+    hard wired columns in delete_row_values.
+  */
+  if (! m_share_ptr->m_checked)
+  {
+    return HA_ERR_TABLE_NEEDS_UPGRADE;
+  }
+
+  /* We internally read from Fields to support the delete interface */
+  org_bitmap= dbug_tmp_use_all_columns(table, table->read_set);
+  int result= delete_row_values(table, buf, fields);
+  dbug_tmp_restore_column_map(table->read_set, org_bitmap);
+
+  return result;
+}
+
+int PFS_engine_table::delete_row_values(TABLE *,
+                                        const unsigned char *,
+                                        Field **)
+{
+  return HA_ERR_WRONG_COMMAND;
+}
+
 /**
   Get the position of the current row.
   @param [out] ref        position
@@ -306,6 +454,19 @@ void PFS_engine_table::set_position(const void *ref)
   memcpy(m_pos_ptr, ref, m_share_ptr->m_ref_length);
 }
 
+/**
+  Get the timer normalizer and class type for the current row.
+  @param [in] instr_class    class
+*/
+void PFS_engine_table::get_normalizer(PFS_instr_class *instr_class)
+{
+  if (instr_class->m_type != m_class_type)
+  {
+    m_normalizer= time_normalizer::get(*instr_class->m_timer);
+    m_class_type= instr_class->m_type;
+  }
+}
+
 void PFS_engine_table::set_field_ulong(Field *f, ulong value)
 {
   DBUG_ASSERT(f->real_type() == MYSQL_TYPE_LONG);
@@ -320,6 +481,14 @@ void PFS_engine_table::set_field_ulonglong(Field *f, ulonglong value)
   f2->store(value, true);
 }
 
+void PFS_engine_table::set_field_char_utf8(Field *f, const char* str,
+                                           uint len)
+{
+  DBUG_ASSERT(f->real_type() == MYSQL_TYPE_STRING);
+  Field_string *f2= (Field_string*) f;
+  f2->store(str, len, &my_charset_utf8_bin);
+}
+
 void PFS_engine_table::set_field_varchar_utf8(Field *f, const char* str,
                                               uint len)
 {
@@ -328,6 +497,14 @@ void PFS_engine_table::set_field_varchar_utf8(Field *f, const char* str,
   f2->store(str, len, &my_charset_utf8_bin);
 }
 
+void PFS_engine_table::set_field_longtext_utf8(Field *f, const char* str,
+                                               uint len)
+{
+  DBUG_ASSERT(f->real_type() == MYSQL_TYPE_BLOB);
+  Field_blob *f2= (Field_blob*) f;
+  f2->store(str, len, &my_charset_utf8_bin);
+}
+
 void PFS_engine_table::set_field_enum(Field *f, ulonglong value)
 {
   DBUG_ASSERT(f->real_type() == MYSQL_TYPE_ENUM);
@@ -335,6 +512,13 @@ void PFS_engine_table::set_field_enum(Field *f, ulonglong value)
   f2->store_type(value);
 }
 
+void PFS_engine_table::set_field_timestamp(Field *f, ulonglong value)
+{
+  DBUG_ASSERT(f->real_type() == MYSQL_TYPE_TIMESTAMP);
+  Field_timestamp *f2= (Field_timestamp*) f;
+  f2->store_TIME((long)(value / 1000000), (value % 1000000));
+}
+
 ulonglong PFS_engine_table::get_field_enum(Field *f)
 {
   DBUG_ASSERT(f->real_type() == MYSQL_TYPE_ENUM);
@@ -342,6 +526,24 @@ ulonglong PFS_engine_table::get_field_enum(Field *f)
   return f2->val_int();
 }
 
+String*
+PFS_engine_table::get_field_char_utf8(Field *f, String *val)
+{
+  DBUG_ASSERT(f->real_type() == MYSQL_TYPE_STRING);
+  Field_string *f2= (Field_string*) f;
+  val= f2->val_str(NULL, val);
+  return val;
+}
+
+String*
+PFS_engine_table::get_field_varchar_utf8(Field *f, String *val)
+{
+  DBUG_ASSERT(f->real_type() == MYSQL_TYPE_VARCHAR);
+  Field_varstring *f2= (Field_varstring*) f;
+  val= f2->val_str(NULL, val);
+  return val;
+}
+
 int PFS_engine_table::update_row_values(TABLE *,
                                         const unsigned char *,
                                         unsigned char *,
@@ -350,6 +552,7 @@ int PFS_engine_table::update_row_values(TABLE *,
   return HA_ERR_WRONG_COMMAND;
 }
 
+/** Implementation of internal ACL checks, for the performance schema. */
 class PFS_internal_schema_access : public ACL_internal_schema_access
 {
 public:
@@ -533,11 +736,11 @@ bool pfs_show_status(handlerton *hton, THD *thd,
     switch (i){
     case 0:
       name= "events_waits_current.row_size";
-      size= sizeof(PFS_wait_locker);
+      size= sizeof(PFS_events_waits);
       break;
     case 1:
       name= "events_waits_current.row_count";
-      size= LOCKER_STACK_SIZE * thread_max;
+      size= WAIT_STACK_SIZE * thread_max;
       break;
     case 2:
       name= "events_waits_history.row_size";
@@ -711,15 +914,15 @@ bool pfs_show_status(handlerton *hton, THD *thd,
       break;
     case 41:
       name= "events_waits_summary_by_thread_by_event_name.row_size";
-      size= sizeof(PFS_single_stat_chain);
+      size= sizeof(PFS_single_stat);
       break;
     case 42:
       name= "events_waits_summary_by_thread_by_event_name.row_count";
-      size= thread_max * instr_class_per_thread;
+      size= thread_max * wait_class_max;
       break;
     case 43:
       name= "events_waits_summary_by_thread_by_event_name.memory";
-      size= thread_max * instr_class_per_thread * sizeof(PFS_single_stat_chain);
+      size= thread_max * wait_class_max * sizeof(PFS_single_stat);
       total_memory+= size;
       break;
     case 44:
@@ -748,11 +951,390 @@ bool pfs_show_status(handlerton *hton, THD *thd,
       size= table_max * sizeof(PFS_table);
       total_memory+= size;
       break;
+    case 50:
+      name= "setup_actors.row_size";
+      size= sizeof(PFS_setup_actor);
+      break;
+    case 51:
+      name= "setup_actors.row_count";
+      size= setup_actor_max;
+      break;
+    case 52:
+      name= "setup_actors.memory";
+      size= setup_actor_max * sizeof(PFS_setup_actor);
+      total_memory+= size;
+      break;
+    case 53:
+      name= "setup_objects.row_size";
+      size= sizeof(PFS_setup_object);
+      break;
+    case 54:
+      name= "setup_objects.row_count";
+      size= setup_object_max;
+      break;
+    case 55:
+      name= "setup_objects.memory";
+      size= setup_object_max * sizeof(PFS_setup_object);
+      total_memory+= size;
+      break;
+    case 56:
+      name= "events_waits_summary_global_by_event_name.row_size";
+      size= sizeof(PFS_single_stat);
+      break;
+    case 57:
+      name= "events_waits_summary_global_by_event_name.row_count";
+      size= wait_class_max;
+      break;
+    case 58:
+      name= "events_waits_summary_global_by_event_name.memory";
+      size= wait_class_max * sizeof(PFS_single_stat);
+      total_memory+= size;
+      break;
+    case 59:
+      name= "(pfs_account).row_size";
+      size= sizeof(PFS_account);
+      break;
+    case 60:
+      name= "(pfs_account).row_count";
+      size= account_max;
+      break;
+    case 61:
+      name= "(pfs_account).memory";
+      size= account_max * sizeof(PFS_account);
+      total_memory+= size;
+      break;
+    case 62:
+      name= "events_waits_summary_by_account_by_event_name.row_size";
+      size= sizeof(PFS_single_stat);
+      break;
+    case 63:
+      name= "events_waits_summary_by_account_by_event_name.row_count";
+      size= account_max * wait_class_max;
+      break;
+    case 64:
+      name= "events_waits_summary_by_account_by_event_name.memory";
+      size= account_max * wait_class_max * sizeof(PFS_single_stat);
+      total_memory+= size;
+      break;
+    case 65:
+      name= "events_waits_summary_by_user_by_event_name.row_size";
+      size= sizeof(PFS_single_stat);
+      break;
+    case 66:
+      name= "events_waits_summary_by_user_by_event_name.row_count";
+      size= user_max * wait_class_max;
+      break;
+    case 67:
+      name= "events_waits_summary_by_user_by_event_name.memory";
+      size= user_max * wait_class_max * sizeof(PFS_single_stat);
+      total_memory+= size;
+      break;
+    case 68:
+      name= "events_waits_summary_by_host_by_event_name.row_size";
+      size= sizeof(PFS_single_stat);
+      break;
+    case 69:
+      name= "events_waits_summary_by_host_by_event_name.row_count";
+      size= host_max * wait_class_max;
+      break;
+    case 70:
+      name= "events_waits_summary_by_host_by_event_name.memory";
+      size= host_max * wait_class_max * sizeof(PFS_single_stat);
+      total_memory+= size;
+      break;
+    case 71:
+      name= "(pfs_user).row_size";
+      size= sizeof(PFS_user);
+      break;
+    case 72:
+      name= "(pfs_user).row_count";
+      size= user_max;
+      break;
+    case 73:
+      name= "(pfs_user).memory";
+      size= user_max * sizeof(PFS_user);
+      total_memory+= size;
+      break;
+    case 74:
+      name= "(pfs_host).row_size";
+      size= sizeof(PFS_host);
+      break;
+    case 75:
+      name= "(pfs_host).row_count";
+      size= host_max;
+      break;
+    case 76:
+      name= "(pfs_host).memory";
+      size= host_max * sizeof(PFS_host);
+      total_memory+= size;
+      break;
+    case 77:
+      name= "(pfs_stage_class).row_size";
+      size= sizeof(PFS_stage_class);
+      break;
+    case 78:
+      name= "(pfs_stage_class).row_count";
+      size= stage_class_max;
+      break;
+    case 79:
+      name= "(pfs_stage_class).memory";
+      size= stage_class_max * sizeof(PFS_stage_class);
+      total_memory+= size;
+      break;
+    case 80:
+      name= "events_stages_history.row_size";
+      size= sizeof(PFS_events_stages);
+      break;
+    case 81:
+      name= "events_stages_history.row_count";
+      size= events_stages_history_per_thread * thread_max;
+      break;
+    case 82:
+      name= "events_stages_history.memory";
+      size= events_stages_history_per_thread * thread_max
+        * sizeof(PFS_events_stages);
+      total_memory+= size;
+      break;
+    case 83:
+      name= "events_stages_history_long.row_size";
+      size= sizeof(PFS_events_stages);
+      break;
+    case 84:
+      name= "events_stages_history_long.row_count";
+      size= events_stages_history_long_size;
+      break;
+    case 85:
+      name= "events_stages_history_long.memory";
+      size= events_stages_history_long_size * sizeof(PFS_events_stages);
+      total_memory+= size;
+      break;
+    case 86:
+      name= "events_stages_summary_by_thread_by_event_name.row_size";
+      size= sizeof(PFS_stage_stat);
+      break;
+    case 87:
+      name= "events_stages_summary_by_thread_by_event_name.row_count";
+      size= thread_max * stage_class_max;
+      break;
+    case 88:
+      name= "events_stages_summary_by_thread_by_event_name.memory";
+      size= thread_max * stage_class_max * sizeof(PFS_stage_stat);
+      total_memory+= size;
+      break;
+    case 89:
+      name= "events_stages_summary_global_by_event_name.row_size";
+      size= sizeof(PFS_stage_stat);
+      break;
+    case 90:
+      name= "events_stages_summary_global_by_event_name.row_count";
+      size= stage_class_max;
+      break;
+    case 91:
+      name= "events_stages_summary_global_by_event_name.memory";
+      size= stage_class_max * sizeof(PFS_stage_stat);
+      total_memory+= size;
+      break;
+    case 92:
+      name= "events_stages_summary_by_account_by_event_name.row_size";
+      size= sizeof(PFS_stage_stat);
+      break;
+    case 93:
+      name= "events_stages_summary_by_account_by_event_name.row_count";
+      size= account_max * stage_class_max;
+      break;
+    case 94:
+      name= "events_stages_summary_by_account_by_event_name.memory";
+      size= account_max * stage_class_max * sizeof(PFS_stage_stat);
+      total_memory+= size;
+      break;
+    case 95:
+      name= "events_stages_summary_by_user_by_event_name.row_size";
+      size= sizeof(PFS_stage_stat);
+      break;
+    case 96:
+      name= "events_stages_summary_by_user_by_event_name.row_count";
+      size= user_max * stage_class_max;
+      break;
+    case 97:
+      name= "events_stages_summary_by_user_by_event_name.memory";
+      size= user_max * stage_class_max * sizeof(PFS_stage_stat);
+      total_memory+= size;
+      break;
+    case 98:
+      name= "events_stages_summary_by_host_by_event_name.row_size";
+      size= sizeof(PFS_stage_stat);
+      break;
+    case 99:
+      name= "events_stages_summary_by_host_by_event_name.row_count";
+      size= host_max * stage_class_max;
+      break;
+    case 100:
+      name= "events_stages_summary_by_host_by_event_name.memory";
+      size= host_max * stage_class_max * sizeof(PFS_stage_stat);
+      total_memory+= size;
+      break;
+    case 101:
+      name= "(pfs_statement_class).row_size";
+      size= sizeof(PFS_statement_class);
+      break;
+    case 102:
+      name= "(pfs_statement_class).row_count";
+      size= statement_class_max;
+      break;
+    case 103:
+      name= "(pfs_statement_class).memory";
+      size= statement_class_max * sizeof(PFS_statement_class);
+      total_memory+= size;
+      break;
+    case 104:
+      name= "events_statements_history.row_size";
+      size= sizeof(PFS_events_statements);
+      break;
+    case 105:
+      name= "events_statements_history.row_count";
+      size= events_statements_history_per_thread * thread_max;
+      break;
+    case 106:
+      name= "events_statements_history.memory";
+      size= events_statements_history_per_thread * thread_max
+        * sizeof(PFS_events_statements);
+      total_memory+= size;
+      break;
+    case 107:
+      name= "events_statements_history_long.row_size";
+      size= sizeof(PFS_events_statements);
+      break;
+    case 108:
+      name= "events_statements_history_long.row_count";
+      size= events_statements_history_long_size;
+      break;
+    case 109:
+      name= "events_statements_history_long.memory";
+      size= events_statements_history_long_size * sizeof(PFS_events_statements);
+      total_memory+= size;
+      break;
+    case 110:
+      name= "events_statements_summary_by_thread_by_event_name.row_size";
+      size= sizeof(PFS_statement_stat);
+      break;
+    case 111:
+      name= "events_statements_summary_by_thread_by_event_name.row_count";
+      size= thread_max * statement_class_max;
+      break;
+    case 112:
+      name= "events_statements_summary_by_thread_by_event_name.memory";
+      size= thread_max * statement_class_max * sizeof(PFS_statement_stat);
+      total_memory+= size;
+      break;
+    case 113:
+      name= "events_statements_summary_global_by_event_name.row_size";
+      size= sizeof(PFS_statement_stat);
+      break;
+    case 114:
+      name= "events_statements_summary_global_by_event_name.row_count";
+      size= statement_class_max;
+      break;
+    case 115:
+      name= "events_statements_summary_global_by_event_name.memory";
+      size= statement_class_max * sizeof(PFS_statement_stat);
+      total_memory+= size;
+      break;
+    case 116:
+      name= "events_statements_summary_by_account_by_event_name.row_size";
+      size= sizeof(PFS_statement_stat);
+      break;
+    case 117:
+      name= "events_statements_summary_by_account_by_event_name.row_count";
+      size= account_max * statement_class_max;
+      break;
+    case 118:
+      name= "events_statements_summary_by_account_by_event_name.memory";
+      size= account_max * statement_class_max * sizeof(PFS_statement_stat);
+      total_memory+= size;
+      break;
+    case 119:
+      name= "events_statements_summary_by_user_by_event_name.row_size";
+      size= sizeof(PFS_statement_stat);
+      break;
+    case 120:
+      name= "events_statements_summary_by_user_by_event_name.row_count";
+      size= user_max * statement_class_max;
+      break;
+    case 121:
+      name= "events_statements_summary_by_user_by_event_name.memory";
+      size= user_max * statement_class_max * sizeof(PFS_statement_stat);
+      total_memory+= size;
+      break;
+    case 122:
+      name= "events_statements_summary_by_host_by_event_name.row_size";
+      size= sizeof(PFS_statement_stat);
+      break;
+    case 123:
+      name= "events_statements_summary_by_host_by_event_name.row_count";
+      size= host_max * statement_class_max;
+      break;
+    case 124:
+      name= "events_statements_summary_by_host_by_event_name.memory";
+      size= host_max * statement_class_max * sizeof(PFS_statement_stat);
+      total_memory+= size;
+      break;
+    case 125:
+      name= "events_statements_current.row_size";
+      size= sizeof(PFS_events_statements);
+      break;
+    case 126:
+      name= "events_statements_current.row_count";
+      size= thread_max * statement_stack_max;
+      break;
+    case 127:
+      name= "events_statements_current.memory";
+      size= thread_max * statement_stack_max * sizeof(PFS_events_statements);
+      total_memory+= size;
+      break;
+    case 128:
+      name= "(pfs_socket_class).row_size";
+      size= sizeof(PFS_socket_class);
+      break;
+    case 129:
+      name= "(pfs_socket_class).row_count";
+      size= socket_class_max;
+      break;
+    case 130:
+      name= "(pfs_socket_class).memory";
+      size= socket_class_max * sizeof(PFS_socket_class);
+      total_memory+= size;
+      break;
+    case 131:
+      name= "socket_instances.row_size";
+      size= sizeof(PFS_socket);
+      break;
+    case 132:
+      name= "socket_instances.row_count";
+      size= socket_max;
+      break;
+    case 133:
+      name= "socket_instances.memory";
+      size= socket_max * sizeof(PFS_socket);
+      total_memory+= size;
+      break;
+    case 134:
+      name= "events_statements_summary_by_digest.row_size";
+      size= sizeof(PFS_statements_digest_stat);
+      break;
+    case 135:
+      name= "events_statements_summary_by_digest.row_count";
+      size= digest_max;
+      break;
+    case 136:
+      name= "events_statements_summary_by_digest.memory";
+      size= digest_max * sizeof(PFS_statements_digest_stat);
+      total_memory+= size;
+      break;    
     /*
       This case must be last,
       for aggregation in total_memory.
     */
-    case 50:
+    case 137:
       name= "performance_schema.memory";
       size= total_memory;
       /* This will fail if something is not advertised here */
@@ -777,4 +1359,3 @@ end:
 
 /** @} */
 
-
diff --git a/storage/perfschema/pfs_engine_table.h b/storage/perfschema/pfs_engine_table.h
index ec73c5a3688..40f5404d0b7 100644
--- a/storage/perfschema/pfs_engine_table.h
+++ b/storage/perfschema/pfs_engine_table.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2008, 2011, Oracle and/or its affiliates. All rights reserved.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
@@ -22,8 +22,10 @@
   Performance schema tables (declarations).
 */
 
+#include "pfs_instr_class.h"
 class Field;
 struct PFS_engine_table_share;
+struct time_normalizer;
 
 /**
   @addtogroup Performance_schema_engine
@@ -46,6 +48,18 @@ public:
   int update_row(TABLE *table, const unsigned char *old_buf,
                  unsigned char *new_buf, Field **fields);
 
+  /**
+    Delete a row from this table.
+    @param table Table handle
+    @param buf the row buffer to delete
+    @param fields Table fields
+    @return 0 on success
+  */
+  int delete_row(TABLE *table, const unsigned char *buf, Field **fields);
+
+  /** Initialize table scan. */
+  virtual int rnd_init(bool scan){return 0;};
+
   /** Fetch the next row in this cursor. */
   virtual int rnd_next(void)= 0;
   /**
@@ -56,12 +70,82 @@ public:
 
   void get_position(void *ref);
   void set_position(const void *ref);
+  /** Reset the cursor position to the beginning of the table. */
   virtual void reset_position(void)= 0;
 
+  /** Get the normalizer and class type for the current row. */
+  void get_normalizer(PFS_instr_class *instr_class);
+
   /** Destructor. */
   virtual ~PFS_engine_table()
   {}
 
+  /**
+    Helper, assign a value to a ulong field.
+    @param f the field to set
+    @param value the value to assign
+  */
+  static void set_field_ulong(Field *f, ulong value);
+  /**
+    Helper, assign a value to a ulonglong field.
+    @param f the field to set
+    @param value the value to assign
+  */
+  static void set_field_ulonglong(Field *f, ulonglong value);
+  /**
+    Helper, assign a value to a char utf8 field.
+    @param f the field to set
+    @param str the string to assign
+    @param len the length of the string to assign
+  */
+  static void set_field_char_utf8(Field *f, const char *str, uint len);
+  /**
+    Helper, assign a value to a varchar utf8 field.
+    @param f the field to set
+    @param str the string to assign
+    @param len the length of the string to assign
+  */
+  static void set_field_varchar_utf8(Field *f, const char *str, uint len);
+  /**
+    Helper, assign a value to a longtext utf8 field.
+    @param f the field to set
+    @param str the string to assign
+    @param len the length of the string to assign
+  */
+  static void set_field_longtext_utf8(Field *f, const char *str, uint len);
+  /**
+    Helper, assign a value to an enum field.
+    @param f the field to set
+    @param value the value to assign
+  */
+  static void set_field_enum(Field *f, ulonglong value);
+  /**
+    Helper, assign a value to a timestamp field.
+    @param f the field to set
+    @param value the value to assign
+  */
+  static void set_field_timestamp(Field *f, ulonglong value);
+  /**
+    Helper, read a value from an enum field.
+    @param f the field to read
+    @return the field value
+  */
+  static ulonglong get_field_enum(Field *f);
+  /**
+    Helper, read a value from a char utf8 field.
+    @param f the field to read
+    @param[out] val the field value
+    @return the field value
+  */
+  static String *get_field_char_utf8(Field *f, String *val);
+  /**
+    Helper, read a value from a varchar utf8 field.
+    @param f the field to read
+    @param[out] val the field value
+    @return the field value
+  */
+  static String *get_field_varchar_utf8(Field *f, String *val);
+
 protected:
   /**
     Read the current row values.
@@ -84,25 +168,32 @@ protected:
                                 unsigned char *new_buf, Field **fields);
 
   /**
+    Delete a row.
+    @param table            Table handle
+    @param buf              Row buffer
+    @param fields           Table fields
+  */
+  virtual int delete_row_values(TABLE *table, const unsigned char *buf,
+                                Field **fields);
+
+  /**
     Constructor.
     @param share            table share
     @param pos              address of the m_pos position member
   */
   PFS_engine_table(const PFS_engine_table_share *share, void *pos)
-    : m_share_ptr(share), m_pos_ptr(pos)
+    : m_share_ptr(share), m_pos_ptr(pos),
+      m_normalizer(NULL), m_class_type(PFS_CLASS_NONE)
   {}
 
-  void set_field_ulong(Field *f, ulong value);
-  void set_field_ulonglong(Field *f, ulonglong value);
-  void set_field_varchar_utf8(Field *f, const char* str, uint len);
-  void set_field_enum(Field *f, ulonglong value);
-
-  ulonglong get_field_enum(Field *f);
-
   /** Table share. */
   const PFS_engine_table_share *m_share_ptr;
   /** Opaque pointer to the m_pos position of this cursor. */
   void *m_pos_ptr;
+  /** Current normalizer */
+  time_normalizer *m_normalizer;
+  /** Current class type */
+  enum PFS_class_type m_class_type;
 };
 
 /** Callback to open a table. */
@@ -112,6 +203,8 @@ typedef int (*pfs_write_row_t)(TABLE *table,
                                unsigned char *buf, Field **fields);
 /** Callback to delete all rows. */
 typedef int (*pfs_delete_all_rows_t)(void);
+/** Callback to get a row count. */
+typedef ha_rows (*pfs_get_row_count_t)(void);
 
 /**
   A PERFORMANCE_SCHEMA table share.
@@ -123,6 +216,10 @@ struct PFS_engine_table_share
   void check_one_table(THD *thd);
   static void init_all_locks(void);
   static void delete_all_locks(void);
+  /** Get the row count. */
+  ha_rows get_row_count(void) const;
+  /** Write a row. */
+  int write_row(TABLE *table, unsigned char *buf, Field **fields) const;
 
   /** Table name. */
   LEX_STRING m_name;
@@ -134,6 +231,8 @@ struct PFS_engine_table_share
   pfs_write_row_t m_write_row;
   /** Delete all rows function. */
   pfs_delete_all_rows_t m_delete_all_rows;
+  /** Get rows count function. */
+  pfs_get_row_count_t m_get_row_count;
   /**
     Number or records.
     This number does not need to be precise,
@@ -151,6 +250,10 @@ struct PFS_engine_table_share
   bool m_checked;
 };
 
+/**
+  Privileges for read only tables.
+  The only operation allowed is SELECT.
+*/
 class PFS_readonly_acl : public ACL_internal_table_access
 {
 public:
@@ -163,8 +266,13 @@ public:
   ACL_internal_access_result check(ulong want_access, ulong *save_priv) const;
 };
 
+/** Singleton instance of PFS_readonly_acl. */
 extern PFS_readonly_acl pfs_readonly_acl;
 
+/**
+  Privileges for truncatable tables.
+  Operations allowed are SELECT and TRUNCATE.
+*/
 class PFS_truncatable_acl : public ACL_internal_table_access
 {
 public:
@@ -177,8 +285,13 @@ public:
   ACL_internal_access_result check(ulong want_access, ulong *save_priv) const;
 };
 
+/** Singleton instance of PFS_truncatable_acl. */
 extern PFS_truncatable_acl pfs_truncatable_acl;
 
+/**
+  Privileges for updatable tables.
+  Operations allowed are SELECT and UPDATE.
+*/
 class PFS_updatable_acl : public ACL_internal_table_access
 {
 public:
@@ -191,8 +304,13 @@ public:
   ACL_internal_access_result check(ulong want_access, ulong *save_priv) const;
 };
 
+/** Singleton instance of PFS_updatable_acl. */
 extern PFS_updatable_acl pfs_updatable_acl;
 
+/**
+  Privileges for editable tables.
+  Operations allowed are SELECT, INSERT, UPDATE, DELETE and TRUNCATE.
+*/
 class PFS_editable_acl : public ACL_internal_table_access
 {
 public:
@@ -205,8 +323,12 @@ public:
   ACL_internal_access_result check(ulong want_access, ulong *save_priv) const;
 };
 
+/** Singleton instance of PFS_editable_acl. */
 extern PFS_editable_acl pfs_editable_acl;
 
+/**
+  Privileges for unknown tables.
+*/
 class PFS_unknown_acl : public ACL_internal_table_access
 {
 public:
@@ -219,6 +341,7 @@ public:
   ACL_internal_access_result check(ulong want_access, ulong *save_priv) const;
 };
 
+/** Singleton instance of PFS_unknown_acl. */
 extern PFS_unknown_acl pfs_unknown_acl;
 
 /** Position of a cursor, for simple iterations. */
@@ -227,20 +350,34 @@ struct PFS_simple_index
   /** Current row index. */
   uint m_index;
 
+  /**
+    Constructor.
+    @param index the index initial value.
+  */
   PFS_simple_index(uint index)
     : m_index(index)
   {}
 
+  /**
+    Set this index at a given position.
+    @param other a position
+  */
   void set_at(const struct PFS_simple_index *other)
   { m_index= other->m_index; }
 
+  /**
+    Set this index after a given position.
+    @param other a position
+  */
   void set_after(const struct PFS_simple_index *other)
   { m_index= other->m_index + 1; }
 
+  /** Set this index to the next record. */
   void next(void)
   { m_index++; }
 };
 
+/** Position of a double cursor, for iterations using 2 nested loops. */
 struct PFS_double_index
 {
   /** Outer index. */
@@ -248,16 +385,29 @@ struct PFS_double_index
   /** Current index within index_1. */
   uint m_index_2;
 
+  /**
+    Constructor.
+    @param index_1 the first index initial value.
+    @param index_2 the second index initial value.
+  */
   PFS_double_index(uint index_1, uint index_2)
     : m_index_1(index_1), m_index_2(index_2)
   {}
 
+  /**
+    Set this index at a given position.
+    @param other a position
+  */
   void set_at(const struct PFS_double_index *other)
   {
     m_index_1= other->m_index_1;
     m_index_2= other->m_index_2;
   }
 
+  /**
+    Set this index after a given position.
+    @param other a position
+  */
   void set_after(const struct PFS_double_index *other)
   {
     m_index_1= other->m_index_1;
@@ -265,6 +415,7 @@ struct PFS_double_index
   }
 };
 
+/** Position of a triple cursor, for iterations using 3 nested loops. */
 struct PFS_triple_index
 {
   /** Outer index. */
@@ -274,10 +425,20 @@ struct PFS_triple_index
   /** Current index within index_2. */
   uint m_index_3;
 
+  /**
+    Constructor.
+    @param index_1 the first index initial value.
+    @param index_2 the second index initial value.
+    @param index_3 the third index initial value.
+  */
   PFS_triple_index(uint index_1, uint index_2, uint index_3)
     : m_index_1(index_1), m_index_2(index_2), m_index_3(index_3)
   {}
 
+  /**
+    Set this index at a given position.
+    @param other a position
+  */
   void set_at(const struct PFS_triple_index *other)
   {
     m_index_1= other->m_index_1;
@@ -285,6 +446,10 @@ struct PFS_triple_index
     m_index_3= other->m_index_3;
   }
 
+  /**
+    Set this index after a given position.
+    @param other a position
+  */
   void set_after(const struct PFS_triple_index *other)
   {
     m_index_1= other->m_index_1;
@@ -293,22 +458,6 @@ struct PFS_triple_index
   }
 };
 
-struct PFS_instrument_view_constants
-{
-  static const uint VIEW_MUTEX= 1;
-  static const uint VIEW_RWLOCK= 2;
-  static const uint VIEW_COND= 3;
-  static const uint VIEW_FILE= 4;
-};
-
-struct PFS_object_view_constants
-{
-  static const uint VIEW_TABLE= 1;
-  static const uint VIEW_EVENT= 2;
-  static const uint VIEW_PROCEDURE= 3;
-  static const uint VIEW_FUNCTION= 4;
-};
-
 bool pfs_show_status(handlerton *hton, THD *thd,
                      stat_print_fn *print, enum ha_stat_type stat);
 
diff --git a/storage/perfschema/pfs_events.h b/storage/perfschema/pfs_events.h
new file mode 100644
index 00000000000..c9586df11bd
--- /dev/null
+++ b/storage/perfschema/pfs_events.h
@@ -0,0 +1,62 @@
+/* Copyright (c) 2010, 2011, Oracle and/or its affiliates. All rights reserved.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; version 2 of the License.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301  USA */
+
+#ifndef PFS_EVENTS_H
+#define PFS_EVENTS_H
+
+/**
+  @file storage/perfschema/pfs_events.h
+  Events data structures (declarations).
+*/
+
+#include "pfs_column_types.h"
+
+struct PFS_instr_class;
+
+/** An event record. */
+struct PFS_events
+{
+  /** THREAD_ID. */
+  ulong m_thread_internal_id;
+  /** EVENT_ID. */
+  ulonglong m_event_id;
+  /** END_EVENT_ID. */
+  ulonglong m_end_event_id;
+  /** (EVENT_TYPE) */
+  enum_event_type m_event_type;
+  /** NESTING_EVENT_ID. */
+  ulonglong m_nesting_event_id;
+  /** NESTING_EVENT_TYPE */
+  enum_event_type m_nesting_event_type;
+  /** Instrument metadata. */
+  PFS_instr_class *m_class;
+  /**
+    Timer start.
+    This member is populated only if m_class->m_timed is true.
+  */
+  ulonglong m_timer_start;
+  /**
+    Timer end.
+    This member is populated only if m_class->m_timed is true.
+  */
+  ulonglong m_timer_end;
+  /** Location of the instrumentation in the source code (file name). */
+  const char *m_source_file;
+  /** Location of the instrumentation in the source code (line number). */
+  uint m_source_line;
+};
+
+#endif
+
diff --git a/storage/perfschema/pfs_events_stages.cc b/storage/perfschema/pfs_events_stages.cc
new file mode 100644
index 00000000000..dbdfa6068ed
--- /dev/null
+++ b/storage/perfschema/pfs_events_stages.cc
@@ -0,0 +1,238 @@
+/* Copyright (c) 2010, 2011, Oracle and/or its affiliates. All rights reserved.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; version 2 of the License.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software Foundation,
+  51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA */
+
+/**
+  @file storage/perfschema/pfs_events_stages.cc
+  Events stages data structures (implementation).
+*/
+
+#include "my_global.h"
+#include "my_sys.h"
+#include "pfs_global.h"
+#include "pfs_instr_class.h"
+#include "pfs_instr.h"
+#include "pfs_account.h"
+#include "pfs_host.h"
+#include "pfs_user.h"
+#include "pfs_events_stages.h"
+#include "pfs_atomic.h"
+#include "m_string.h"
+
+ulong events_stages_history_long_size= 0;
+/** Consumer flag for table EVENTS_STAGES_CURRENT. */
+bool flag_events_stages_current= false;
+/** Consumer flag for table EVENTS_STAGES_HISTORY. */
+bool flag_events_stages_history= false;
+/** Consumer flag for table EVENTS_STAGES_HISTORY_LONG. */
+bool flag_events_stages_history_long= false;
+
+/** True if EVENTS_STAGES_HISTORY_LONG circular buffer is full. */
+bool events_stages_history_long_full= false;
+/** Index in EVENTS_STAGES_HISTORY_LONG circular buffer. */
+volatile uint32 events_stages_history_long_index= 0;
+/** EVENTS_STAGES_HISTORY_LONG circular buffer. */
+PFS_events_stages *events_stages_history_long_array= NULL;
+
+/**
+  Initialize table EVENTS_STAGES_HISTORY_LONG.
+  @param events_stages_history_long_sizing       table sizing
+*/
+int init_events_stages_history_long(uint events_stages_history_long_sizing)
+{
+  events_stages_history_long_size= events_stages_history_long_sizing;
+  events_stages_history_long_full= false;
+  PFS_atomic::store_u32(&events_stages_history_long_index, 0);
+
+  if (events_stages_history_long_size == 0)
+    return 0;
+
+  events_stages_history_long_array=
+    PFS_MALLOC_ARRAY(events_stages_history_long_size, PFS_events_stages,
+                     MYF(MY_ZEROFILL));
+
+  return (events_stages_history_long_array ? 0 : 1);
+}
+
+/** Cleanup table EVENTS_STAGES_HISTORY_LONG. */
+void cleanup_events_stages_history_long(void)
+{
+  pfs_free(events_stages_history_long_array);
+  events_stages_history_long_array= NULL;
+}
+
+static inline void copy_events_stages(PFS_events_stages *dest,
+                                      const PFS_events_stages *source)
+{
+  memcpy(dest, source, sizeof(PFS_events_stages));
+}
+
+/**
+  Insert a stage record in table EVENTS_STAGES_HISTORY.
+  @param thread             thread that executed the wait
+  @param stage              record to insert
+*/
+void insert_events_stages_history(PFS_thread *thread, PFS_events_stages *stage)
+{
+  if (unlikely(events_stages_history_per_thread == 0))
+    return;
+
+  DBUG_ASSERT(thread->m_stages_history != NULL);
+
+  uint index= thread->m_stages_history_index;
+
+  /*
+    A concurrent thread executing TRUNCATE TABLE EVENTS_STAGES_CURRENT
+    could alter the data that this thread is inserting,
+    causing a potential race condition.
+    We are not testing for this and insert a possibly empty record,
+    to make this thread (the writer) faster.
+    This is ok, the readers of m_stages_history will filter this out.
+  */
+  copy_events_stages(&thread->m_stages_history[index], stage);
+
+  index++;
+  if (index >= events_stages_history_per_thread)
+  {
+    index= 0;
+    thread->m_stages_history_full= true;
+  }
+  thread->m_stages_history_index= index;
+}
+
+/**
+  Insert a stage record in table EVENTS_STAGES_HISTORY_LONG.
+  @param stage              record to insert
+*/
+void insert_events_stages_history_long(PFS_events_stages *stage)
+{
+  if (unlikely(events_stages_history_long_size == 0))
+    return;
+
+  DBUG_ASSERT(events_stages_history_long_array != NULL);
+
+  uint index= PFS_atomic::add_u32(&events_stages_history_long_index, 1);
+
+  index= index % events_stages_history_long_size;
+  if (index == 0)
+    events_stages_history_long_full= true;
+
+  /* See related comment in insert_events_stages_history. */
+  copy_events_stages(&events_stages_history_long_array[index], stage);
+}
+
+/** Reset table EVENTS_STAGES_CURRENT data. */
+void reset_events_stages_current(void)
+{
+  PFS_thread *pfs_thread= thread_array;
+  PFS_thread *pfs_thread_last= thread_array + thread_max;
+
+  for ( ; pfs_thread < pfs_thread_last; pfs_thread++)
+  {
+    pfs_thread->m_stage_current.m_class= NULL;
+  }
+}
+
+/** Reset table EVENTS_STAGES_HISTORY data. */
+void reset_events_stages_history(void)
+{
+  PFS_thread *pfs_thread= thread_array;
+  PFS_thread *pfs_thread_last= thread_array + thread_max;
+
+  for ( ; pfs_thread < pfs_thread_last; pfs_thread++)
+  {
+    PFS_events_stages *pfs= pfs_thread->m_stages_history;
+    PFS_events_stages *pfs_last= pfs + events_stages_history_per_thread;
+
+    pfs_thread->m_stages_history_index= 0;
+    pfs_thread->m_stages_history_full= false;
+    for ( ; pfs < pfs_last; pfs++)
+      pfs->m_class= NULL;
+  }
+}
+
+/** Reset table EVENTS_STAGES_HISTORY_LONG data. */
+void reset_events_stages_history_long(void)
+{
+  PFS_atomic::store_u32(&events_stages_history_long_index, 0);
+  events_stages_history_long_full= false;
+
+  PFS_events_stages *pfs= events_stages_history_long_array;
+  PFS_events_stages *pfs_last= pfs + events_stages_history_long_size;
+  for ( ; pfs < pfs_last; pfs++)
+    pfs->m_class= NULL;
+}
+
+/** Reset table EVENTS_STAGES_SUMMARY_BY_THREAD_BY_EVENT_NAME data. */
+void reset_events_stages_by_thread()
+{
+  PFS_thread *thread= thread_array;
+  PFS_thread *thread_last= thread_array + thread_max;
+
+  for ( ; thread < thread_last; thread++)
+  {
+    if (thread->m_lock.is_populated())
+      aggregate_thread_stages(thread);
+  }
+}
+
+/** Reset table EVENTS_STAGES_SUMMARY_BY_ACCOUNT_BY_EVENT_NAME data. */
+void reset_events_stages_by_account()
+{
+  PFS_account *pfs= account_array;
+  PFS_account *pfs_last= account_array + account_max;
+
+  for ( ; pfs < pfs_last; pfs++)
+  {
+    if (pfs->m_lock.is_populated())
+      pfs->aggregate_stages();
+  }
+}
+
+/** Reset table EVENTS_STAGES_SUMMARY_BY_USER_BY_EVENT_NAME data. */
+void reset_events_stages_by_user()
+{
+  PFS_user *pfs= user_array;
+  PFS_user *pfs_last= user_array + user_max;
+
+  for ( ; pfs < pfs_last; pfs++)
+  {
+    if (pfs->m_lock.is_populated())
+      pfs->aggregate_stages();
+  }
+}
+
+/** Reset table EVENTS_STAGES_SUMMARY_BY_HOST_BY_EVENT_NAME data. */
+void reset_events_stages_by_host()
+{
+  PFS_host *pfs= host_array;
+  PFS_host *pfs_last= host_array + host_max;
+
+  for ( ; pfs < pfs_last; pfs++)
+  {
+    if (pfs->m_lock.is_populated())
+      pfs->aggregate_stages();
+  }
+}
+
+/** Reset table EVENTS_STAGES_GLOBAL_BY_EVENT_NAME data. */
+void reset_events_stages_global()
+{
+  PFS_stage_stat *stat= global_instr_class_stages_array;
+  PFS_stage_stat *stat_last= global_instr_class_stages_array + stage_class_max;
+
+  for ( ; stat < stat_last; stat++)
+    stat->reset();
+}
+
diff --git a/storage/perfschema/pfs_events_stages.h b/storage/perfschema/pfs_events_stages.h
new file mode 100644
index 00000000000..43231796be5
--- /dev/null
+++ b/storage/perfschema/pfs_events_stages.h
@@ -0,0 +1,65 @@
+/* Copyright (c) 2010, 2011, Oracle and/or its affiliates. All rights reserved.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; version 2 of the License.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301  USA */
+
+#ifndef PFS_EVENTS_STAGES_H
+#define PFS_EVENTS_STAGES_H
+
+/**
+  @file storage/perfschema/pfs_events_stages.h
+  Events waits data structures (declarations).
+*/
+
+#include "pfs_events.h"
+
+struct PFS_thread;
+struct PFS_account;
+struct PFS_user;
+struct PFS_host;
+
+/** A stage record. */
+struct PFS_events_stages : public PFS_events
+{
+  /* No specific attributes */
+};
+
+void insert_events_stages_history(PFS_thread *thread, PFS_events_stages *stage);
+void insert_events_stages_history_long(PFS_events_stages *stage);
+
+extern bool flag_events_stages_current;
+extern bool flag_events_stages_history;
+extern bool flag_events_stages_history_long;
+
+extern bool events_stages_history_long_full;
+extern volatile uint32 events_stages_history_long_index;
+extern PFS_events_stages *events_stages_history_long_array;
+extern ulong events_stages_history_long_size;
+
+int init_events_stages_history_long(uint events_stages_history_long_sizing);
+void cleanup_events_stages_history_long();
+
+void reset_events_stages_current();
+void reset_events_stages_history();
+void reset_events_stages_history_long();
+void reset_events_stages_by_thread();
+void reset_events_stages_by_account();
+void reset_events_stages_by_user();
+void reset_events_stages_by_host();
+void reset_events_stages_global();
+void aggregate_account_stages(PFS_account *account);
+void aggregate_user_stages(PFS_user *user);
+void aggregate_host_stages(PFS_host *host);
+
+#endif
+
diff --git a/storage/perfschema/pfs_events_statements.cc b/storage/perfschema/pfs_events_statements.cc
new file mode 100644
index 00000000000..66def924d80
--- /dev/null
+++ b/storage/perfschema/pfs_events_statements.cc
@@ -0,0 +1,242 @@
+/* Copyright (c) 2010, 2011, Oracle and/or its affiliates. All rights reserved.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; version 2 of the License.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software Foundation,
+  51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA */
+
+/**
+  @file storage/perfschema/pfs_events_statements.cc
+  Events statements data structures (implementation).
+*/
+
+#include "my_global.h"
+#include "my_sys.h"
+#include "pfs_global.h"
+#include "pfs_instr_class.h"
+#include "pfs_instr.h"
+#include "pfs_account.h"
+#include "pfs_host.h"
+#include "pfs_user.h"
+#include "pfs_events_statements.h"
+#include "pfs_atomic.h"
+#include "m_string.h"
+
+ulong events_statements_history_long_size= 0;
+/** Consumer flag for table EVENTS_STATEMENTS_CURRENT. */
+bool flag_events_statements_current= false;
+/** Consumer flag for table EVENTS_STATEMENTS_HISTORY. */
+bool flag_events_statements_history= false;
+/** Consumer flag for table EVENTS_STATEMENTS_HISTORY_LONG. */
+bool flag_events_statements_history_long= false;
+
+/** True if EVENTS_STATEMENTS_HISTORY_LONG circular buffer is full. */
+bool events_statements_history_long_full= false;
+/** Index in EVENTS_STATEMENTS_HISTORY_LONG circular buffer. */
+volatile uint32 events_statements_history_long_index= 0;
+/** EVENTS_STATEMENTS_HISTORY_LONG circular buffer. */
+PFS_events_statements *events_statements_history_long_array= NULL;
+
+/**
+  Initialize table EVENTS_STATEMENTS_HISTORY_LONG.
+  @param events_statements_history_long_sizing       table sizing
+*/
+int init_events_statements_history_long(uint events_statements_history_long_sizing)
+{
+  events_statements_history_long_size= events_statements_history_long_sizing;
+  events_statements_history_long_full= false;
+  PFS_atomic::store_u32(&events_statements_history_long_index, 0);
+
+  if (events_statements_history_long_size == 0)
+    return 0;
+
+  events_statements_history_long_array=
+    PFS_MALLOC_ARRAY(events_statements_history_long_size, PFS_events_statements,
+                     MYF(MY_ZEROFILL));
+
+  return (events_statements_history_long_array ? 0 : 1);
+}
+
+/** Cleanup table EVENTS_STATEMENTS_HISTORY_LONG. */
+void cleanup_events_statements_history_long(void)
+{
+  pfs_free(events_statements_history_long_array);
+  events_statements_history_long_array= NULL;
+}
+
+static inline void copy_events_statements(PFS_events_statements *dest,
+                                      const PFS_events_statements *source)
+{
+  memcpy(dest, source, sizeof(PFS_events_statements));
+}
+
+/**
+  Insert a statement record in table EVENTS_STATEMENTS_HISTORY.
+  @param thread             thread that executed the wait
+  @param statement          record to insert
+*/
+void insert_events_statements_history(PFS_thread *thread, PFS_events_statements *statement)
+{
+  if (unlikely(events_statements_history_per_thread == 0))
+    return;
+
+  DBUG_ASSERT(thread->m_statements_history != NULL);
+
+  uint index= thread->m_statements_history_index;
+
+  /*
+    A concurrent thread executing TRUNCATE TABLE EVENTS_STATEMENTS_CURRENT
+    could alter the data that this thread is inserting,
+    causing a potential race condition.
+    We are not testing for this and insert a possibly empty record,
+    to make this thread (the writer) faster.
+    This is ok, the readers of m_statements_history will filter this out.
+  */
+  copy_events_statements(&thread->m_statements_history[index], statement);
+
+  index++;
+  if (index >= events_statements_history_per_thread)
+  {
+    index= 0;
+    thread->m_statements_history_full= true;
+  }
+  thread->m_statements_history_index= index;
+}
+
+/**
+  Insert a statement record in table EVENTS_STATEMENTS_HISTORY_LONG.
+  @param statement              record to insert
+*/
+void insert_events_statements_history_long(PFS_events_statements *statement)
+{
+  if (unlikely(events_statements_history_long_size == 0))
+    return ;
+
+  DBUG_ASSERT(events_statements_history_long_array != NULL);
+
+  uint index= PFS_atomic::add_u32(&events_statements_history_long_index, 1);
+
+  index= index % events_statements_history_long_size;
+  if (index == 0)
+    events_statements_history_long_full= true;
+
+  /* See related comment in insert_events_statements_history. */
+  copy_events_statements(&events_statements_history_long_array[index], statement);
+}
+
+/** Reset table EVENTS_STATEMENTS_CURRENT data. */
+void reset_events_statements_current(void)
+{
+  PFS_thread *pfs_thread= thread_array;
+  PFS_thread *pfs_thread_last= thread_array + thread_max;
+
+  for ( ; pfs_thread < pfs_thread_last; pfs_thread++)
+  {
+    PFS_events_statements *pfs_stmt= & pfs_thread->m_statement_stack[0];
+    PFS_events_statements *pfs_stmt_last= pfs_stmt + statement_stack_max;
+
+    for ( ; pfs_stmt < pfs_stmt_last; pfs_stmt++)
+      pfs_stmt->m_class= NULL;
+  }
+}
+
+/** Reset table EVENTS_STATEMENTS_HISTORY data. */
+void reset_events_statements_history(void)
+{
+  PFS_thread *pfs_thread= thread_array;
+  PFS_thread *pfs_thread_last= thread_array + thread_max;
+
+  for ( ; pfs_thread < pfs_thread_last; pfs_thread++)
+  {
+    PFS_events_statements *pfs= pfs_thread->m_statements_history;
+    PFS_events_statements *pfs_last= pfs + events_statements_history_per_thread;
+
+    pfs_thread->m_statements_history_index= 0;
+    pfs_thread->m_statements_history_full= false;
+    for ( ; pfs < pfs_last; pfs++)
+      pfs->m_class= NULL;
+  }
+}
+
+/** Reset table EVENTS_STATEMENTS_HISTORY_LONG data. */
+void reset_events_statements_history_long(void)
+{
+  PFS_atomic::store_u32(&events_statements_history_long_index, 0);
+  events_statements_history_long_full= false;
+
+  PFS_events_statements *pfs= events_statements_history_long_array;
+  PFS_events_statements *pfs_last= pfs + events_statements_history_long_size;
+  for ( ; pfs < pfs_last; pfs++)
+    pfs->m_class= NULL;
+}
+
+/** Reset table EVENTS_STATEMENTS_SUMMARY_BY_THREAD_BY_EVENT_NAME data. */
+void reset_events_statements_by_thread()
+{
+  PFS_thread *thread= thread_array;
+  PFS_thread *thread_last= thread_array + thread_max;
+
+  for ( ; thread < thread_last; thread++)
+  {
+    if (thread->m_lock.is_populated())
+      aggregate_thread_statements(thread);
+  }
+}
+
+/** Reset table EVENTS_STATEMENTS_SUMMARY_BY_ACCOUNT_BY_EVENT_NAME data. */
+void reset_events_statements_by_account()
+{
+  PFS_account *pfs= account_array;
+  PFS_account *pfs_last= account_array + account_max;
+
+  for ( ; pfs < pfs_last; pfs++)
+  {
+    if (pfs->m_lock.is_populated())
+      pfs->aggregate_statements();
+  }
+}
+
+/** Reset table EVENTS_STATEMENTS_SUMMARY_BY_USER_BY_EVENT_NAME data. */
+void reset_events_statements_by_user()
+{
+  PFS_user *pfs= user_array;
+  PFS_user *pfs_last= user_array + user_max;
+
+  for ( ; pfs < pfs_last; pfs++)
+  {
+    if (pfs->m_lock.is_populated())
+      pfs->aggregate_statements();
+  }
+}
+
+/** Reset table EVENTS_STATEMENTS_SUMMARY_BY_HOST_BY_EVENT_NAME data. */
+void reset_events_statements_by_host()
+{
+  PFS_host *pfs= host_array;
+  PFS_host *pfs_last= host_array + host_max;
+
+  for ( ; pfs < pfs_last; pfs++)
+  {
+    if (pfs->m_lock.is_populated())
+      pfs->aggregate_statements();
+  }
+}
+
+/** Reset table EVENTS_STATEMENTS_GLOBAL_BY_EVENT_NAME data. */
+void reset_events_statements_global()
+{
+  PFS_statement_stat *stat= global_instr_class_statements_array;
+  PFS_statement_stat *stat_last= global_instr_class_statements_array + statement_class_max;
+
+  for ( ; stat < stat_last; stat++)
+    stat->reset();
+}
+
diff --git a/storage/perfschema/pfs_events_statements.h b/storage/perfschema/pfs_events_statements.h
new file mode 100644
index 00000000000..5d90250c618
--- /dev/null
+++ b/storage/perfschema/pfs_events_statements.h
@@ -0,0 +1,123 @@
+/* Copyright (c) 2010, 2011, Oracle and/or its affiliates. All rights reserved.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; version 2 of the License.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301  USA */
+
+#ifndef PFS_EVENTS_STATEMENTS_H
+#define PFS_EVENTS_STATEMENTS_H
+
+/**
+  @file storage/perfschema/pfs_events_statements.h
+  Events statements data structures (declarations).
+*/
+
+#include "pfs_column_types.h"
+#include "pfs_events.h"
+#include "pfs_digest.h"
+
+struct PFS_thread;
+struct PFS_account;
+struct PFS_user;
+struct PFS_host;
+
+/** A statement record. */
+struct PFS_events_statements : public PFS_events
+{
+  /** Database name. */
+  char m_current_schema_name[NAME_LEN];
+  /** Length of @c m_current_schema_name. */
+  uint m_current_schema_name_length;
+  /** SQL_TEXT */
+  char m_sqltext[COL_INFO_SIZE];
+  /** Length of @ m_info. */
+  uint m_sqltext_length;
+
+  /** Locked time. */
+  ulonglong m_lock_time;
+  
+  /** Diagnostics area, message text. */
+  char m_message_text[MYSQL_ERRMSG_SIZE+1];
+  /** Diagnostics area, error number. */
+  uint m_sql_errno;
+  /** Diagnostics area, SQLSTATE. */
+  char m_sqlstate[SQLSTATE_LENGTH];
+  /** Diagnostics area, error count. */
+  uint m_error_count;
+  /** Diagnostics area, warning count. */
+  uint m_warning_count;
+  /** Diagnostics area, rows affected. */
+  ulonglong m_rows_affected;
+
+  /** Optimizer metric, number of rows sent. */
+  ulonglong m_rows_sent;
+  /** Optimizer metric, number of rows examined. */
+  ulonglong m_rows_examined;
+  /** Optimizer metric, number of temporary tables created on disk. */
+  ulonglong m_created_tmp_disk_tables;
+  /** Optimizer metric, number of temporary tables created. */
+  ulonglong m_created_tmp_tables;
+  /** Optimizer metric, number of full join. */
+  ulonglong m_select_full_join;
+  /** Optimizer metric, number of full range join. */
+  ulonglong m_select_full_range_join;
+  /** Optimizer metric, number of select range. */
+  ulonglong m_select_range;
+  /** Optimizer metric, number of select range checks. */
+  ulonglong m_select_range_check;
+  /** Optimizer metric, number of select scans. */
+  ulonglong m_select_scan;
+  /** Optimizer metric, number of sort merge passes. */
+  ulonglong m_sort_merge_passes;
+  /** Optimizer metric, number of sort ranges. */
+  ulonglong m_sort_range;
+  /** Optimizer metric, number of sort rows. */
+  ulonglong m_sort_rows;
+  /** Optimizer metric, number of sort scans. */
+  ulonglong m_sort_scan;
+  /** Optimizer metric, number of 'no index used'. */
+  ulonglong m_no_index_used;
+  /** Optimizer metric, number of 'no good index used'. */
+  ulonglong m_no_good_index_used;
+  /** Statement digest. */
+  PSI_digest_storage m_digest_storage;
+};
+
+void insert_events_statements_history(PFS_thread *thread, PFS_events_statements *statement);
+void insert_events_statements_history_long(PFS_events_statements *statement);
+
+extern bool flag_events_statements_current;
+extern bool flag_events_statements_history;
+extern bool flag_events_statements_history_long;
+
+extern bool events_statements_history_long_full;
+extern volatile uint32 events_statements_history_long_index;
+extern PFS_events_statements *events_statements_history_long_array;
+extern ulong events_statements_history_long_size;
+
+int init_events_statements_history_long(uint events_statements_history_long_sizing);
+void cleanup_events_statements_history_long();
+
+void reset_events_statements_current();
+void reset_events_statements_history();
+void reset_events_statements_history_long();
+void reset_events_statements_by_thread();
+void reset_events_statements_by_account();
+void reset_events_statements_by_user();
+void reset_events_statements_by_host();
+void reset_events_statements_global();
+void aggregate_account_statements(PFS_account *account);
+void aggregate_user_statements(PFS_user *user);
+void aggregate_host_statements(PFS_host *host);
+
+#endif
+
diff --git a/storage/perfschema/pfs_events_waits.cc b/storage/perfschema/pfs_events_waits.cc
index b6cadf9e61c..2ee9ec292a2 100644
--- a/storage/perfschema/pfs_events_waits.cc
+++ b/storage/perfschema/pfs_events_waits.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2008, 2011, Oracle and/or its affiliates. All rights reserved.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
@@ -21,30 +21,26 @@
 #include "my_global.h"
 #include "my_sys.h"
 #include "pfs_global.h"
+#include "pfs_instr_class.h"
 #include "pfs_instr.h"
+#include "pfs_user.h"
+#include "pfs_host.h"
+#include "pfs_account.h"
 #include "pfs_events_waits.h"
 #include "pfs_atomic.h"
 #include "m_string.h"
 
 ulong events_waits_history_long_size= 0;
 /** Consumer flag for table EVENTS_WAITS_CURRENT. */
-bool flag_events_waits_current= true;
+bool flag_events_waits_current= false;
 /** Consumer flag for table EVENTS_WAITS_HISTORY. */
-bool flag_events_waits_history= true;
+bool flag_events_waits_history= false;
 /** Consumer flag for table EVENTS_WAITS_HISTORY_LONG. */
-bool flag_events_waits_history_long= true;
-/** Consumer flag for table EVENTS_WAITS_SUMMARY_BY_THREAD_BY_EVENT_NAME. */
-bool flag_events_waits_summary_by_thread_by_event_name= true;
-/** Consumer flag for table EVENTS_WAITS_SUMMARY_BY_EVENT_NAME. */
-bool flag_events_waits_summary_by_event_name= true;
-/** Consumer flag for table EVENTS_WAITS_SUMMARY_BY_INSTANCE. */
-bool flag_events_waits_summary_by_instance= true;
-bool flag_events_locks_summary_by_event_name= true;
-bool flag_events_locks_summary_by_instance= true;
-/** Consumer flag for table FILE_SUMMARY_BY_EVENT_NAME. */
-bool flag_file_summary_by_event_name= true;
-/** Consumer flag for table FILE_SUMMARY_BY_INSTANCE. */
-bool flag_file_summary_by_instance= true;
+bool flag_events_waits_history_long= false;
+/** Consumer flag for the global instrumentation. */
+bool flag_global_instrumentation= false;
+/** Consumer flag for the per thread instrumentation. */
+bool flag_thread_instrumentation= false;
 
 /** True if EVENTS_WAITS_HISTORY_LONG circular buffer is full. */
 bool events_waits_history_long_full= false;
@@ -93,6 +89,9 @@ static inline void copy_events_waits(PFS_events_waits *dest,
 */
 void insert_events_waits_history(PFS_thread *thread, PFS_events_waits *wait)
 {
+  if (unlikely(events_waits_history_per_thread == 0))
+    return;
+
   uint index= thread->m_waits_history_index;
 
   /*
@@ -120,6 +119,9 @@ void insert_events_waits_history(PFS_thread *thread, PFS_events_waits *wait)
 */
 void insert_events_waits_history_long(PFS_events_waits *wait)
 {
+  if (unlikely(events_waits_history_long_size == 0))
+    return;
+
   uint index= PFS_atomic::add_u32(&events_waits_history_long_index, 1);
 
   index= index % events_waits_history_long_size;
@@ -138,11 +140,11 @@ void reset_events_waits_current(void)
 
   for ( ; pfs_thread < pfs_thread_last; pfs_thread++)
   {
-    PFS_wait_locker *locker= pfs_thread->m_wait_locker_stack;
-    PFS_wait_locker *locker_last= locker + LOCKER_STACK_SIZE;
+    PFS_events_waits *pfs_wait= pfs_thread->m_events_waits_stack;
+    PFS_events_waits *pfs_wait_last= pfs_wait + WAIT_STACK_SIZE;
 
-    for ( ; locker < locker_last; locker++)
-      locker->m_waits_current.m_wait_class= NO_WAIT_CLASS;
+    for ( ; pfs_wait < pfs_wait_last; pfs_wait++)
+      pfs_wait->m_wait_class= NO_WAIT_CLASS;
   }
 }
 
@@ -176,3 +178,137 @@ void reset_events_waits_history_long(void)
     wait->m_wait_class= NO_WAIT_CLASS;
 }
 
+/** Reset table EVENTS_WAITS_SUMMARY_BY_THREAD_BY_EVENT_NAME data. */
+void reset_events_waits_by_thread()
+{
+  PFS_thread *thread= thread_array;
+  PFS_thread *thread_last= thread_array + thread_max;
+
+  for ( ; thread < thread_last; thread++)
+  {
+    if (thread->m_lock.is_populated())
+      aggregate_thread_waits(thread);
+  }
+}
+
+/** Reset table EVENTS_WAITS_SUMMARY_BY_ACCOUNT_BY_EVENT_NAME data. */
+void reset_events_waits_by_account()
+{
+  PFS_account *pfs= account_array;
+  PFS_account *pfs_last= account_array + account_max;
+
+  for ( ; pfs < pfs_last; pfs++)
+  {
+    if (pfs->m_lock.is_populated())
+      pfs->aggregate_waits();
+  }
+}
+
+/** Reset table EVENTS_WAITS_SUMMARY_BY_USER_BY_EVENT_NAME data. */
+void reset_events_waits_by_user()
+{
+  PFS_user *pfs= user_array;
+  PFS_user *pfs_last= user_array + user_max;
+
+  for ( ; pfs < pfs_last; pfs++)
+  {
+    if (pfs->m_lock.is_populated())
+      pfs->aggregate_waits();
+  }
+}
+
+/** Reset table EVENTS_WAITS_SUMMARY_BY_HOST_BY_EVENT_NAME data. */
+void reset_events_waits_by_host()
+{
+  PFS_host *pfs= host_array;
+  PFS_host *pfs_last= host_array + host_max;
+
+  for ( ; pfs < pfs_last; pfs++)
+  {
+    if (pfs->m_lock.is_populated())
+      pfs->aggregate_waits();
+  }
+}
+
+/** Reset table EVENTS_WAITS_GLOBAL_BY_EVENT_NAME data. */
+void reset_events_waits_global()
+{
+  PFS_single_stat *stat= global_instr_class_waits_array;
+  PFS_single_stat *stat_last= global_instr_class_waits_array + wait_class_max;
+
+  for ( ; stat < stat_last; stat++)
+    stat->reset();
+}
+
+void reset_table_waits_by_table()
+{
+  PFS_table_share *pfs= table_share_array;
+  PFS_table_share *pfs_last= pfs + table_share_max;
+
+  for ( ; pfs < pfs_last; pfs++)
+  {
+    if (pfs->m_lock.is_populated())
+      pfs->aggregate();
+  }
+}
+
+void reset_table_io_waits_by_table()
+{
+  PFS_table_share *pfs= table_share_array;
+  PFS_table_share *pfs_last= pfs + table_share_max;
+
+  for ( ; pfs < pfs_last; pfs++)
+  {
+    if (pfs->m_lock.is_populated())
+      pfs->aggregate_io();
+  }
+}
+
+void reset_table_lock_waits_by_table()
+{
+  PFS_table_share *pfs= table_share_array;
+  PFS_table_share *pfs_last= pfs + table_share_max;
+
+  for ( ; pfs < pfs_last; pfs++)
+  {
+    if (pfs->m_lock.is_populated())
+      pfs->aggregate_lock();
+  }
+}
+
+void reset_table_waits_by_table_handle()
+{
+  PFS_table *pfs= table_array;
+  PFS_table *pfs_last= pfs + table_max;
+
+  for ( ; pfs < pfs_last; pfs++)
+  {
+    if (pfs->m_lock.is_populated())
+      pfs->sanitized_aggregate();
+  }
+}
+
+void reset_table_io_waits_by_table_handle()
+{
+  PFS_table *pfs= table_array;
+  PFS_table *pfs_last= pfs + table_max;
+
+  for ( ; pfs < pfs_last; pfs++)
+  {
+    if (pfs->m_lock.is_populated())
+      pfs->sanitized_aggregate_io();
+  }
+}
+
+void reset_table_lock_waits_by_table_handle()
+{
+  PFS_table *pfs= table_array;
+  PFS_table *pfs_last= pfs + table_max;
+
+  for ( ; pfs < pfs_last; pfs++)
+  {
+    if (pfs->m_lock.is_populated())
+      pfs->sanitized_aggregate_lock();
+  }
+}
+
diff --git a/storage/perfschema/pfs_events_waits.h b/storage/perfschema/pfs_events_waits.h
index d277db39d8d..a7f7a095b9f 100644
--- a/storage/perfschema/pfs_events_waits.h
+++ b/storage/perfschema/pfs_events_waits.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2008, 2011, Oracle and/or its affiliates. All rights reserved.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
@@ -23,6 +23,7 @@
 
 #include "pfs_column_types.h"
 #include "pfs_lock.h"
+#include "pfs_events.h"
 
 struct PFS_mutex;
 struct PFS_rwlock;
@@ -30,7 +31,12 @@ struct PFS_cond;
 struct PFS_table;
 struct PFS_file;
 struct PFS_thread;
+struct PFS_socket;
 struct PFS_instr_class;
+struct PFS_table_share;
+struct PFS_account;
+struct PFS_user;
+struct PFS_host;
 
 /** Class of a wait event. */
 enum events_waits_class
@@ -40,51 +46,13 @@ enum events_waits_class
   WAIT_CLASS_RWLOCK,
   WAIT_CLASS_COND,
   WAIT_CLASS_TABLE,
-  WAIT_CLASS_FILE
-};
-
-/** State of a timer. */
-enum timer_state
-{
-  /**
-    Not timed.
-    In this state, TIMER_START, TIMER_END and TIMER_WAIT are NULL.
-  */
-  TIMER_STATE_UNTIMED,
-  /**
-    About to start.
-    In this state, TIMER_START, TIMER_END and TIMER_WAIT are NULL.
-  */
-  TIMER_STATE_STARTING,
-  /**
-    Started, but not yet ended.
-    In this state, TIMER_START has a value, TIMER_END and TIMER_WAIT are NULL.
-  */
-  TIMER_STATE_STARTED,
-  /**
-    Ended.
-    In this state, TIMER_START, TIMER_END and TIMER_WAIT have a value.
-  */
-  TIMER_STATE_TIMED
-};
-
-/** Target object a wait event is waiting on. */
-union events_waits_target
-{
-  /** Mutex waited on. */
-  PFS_mutex *m_mutex;
-  /** RWLock waited on. */
-  PFS_rwlock *m_rwlock;
-  /** Condition waited on. */
-  PFS_cond *m_cond;
-  /** Table waited on. */
-  PFS_table *m_table;
-  /** File waited on. */
-  PFS_file *m_file;
+  WAIT_CLASS_FILE,
+  WAIT_CLASS_SOCKET,
+  WAIT_CLASS_IDLE
 };
 
 /** A wait event record. */
-struct PFS_events_waits
+struct PFS_events_waits : public PFS_events
 {
   /**
     The type of wait.
@@ -100,36 +68,18 @@ struct PFS_events_waits
   events_waits_class m_wait_class;
   /** Executing thread. */
   PFS_thread *m_thread;
-  /** Instrument metadata. */
-  PFS_instr_class *m_class;
-  /** Timer state. */
-  enum timer_state m_timer_state;
-  /** Event id. */
-  ulonglong m_event_id;
-  /**
-    Timer start.
-    This member is populated only if m_timed is true.
-  */
-  ulonglong m_timer_start;
-  /**
-    Timer end.
-    This member is populated only if m_timed is true.
-  */
-  ulonglong m_timer_end;
-  /** Schema name. */
-  const char *m_schema_name;
-  /** Length in bytes of @c m_schema_name. */
-  uint m_schema_name_length;
-  /** Object name. */
-  const char *m_object_name;
-  /** Length in bytes of @c m_object_name. */
-  uint m_object_name_length;
+  /** Object type */
+  enum_object_type m_object_type;
+  /** Table share, for table operations only. */
+  PFS_table_share *m_weak_table_share;
+  /** File, for file operations only. */
+  PFS_file *m_weak_file;
+  /** Socket, for socket operations only. */
+  PFS_socket *m_weak_socket;
+  /** For weak pointers, target object version. */
+  uint32 m_weak_version;
   /** Address in memory of the object instance waited on. */
   const void *m_object_instance_addr;
-  /** Location of the instrumentation in the source code (file name). */
-  const char *m_source_file;
-  /** Location of the instrumentation in the source code (line number). */
-  uint m_source_line;
   /** Operation performed. */
   enum_operation_type m_operation;
   /**
@@ -137,22 +87,23 @@ struct PFS_events_waits
     This member is populated for file READ/WRITE operations only.
   */
   size_t m_number_of_bytes;
+  /**
+    Index used.
+    This member is populated for TABLE IO operations only.
+  */
+  uint m_index;
+  /** Flags */
+  ulong m_flags;
 };
 
-/**
-  A wait locker.
-  A locker is a transient helper structure used by the instrumentation
-  during the recording of a wait.
-*/
-struct PFS_wait_locker
-{
-  /** The timer used to measure the wait. */
-  enum_timer_name m_timer_name;
-  /** The object waited on. */
-  events_waits_target m_target;
-  /** The wait data recorded. */
-  PFS_events_waits m_waits_current;
-};
+/** TIMED bit in the state flags bitfield. */
+#define STATE_FLAG_TIMED (1<<0)
+/** THREAD bit in the state flags bitfield. */
+#define STATE_FLAG_THREAD (1<<1)
+/** EVENT bit in the state flags bitfield. */
+#define STATE_FLAG_EVENT (1<<2)
+/** DIGEST bit in the state flags bitfield. */
+#define STATE_FLAG_DIGEST (1<<3)
 
 void insert_events_waits_history(PFS_thread *thread, PFS_events_waits *wait);
 
@@ -161,14 +112,9 @@ void insert_events_waits_history_long(PFS_events_waits *wait);
 extern bool flag_events_waits_current;
 extern bool flag_events_waits_history;
 extern bool flag_events_waits_history_long;
-extern bool flag_events_waits_summary_by_thread_by_event_name;
-extern bool flag_events_waits_summary_by_event_name;
-extern bool flag_events_waits_summary_by_instance;
-extern bool flag_events_locks_summary_by_thread_by_name;
-extern bool flag_events_locks_summary_by_event_name;
-extern bool flag_events_locks_summary_by_instance;
-extern bool flag_file_summary_by_event_name;
-extern bool flag_file_summary_by_instance;
+extern bool flag_global_instrumentation;
+extern bool flag_thread_instrumentation;
+
 extern bool events_waits_history_long_full;
 extern volatile uint32 events_waits_history_long_index;
 extern PFS_events_waits *events_waits_history_long_array;
@@ -180,6 +126,21 @@ void cleanup_events_waits_history_long();
 void reset_events_waits_current();
 void reset_events_waits_history();
 void reset_events_waits_history_long();
+void reset_events_waits_by_thread();
+void reset_events_waits_by_account();
+void reset_events_waits_by_user();
+void reset_events_waits_by_host();
+void reset_events_waits_global();
+void aggregate_account_waits(PFS_account *account);
+void aggregate_user_waits(PFS_user *user);
+void aggregate_host_waits(PFS_host *host);
+
+void reset_table_waits_by_table();
+void reset_table_io_waits_by_table();
+void reset_table_lock_waits_by_table();
+void reset_table_waits_by_table_handle();
+void reset_table_io_waits_by_table_handle();
+void reset_table_lock_waits_by_table_handle();
 
 #endif
 
diff --git a/storage/perfschema/pfs_global.cc b/storage/perfschema/pfs_global.cc
index fa57f335325..6c3b79a3e1f 100644
--- a/storage/perfschema/pfs_global.cc
+++ b/storage/perfschema/pfs_global.cc
@@ -21,10 +21,17 @@
 #include "my_global.h"
 #include "my_sys.h"
 #include "pfs_global.h"
+#include "my_net.h"
 
 #include <stdlib.h>
 #include <string.h>
 
+#ifdef __WIN__
+  #include <winsock2.h>
+#else
+  #include <arpa/inet.h>
+#endif
+
 bool pfs_initialized= false;
 ulonglong pfs_allocated_memory= 0;
 
@@ -67,3 +74,62 @@ void pfs_print_error(const char *format, ...)
   fflush(stderr);
 }
 
+/** Convert raw ip address into readable format. Do not do a reverse DNS lookup. */
+
+uint pfs_get_socket_address(char *host,
+                            uint host_len,
+                            uint *port,
+                            const struct sockaddr_storage *src_addr,
+                            socklen_t src_len)
+{
+  DBUG_ASSERT(host);
+  DBUG_ASSERT(src_addr);
+  DBUG_ASSERT(port);
+
+  memset(host, 0, host_len);
+  *port= 0;
+
+  switch (src_addr->ss_family)
+  {
+    case AF_INET:
+    {
+      if (host_len < INET_ADDRSTRLEN+1)
+        return 0;
+      struct sockaddr_in *sa4= (struct sockaddr_in *)(src_addr);
+    #ifdef __WIN__
+      /* Older versions of Windows do not support inet_ntop() */
+      getnameinfo((struct sockaddr *)sa4, sizeof(struct sockaddr_in),
+                  host, host_len, NULL, 0, NI_NUMERICHOST);
+    #else
+      inet_ntop(AF_INET, &(sa4->sin_addr), host, INET_ADDRSTRLEN);
+    #endif
+      *port= ntohs(sa4->sin_port);
+    }
+    break;
+
+#ifdef HAVE_IPV6
+    case AF_INET6:
+    {
+      if (host_len < INET6_ADDRSTRLEN+1)
+        return 0;
+      struct sockaddr_in6 *sa6= (struct sockaddr_in6 *)(src_addr);
+    #ifdef __WIN__
+      /* Older versions of Windows do not support inet_ntop() */
+      getnameinfo((struct sockaddr *)sa6, sizeof(struct sockaddr_in6),
+                  host, host_len, NULL, 0, NI_NUMERICHOST);
+    #else
+      inet_ntop(AF_INET6, &(sa6->sin6_addr), host, INET6_ADDRSTRLEN);
+    #endif
+      *port= ntohs(sa6->sin6_port);
+    }
+    break;
+#endif
+
+    default:
+      break;
+  }
+
+  /* Return actual IP address string length */
+  return (strlen((const char*)host));
+}
+
diff --git a/storage/perfschema/pfs_global.h b/storage/perfschema/pfs_global.h
index c0c0490a380..693153cb097 100644
--- a/storage/perfschema/pfs_global.h
+++ b/storage/perfschema/pfs_global.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2008, 2011, Oracle and/or its affiliates. All rights reserved.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
@@ -21,14 +21,38 @@
   Miscellaneous global dependencies (declarations).
 */
 
+/** True when the performance schema is initialized. */
 extern bool pfs_initialized;
+/** Total memory allocated by the performance schema, in bytes. */
 extern ulonglong pfs_allocated_memory;
 
 void *pfs_malloc(size_t size, myf flags);
+
+/**
+  Helper, to allocate an array of structures.
+  @param n number of elements in the array.
+  @param T type of an element.
+  @param f flags to use when allocating memory
+*/
 #define PFS_MALLOC_ARRAY(n, T, f) \
   reinterpret_cast<T*> (pfs_malloc((n) * sizeof(T), (f)))
+
+/** Free memory allocated with @sa pfs_malloc. */
 void pfs_free(void *ptr);
 
+
+uint pfs_get_socket_address(char *host,
+                            uint host_len,
+                            uint *port,
+                            const struct sockaddr_storage *src_addr,
+                            socklen_t src_len);
+
+/**
+  Compute a random index value in an interval.
+  @param ptr seed address
+  @param max_size maximun size of the interval
+  @return a random value in [0, max_size-1]
+*/
 inline uint randomized_index(const void *ptr, uint max_size)
 {
   static uint seed1= 0;
diff --git a/storage/perfschema/pfs_host.cc b/storage/perfschema/pfs_host.cc
new file mode 100644
index 00000000000..82b78e19ce8
--- /dev/null
+++ b/storage/perfschema/pfs_host.cc
@@ -0,0 +1,380 @@
+/* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; version 2 of the License.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+/**
+  @file storage/perfschema/pfs_host.cc
+  Performance schema host (implementation).
+*/
+
+#include "my_global.h"
+#include "my_sys.h"
+#include "pfs.h"
+#include "pfs_stat.h"
+#include "pfs_instr.h"
+#include "pfs_setup_actor.h"
+#include "pfs_host.h"
+#include "pfs_global.h"
+#include "pfs_instr_class.h"
+
+/**
+  @addtogroup Performance_schema_buffers
+  @{
+*/
+
+ulong host_max;
+ulong host_lost;
+
+PFS_host *host_array= NULL;
+
+static PFS_single_stat *host_instr_class_waits_array= NULL;
+static PFS_stage_stat *host_instr_class_stages_array= NULL;
+static PFS_statement_stat *host_instr_class_statements_array= NULL;
+
+static LF_HASH host_hash;
+static bool host_hash_inited= false;
+
+/**
+  Initialize the host buffers.
+  @param param                        sizing parameters
+  @return 0 on success
+*/
+int init_host(const PFS_global_param *param)
+{
+  uint index;
+
+  host_max= param->m_host_sizing;
+
+  host_array= NULL;
+  host_instr_class_waits_array= NULL;
+  host_instr_class_stages_array= NULL;
+  host_instr_class_statements_array= NULL;
+  uint waits_sizing= host_max * wait_class_max;
+  uint stages_sizing= host_max * stage_class_max;
+  uint statements_sizing= host_max * statement_class_max;
+
+  if (host_max > 0)
+  {
+    host_array= PFS_MALLOC_ARRAY(host_max, PFS_host,
+                                 MYF(MY_ZEROFILL));
+    if (unlikely(host_array == NULL))
+      return 1;
+  }
+
+  if (waits_sizing > 0)
+  {
+    host_instr_class_waits_array=
+      PFS_connection_slice::alloc_waits_slice(waits_sizing);
+    if (unlikely(host_instr_class_waits_array == NULL))
+      return 1;
+  }
+
+  if (stages_sizing > 0)
+  {
+    host_instr_class_stages_array=
+      PFS_connection_slice::alloc_stages_slice(stages_sizing);
+    if (unlikely(host_instr_class_stages_array == NULL))
+      return 1;
+  }
+
+  if (statements_sizing > 0)
+  {
+    host_instr_class_statements_array=
+      PFS_connection_slice::alloc_statements_slice(statements_sizing);
+    if (unlikely(host_instr_class_statements_array == NULL))
+      return 1;
+  }
+
+  for (index= 0; index < host_max; index++)
+  {
+    host_array[index].m_instr_class_waits_stats=
+      &host_instr_class_waits_array[index * wait_class_max];
+    host_array[index].m_instr_class_stages_stats=
+      &host_instr_class_stages_array[index * stage_class_max];
+    host_array[index].m_instr_class_statements_stats=
+      &host_instr_class_statements_array[index * statement_class_max];
+  }
+
+  return 0;
+}
+
+/** Cleanup all the host buffers. */
+void cleanup_host(void)
+{
+  pfs_free(host_array);
+  host_array= NULL;
+  pfs_free(host_instr_class_waits_array);
+  host_instr_class_waits_array= NULL;
+  pfs_free(host_instr_class_stages_array);
+  host_instr_class_stages_array= NULL;
+  pfs_free(host_instr_class_statements_array);
+  host_instr_class_statements_array= NULL;
+  host_max= 0;
+}
+
+C_MODE_START
+static uchar *host_hash_get_key(const uchar *entry, size_t *length,
+                                my_bool)
+{
+  const PFS_host * const *typed_entry;
+  const PFS_host *host;
+  const void *result;
+  typed_entry= reinterpret_cast<const PFS_host* const *> (entry);
+  DBUG_ASSERT(typed_entry != NULL);
+  host= *typed_entry;
+  DBUG_ASSERT(host != NULL);
+  *length= host->m_key.m_key_length;
+  result= host->m_key.m_hash_key;
+  return const_cast<uchar*> (reinterpret_cast<const uchar*> (result));
+}
+C_MODE_END
+
+/**
+  Initialize the host hash.
+  @return 0 on success
+*/
+int init_host_hash(void)
+{
+  if (! host_hash_inited)
+  {
+    lf_hash_init(&host_hash, sizeof(PFS_host*), LF_HASH_UNIQUE,
+                 0, 0, host_hash_get_key, &my_charset_bin);
+    host_hash_inited= true;
+  }
+  return 0;
+}
+
+/** Cleanup the host hash. */
+void cleanup_host_hash(void)
+{
+  if (host_hash_inited)
+  {
+    lf_hash_destroy(&host_hash);
+    host_hash_inited= false;
+  }
+}
+
+static LF_PINS* get_host_hash_pins(PFS_thread *thread)
+{
+  if (unlikely(thread->m_host_hash_pins == NULL))
+  {
+    if (! host_hash_inited)
+      return NULL;
+    thread->m_host_hash_pins= lf_hash_get_pins(&host_hash);
+  }
+  return thread->m_host_hash_pins;
+}
+
+static void set_host_key(PFS_host_key *key,
+                         const char *host, uint host_length)
+{
+  DBUG_ASSERT(host_length <= HOSTNAME_LENGTH);
+
+  char *ptr= &key->m_hash_key[0];
+  if (host_length > 0)
+  {
+    memcpy(ptr, host, host_length);
+    ptr+= host_length;
+  }
+  ptr[0]= 0;
+  ptr++;
+  key->m_key_length= ptr - &key->m_hash_key[0];
+}
+
+PFS_host *find_or_create_host(PFS_thread *thread,
+                              const char *hostname, uint hostname_length)
+{
+  if (host_max == 0)
+  {
+    host_lost++;
+    return NULL;
+  }
+
+  LF_PINS *pins= get_host_hash_pins(thread);
+  if (unlikely(pins == NULL))
+  {
+    host_lost++;
+    return NULL;
+  }
+
+  PFS_host_key key;
+  set_host_key(&key, hostname, hostname_length);
+
+  PFS_host **entry;
+  uint retry_count= 0;
+  const uint retry_max= 3;
+
+search:
+  entry= reinterpret_cast<PFS_host**>
+    (lf_hash_search(&host_hash, pins,
+                    key.m_hash_key, key.m_key_length));
+  if (entry && (entry != MY_ERRPTR))
+  {
+    PFS_host *pfs;
+    pfs= *entry;
+    pfs->inc_refcount();
+    lf_hash_search_unpin(pins);
+    return pfs;
+  }
+
+  lf_hash_search_unpin(pins);
+
+  PFS_scan scan;
+  uint random= randomized_index(hostname, host_max);
+
+  for (scan.init(random, host_max);
+       scan.has_pass();
+       scan.next_pass())
+  {
+    PFS_host *pfs= host_array + scan.first();
+    PFS_host *pfs_last= host_array + scan.last();
+    for ( ; pfs < pfs_last; pfs++)
+    {
+      if (pfs->m_lock.is_free())
+      {
+        if (pfs->m_lock.free_to_dirty())
+        {
+          pfs->m_key= key;
+          if (hostname_length > 0)
+            pfs->m_hostname= &pfs->m_key.m_hash_key[0];
+          else
+            pfs->m_hostname= NULL;
+          pfs->m_hostname_length= hostname_length;
+
+          pfs->init_refcount();
+          pfs->reset_stats();
+          pfs->m_disconnected_count= 0;
+
+          int res;
+          res= lf_hash_insert(&host_hash, pins, &pfs);
+          if (likely(res == 0))
+          {
+            pfs->m_lock.dirty_to_allocated();
+            return pfs;
+          }
+
+          pfs->m_lock.dirty_to_free();
+
+          if (res > 0)
+          {
+            if (++retry_count > retry_max)
+            {
+              host_lost++;
+              return NULL;
+            }
+            goto search;
+          }
+
+          host_lost++;
+          return NULL;
+        }
+      }
+    }
+  }
+
+  host_lost++;
+  return NULL;
+}
+
+void PFS_host::aggregate()
+{
+  aggregate_waits();
+  aggregate_stages();
+  aggregate_statements();
+  aggregate_stats();
+}
+
+void PFS_host::aggregate_waits()
+{
+  /* No parent to aggregate to, clean the stats */
+  reset_waits_stats();
+}
+
+void PFS_host::aggregate_stages()
+{
+  /*
+    Aggregate EVENTS_STAGES_SUMMARY_BY_HOST_BY_EVENT_NAME to:
+    -  EVENTS_STAGES_SUMMARY_GLOBAL_BY_EVENT_NAME
+  */
+  aggregate_all_stages(m_instr_class_stages_stats,
+                       global_instr_class_stages_array);
+}
+
+void PFS_host::aggregate_statements()
+{
+  /*
+    Aggregate EVENTS_STATEMENTS_SUMMARY_BY_HOST_BY_EVENT_NAME to:
+    -  EVENTS_STATEMENTS_SUMMARY_GLOBAL_BY_EVENT_NAME
+  */
+  aggregate_all_statements(m_instr_class_statements_stats,
+                           global_instr_class_statements_array);
+}
+
+void PFS_host::aggregate_stats()
+{
+  /* No parent to aggregate to, clean the stats */
+  m_disconnected_count= 0;
+}
+
+void PFS_host::release()
+{
+  dec_refcount();
+}
+
+void purge_host(PFS_thread *thread, PFS_host *host)
+{
+  LF_PINS *pins= get_host_hash_pins(thread);
+  if (unlikely(pins == NULL))
+    return;
+
+  PFS_host **entry;
+  entry= reinterpret_cast<PFS_host**>
+    (lf_hash_search(&host_hash, pins,
+                    host->m_key.m_hash_key, host->m_key.m_key_length));
+  if (entry && (entry != MY_ERRPTR))
+  {
+    DBUG_ASSERT(*entry == host);
+    if (host->get_refcount() == 0)
+    {
+      lf_hash_delete(&host_hash, pins,
+                     host->m_key.m_hash_key, host->m_key.m_key_length);
+      host->m_lock.allocated_to_free();
+    }
+  }
+
+  lf_hash_search_unpin(pins);
+}
+
+/** Purge non connected hosts, reset stats of connected hosts. */
+void purge_all_host(void)
+{
+  PFS_thread *thread= PFS_thread::get_current_thread();
+  if (unlikely(thread == NULL))
+    return;
+
+  PFS_host *pfs= host_array;
+  PFS_host *pfs_last= host_array + host_max;
+
+  for ( ; pfs < pfs_last; pfs++)
+  {
+    if (pfs->m_lock.is_populated())
+    {
+      pfs->aggregate();
+      if (pfs->get_refcount() == 0)
+        purge_host(thread, pfs);
+    }
+  }
+}
+
+/** @} */
diff --git a/storage/perfschema/pfs_host.h b/storage/perfschema/pfs_host.h
new file mode 100644
index 00000000000..d04b88e62f3
--- /dev/null
+++ b/storage/perfschema/pfs_host.h
@@ -0,0 +1,110 @@
+/* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; version 2 of the License.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+#ifndef PFS_HOST_H
+#define PFS_HOST_H
+
+/**
+  @file storage/perfschema/pfs_host.h
+  Performance schema host (declarations).
+*/
+
+#include "pfs_lock.h"
+#include "lf.h"
+#include "pfs_con_slice.h"
+
+struct PFS_global_param;
+struct PFS_thread;
+
+/**
+  @addtogroup Performance_schema_buffers
+  @{
+*/
+
+struct PFS_host_key
+{
+  /**
+    Hash search key.
+    This has to be a string for LF_HASH,
+    the format is "<hostname><0x00>"
+  */
+  char m_hash_key[HOSTNAME_LENGTH + 1];
+  uint m_key_length;
+};
+
+struct PFS_host : PFS_connection_slice
+{
+public:
+  inline void init_refcount(void)
+  {
+    PFS_atomic::store_32(& m_refcount, 1);
+  }
+
+  inline int get_refcount(void)
+  {
+    return PFS_atomic::load_32(& m_refcount);
+  }
+
+  inline void inc_refcount(void)
+  {
+    PFS_atomic::add_32(& m_refcount, 1);
+  }
+
+  inline void dec_refcount(void)
+  {
+    PFS_atomic::add_32(& m_refcount, -1);
+  }
+
+  void aggregate(void);
+  void aggregate_waits(void);
+  void aggregate_stages(void);
+  void aggregate_statements(void);
+  void aggregate_stats(void);
+  void release(void);
+
+  /* Internal lock. */
+  pfs_lock m_lock;
+  PFS_host_key m_key;
+  const char *m_hostname;
+  uint m_hostname_length;
+
+  ulonglong m_disconnected_count;
+
+private:
+  int m_refcount;
+};
+
+int init_host(const PFS_global_param *param);
+void cleanup_host(void);
+int init_host_hash(void);
+void cleanup_host_hash(void);
+
+PFS_host *find_or_create_host(PFS_thread *thread,
+                              const char *hostname, uint hostname_length);
+
+void purge_all_host(void);
+
+/* For iterators and show status. */
+
+extern ulong host_max;
+extern ulong host_lost;
+
+/* Exposing the data directly, for iterators. */
+
+extern PFS_host *host_array;
+
+/** @} */
+#endif
+
diff --git a/storage/perfschema/pfs_instr.cc b/storage/perfschema/pfs_instr.cc
index 8da1a9862e1..82e768c9be2 100644
--- a/storage/perfschema/pfs_instr.cc
+++ b/storage/perfschema/pfs_instr.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2008, 2012, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2008, 2011, Oracle and/or its affiliates. All rights reserved.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
@@ -25,7 +25,11 @@
 #include "pfs.h"
 #include "pfs_stat.h"
 #include "pfs_instr.h"
+#include "pfs_host.h"
+#include "pfs_user.h"
+#include "pfs_account.h"
 #include "pfs_global.h"
+#include "pfs_instr_class.h"
 
 /**
   @addtogroup Performance_schema_buffers
@@ -63,12 +67,21 @@ ulong file_handle_lost;
 ulong table_max;
 /** Number of table instances lost. @sa table_array */
 ulong table_lost;
+/** Size of the socket instances array. @sa socket_array */
+ulong socket_max;
+/** Number of socket instances lost. @sa socket_array */
+ulong socket_lost;
 /** Number of EVENTS_WAITS_HISTORY records per thread. */
 ulong events_waits_history_per_thread;
-/** Number of instruments class per thread. */
-ulong instr_class_per_thread;
+/** Number of EVENTS_STAGES_HISTORY records per thread. */
+ulong events_stages_history_per_thread;
+/** Number of EVENTS_STATEMENTS_HISTORY records per thread. */
+ulong events_statements_history_per_thread;
+uint statement_stack_max;
 /** Number of locker lost. @sa LOCKER_STACK_SIZE. */
 ulong locker_lost= 0;
+/** Number of statement lost. @sa STATEMENT_STACK_SIZE. */
+ulong statement_lost= 0;
 
 /**
   Mutex instrumentation instances array.
@@ -120,24 +133,35 @@ PFS_file **file_handle_array= NULL;
 */
 PFS_table *table_array= NULL;
 
+/**
+  Socket instrumentation instances array.
+  @sa socket_max
+  @sa socket_lost
+*/
+PFS_socket *socket_array= NULL;
+
+PFS_single_stat *global_instr_class_waits_array= NULL;
+PFS_stage_stat *global_instr_class_stages_array= NULL;
+PFS_statement_stat *global_instr_class_statements_array= NULL;
+
 static volatile uint32 thread_internal_id_counter= 0;
 
-static uint per_thread_rwlock_class_start;
-static uint per_thread_cond_class_start;
-static uint per_thread_file_class_start;
 static uint thread_instr_class_waits_sizing;
-static PFS_single_stat_chain *thread_instr_class_waits_array= NULL;
+static uint thread_instr_class_stages_sizing;
+static uint thread_instr_class_statements_sizing;
+static PFS_single_stat *thread_instr_class_waits_array= NULL;
+static PFS_stage_stat *thread_instr_class_stages_array= NULL;
+static PFS_statement_stat *thread_instr_class_statements_array= NULL;
 
-static PFS_events_waits *thread_history_array= NULL;
+static PFS_events_waits *thread_waits_history_array= NULL;
+static PFS_events_stages *thread_stages_history_array= NULL;
+static PFS_events_statements *thread_statements_history_array= NULL;
+static PFS_events_statements *thread_statements_stack_array= NULL;
 
 /** Hash table for instrumented files. */
 static LF_HASH filename_hash;
 /** True if filename_hash is initialized. */
 static bool filename_hash_inited= false;
-C_MODE_START
-/** Get hash table key for instrumented files. */
-static uchar *filename_hash_get_key(const uchar *, size_t *, my_bool);
-C_MODE_END
 
 /**
   Initialize all the instruments instance buffers.
@@ -146,10 +170,16 @@ C_MODE_END
 */
 int init_instruments(const PFS_global_param *param)
 {
-  uint thread_history_sizing;
+  uint thread_waits_history_sizing;
+  uint thread_stages_history_sizing;
+  uint thread_statements_history_sizing;
+  uint thread_statements_stack_sizing;
   uint index;
   DBUG_ENTER("init_instruments");
 
+  /* Make sure init_event_name_sizing is called */
+  DBUG_ASSERT(wait_class_max != 0);
+
   mutex_max= param->m_mutex_sizing;
   mutex_lost= 0;
   rwlock_max= param->m_rwlock_sizing;
@@ -164,21 +194,32 @@ int init_instruments(const PFS_global_param *param)
   table_lost= 0;
   thread_max= param->m_thread_sizing;
   thread_lost= 0;
+  socket_max= param->m_socket_sizing;
+  socket_lost= 0;
 
   events_waits_history_per_thread= param->m_events_waits_history_sizing;
-  thread_history_sizing= param->m_thread_sizing
+  thread_waits_history_sizing= param->m_thread_sizing
     * events_waits_history_per_thread;
 
-  per_thread_rwlock_class_start= param->m_mutex_class_sizing;
-  per_thread_cond_class_start= per_thread_rwlock_class_start
-    + param->m_rwlock_class_sizing;
-  per_thread_file_class_start= per_thread_cond_class_start
-    + param->m_cond_class_sizing;
-  instr_class_per_thread= per_thread_file_class_start
-    + param->m_file_class_sizing;
-
   thread_instr_class_waits_sizing= param->m_thread_sizing
-    * instr_class_per_thread;
+    * wait_class_max;
+
+  events_stages_history_per_thread= param->m_events_stages_history_sizing;
+  thread_stages_history_sizing= param->m_thread_sizing
+    * events_stages_history_per_thread;
+
+  events_statements_history_per_thread= param->m_events_statements_history_sizing;
+  thread_statements_history_sizing= param->m_thread_sizing
+    * events_statements_history_per_thread;
+
+  statement_stack_max= 1;
+  thread_statements_stack_sizing= param->m_thread_sizing * statement_stack_max;
+
+  thread_instr_class_stages_sizing= param->m_thread_sizing
+    * param->m_stage_class_sizing;
+
+  thread_instr_class_statements_sizing= param->m_thread_sizing
+    * param->m_statement_class_sizing;
 
   mutex_array= NULL;
   rwlock_array= NULL;
@@ -186,9 +227,15 @@ int init_instruments(const PFS_global_param *param)
   file_array= NULL;
   file_handle_array= NULL;
   table_array= NULL;
+  socket_array= NULL;
   thread_array= NULL;
-  thread_history_array= NULL;
+  thread_waits_history_array= NULL;
+  thread_stages_history_array= NULL;
+  thread_statements_history_array= NULL;
+  thread_statements_stack_array= NULL;
   thread_instr_class_waits_array= NULL;
+  thread_instr_class_stages_array= NULL;
+  thread_instr_class_statements_array= NULL;
   thread_internal_id_counter= 0;
 
   if (mutex_max > 0)
@@ -233,6 +280,13 @@ int init_instruments(const PFS_global_param *param)
       DBUG_RETURN(1);
   }
 
+  if (socket_max > 0)
+  {
+    socket_array= PFS_MALLOC_ARRAY(socket_max, PFS_socket, MYF(MY_ZEROFILL));
+    if (unlikely(socket_array == NULL))
+      DBUG_RETURN(1);
+  }
+
   if (thread_max > 0)
   {
     thread_array= PFS_MALLOC_ARRAY(thread_max, PFS_thread, MYF(MY_ZEROFILL));
@@ -240,12 +294,12 @@ int init_instruments(const PFS_global_param *param)
       DBUG_RETURN(1);
   }
 
-  if (thread_history_sizing > 0)
+  if (thread_waits_history_sizing > 0)
   {
-    thread_history_array=
-      PFS_MALLOC_ARRAY(thread_history_sizing, PFS_events_waits,
+    thread_waits_history_array=
+      PFS_MALLOC_ARRAY(thread_waits_history_sizing, PFS_events_waits,
                        MYF(MY_ZEROFILL));
-    if (unlikely(thread_history_array == NULL))
+    if (unlikely(thread_waits_history_array == NULL))
       DBUG_RETURN(1);
   }
 
@@ -253,146 +307,126 @@ int init_instruments(const PFS_global_param *param)
   {
     thread_instr_class_waits_array=
       PFS_MALLOC_ARRAY(thread_instr_class_waits_sizing,
-                       PFS_single_stat_chain, MYF(MY_ZEROFILL));
+                       PFS_single_stat, MYF(MY_ZEROFILL));
     if (unlikely(thread_instr_class_waits_array == NULL))
       DBUG_RETURN(1);
-  }
 
-  for (index= 0; index < thread_instr_class_waits_sizing; index++)
-  {
-    /*
-      Currently, this chain is of length 1,
-      but it's still implemented as a stat chain,
-      since more aggregations are planned to be implemented in m_parent.
-    */
-    thread_instr_class_waits_array[index].m_control_flag=
-      &flag_events_waits_summary_by_thread_by_event_name;
-    thread_instr_class_waits_array[index].m_parent= NULL;
+    for (index= 0; index < thread_instr_class_waits_sizing; index++)
+      thread_instr_class_waits_array[index].reset();
   }
 
-  for (index= 0; index < thread_max; index++)
+  if (thread_stages_history_sizing > 0)
   {
-    thread_array[index].m_waits_history=
-      &thread_history_array[index * events_waits_history_per_thread];
-    thread_array[index].m_instr_class_wait_stats=
-      &thread_instr_class_waits_array[index * instr_class_per_thread];
+    thread_stages_history_array=
+      PFS_MALLOC_ARRAY(thread_stages_history_sizing, PFS_events_stages,
+                       MYF(MY_ZEROFILL));
+    if (unlikely(thread_stages_history_array == NULL))
+      DBUG_RETURN(1);
   }
 
-  DBUG_RETURN(0);
-}
-
-/**
-  Find the per-thread wait statistics for a mutex class.
-  @param thread                       input thread
-  @param klass                        mutex class
-  @return the per thread per mutex class wait stat
-*/
-PFS_single_stat_chain *
-find_per_thread_mutex_class_wait_stat(PFS_thread *thread,
-                                      PFS_mutex_class *klass)
-{
-  PFS_single_stat_chain *stat;
-  uint index;
-  DBUG_ENTER("find_per_thread_mutex_class_wait_stat");
+  if (thread_instr_class_stages_sizing > 0)
+  {
+    thread_instr_class_stages_array=
+      PFS_MALLOC_ARRAY(thread_instr_class_stages_sizing,
+                       PFS_stage_stat, MYF(MY_ZEROFILL));
+    if (unlikely(thread_instr_class_stages_array == NULL))
+      DBUG_RETURN(1);
 
-  DBUG_ASSERT(thread != NULL);
-  DBUG_ASSERT(klass != NULL);
-  index= klass->m_index;
-  DBUG_ASSERT(index < mutex_class_max);
+    for (index= 0; index < thread_instr_class_stages_sizing; index++)
+      thread_instr_class_stages_array[index].reset();
+  }
 
-  stat= &(thread->m_instr_class_wait_stats[index]);
-  DBUG_RETURN(stat);
-}
+  if (thread_statements_history_sizing > 0)
+  {
+    thread_statements_history_array=
+      PFS_MALLOC_ARRAY(thread_statements_history_sizing, PFS_events_statements,
+                       MYF(MY_ZEROFILL));
+    if (unlikely(thread_statements_history_array == NULL))
+      DBUG_RETURN(1);
+  }
 
-/**
-  Find the per-thread wait statistics for a rwlock class.
-  @param thread                       input thread
-  @param klass                        rwlock class
-  @return the per thread per rwlock class wait stat
-*/
-PFS_single_stat_chain *
-find_per_thread_rwlock_class_wait_stat(PFS_thread *thread,
-                                       PFS_rwlock_class *klass)
-{
-  PFS_single_stat_chain *stat;
-  uint index;
-  DBUG_ENTER("find_per_thread_rwlock_class_wait_stat");
+  if (thread_statements_stack_sizing > 0)
+  {
+    thread_statements_stack_array=
+      PFS_MALLOC_ARRAY(thread_statements_stack_sizing, PFS_events_statements,
+                       MYF(MY_ZEROFILL));
+    if (unlikely(thread_statements_stack_array == NULL))
+      DBUG_RETURN(1);
+  }
 
-  DBUG_ASSERT(thread != NULL);
-  DBUG_ASSERT(klass != NULL);
-  index= klass->m_index;
-  DBUG_ASSERT(index < rwlock_class_max);
+  if (thread_instr_class_statements_sizing > 0)
+  {
+    thread_instr_class_statements_array=
+      PFS_MALLOC_ARRAY(thread_instr_class_statements_sizing,
+                       PFS_statement_stat, MYF(MY_ZEROFILL));
+    if (unlikely(thread_instr_class_statements_array == NULL))
+      DBUG_RETURN(1);
 
-  stat= &(thread->m_instr_class_wait_stats
-          [per_thread_rwlock_class_start + index]);
-  DBUG_RETURN(stat);
-}
+    for (index= 0; index < thread_instr_class_statements_sizing; index++)
+      thread_instr_class_statements_array[index].reset();
+  }
 
-/**
-  Find the per-thread wait statistics for a condition class.
-  @param thread                       input thread
-  @param klass                        condition class
-  @return the per thread per condition class wait stat
-*/
-PFS_single_stat_chain *
-find_per_thread_cond_class_wait_stat(PFS_thread *thread,
-                                     PFS_cond_class *klass)
-{
-  PFS_single_stat_chain *stat;
-  uint index;
-  DBUG_ENTER("find_per_thread_cond_class_wait_stat");
+  for (index= 0; index < thread_max; index++)
+  {
+    thread_array[index].m_waits_history=
+      &thread_waits_history_array[index * events_waits_history_per_thread];
+    thread_array[index].m_instr_class_waits_stats=
+      &thread_instr_class_waits_array[index * wait_class_max];
+    thread_array[index].m_stages_history=
+      &thread_stages_history_array[index * events_stages_history_per_thread];
+    thread_array[index].m_instr_class_stages_stats=
+      &thread_instr_class_stages_array[index * stage_class_max];
+    thread_array[index].m_statements_history=
+      &thread_statements_history_array[index * events_statements_history_per_thread];
+    thread_array[index].m_statement_stack=
+      &thread_statements_stack_array[index * statement_stack_max];
+    thread_array[index].m_instr_class_statements_stats=
+      &thread_instr_class_statements_array[index * statement_class_max];
+  }
 
-  DBUG_ASSERT(thread != NULL);
-  DBUG_ASSERT(klass != NULL);
-  index= klass->m_index;
-  DBUG_ASSERT(index < cond_class_max);
+  if (wait_class_max > 0)
+  {
+    global_instr_class_waits_array=
+      PFS_MALLOC_ARRAY(wait_class_max,
+                       PFS_single_stat, MYF(MY_ZEROFILL));
+    if (unlikely(global_instr_class_waits_array == NULL))
+      DBUG_RETURN(1);
 
-  stat= &(thread->m_instr_class_wait_stats
-          [per_thread_cond_class_start + index]);
-  DBUG_RETURN(stat);
-}
+    for (index= 0; index < wait_class_max; index++)
+      global_instr_class_waits_array[index].reset();
+  }
 
-/**
-  Find the per-thread wait statistics for a file class.
-  @param thread                       input thread
-  @param klass                        file class
-  @return the per thread per file class wait stat
-*/
-PFS_single_stat_chain *
-find_per_thread_file_class_wait_stat(PFS_thread *thread,
-                                     PFS_file_class *klass)
-{
-  PFS_single_stat_chain *stat;
-  uint index;
-  DBUG_ENTER("find_per_thread_file_class_wait_stat");
+  if (stage_class_max > 0)
+  {
+    global_instr_class_stages_array=
+      PFS_MALLOC_ARRAY(stage_class_max,
+                       PFS_stage_stat, MYF(MY_ZEROFILL));
+    if (unlikely(global_instr_class_stages_array == NULL))
+      DBUG_RETURN(1);
 
-  DBUG_ASSERT(thread != NULL);
-  DBUG_ASSERT(klass != NULL);
-  index= klass->m_index;
-  DBUG_ASSERT(index < file_class_max);
+    for (index= 0; index < stage_class_max; index++)
+      global_instr_class_stages_array[index].reset();
+  }
 
-  stat= &(thread->m_instr_class_wait_stats
-          [per_thread_file_class_start + index]);
-  DBUG_RETURN(stat);
-}
+  if (statement_class_max > 0)
+  {
+    global_instr_class_statements_array=
+      PFS_MALLOC_ARRAY(statement_class_max,
+                       PFS_statement_stat, MYF(MY_ZEROFILL));
+    if (unlikely(global_instr_class_statements_array == NULL))
+      DBUG_RETURN(1);
 
-/** Reset the wait statistics per thread. */
-void reset_per_thread_wait_stat(void)
-{
-  PFS_single_stat_chain *stat= thread_instr_class_waits_array;
-  PFS_single_stat_chain *stat_last= stat + thread_instr_class_waits_sizing;
-  DBUG_ENTER("reset_per_thread_wait_stat");
+    for (index= 0; index < statement_class_max; index++)
+      global_instr_class_statements_array[index].reset();
+  }
 
-  for ( ; stat < stat_last; stat++)
-    reset_single_stat_link(stat);
-  DBUG_VOID_RETURN;
+  DBUG_RETURN(0);
 }
 
 /** Cleanup all the instruments buffers. */
 void cleanup_instruments(void)
 {
   DBUG_ENTER("cleanup_instruments");
-
   pfs_free(mutex_array);
   mutex_array= NULL;
   mutex_max= 0;
@@ -411,18 +445,33 @@ void cleanup_instruments(void)
   pfs_free(table_array);
   table_array= NULL;
   table_max= 0;
+  pfs_free(socket_array);
+  socket_array= NULL;
+  socket_max= 0;
   pfs_free(thread_array);
   thread_array= NULL;
   thread_max= 0;
-  pfs_free(thread_history_array);
-  thread_history_array= NULL;
+  pfs_free(thread_waits_history_array);
+  thread_waits_history_array= NULL;
+  pfs_free(thread_stages_history_array);
+  thread_stages_history_array= NULL;
+  pfs_free(thread_statements_history_array);
+  thread_statements_history_array= NULL;
+  pfs_free(thread_statements_stack_array);
+  thread_statements_stack_array= NULL;
   pfs_free(thread_instr_class_waits_array);
   thread_instr_class_waits_array= NULL;
+  pfs_free(global_instr_class_waits_array);
+  global_instr_class_waits_array= NULL;
+  pfs_free(global_instr_class_stages_array);
+  global_instr_class_stages_array= NULL;
+  pfs_free(global_instr_class_statements_array);
+  global_instr_class_statements_array= NULL;
   DBUG_VOID_RETURN;
 }
 
-extern "C"
-{
+C_MODE_START
+/** Get hash table key for instrumented files. */
 static uchar *filename_hash_get_key(const uchar *entry, size_t *length,
                                     my_bool)
 {
@@ -439,7 +488,7 @@ static uchar *filename_hash_get_key(const uchar *entry, size_t *length,
   result= file->m_filename;
   DBUG_RETURN(const_cast<uchar*> (reinterpret_cast<const uchar*> (result)));
 }
-}
+C_MODE_END
 
 /**
   Initialize the file name hash.
@@ -551,37 +600,52 @@ void PFS_scan::init(uint random, uint max_size)
 */
 PFS_mutex* create_mutex(PFS_mutex_class *klass, const void *identity)
 {
-  PFS_scan scan;
-  uint random= randomized_index(identity, mutex_max);
+  static uint mutex_monotonic_index= 0;
+  uint index;
+  uint attempts= 0;
+  PFS_mutex *pfs;
   DBUG_ENTER("create_mutex");
 
-  for (scan.init(random, mutex_max);
-       scan.has_pass();
-       scan.next_pass())
+  while (++attempts <= mutex_max)
   {
-    PFS_mutex *pfs= mutex_array + scan.first();
-    PFS_mutex *pfs_last= mutex_array + scan.last();
-    for ( ; pfs < pfs_last; pfs++)
+    /*
+      Problem:
+      Multiple threads running concurrently may need to create a new
+      instrumented mutex, and find an empty slot in mutex_array[].
+      With N1 threads running on a N2 core hardware:
+      - up to N2 hardware threads can run concurrently,
+      causing contention if looking at the same array[i] slot.
+      - up to N1 threads can run almost concurrently (with thread scheduling),
+      scanning maybe overlapping regions in the [0-mutex_max] array.
+
+      Solution:
+      Instead of letting different threads compete on the same array[i] entry,
+      this code forces all threads to cooperate with the monotonic_index.
+      Only one thread will be allowed to test a given array[i] slot.
+      All threads do scan from the same region, starting at monotonic_index.
+      Serializing on monotonic_index ensures that when a slot is found occupied
+      in a given loop by a given thread, other threads will not attempt this
+      slot.
+    */
+    index= PFS_atomic::add_u32(& mutex_monotonic_index, 1) % mutex_max;
+    pfs= mutex_array + index;
+
+    if (pfs->m_lock.is_free())
     {
-      if (pfs->m_lock.is_free())
+      if (pfs->m_lock.free_to_dirty())
       {
-        if (pfs->m_lock.free_to_dirty())
-        {
-          pfs->m_identity= identity;
-          pfs->m_class= klass;
-          pfs->m_wait_stat.m_control_flag=
-            &flag_events_waits_summary_by_instance;
-          pfs->m_wait_stat.m_parent= &klass->m_wait_stat;
-          reset_single_stat_link(&pfs->m_wait_stat);
-          pfs->m_lock_stat.m_control_flag=
-            &flag_events_locks_summary_by_instance;
-          pfs->m_lock_stat.m_parent= &klass->m_lock_stat;
-          reset_single_stat_link(&pfs->m_lock_stat);
-          pfs->m_owner= NULL;
-          pfs->m_last_locked= 0;
-          pfs->m_lock.dirty_to_allocated();
-          DBUG_RETURN(pfs);
-        }
+        pfs->m_identity= identity;
+        pfs->m_class= klass;
+        pfs->m_enabled= klass->m_enabled && flag_global_instrumentation;
+        pfs->m_timed= klass->m_timed;
+        pfs->m_wait_stat.reset();
+        pfs->m_lock_stat.reset();
+        pfs->m_owner= NULL;
+        pfs->m_last_locked= 0;
+        pfs->m_lock.dirty_to_allocated();
+        if (klass->is_singleton())
+          klass->m_singleton= pfs;
+        DBUG_RETURN(pfs);
       }
     }
   }
@@ -597,8 +661,14 @@ PFS_mutex* create_mutex(PFS_mutex_class *klass, const void *identity)
 void destroy_mutex(PFS_mutex *pfs)
 {
   DBUG_ENTER("destroy_mutex");
-
   DBUG_ASSERT(pfs != NULL);
+  PFS_mutex_class *klass= pfs->m_class;
+  /* Aggregate to EVENTS_WAITS_SUMMARY_BY_EVENT_NAME */
+  uint index= klass->m_event_name_index;
+  global_instr_class_waits_array[index].aggregate(& pfs->m_wait_stat);
+  pfs->m_wait_stat.reset();
+  if (klass->is_singleton())
+    klass->m_singleton= NULL;
   pfs->m_lock.allocated_to_free();
   DBUG_VOID_RETURN;
 }
@@ -611,43 +681,37 @@ void destroy_mutex(PFS_mutex *pfs)
 */
 PFS_rwlock* create_rwlock(PFS_rwlock_class *klass, const void *identity)
 {
-  PFS_scan scan;
-  uint random= randomized_index(identity, rwlock_max);
+  static uint rwlock_monotonic_index= 0;
+  uint index;
+  uint attempts= 0;
+  PFS_rwlock *pfs;
   DBUG_ENTER("create_rwlock");
 
-  for (scan.init(random, rwlock_max);
-       scan.has_pass();
-       scan.next_pass())
+  while (++attempts <= rwlock_max)
   {
-    PFS_rwlock *pfs= rwlock_array + scan.first();
-    PFS_rwlock *pfs_last= rwlock_array + scan.last();
-    for ( ; pfs < pfs_last; pfs++)
+    /* See create_mutex() */
+    index= PFS_atomic::add_u32(& rwlock_monotonic_index, 1) % rwlock_max;
+    pfs= rwlock_array + index;
+
+    if (pfs->m_lock.is_free())
     {
-      if (pfs->m_lock.is_free())
+      if (pfs->m_lock.free_to_dirty())
       {
-        if (pfs->m_lock.free_to_dirty())
-        {
-          pfs->m_identity= identity;
-          pfs->m_class= klass;
-          pfs->m_wait_stat.m_control_flag=
-            &flag_events_waits_summary_by_instance;
-          pfs->m_wait_stat.m_parent= &klass->m_wait_stat;
-          reset_single_stat_link(&pfs->m_wait_stat);
-          pfs->m_lock.dirty_to_allocated();
-          pfs->m_read_lock_stat.m_control_flag=
-            &flag_events_locks_summary_by_instance;
-          pfs->m_read_lock_stat.m_parent= &klass->m_read_lock_stat;
-          reset_single_stat_link(&pfs->m_read_lock_stat);
-          pfs->m_write_lock_stat.m_control_flag=
-            &flag_events_locks_summary_by_instance;
-          pfs->m_write_lock_stat.m_parent= &klass->m_write_lock_stat;
-          reset_single_stat_link(&pfs->m_write_lock_stat);
-          pfs->m_writer= NULL;
-          pfs->m_readers= 0;
-          pfs->m_last_written= 0;
-          pfs->m_last_read= 0;
-          DBUG_RETURN(pfs);
-        }
+        pfs->m_identity= identity;
+        pfs->m_class= klass;
+        pfs->m_enabled= klass->m_enabled && flag_global_instrumentation;
+        pfs->m_timed= klass->m_timed;
+        pfs->m_wait_stat.reset();
+        pfs->m_lock.dirty_to_allocated();
+        pfs->m_read_lock_stat.reset();
+        pfs->m_write_lock_stat.reset();
+        pfs->m_writer= NULL;
+        pfs->m_readers= 0;
+        pfs->m_last_written= 0;
+        pfs->m_last_read= 0;
+        if (klass->is_singleton())
+          klass->m_singleton= pfs;
+        DBUG_RETURN(pfs);
       }
     }
   }
@@ -663,8 +727,14 @@ PFS_rwlock* create_rwlock(PFS_rwlock_class *klass, const void *identity)
 void destroy_rwlock(PFS_rwlock *pfs)
 {
   DBUG_ENTER("destroy_rwlock");
-
   DBUG_ASSERT(pfs != NULL);
+  PFS_rwlock_class *klass= pfs->m_class;
+  /* Aggregate to EVENTS_WAITS_SUMMARY_BY_EVENT_NAME */
+  uint index= klass->m_event_name_index;
+  global_instr_class_waits_array[index].aggregate(& pfs->m_wait_stat);
+  pfs->m_wait_stat.reset();
+  if (klass->is_singleton())
+    klass->m_singleton= NULL;
   pfs->m_lock.allocated_to_free();
   DBUG_VOID_RETURN;
 }
@@ -677,33 +747,33 @@ void destroy_rwlock(PFS_rwlock *pfs)
 */
 PFS_cond* create_cond(PFS_cond_class *klass, const void *identity)
 {
-  PFS_scan scan;
-  uint random= randomized_index(identity, cond_max);
+  static uint cond_monotonic_index= 0;
+  uint index;
+  uint attempts= 0;
+  PFS_cond *pfs;
   DBUG_ENTER("create_cond");
 
-  for (scan.init(random, cond_max);
-       scan.has_pass();
-       scan.next_pass())
+  while (++attempts <= cond_max)
   {
-    PFS_cond *pfs= cond_array + scan.first();
-    PFS_cond *pfs_last= cond_array + scan.last();
-    for ( ; pfs < pfs_last; pfs++)
+    /* See create_mutex() */
+    index= PFS_atomic::add_u32(& cond_monotonic_index, 1) % cond_max;
+    pfs= cond_array + index;
+
+    if (pfs->m_lock.is_free())
     {
-      if (pfs->m_lock.is_free())
+      if (pfs->m_lock.free_to_dirty())
       {
-        if (pfs->m_lock.free_to_dirty())
-        {
-          pfs->m_identity= identity;
-          pfs->m_class= klass;
-          pfs->m_cond_stat.m_signal_count= 0;
-          pfs->m_cond_stat.m_broadcast_count= 0;
-          pfs->m_wait_stat.m_control_flag=
-            &flag_events_waits_summary_by_instance;
-          pfs->m_wait_stat.m_parent= &klass->m_wait_stat;
-          reset_single_stat_link(&pfs->m_wait_stat);
-          pfs->m_lock.dirty_to_allocated();
-          DBUG_RETURN(pfs);
-        }
+        pfs->m_identity= identity;
+        pfs->m_class= klass;
+        pfs->m_enabled= klass->m_enabled && flag_global_instrumentation;
+        pfs->m_timed= klass->m_timed;
+        pfs->m_cond_stat.m_signal_count= 0;
+        pfs->m_cond_stat.m_broadcast_count= 0;
+        pfs->m_wait_stat.reset();
+        pfs->m_lock.dirty_to_allocated();
+        if (klass->is_singleton())
+          klass->m_singleton= pfs;
+        DBUG_RETURN(pfs);
       }
     }
   }
@@ -718,13 +788,26 @@ PFS_cond* create_cond(PFS_cond_class *klass, const void *identity)
 */
 void destroy_cond(PFS_cond *pfs)
 {
-  DBUG_ENTER("destroy_cond");
+  DBUG_ENTER("destroy_thread");
 
   DBUG_ASSERT(pfs != NULL);
+  PFS_cond_class *klass= pfs->m_class;
+  /* Aggregate to EVENTS_WAITS_SUMMARY_BY_EVENT_NAME */
+  uint index= klass->m_event_name_index;
+  global_instr_class_waits_array[index].aggregate(& pfs->m_wait_stat);
+  pfs->m_wait_stat.reset();
+  if (klass->is_singleton())
+    klass->m_singleton= NULL;
   pfs->m_lock.allocated_to_free();
   DBUG_VOID_RETURN;
 }
 
+PFS_thread* PFS_thread::get_current_thread()
+{
+  PFS_thread *pfs= my_pthread_getspecific_ptr(PFS_thread*, THR_PFS);
+  return pfs;
+}
+
 /**
   Create instrumentation for a thread instance.
   @param klass                        the thread class
@@ -737,41 +820,127 @@ void destroy_cond(PFS_cond *pfs)
 PFS_thread* create_thread(PFS_thread_class *klass, const void *identity,
                           ulong thread_id)
 {
-  PFS_scan scan;
-  uint random= randomized_index(identity, thread_max);
+  static uint thread_monotonic_index= 0;
+  uint index;
+  uint attempts= 0;
+  PFS_thread *pfs;
   DBUG_ENTER("create_thread");
 
-  for (scan.init(random, thread_max);
-       scan.has_pass();
-       scan.next_pass())
+  while (++attempts <= thread_max)
   {
-    PFS_thread *pfs= thread_array + scan.first();
-    PFS_thread *pfs_last= thread_array + scan.last();
-    for ( ; pfs < pfs_last; pfs++)
+    /* See create_mutex() */
+    index= PFS_atomic::add_u32(& thread_monotonic_index, 1) % thread_max;
+    pfs= thread_array + index;
+
+    if (pfs->m_lock.is_free())
     {
-      if (pfs->m_lock.is_free())
+      if (pfs->m_lock.free_to_dirty())
       {
-        if (pfs->m_lock.free_to_dirty())
+        pfs->m_thread_internal_id=
+          PFS_atomic::add_u32(&thread_internal_id_counter, 1);
+        pfs->m_parent_thread_internal_id= 0;
+        pfs->m_thread_id= thread_id;
+        pfs->m_event_id= 1;
+        pfs->m_enabled= true;
+        pfs->m_class= klass;
+        pfs->m_events_waits_current= & pfs->m_events_waits_stack[WAIT_STACK_BOTTOM];
+        pfs->m_waits_history_full= false;
+        pfs->m_waits_history_index= 0;
+        pfs->m_stages_history_full= false;
+        pfs->m_stages_history_index= 0;
+        pfs->m_statements_history_full= false;
+        pfs->m_statements_history_index= 0;
+
+        pfs->reset_stats();
+
+        pfs->m_filename_hash_pins= NULL;
+        pfs->m_table_share_hash_pins= NULL;
+        pfs->m_setup_actor_hash_pins= NULL;
+        pfs->m_setup_object_hash_pins= NULL;
+        pfs->m_user_hash_pins= NULL;
+        pfs->m_account_hash_pins= NULL;
+        pfs->m_host_hash_pins= NULL;
+        pfs->m_digest_hash_pins= NULL;
+
+        pfs->m_username_length= 0;
+        pfs->m_hostname_length= 0;
+        pfs->m_dbname_length= 0;
+        pfs->m_command= 0;
+        pfs->m_start_time= 0;
+        pfs->m_processlist_state_length= 0;
+        pfs->m_processlist_info_length= 0;
+
+        pfs->m_host= NULL;
+        pfs->m_user= NULL;
+        pfs->m_account= NULL;
+        set_thread_account(pfs);
+
+        PFS_events_waits *child_wait;
+        for (index= 0; index < WAIT_STACK_SIZE; index++)
         {
-          pfs->m_thread_internal_id=
-            PFS_atomic::add_u32(&thread_internal_id_counter, 1);
-          pfs->m_thread_id= thread_id;
-          pfs->m_event_id= 1;
-          pfs->m_enabled= true;
-          pfs->m_class= klass;
-          pfs->m_wait_locker_count= 0;
-          pfs->m_waits_history_full= false;
-          pfs->m_waits_history_index= 0;
-
-          PFS_single_stat_chain *stat= pfs->m_instr_class_wait_stats;
-          PFS_single_stat_chain *stat_last= stat + instr_class_per_thread;
-          for ( ; stat < stat_last; stat++)
-            reset_single_stat_link(stat);
-          pfs->m_filename_hash_pins= NULL;
-          pfs->m_table_share_hash_pins= NULL;
-          pfs->m_lock.dirty_to_allocated();
-          DBUG_RETURN(pfs);
+          child_wait= & pfs->m_events_waits_stack[index];
+          child_wait->m_thread_internal_id= pfs->m_thread_internal_id;
+          child_wait->m_event_id= 0;
+          child_wait->m_end_event_id= 0;
+          child_wait->m_event_type= EVENT_TYPE_STATEMENT;
+          child_wait->m_wait_class= NO_WAIT_CLASS;
         }
+
+        PFS_events_stages *child_stage= & pfs->m_stage_current;
+        child_stage->m_thread_internal_id= pfs->m_thread_internal_id;
+        child_stage->m_event_id= 0;
+        child_stage->m_end_event_id= 0;
+        child_stage->m_event_type= EVENT_TYPE_STATEMENT;
+        child_stage->m_class= NULL;
+        child_stage->m_timer_start= 0;
+        child_stage->m_timer_end= 0;
+        child_stage->m_source_file= NULL;
+        child_stage->m_source_line= 0;
+
+        PFS_events_statements *child_statement;
+        for (index= 0; index < statement_stack_max; index++)
+        {
+          child_statement= & pfs->m_statement_stack[index];
+          child_statement->m_thread_internal_id= pfs->m_thread_internal_id;
+          child_statement->m_event_id= 0;
+          child_statement->m_end_event_id= 0;
+          child_statement->m_event_type= EVENT_TYPE_STATEMENT;
+          child_statement->m_class= NULL;
+          child_statement->m_timer_start= 0;
+          child_statement->m_timer_end= 0;
+          child_statement->m_lock_time= 0;
+          child_statement->m_source_file= NULL;
+          child_statement->m_source_line= 0;
+          child_statement->m_current_schema_name_length= 0;
+          child_statement->m_sqltext_length= 0;
+
+          child_statement->m_message_text[0]= '\0';
+          child_statement->m_sql_errno= 0;
+          child_statement->m_sqlstate[0]= '\0';
+          child_statement->m_error_count= 0;
+          child_statement->m_warning_count= 0;
+          child_statement->m_rows_affected= 0;
+
+          child_statement->m_rows_sent= 0;
+          child_statement->m_rows_examined= 0;
+          child_statement->m_created_tmp_disk_tables= 0;
+          child_statement->m_created_tmp_tables= 0;
+          child_statement->m_select_full_join= 0;
+          child_statement->m_select_full_range_join= 0;
+          child_statement->m_select_range= 0;
+          child_statement->m_select_range_check= 0;
+          child_statement->m_select_scan= 0;
+          child_statement->m_sort_merge_passes= 0;
+          child_statement->m_sort_range= 0;
+          child_statement->m_sort_rows= 0;
+          child_statement->m_sort_scan= 0;
+          child_statement->m_no_index_used= 0;
+          child_statement->m_no_good_index_used= 0;
+        }
+        pfs->m_events_statements_count= 0;
+
+        pfs->m_lock.dirty_to_allocated();
+        DBUG_RETURN(pfs);
       }
     }
   }
@@ -780,6 +949,21 @@ PFS_thread* create_thread(PFS_thread_class *klass, const void *identity,
   DBUG_RETURN(NULL);
 }
 
+PFS_mutex *sanitize_mutex(PFS_mutex *unsafe)
+{
+  SANITIZE_ARRAY_BODY(PFS_mutex, mutex_array, mutex_max, unsafe);
+}
+
+PFS_rwlock *sanitize_rwlock(PFS_rwlock *unsafe)
+{
+  SANITIZE_ARRAY_BODY(PFS_rwlock, rwlock_array, rwlock_max, unsafe);
+}
+
+PFS_cond *sanitize_cond(PFS_cond *unsafe)
+{
+  SANITIZE_ARRAY_BODY(PFS_cond, cond_array, cond_max, unsafe);
+}
+
 /**
   Sanitize a PFS_thread pointer.
   Validate that the PFS_thread is part of thread_array.
@@ -794,25 +978,14 @@ PFS_thread *sanitize_thread(PFS_thread *unsafe)
   SANITIZE_ARRAY_BODY(PFS_thread, thread_array, thread_max, unsafe);
 }
 
-const char *sanitize_file_name(const char *unsafe)
+PFS_file *sanitize_file(PFS_file *unsafe)
 {
-  intptr ptr= (intptr) unsafe;
-  intptr first= (intptr) &file_array[0];
-  intptr last= (intptr) &file_array[file_max];
-  DBUG_ENTER("sanitize_file_name");
+  SANITIZE_ARRAY_BODY(PFS_file, file_array, file_max, unsafe);
+}
 
-  /* Check if unsafe points inside file_array[] */
-  if (likely((first <= ptr) && (ptr < last)))
-  {
-    /* Check if unsafe points to PFS_file::m_filename */
-    intptr offset= (ptr - first) % sizeof(PFS_file);
-    intptr valid_offset= my_offsetof(PFS_file, m_filename[0]);
-    if (likely(offset == valid_offset))
-    {   
-      DBUG_RETURN(unsafe);
-    }   
-  }
-  DBUG_RETURN(NULL);
+PFS_socket *sanitize_socket(PFS_socket *unsafe)
+{
+  SANITIZE_ARRAY_BODY(PFS_socket, socket_array, socket_max, unsafe);
 }
 
 /**
@@ -824,6 +997,26 @@ void destroy_thread(PFS_thread *pfs)
   DBUG_ENTER("destroy_thread");
 
   DBUG_ASSERT(pfs != NULL);
+  if (pfs->m_account != NULL)
+  {
+    pfs->m_account->release();
+    pfs->m_account= NULL;
+    DBUG_ASSERT(pfs->m_user == NULL);
+    DBUG_ASSERT(pfs->m_host == NULL);
+  }
+  else
+  {
+    if (pfs->m_user != NULL)
+    {
+      pfs->m_user->release();
+      pfs->m_user= NULL;
+    }
+    if (pfs->m_host != NULL)
+    {
+      pfs->m_host->release();
+      pfs->m_host= NULL;
+    }
+  }
   if (pfs->m_filename_hash_pins)
   {
     lf_hash_put_pins(pfs->m_filename_hash_pins);
@@ -834,6 +1027,36 @@ void destroy_thread(PFS_thread *pfs)
     lf_hash_put_pins(pfs->m_table_share_hash_pins);
     pfs->m_table_share_hash_pins= NULL;
   }
+  if (pfs->m_setup_actor_hash_pins)
+  {
+    lf_hash_put_pins(pfs->m_setup_actor_hash_pins);
+    pfs->m_setup_actor_hash_pins= NULL;
+  }
+  if (pfs->m_setup_object_hash_pins)
+  {
+    lf_hash_put_pins(pfs->m_setup_object_hash_pins);
+    pfs->m_setup_object_hash_pins= NULL;
+  }
+  if (pfs->m_user_hash_pins)
+  {
+    lf_hash_put_pins(pfs->m_user_hash_pins);
+    pfs->m_user_hash_pins= NULL;
+  }
+  if (pfs->m_account_hash_pins)
+  {
+    lf_hash_put_pins(pfs->m_account_hash_pins);
+    pfs->m_account_hash_pins= NULL;
+  }
+  if (pfs->m_host_hash_pins)
+  {
+    lf_hash_put_pins(pfs->m_host_hash_pins);
+    pfs->m_host_hash_pins= NULL;
+  }
+  if (pfs->m_digest_hash_pins)
+  {
+    lf_hash_put_pins(pfs->m_digest_hash_pins);
+    pfs->m_digest_hash_pins= NULL;
+  }
   pfs->m_lock.allocated_to_free();
   DBUG_VOID_RETURN;
 }
@@ -867,19 +1090,18 @@ find_or_create_file(PFS_thread *thread, PFS_file_class *klass,
                     const char *filename, uint len)
 {
   PFS_file *pfs;
-  PFS_scan scan;
+  LF_PINS *pins;
+  char safe_buffer[FN_REFLEN];
+  const char *safe_filename;
   DBUG_ENTER("find_or_create_file");
 
-  LF_PINS *pins= get_filename_hash_pins(thread);
+  pins= get_filename_hash_pins(thread);
   if (unlikely(pins == NULL))
   {
     file_lost++;
     DBUG_RETURN(NULL);
   }
 
-  char safe_buffer[FN_REFLEN];
-  const char *safe_filename;
-
   if (len >= FN_REFLEN)
   {
     /*
@@ -948,7 +1170,7 @@ find_or_create_file(PFS_thread *thread, PFS_file_class *klass,
   /* Append the unresolved file name to the resolved path */
   char *ptr= buffer + strlen(buffer);
   char *buf_end= &buffer[sizeof(buffer)-1];
-  if ((buf_end > ptr) && (*(ptr-1) != FN_LIBCHAR))
+  if (buf_end > ptr)
     *ptr++= FN_LIBCHAR;
   if (buf_end > ptr)
     strncpy(ptr, safe_filename + dirlen, buf_end - ptr);
@@ -960,7 +1182,12 @@ find_or_create_file(PFS_thread *thread, PFS_file_class *klass,
   PFS_file **entry;
   uint retry_count= 0;
   const uint retry_max= 3;
+  static uint file_monotonic_index= 0;
+  uint index;
+  uint attempts= 0;
+
 search:
+
   entry= reinterpret_cast<PFS_file**>
     (lf_hash_search(&filename_hash, pins,
                     normalized_filename, normalized_length));
@@ -974,58 +1201,55 @@ search:
 
   lf_hash_search_unpin(pins);
 
-  /* filename is not constant, just using it for noise on create */
-  uint random= randomized_index(filename, file_max);
-
-  for (scan.init(random, file_max);
-       scan.has_pass();
-       scan.next_pass())
+  while (++attempts <= file_max)
   {
-    pfs= file_array + scan.first();
-    PFS_file *pfs_last= file_array + scan.last();
-    for ( ; pfs < pfs_last; pfs++)
+    /* See create_mutex() */
+    index= PFS_atomic::add_u32(& file_monotonic_index, 1) % file_max;
+    pfs= file_array + index;
+
+    if (pfs->m_lock.is_free())
     {
-      if (pfs->m_lock.is_free())
+      if (pfs->m_lock.free_to_dirty())
       {
-        if (pfs->m_lock.free_to_dirty())
+        pfs->m_class= klass;
+        pfs->m_enabled= klass->m_enabled && flag_global_instrumentation;
+        pfs->m_timed= klass->m_timed;
+        strncpy(pfs->m_filename, normalized_filename, normalized_length);
+        pfs->m_filename[normalized_length]= '\0';
+        pfs->m_filename_length= normalized_length;
+        pfs->m_wait_stat.reset();
+        pfs->m_file_stat.m_open_count= 1;
+        pfs->m_file_stat.m_io_stat.reset();
+        pfs->m_identity= (const void *)pfs;
+
+        int res;
+        res= lf_hash_insert(&filename_hash, thread->m_filename_hash_pins,
+                            &pfs);
+        if (likely(res == 0))
         {
-          pfs->m_class= klass;
-          strncpy(pfs->m_filename, normalized_filename, normalized_length);
-          pfs->m_filename[normalized_length]= '\0';
-          pfs->m_filename_length= normalized_length;
-          pfs->m_file_stat.m_open_count= 1;
-          pfs->m_wait_stat.m_control_flag=
-            &flag_events_waits_summary_by_instance;
-          pfs->m_wait_stat.m_parent= &klass->m_wait_stat;
-          reset_single_stat_link(&pfs->m_wait_stat);
-
-          int res;
-          res= lf_hash_insert(&filename_hash, pins,
-                              &pfs);
-          if (likely(res == 0))
-          {
-            pfs->m_lock.dirty_to_allocated();
-            DBUG_RETURN(pfs);
-          }
+          pfs->m_lock.dirty_to_allocated();
+          if (klass->is_singleton())
+            klass->m_singleton= pfs;
+          DBUG_RETURN(pfs);
+        }
 
-          pfs->m_lock.dirty_to_free();
+        pfs->m_lock.dirty_to_free();
 
-          if (res > 0)
+        if (res > 0)
+        {
+          /* Duplicate insert by another thread */
+          if (++retry_count > retry_max)
           {
-            /* Duplicate insert by another thread */
-            if (++retry_count > retry_max)
-            {
-              /* Avoid infinite loops */
-              file_lost++;
-              DBUG_RETURN(NULL);
-            }
-            goto search;
+            /* Avoid infinite loops */
+            file_lost++;
+            DBUG_RETURN(NULL);
           }
-
-          /* OOM in lf_hash_insert */
-          file_lost++;
-          DBUG_RETURN(NULL);
+          goto search;
         }
+
+        /* OOM in lf_hash_insert */
+        file_lost++;
+        return NULL;
       }
     }
   }
@@ -1058,12 +1282,27 @@ void destroy_file(PFS_thread *thread, PFS_file *pfs)
 
   DBUG_ASSERT(thread != NULL);
   DBUG_ASSERT(pfs != NULL);
+  PFS_file_class *klass= pfs->m_class;
+
+  /* Aggregate to EVENTS_WAITS_SUMMARY_BY_EVENT_NAME */
+  uint index= klass->m_event_name_index;
+  global_instr_class_waits_array[index].aggregate(& pfs->m_wait_stat);
+  pfs->m_wait_stat.reset();
+
+  /* Aggregate to FILE_SUMMARY_BY_EVENT_NAME */
+  klass->m_file_stat.m_io_stat.aggregate(& pfs->m_file_stat.m_io_stat);
+  pfs->m_file_stat.m_io_stat.reset();
+
+  if (klass->is_singleton())
+    klass->m_singleton= NULL;
 
   LF_PINS *pins= get_filename_hash_pins(thread);
   DBUG_ASSERT(pins != NULL);
 
   lf_hash_delete(&filename_hash, pins,
                  pfs->m_filename, pfs->m_filename_length);
+  if (klass->is_singleton())
+    klass->m_singleton= NULL;
   pfs->m_lock.allocated_to_free();
   DBUG_VOID_RETURN;
 }
@@ -1071,53 +1310,282 @@ void destroy_file(PFS_thread *thread, PFS_file *pfs)
 /**
   Create instrumentation for a table instance.
   @param share                        the table share
+  @param opening_thread               the opening thread
   @param identity                     the table address
   @return a table instance, or NULL
 */
-PFS_table* create_table(PFS_table_share *share, const void *identity)
+PFS_table* create_table(PFS_table_share *share, PFS_thread *opening_thread,
+                        const void *identity)
 {
-  PFS_scan scan;
-  uint random= randomized_index(identity, table_max);
+  static uint table_monotonic_index= 0;
+  uint index;
+  uint attempts= 0;
+  PFS_table *pfs;
   DBUG_ENTER("create_table");
 
-  for (scan.init(random, table_max);
+  while (++attempts <= table_max)
+  {
+    /* See create_mutex() */
+    index= PFS_atomic::add_u32(& table_monotonic_index, 1) % table_max;
+    pfs= table_array + index;
+
+    if (pfs->m_lock.is_free())
+    {
+      if (pfs->m_lock.free_to_dirty())
+      {
+        pfs->m_identity= identity;
+        pfs->m_share= share;
+        pfs->m_io_enabled= share->m_enabled &&
+          flag_global_instrumentation && global_table_io_class.m_enabled;
+        pfs->m_io_timed= share->m_timed && global_table_io_class.m_timed;
+        pfs->m_lock_enabled= share->m_enabled &&
+          flag_global_instrumentation && global_table_lock_class.m_enabled;
+        pfs->m_lock_timed= share->m_timed && global_table_lock_class.m_timed;
+        pfs->m_has_io_stats= false;
+        pfs->m_has_lock_stats= false;
+        share->inc_refcount();
+        pfs->m_table_stat.fast_reset();
+        pfs->m_thread_owner= opening_thread;
+        pfs->m_lock.dirty_to_allocated();
+        DBUG_RETURN(pfs);
+      }
+    }
+  }
+
+  table_lost++;
+  DBUG_RETURN(NULL);
+}
+
+void PFS_table::sanitized_aggregate(void)
+{
+  /*
+    This thread could be a TRUNCATE on an aggregated summary table,
+    and not own the table handle.
+  */
+  PFS_table_share *safe_share= sanitize_table_share(m_share);
+  PFS_thread *safe_thread= sanitize_thread(m_thread_owner);
+  if ((safe_share != NULL && safe_thread != NULL) &&
+      (m_has_io_stats || m_has_lock_stats))
+  {
+    safe_aggregate(& m_table_stat, safe_share, safe_thread);
+    m_has_io_stats= false;
+    m_has_lock_stats= false;
+  }
+}
+
+void PFS_table::sanitized_aggregate_io(void)
+{
+  PFS_table_share *safe_share= sanitize_table_share(m_share);
+  PFS_thread *safe_thread= sanitize_thread(m_thread_owner);
+  if (safe_share != NULL && safe_thread != NULL && m_has_io_stats)
+  {
+    safe_aggregate_io(& m_table_stat, safe_share, safe_thread);
+    m_has_io_stats= false;
+  }
+}
+
+void PFS_table::sanitized_aggregate_lock(void)
+{
+  PFS_table_share *safe_share= sanitize_table_share(m_share);
+  PFS_thread *safe_thread= sanitize_thread(m_thread_owner);
+  if (safe_share != NULL && safe_thread != NULL && m_has_lock_stats)
+  {
+    safe_aggregate_lock(& m_table_stat, safe_share, safe_thread);
+    m_has_lock_stats= false;
+  }
+}
+
+void PFS_table::safe_aggregate(PFS_table_stat *table_stat,
+                               PFS_table_share *table_share,
+                               PFS_thread *thread)
+{
+  DBUG_ASSERT(table_stat != NULL);
+  DBUG_ASSERT(table_share != NULL);
+  DBUG_ASSERT(thread != NULL);
+
+  if (flag_thread_instrumentation && thread->m_enabled)
+  {
+    PFS_single_stat *event_name_array;
+    uint index;
+    event_name_array= thread->m_instr_class_waits_stats;
+
+    /*
+      Aggregate to EVENTS_WAITS_SUMMARY_BY_THREAD_BY_EVENT_NAME
+      (for wait/io/table/sql/handler)
+    */
+    index= global_table_io_class.m_event_name_index;
+    table_stat->sum_io(& event_name_array[index]);
+
+    /*
+      Aggregate to EVENTS_WAITS_SUMMARY_BY_THREAD_BY_EVENT_NAME
+      (for wait/lock/table/sql/handler)
+    */
+    index= global_table_lock_class.m_event_name_index;
+    table_stat->sum_lock(& event_name_array[index]);
+  }
+
+  /* Aggregate to TABLE_IO_SUMMARY, TABLE_LOCK_SUMMARY */
+  table_share->m_table_stat.aggregate(table_stat);
+  table_stat->fast_reset();
+}
+
+void PFS_table::safe_aggregate_io(PFS_table_stat *table_stat,
+                                  PFS_table_share *table_share,
+                                  PFS_thread *thread)
+{
+  DBUG_ASSERT(table_stat != NULL);
+  DBUG_ASSERT(table_share != NULL);
+  DBUG_ASSERT(thread != NULL);
+
+  if (flag_thread_instrumentation && thread->m_enabled)
+  {
+    PFS_single_stat *event_name_array;
+    uint index;
+    event_name_array= thread->m_instr_class_waits_stats;
+
+    /*
+      Aggregate to EVENTS_WAITS_SUMMARY_BY_THREAD_BY_EVENT_NAME
+      (for wait/io/table/sql/handler)
+    */
+    index= global_table_io_class.m_event_name_index;
+    table_stat->sum_io(& event_name_array[index]);
+  }
+
+  /* Aggregate to TABLE_IO_SUMMARY */
+  table_share->m_table_stat.aggregate_io(table_stat);
+  table_stat->fast_reset_io();
+}
+
+void PFS_table::safe_aggregate_lock(PFS_table_stat *table_stat,
+                                    PFS_table_share *table_share,
+                                    PFS_thread *thread)
+{
+  DBUG_ASSERT(table_stat != NULL);
+  DBUG_ASSERT(table_share != NULL);
+  DBUG_ASSERT(thread != NULL);
+
+  if (flag_thread_instrumentation && thread->m_enabled)
+  {
+    PFS_single_stat *event_name_array;
+    uint index;
+    event_name_array= thread->m_instr_class_waits_stats;
+
+    /*
+      Aggregate to EVENTS_WAITS_SUMMARY_BY_THREAD_BY_EVENT_NAME
+      (for wait/lock/table/sql/handler)
+    */
+    index= global_table_lock_class.m_event_name_index;
+    table_stat->sum_lock(& event_name_array[index]);
+  }
+
+  /* Aggregate to TABLE_LOCK_SUMMARY */
+  table_share->m_table_stat.aggregate_lock(table_stat);
+  table_stat->fast_reset_lock();
+}
+
+/**
+  Destroy instrumentation for a table instance.
+  @param pfs                          the table to destroy
+*/
+void destroy_table(PFS_table *pfs)
+{
+  DBUG_ENTER("destroy_table");
+
+  DBUG_ASSERT(pfs != NULL);
+  pfs->m_share->dec_refcount();
+  pfs->m_lock.allocated_to_free();
+  DBUG_VOID_RETURN;
+}
+
+/**
+  Create instrumentation for a socket instance.
+  @param klass                        the socket class
+  @param identity                     the socket descriptor
+  @return a socket instance, or NULL
+*/
+PFS_socket* create_socket(PFS_socket_class *klass, const void *identity)
+{
+  PFS_scan scan;
+  DBUG_ENTER("create_socket");
+
+  /**
+    Unlike other instrumented objects, there is no socket 'object' to use as a
+    unique identifier. Instead, a pointer to the PFS_socket object will be used
+    to identify this socket instance. The socket descriptor will be used to
+    seed the the random index assignment.
+    */
+  my_socket fd= likely(identity != NULL) ?
+                *(reinterpret_cast<const my_socket*>(identity)) : 0;
+  my_ptrdiff_t ptr= fd;
+  uint random= randomized_index((const void *)ptr, socket_max);
+
+  for (scan.init(random, socket_max);
        scan.has_pass();
        scan.next_pass())
   {
-    PFS_table *pfs= table_array + scan.first();
-    PFS_table *pfs_last= table_array + scan.last();
+    PFS_socket *pfs= socket_array + scan.first();
+    PFS_socket *pfs_last= socket_array + scan.last();
     for ( ; pfs < pfs_last; pfs++)
     {
       if (pfs->m_lock.is_free())
       {
         if (pfs->m_lock.free_to_dirty())
         {
-          pfs->m_identity= identity;
-          pfs->m_share= share;
-          pfs->m_wait_stat.m_control_flag=
-            &flag_events_waits_summary_by_instance;
-          pfs->m_wait_stat.m_parent= &share->m_wait_stat;
-          reset_single_stat_link(&pfs->m_wait_stat);
+          pfs->m_fd= fd;
+          pfs->m_identity= pfs;
+          pfs->m_class= klass;
+          pfs->m_enabled= klass->m_enabled && flag_global_instrumentation;
+          pfs->m_timed= klass->m_timed;
+          pfs->m_idle= false;
+          pfs->m_socket_stat.reset();
           pfs->m_lock.dirty_to_allocated();
+          pfs->m_thread_owner= NULL;
+          if (klass->is_singleton())
+            klass->m_singleton= pfs;
           DBUG_RETURN(pfs);
         }
       }
     }
   }
 
-  table_lost++;
+  socket_lost++;
   DBUG_RETURN(NULL);
 }
 
 /**
-  Destroy instrumentation for a table instance.
-  @param pfs                          the table to destroy
+  Destroy instrumentation for a socket instance.
+  @param pfs                          the socket to destroy
 */
-void destroy_table(PFS_table *pfs)
+void destroy_socket(PFS_socket *pfs)
 {
-  DBUG_ENTER("destroy_table");
-
   DBUG_ASSERT(pfs != NULL);
+  PFS_socket_class *klass= pfs->m_class;
+  DBUG_ENTER("destroy_socket");
+
+  /* Aggregate to SOCKET_SUMMARY_BY_EVENT_NAME */
+  klass->m_socket_stat.m_io_stat.aggregate(&pfs->m_socket_stat.m_io_stat);
+
+  if (klass->is_singleton())
+    klass->m_singleton= NULL;
+
+  /* Aggregate to EVENTS_WAITS_SUMMARY_BY_THREAD_BY_EVENT_NAME */
+  PFS_thread *thread= pfs->m_thread_owner;
+  if (thread != NULL)
+  {
+    PFS_single_stat *event_name_array;
+    event_name_array= thread->m_instr_class_waits_stats;
+    uint index= pfs->m_class->m_event_name_index;
+
+    /* Combine stats for all operations */
+    PFS_single_stat stat;
+    pfs->m_socket_stat.m_io_stat.sum_waits(&stat);
+    event_name_array[index].aggregate(&stat);
+  }
+
+  pfs->m_socket_stat.reset();
+  pfs->m_thread_owner= NULL;
+  pfs->m_fd= 0;
+  pfs->m_addr_len= 0;
   pfs->m_lock.allocated_to_free();
   DBUG_VOID_RETURN;
 }
@@ -1129,7 +1597,7 @@ static void reset_mutex_waits_by_instance(void)
   DBUG_ENTER("reset_mutex_waits_by_instance");
 
   for ( ; pfs < pfs_last; pfs++)
-    reset_single_stat_link(&pfs->m_wait_stat);
+    pfs->m_wait_stat.reset();
   DBUG_VOID_RETURN;
 }
 
@@ -1140,7 +1608,7 @@ static void reset_rwlock_waits_by_instance(void)
   DBUG_ENTER("reset_rwlock_waits_by_instance");
 
   for ( ; pfs < pfs_last; pfs++)
-    reset_single_stat_link(&pfs->m_wait_stat);
+    pfs->m_wait_stat.reset();
   DBUG_VOID_RETURN;
 }
 
@@ -1151,7 +1619,7 @@ static void reset_cond_waits_by_instance(void)
   DBUG_ENTER("reset_cond_waits_by_instance");
 
   for ( ; pfs < pfs_last; pfs++)
-    reset_single_stat_link(&pfs->m_wait_stat);
+    pfs->m_wait_stat.reset();
   DBUG_VOID_RETURN;
 }
 
@@ -1162,20 +1630,27 @@ static void reset_file_waits_by_instance(void)
   DBUG_ENTER("reset_file_waits_by_instance");
 
   for ( ; pfs < pfs_last; pfs++)
-    reset_single_stat_link(&pfs->m_wait_stat);
+    pfs->m_file_stat.reset();
   DBUG_VOID_RETURN;
 }
 
+static void reset_socket_waits_by_instance(void)
+{
+  PFS_socket *pfs= socket_array;
+  PFS_socket *pfs_last= socket_array + socket_max;
+
+  for ( ; pfs < pfs_last; pfs++)
+    pfs->m_socket_stat.reset();
+}
+
 /** Reset the wait statistics per object instance. */
 void reset_events_waits_by_instance(void)
 {
-  DBUG_ENTER("reset_events_waits_by_instance");
-
   reset_mutex_waits_by_instance();
   reset_rwlock_waits_by_instance();
   reset_cond_waits_by_instance();
   reset_file_waits_by_instance();
-  DBUG_VOID_RETURN;
+  reset_socket_waits_by_instance();
 }
 
 /** Reset the io statistics per file instance. */
@@ -1186,8 +1661,595 @@ void reset_file_instance_io(void)
   DBUG_ENTER("reset_file_instance_io");
 
   for ( ; pfs < pfs_last; pfs++)
-    reset_file_stat(&pfs->m_file_stat);
+    pfs->m_file_stat.m_io_stat.reset();
+  DBUG_VOID_RETURN;
+}
+
+/** Reset the io statistics per socket instance. */
+void reset_socket_instance_io(void)
+{
+  PFS_socket *pfs= socket_array;
+  PFS_socket *pfs_last= socket_array + socket_max;
+  DBUG_ENTER("reset_socket_instance_io");
+
+  for ( ; pfs < pfs_last; pfs++)
+    pfs->m_socket_stat.m_io_stat.reset();
   DBUG_VOID_RETURN;
 }
 
+void reset_global_wait_stat()
+{
+  PFS_single_stat *stat= global_instr_class_waits_array;
+  PFS_single_stat *stat_last= global_instr_class_waits_array + wait_class_max;
+
+  for ( ; stat < stat_last; stat++)
+    stat->reset();
+}
+
+void aggregate_all_event_names(PFS_single_stat *from_array,
+                               PFS_single_stat *to_array)
+{
+  PFS_single_stat *from;
+  PFS_single_stat *from_last;
+  PFS_single_stat *to;
+
+  from= from_array;
+  from_last= from_array + wait_class_max;
+  to= to_array;
+
+  for ( ; from < from_last ; from++, to++)
+  {
+    if (from->m_count > 0)
+    {
+      to->aggregate(from);
+      from->reset();
+    }
+  }
+}
+
+void aggregate_all_event_names(PFS_single_stat *from_array,
+                               PFS_single_stat *to_array_1,
+                               PFS_single_stat *to_array_2)
+{
+  PFS_single_stat *from;
+  PFS_single_stat *from_last;
+  PFS_single_stat *to_1;
+  PFS_single_stat *to_2;
+
+  from= from_array;
+  from_last= from_array + wait_class_max;
+  to_1= to_array_1;
+  to_2= to_array_2;
+
+  for ( ; from < from_last ; from++, to_1++, to_2++)
+  {
+    if (from->m_count > 0)
+    {
+      to_1->aggregate(from);
+      to_2->aggregate(from);
+      from->reset();
+    }
+  }
+}
+
+void aggregate_all_stages(PFS_stage_stat *from_array,
+                          PFS_stage_stat *to_array)
+{
+  PFS_stage_stat *from;
+  PFS_stage_stat *from_last;
+  PFS_stage_stat *to;
+
+  from= from_array;
+  from_last= from_array + stage_class_max;
+  to= to_array;
+
+  for ( ; from < from_last ; from++, to++)
+  {
+    if (from->m_timer1_stat.m_count > 0)
+    {
+      to->aggregate(from);
+      from->reset();
+    }
+  }
+}
+
+void aggregate_all_stages(PFS_stage_stat *from_array,
+                          PFS_stage_stat *to_array_1,
+                          PFS_stage_stat *to_array_2)
+{
+  PFS_stage_stat *from;
+  PFS_stage_stat *from_last;
+  PFS_stage_stat *to_1;
+  PFS_stage_stat *to_2;
+
+  from= from_array;
+  from_last= from_array + stage_class_max;
+  to_1= to_array_1;
+  to_2= to_array_2;
+
+  for ( ; from < from_last ; from++, to_1++, to_2++)
+  {
+    if (from->m_timer1_stat.m_count > 0)
+    {
+      to_1->aggregate(from);
+      to_2->aggregate(from);
+      from->reset();
+    }
+  }
+}
+
+void aggregate_all_statements(PFS_statement_stat *from_array,
+                              PFS_statement_stat *to_array)
+{
+  PFS_statement_stat *from;
+  PFS_statement_stat *from_last;
+  PFS_statement_stat *to;
+
+  from= from_array;
+  from_last= from_array + statement_class_max;
+  to= to_array;
+
+  for ( ; from < from_last ; from++, to++)
+  {
+    if (from->m_timer1_stat.m_count > 0)
+    {
+      to->aggregate(from);
+      from->reset();
+    }
+  }
+}
+
+void aggregate_all_statements(PFS_statement_stat *from_array,
+                              PFS_statement_stat *to_array_1,
+                              PFS_statement_stat *to_array_2)
+{
+  PFS_statement_stat *from;
+  PFS_statement_stat *from_last;
+  PFS_statement_stat *to_1;
+  PFS_statement_stat *to_2;
+
+  from= from_array;
+  from_last= from_array + statement_class_max;
+  to_1= to_array_1;
+  to_2= to_array_2;
+
+  for ( ; from < from_last ; from++, to_1++, to_2++)
+  {
+    if (from->m_timer1_stat.m_count > 0)
+    {
+      to_1->aggregate(from);
+      to_2->aggregate(from);
+      from->reset();
+    }
+  }
+}
+
+void aggregate_thread_stats(PFS_thread *thread)
+{
+  if (likely(thread->m_account != NULL))
+  {
+    thread->m_account->m_disconnected_count++;
+    return;
+  }
+
+  if (thread->m_user != NULL)
+    thread->m_user->m_disconnected_count++;
+
+  if (thread->m_host != NULL)
+    thread->m_host->m_disconnected_count++;
+
+  /* There is no global table for connections statistics. */
+  return;
+}
+
+void aggregate_thread(PFS_thread *thread)
+{
+  aggregate_thread_waits(thread);
+  aggregate_thread_stages(thread);
+  aggregate_thread_statements(thread);
+  aggregate_thread_stats(thread);
+}
+
+void aggregate_thread_waits(PFS_thread *thread)
+{
+  if (likely(thread->m_account != NULL))
+  {
+    DBUG_ASSERT(thread->m_user == NULL);
+    DBUG_ASSERT(thread->m_host == NULL);
+    DBUG_ASSERT(thread->m_account->get_refcount() > 0);
+
+    /*
+      Aggregate EVENTS_WAITS_SUMMARY_BY_THREAD_BY_EVENT_NAME
+      to EVENTS_WAITS_SUMMARY_BY_ACCOUNT_BY_EVENT_NAME.
+    */
+    aggregate_all_event_names(thread->m_instr_class_waits_stats,
+                              thread->m_account->m_instr_class_waits_stats);
+
+    return;
+  }
+
+  if ((thread->m_user != NULL) && (thread->m_host != NULL))
+  {
+    DBUG_ASSERT(thread->m_user->get_refcount() > 0);
+    DBUG_ASSERT(thread->m_host->get_refcount() > 0);
+
+    /*
+      Aggregate EVENTS_WAITS_SUMMARY_BY_THREAD_BY_EVENT_NAME to:
+      -  EVENTS_WAITS_SUMMARY_BY_USER_BY_EVENT_NAME
+      -  EVENTS_WAITS_SUMMARY_BY_HOST_BY_EVENT_NAME
+      in parallel.
+    */
+    aggregate_all_event_names(thread->m_instr_class_waits_stats,
+                              thread->m_user->m_instr_class_waits_stats,
+                              thread->m_host->m_instr_class_waits_stats);
+    return;
+  }
+
+  if (thread->m_user != NULL)
+  {
+    DBUG_ASSERT(thread->m_user->get_refcount() > 0);
+
+    /*
+      Aggregate EVENTS_WAITS_SUMMARY_BY_THREAD_BY_EVENT_NAME
+      to EVENTS_WAITS_SUMMARY_BY_USER_BY_EVENT_NAME, directly.
+    */
+    aggregate_all_event_names(thread->m_instr_class_waits_stats,
+                              thread->m_user->m_instr_class_waits_stats);
+    return;
+  }
+
+  if (thread->m_host != NULL)
+  {
+    DBUG_ASSERT(thread->m_host->get_refcount() > 0);
+
+    /*
+      Aggregate EVENTS_WAITS_SUMMARY_BY_THREAD_BY_EVENT_NAME
+      to EVENTS_WAITS_SUMMARY_BY_HOST_BY_EVENT_NAME, directly.
+    */
+    aggregate_all_event_names(thread->m_instr_class_waits_stats,
+                              thread->m_host->m_instr_class_waits_stats);
+    return;
+  }
+
+  /* Orphan thread, clean the waits stats. */
+  thread->reset_waits_stats();
+}
+
+void aggregate_thread_stages(PFS_thread *thread)
+{
+  if (likely(thread->m_account != NULL))
+  {
+    DBUG_ASSERT(thread->m_user == NULL);
+    DBUG_ASSERT(thread->m_host == NULL);
+    DBUG_ASSERT(thread->m_account->get_refcount() > 0);
+
+    /*
+      Aggregate EVENTS_STAGES_SUMMARY_BY_THREAD_BY_EVENT_NAME
+      to EVENTS_STAGES_SUMMARY_BY_ACCOUNT_BY_EVENT_NAME.
+    */
+    aggregate_all_stages(thread->m_instr_class_stages_stats,
+                         thread->m_account->m_instr_class_stages_stats);
+
+    return;
+  }
+
+  if ((thread->m_user != NULL) && (thread->m_host != NULL))
+  {
+    DBUG_ASSERT(thread->m_user->get_refcount() > 0);
+    DBUG_ASSERT(thread->m_host->get_refcount() > 0);
+
+    /*
+      Aggregate EVENTS_STAGES_SUMMARY_BY_THREAD_BY_EVENT_NAME to:
+      -  EVENTS_STAGES_SUMMARY_BY_USER_BY_EVENT_NAME
+      -  EVENTS_STAGES_SUMMARY_BY_HOST_BY_EVENT_NAME
+      in parallel.
+    */
+    aggregate_all_stages(thread->m_instr_class_stages_stats,
+                         thread->m_user->m_instr_class_stages_stats,
+                         thread->m_host->m_instr_class_stages_stats);
+    return;
+  }
+
+  if (thread->m_user != NULL)
+  {
+    DBUG_ASSERT(thread->m_user->get_refcount() > 0);
+
+    /*
+      Aggregate EVENTS_STAGES_SUMMARY_BY_THREAD_BY_EVENT_NAME to:
+      -  EVENTS_STAGES_SUMMARY_BY_USER_BY_EVENT_NAME
+      -  EVENTS_STAGES_SUMMARY_GLOBAL_BY_EVENT_NAME
+      in parallel.
+    */
+    aggregate_all_stages(thread->m_instr_class_stages_stats,
+                         thread->m_user->m_instr_class_stages_stats,
+                         global_instr_class_stages_array);
+    return;
+  }
+
+  if (thread->m_host != NULL)
+  {
+    DBUG_ASSERT(thread->m_host->get_refcount() > 0);
+
+    /*
+      Aggregate EVENTS_STAGES_SUMMARY_BY_THREAD_BY_EVENT_NAME
+      to EVENTS_STAGES_SUMMARY_BY_HOST_BY_EVENT_NAME, directly.
+    */
+    aggregate_all_stages(thread->m_instr_class_stages_stats,
+                         thread->m_host->m_instr_class_stages_stats);
+    return;
+  }
+
+  /*
+    Aggregate EVENTS_STAGES_SUMMARY_BY_THREAD_BY_EVENT_NAME
+    to EVENTS_STAGES_SUMMARY_GLOBAL_BY_EVENT_NAME.
+  */
+  aggregate_all_stages(thread->m_instr_class_stages_stats,
+                       global_instr_class_stages_array);
+}
+
+void aggregate_thread_statements(PFS_thread *thread)
+{
+  if (likely(thread->m_account != NULL))
+  {
+    DBUG_ASSERT(thread->m_user == NULL);
+    DBUG_ASSERT(thread->m_host == NULL);
+    DBUG_ASSERT(thread->m_account->get_refcount() > 0);
+
+    /*
+      Aggregate EVENTS_STATEMENTS_SUMMARY_BY_THREAD_BY_EVENT_NAME
+      to EVENTS_STATEMENTS_SUMMARY_BY_ACCOUNT_BY_EVENT_NAME.
+    */
+    aggregate_all_statements(thread->m_instr_class_statements_stats,
+                             thread->m_account->m_instr_class_statements_stats);
+
+    return;
+  }
+
+  if ((thread->m_user != NULL) && (thread->m_host != NULL))
+  {
+    DBUG_ASSERT(thread->m_user->get_refcount() > 0);
+    DBUG_ASSERT(thread->m_host->get_refcount() > 0);
+
+    /*
+      Aggregate EVENTS_STATEMENT_SUMMARY_BY_THREAD_BY_EVENT_NAME to:
+      -  EVENTS_STATEMENT_SUMMARY_BY_USER_BY_EVENT_NAME
+      -  EVENTS_STATEMENT_SUMMARY_BY_HOST_BY_EVENT_NAME
+      in parallel.
+    */
+    aggregate_all_statements(thread->m_instr_class_statements_stats,
+                             thread->m_user->m_instr_class_statements_stats,
+                             thread->m_host->m_instr_class_statements_stats);
+    return;
+  }
+
+  if (thread->m_user != NULL)
+  {
+    DBUG_ASSERT(thread->m_user->get_refcount() > 0);
+
+    /*
+      Aggregate EVENTS_STATEMENTS_SUMMARY_BY_THREAD_BY_EVENT_NAME to:
+      -  EVENTS_STATEMENTS_SUMMARY_BY_USER_BY_EVENT_NAME
+      -  EVENTS_STATEMENTS_SUMMARY_GLOBAL_BY_EVENT_NAME
+      in parallel.
+    */
+    aggregate_all_statements(thread->m_instr_class_statements_stats,
+                             thread->m_user->m_instr_class_statements_stats,
+                             global_instr_class_statements_array);
+    return;
+  }
+
+  if (thread->m_host != NULL)
+  {
+    DBUG_ASSERT(thread->m_host->get_refcount() > 0);
+
+    /*
+      Aggregate EVENTS_STATEMENTS_SUMMARY_BY_THREAD_BY_EVENT_NAME
+      to EVENTS_STATEMENTS_SUMMARY_BY_HOST_BY_EVENT_NAME, directly.
+    */
+    aggregate_all_statements(thread->m_instr_class_statements_stats,
+                             thread->m_host->m_instr_class_statements_stats);
+    return;
+  }
+
+  /*
+    Aggregate EVENTS_STATEMENTS_SUMMARY_BY_THREAD_BY_EVENT_NAME
+    to EVENTS_STATEMENTS_SUMMARY_GLOBAL_BY_EVENT_NAME.
+  */
+  aggregate_all_statements(thread->m_instr_class_statements_stats,
+                           global_instr_class_statements_array);
+}
+
+void clear_thread_account(PFS_thread *thread)
+{
+  if (thread->m_account != NULL)
+  {
+    thread->m_account->release();
+    thread->m_account= NULL;
+  }
+
+  if (thread->m_user != NULL)
+  {
+    thread->m_user->release();
+    thread->m_user= NULL;
+  }
+
+  if (thread->m_host != NULL)
+  {
+    thread->m_host->release();
+    thread->m_host= NULL;
+  }
+}
+
+void set_thread_account(PFS_thread *thread)
+{
+  DBUG_ASSERT(thread->m_account == NULL);
+  DBUG_ASSERT(thread->m_user == NULL);
+  DBUG_ASSERT(thread->m_host == NULL);
+
+  thread->m_account= find_or_create_account(thread,
+                                                thread->m_username,
+                                                thread->m_username_length,
+                                                thread->m_hostname,
+                                                thread->m_hostname_length);
+
+  if ((thread->m_account == NULL) && (thread->m_username_length > 0))
+    thread->m_user= find_or_create_user(thread,
+                                        thread->m_username,
+                                        thread->m_username_length);
+
+  if ((thread->m_account == NULL) && (thread->m_hostname_length > 0))
+    thread->m_host= find_or_create_host(thread,
+                                        thread->m_hostname,
+                                        thread->m_hostname_length);
+}
+
+void update_mutex_derived_flags()
+{
+  PFS_mutex *pfs= mutex_array;
+  PFS_mutex *pfs_last= mutex_array + mutex_max;
+  PFS_mutex_class *klass;
+
+  for ( ; pfs < pfs_last; pfs++)
+  {
+    klass= sanitize_mutex_class(pfs->m_class);
+    if (likely(klass != NULL))
+    {
+      pfs->m_enabled= klass->m_enabled && flag_global_instrumentation;
+      pfs->m_timed= klass->m_timed;
+    }
+    else
+    {
+      pfs->m_enabled= false;
+      pfs->m_timed= false;
+    }
+  }
+}
+
+void update_rwlock_derived_flags()
+{
+  PFS_rwlock *pfs= rwlock_array;
+  PFS_rwlock *pfs_last= rwlock_array + rwlock_max;
+  PFS_rwlock_class *klass;
+
+  for ( ; pfs < pfs_last; pfs++)
+  {
+    klass= sanitize_rwlock_class(pfs->m_class);
+    if (likely(klass != NULL))
+    {
+      pfs->m_enabled= klass->m_enabled && flag_global_instrumentation;
+      pfs->m_timed= klass->m_timed;
+    }
+    else
+    {
+      pfs->m_enabled= false;
+      pfs->m_timed= false;
+    }
+  }
+}
+
+void update_cond_derived_flags()
+{
+  PFS_cond *pfs= cond_array;
+  PFS_cond *pfs_last= cond_array + cond_max;
+  PFS_cond_class *klass;
+
+  for ( ; pfs < pfs_last; pfs++)
+  {
+    klass= sanitize_cond_class(pfs->m_class);
+    if (likely(klass != NULL))
+    {
+      pfs->m_enabled= klass->m_enabled && flag_global_instrumentation;
+      pfs->m_timed= klass->m_timed;
+    }
+    else
+    {
+      pfs->m_enabled= false;
+      pfs->m_timed= false;
+    }
+  }
+}
+
+void update_file_derived_flags()
+{
+  PFS_file *pfs= file_array;
+  PFS_file *pfs_last= file_array + file_max;
+  PFS_file_class *klass;
+
+  for ( ; pfs < pfs_last; pfs++)
+  {
+    klass= sanitize_file_class(pfs->m_class);
+    if (likely(klass != NULL))
+    {
+      pfs->m_enabled= klass->m_enabled && flag_global_instrumentation;
+      pfs->m_timed= klass->m_timed;
+    }
+    else
+    {
+      pfs->m_enabled= false;
+      pfs->m_timed= false;
+    }
+  }
+}
+
+void update_table_derived_flags()
+{
+  PFS_table *pfs= table_array;
+  PFS_table *pfs_last= table_array + table_max;
+  PFS_table_share *share;
+
+  for ( ; pfs < pfs_last; pfs++)
+  {
+    share= sanitize_table_share(pfs->m_share);
+    if (likely(share != NULL))
+    {
+      pfs->m_io_enabled= share->m_enabled &&
+        flag_global_instrumentation && global_table_io_class.m_enabled;
+      pfs->m_io_timed= share->m_timed && global_table_io_class.m_timed;
+      pfs->m_lock_enabled= share->m_enabled &&
+        flag_global_instrumentation && global_table_lock_class.m_enabled;
+      pfs->m_lock_timed= share->m_timed && global_table_lock_class.m_timed;
+    }
+    else
+    {
+      pfs->m_io_enabled= false;
+      pfs->m_io_timed= false;
+      pfs->m_lock_enabled= false;
+      pfs->m_lock_timed= false;
+    }
+  }
+}
+
+void update_socket_derived_flags()
+{
+  PFS_socket *pfs= socket_array;
+  PFS_socket *pfs_last= socket_array + socket_max;
+  PFS_socket_class *klass;
+
+  for ( ; pfs < pfs_last; pfs++)
+  {
+    klass= sanitize_socket_class(pfs->m_class);
+    if (likely(klass != NULL))
+    {
+      pfs->m_enabled= klass->m_enabled && flag_global_instrumentation;
+      pfs->m_timed= klass->m_timed;
+    }
+    else
+    {
+      pfs->m_enabled= false;
+      pfs->m_timed= false;
+    }
+  }
+}
+
+void update_instruments_derived_flags()
+{
+  update_mutex_derived_flags();
+  update_rwlock_derived_flags();
+  update_cond_derived_flags();
+  update_file_derived_flags();
+  update_table_derived_flags();
+  update_socket_derived_flags();
+  /* nothing for stages and statements (no instances) */
+}
+
 /** @} */
diff --git a/storage/perfschema/pfs_instr.h b/storage/perfschema/pfs_instr.h
index 2f6b729628e..b579c1d7902 100644
--- a/storage/perfschema/pfs_instr.h
+++ b/storage/perfschema/pfs_instr.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2008, 2011, Oracle and/or its affiliates. All rights reserved.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
@@ -21,11 +21,28 @@
   Performance schema instruments (declarations).
 */
 
+struct PFS_mutex_class;
+struct PFS_rwlock_class;
+struct PFS_cond_class;
+struct PFS_file_class;
+struct PFS_table_share;
+struct PFS_thread_class;
+struct PFS_socket_class;
+
+#ifdef __WIN__
+#include <winsock2.h>
+#else
+#include <arpa/inet.h>
+#endif
 #include "pfs_lock.h"
+#include "pfs_stat.h"
 #include "pfs_instr_class.h"
 #include "pfs_events_waits.h"
+#include "pfs_events_stages.h"
+#include "pfs_events_statements.h"
 #include "pfs_server.h"
 #include "lf.h"
+#include "pfs_con_slice.h"
 
 /**
   @addtogroup Performance_schema_buffers
@@ -33,13 +50,21 @@
 */
 
 struct PFS_thread;
+struct PFS_host;
+struct PFS_user;
+struct PFS_account;
 
+/** Base structure for wait instruments. */
 struct PFS_instr
 {
   /** Internal lock. */
   pfs_lock m_lock;
-  /** Instrument wait statistics chain. */
-  PFS_single_stat_chain m_wait_stat;
+  /** Enabled flag. */
+  bool m_enabled;
+  /** Timed flag. */
+  bool m_timed;
+  /** Instrument wait statistics. */
+  PFS_single_stat m_wait_stat;
 };
 
 /** Instrumented mutex implementation. @see PSI_mutex. */
@@ -49,11 +74,13 @@ struct PFS_mutex : public PFS_instr
   const void *m_identity;
   /** Mutex class. */
   PFS_mutex_class *m_class;
+  /** Instrument wait statistics. */
+  PFS_single_stat m_wait_stat;
   /**
-    Mutex lock usage statistics chain.
+    Mutex lock usage statistics.
     This statistic is not exposed in user visible tables yet.
   */
-  PFS_single_stat_chain m_lock_stat;
+  PFS_single_stat m_lock_stat;
   /** Current owner. */
   PFS_thread *m_owner;
   /**
@@ -70,16 +97,18 @@ struct PFS_rwlock : public PFS_instr
   const void *m_identity;
   /** RWLock class. */
   PFS_rwlock_class *m_class;
+  /** Instrument wait statistics. */
+  PFS_single_stat m_wait_stat;
   /**
-    RWLock read lock usage statistics chain.
+    RWLock read lock usage statistics.
     This statistic is not exposed in user visible tables yet.
   */
-  PFS_single_stat_chain m_read_lock_stat;
+  PFS_single_stat m_read_lock_stat;
   /**
-    RWLock write lock usage statistics chain.
+    RWLock write lock usage statistics.
     This statistic is not exposed in user visible tables yet.
   */
-  PFS_single_stat_chain m_write_lock_stat;
+  PFS_single_stat m_write_lock_stat;
   /** Current writer thread. */
   PFS_thread *m_writer;
   /** Current count of readers. */
@@ -103,6 +132,8 @@ struct PFS_cond : public PFS_instr
   const void *m_identity;
   /** Condition class. */
   PFS_cond_class *m_class;
+  /** Instrument wait statistics. */
+  PFS_single_stat m_wait_stat;
   /** Condition instance usage statistics. */
   PFS_cond_stat m_cond_stat;
 };
@@ -110,30 +141,173 @@ struct PFS_cond : public PFS_instr
 /** Instrumented File and FILE implementation. @see PSI_file. */
 struct PFS_file : public PFS_instr
 {
+  uint32 get_version()
+  { return m_lock.get_version(); }
+
+  /** File identity */
+  const void *m_identity;
   /** File name. */
   char m_filename[FN_REFLEN];
   /** File name length in bytes. */
   uint m_filename_length;
   /** File class. */
   PFS_file_class *m_class;
+  /** Instrument wait statistics. */
+  PFS_single_stat m_wait_stat;
   /** File usage statistics. */
   PFS_file_stat m_file_stat;
 };
 
 /** Instrumented table implementation. @see PSI_table. */
-struct PFS_table : public PFS_instr
+struct PFS_table
 {
+  /**
+    True if table io instrumentation is enabled.
+    This flag is computed.
+  */
+  bool m_io_enabled;
+  /**
+    True if table lock instrumentation is enabled.
+    This flag is computed.
+  */
+  bool m_lock_enabled;
+  /**
+    True if table io instrumentation is timed.
+    This flag is computed.
+  */
+  bool m_io_timed;
+  /**
+    True if table lock instrumentation is timed.
+    This flag is computed.
+  */
+  bool m_lock_timed;
+
+  /** True if table io statistics have been collected. */
+  bool m_has_io_stats;
+
+  /** True if table lock statistics have been collected. */
+  bool m_has_lock_stats;
+
+public:
+  /**
+    Aggregate this table handle statistics to the parents.
+    Only use this method for handles owned by the calling code.
+    @sa sanitized_aggregate.
+  */
+  void aggregate(void)
+  {
+    if (likely((m_thread_owner != NULL) && (m_has_io_stats || m_has_lock_stats)))
+    {
+      safe_aggregate(& m_table_stat, m_share, m_thread_owner);
+      m_has_io_stats= false;
+      m_has_lock_stats= false;
+    }
+  }
+
+  /**
+    Aggregate this table handle statistics to the parents.
+    This method is safe to call on handles not owned by the calling code.
+    @sa aggregate
+    @sa sanitized_aggregate_io
+    @sa sanitized_aggregate_lock
+  */
+  void sanitized_aggregate(void);
+
+  /**
+    Aggregate this table handle io statistics to the parents.
+    This method is safe to call on handles not owned by the calling code.
+  */
+  void sanitized_aggregate_io(void);
+
+  /**
+    Aggregate this table handle lock statistics to the parents.
+    This method is safe to call on handles not owned by the calling code.
+  */
+  void sanitized_aggregate_lock(void);
+
+  /** Internal lock. */
+  pfs_lock m_lock;
+  /** Owner. */
+  PFS_thread *m_thread_owner;
   /** Table share. */
   PFS_table_share *m_share;
   /** Table identity, typically a handler. */
   const void *m_identity;
+  /** Table statistics. */
+  PFS_table_stat m_table_stat;
+
+private:
+  static void safe_aggregate(PFS_table_stat *stat,
+                             PFS_table_share *safe_share,
+                             PFS_thread *safe_thread);
+  static void safe_aggregate_io(PFS_table_stat *stat,
+                                PFS_table_share *safe_share,
+                                PFS_thread *safe_thread);
+  static void safe_aggregate_lock(PFS_table_stat *stat,
+                                  PFS_table_share *safe_share,
+                                  PFS_thread *safe_thread);
+};
+
+/** Instrumented socket implementation. @see PSI_socket. */
+struct PFS_socket : public PFS_instr
+{
+  uint32 get_version()
+  { return m_lock.get_version(); }
+
+  /** Socket identity, typically int */
+  const void *m_identity;
+  /** Owning thread, if applicable */
+  PFS_thread *m_thread_owner;
+  /** Socket file descriptor */
+  uint m_fd;
+  /** Raw socket address */
+  struct sockaddr_storage  m_sock_addr;
+  /** Length of address */
+  socklen_t m_addr_len;
+  /** Idle flag. */
+  bool m_idle;
+  /** Socket class. */
+  PFS_socket_class *m_class;
+  /** Socket usage statistics. */
+  PFS_socket_stat m_socket_stat;
 };
 
 /**
-  @def LOCKER_STACK_SIZE
+  @def WAIT_STACK_LOGICAL_SIZE
   Maximum number of nested waits.
+  Some waits, such as:
+  - "wait/io/table/sql/handler"
+  - "wait/lock/table/sql/handler"
+  are implemented by calling code in a storage engine,
+  that can cause nested waits (file io, mutex, ...)
+  Because of partitioned tables, a table io event (on the whole table)
+  can contain a nested table io event (on a partition).
+  Because of additional debug instrumentation,
+  waiting on what looks like a "mutex" (safe_mutex, innodb sync0sync, ...)
+  can cause nested waits to be recorded.
+  For example, a wait on innodb mutexes can lead to:
+  - wait/sync/mutex/innobase/some_mutex
+    - wait/sync/mutex/innobase/sync0sync
+      - wait/sync/mutex/innobase/os0sync
+  The max depth of the event stack must be sufficient
+  for these low level details to be visible.
 */
-#define LOCKER_STACK_SIZE 3
+#define WAIT_STACK_LOGICAL_SIZE 5
+/**
+  @def WAIT_STACK_BOTTOM
+  Maximum number dummy waits records.
+  One dummy record is reserved for the parent stage / statement,
+  at the bottom of the wait stack.
+*/
+#define WAIT_STACK_BOTTOM 1
+/**
+  @def WAIT_STACK_SIZE
+  Physical size of the waits stack
+*/
+#define WAIT_STACK_SIZE (WAIT_STACK_BOTTOM + WAIT_STACK_LOGICAL_SIZE)
+
+/** Max size of the statements stack. */
+extern uint statement_stack_max;
 
 /**
   @def PFS_MAX_ALLOC_RETRY
@@ -142,6 +316,7 @@ struct PFS_table : public PFS_instr
 */
 #define PFS_MAX_ALLOC_RETRY 1000
 
+/** The maximun number of passes in @sa PFS_scan. */
 #define PFS_MAX_SCAN_PASS 2
 
 /**
@@ -155,59 +330,113 @@ struct PFS_table : public PFS_instr
 struct PFS_scan
 {
 public:
+  /**
+    Initialize a new scan.
+    @param random a random index to start from
+    @param max_size the max size of the interval to scan
+  */
   void init(uint random, uint max_size);
 
+  /**
+    Predicate, has a next pass.
+    @return true if there is a next pass to perform.
+  */
   bool has_pass() const
   { return (m_pass < m_pass_max); }
 
+  /**
+    Iterator, proceed to the next pass.
+  */
   void next_pass()
   { m_pass++; }
   
+  /** First index for this pass. */
   uint first() const
   { return m_first[m_pass]; }
 
+  /** Last index for this pass. */
   uint last() const
   { return m_last[m_pass]; }
 
 private:
+  /** Current pass. */
   uint m_pass;
+  /** Maximum number of passes. */
   uint m_pass_max;
+  /** First element for each pass. */
   uint m_first[PFS_MAX_SCAN_PASS];
+  /** Last element for each pass. */
   uint m_last[PFS_MAX_SCAN_PASS];
 };
 
 
 /** Instrumented thread implementation. @see PSI_thread. */
-struct PFS_thread
+struct PFS_thread : PFS_connection_slice
 {
+  static PFS_thread* get_current_thread(void);
+
+  /** Thread instrumentation flag. */
+  bool m_enabled;
+  /** Current wait event in the event stack. */
+  PFS_events_waits *m_events_waits_current;
+  /** Event ID counter */
+  ulonglong m_event_id;
   /** Internal lock. */
   pfs_lock m_lock;
   /** Pins for filename_hash. */
   LF_PINS *m_filename_hash_pins;
   /** Pins for table_share_hash. */
   LF_PINS *m_table_share_hash_pins;
-  /** Event ID counter */
-  ulonglong m_event_id;
-  /** Thread instrumentation flag. */
-  bool m_enabled;
+  /** Pins for setup_actor_hash. */
+  LF_PINS *m_setup_actor_hash_pins;
+  /** Pins for setup_object_hash. */
+  LF_PINS *m_setup_object_hash_pins;
+  /** Pins for host_hash. */
+  LF_PINS *m_host_hash_pins;
+  /** Pins for user_hash. */
+  LF_PINS *m_user_hash_pins;
+  /** Pins for account_hash. */
+  LF_PINS *m_account_hash_pins;
+  /** Pins for digest_hash. */
+  LF_PINS *m_digest_hash_pins;
   /** Internal thread identifier, unique. */
   ulong m_thread_internal_id;
+  /** Parent internal thread identifier. */
+  ulong m_parent_thread_internal_id;
   /** External (SHOW PROCESSLIST) thread identifier, not unique. */
   ulong m_thread_id;
   /** Thread class. */
   PFS_thread_class *m_class;
-  /** Size of @c m_wait_locker_stack. */
-  uint m_wait_locker_count;
   /**
-    Stack of wait lockers.
-    This member holds the data for the table
-    PERFORMANCE_SCHEMA.EVENTS_WAITS_CURRENT.
-    For most locks, only 1 wait locker is used at a given time.
-    For composite locks, several records are needed:
-    - 1 for a 'logical' wait (for example on the GLOBAL READ LOCK state)
-    - 1 for a 'physical' wait (for example on COND_refresh)
+    Stack of events waits.
+    This member holds the data for the table PERFORMANCE_SCHEMA.EVENTS_WAITS_CURRENT.
+    Note that stack[0] is a dummy record that represents the parent stage/statement.
+    For example, assuming the following tree:
+    - STAGE ID 100
+      - WAIT ID 101, parent STAGE 100
+        - WAIT ID 102, parent wait 101
+    the data in the stack will be:
+    stack[0].m_event_id= 100, set by the stage instrumentation
+    stack[0].m_event_type= STAGE, set by the stage instrumentation
+    stack[0].m_nesting_event_id= unused
+    stack[0].m_nesting_event_type= unused
+    stack[1].m_event_id= 101
+    stack[1].m_event_type= WAIT
+    stack[1].m_nesting_event_id= stack[0].m_event_id= 100
+    stack[1].m_nesting_event_type= stack[0].m_event_type= STAGE
+    stack[2].m_event_id= 102
+    stack[2].m_event_type= WAIT
+    stack[2].m_nesting_event_id= stack[1].m_event_id= 101
+    stack[2].m_nesting_event_type= stack[1].m_event_type= WAIT
+
+    The whole point of the stack[0] record is to allow this optimization
+    in the code, in the instrumentation for wait events:
+      wait->m_nesting_event_id= (wait-1)->m_event_id;
+      wait->m_nesting_event_type= (wait-1)->m_event_type;
+    This code works for both the top level wait, and nested waits,
+    and works without if conditions, which helps performances.
   */
-  PFS_wait_locker m_wait_locker_stack[LOCKER_STACK_SIZE];
+  PFS_events_waits m_events_waits_stack[WAIT_STACK_SIZE];
   /** True if the circular buffer @c m_waits_history is full. */
   bool m_waits_history_full;
   /** Current index in the circular buffer @c m_waits_history. */
@@ -218,32 +447,75 @@ struct PFS_thread
     PERFORMANCE_SCHEMA.EVENTS_WAITS_HISTORY.
   */
   PFS_events_waits *m_waits_history;
+
+  /** True if the circular buffer @c m_stages_history is full. */
+  bool m_stages_history_full;
+  /** Current index in the circular buffer @c m_stages_history. */
+  uint m_stages_history_index;
   /**
-    Per thread waits aggregated statistics.
+    Stages history circular buffer.
     This member holds the data for the table
-    PERFORMANCE_SCHEMA.EVENTS_WAITS_SUMMARY_BY_THREAD_BY_EVENT_NAME.
+    PERFORMANCE_SCHEMA.EVENTS_STAGES_HISTORY.
   */
-  PFS_single_stat_chain *m_instr_class_wait_stats;
-};
+  PFS_events_stages *m_stages_history;
 
-PFS_thread *sanitize_thread(PFS_thread *unsafe);
-const char *sanitize_file_name(const char *unsafe);
-
-PFS_single_stat_chain*
-find_per_thread_mutex_class_wait_stat(PFS_thread *thread,
-                                      PFS_mutex_class *klass);
-
-PFS_single_stat_chain*
-find_per_thread_rwlock_class_wait_stat(PFS_thread *thread,
-                                       PFS_rwlock_class *klass);
+  /** True if the circular buffer @c m_statements_history is full. */
+  bool m_statements_history_full;
+  /** Current index in the circular buffer @c m_statements_history. */
+  uint m_statements_history_index;
+  /**
+    Statements history circular buffer.
+    This member holds the data for the table
+    PERFORMANCE_SCHEMA.EVENTS_STATEMENTS_HISTORY.
+  */
+  PFS_events_statements *m_statements_history;
+
+  /** User name. */
+  char m_username[USERNAME_LENGTH];
+  /** Length of @c m_username. */
+  uint m_username_length;
+  /** Host name. */
+  char m_hostname[HOSTNAME_LENGTH];
+  /** Length of @c m_hostname. */
+  uint m_hostname_length;
+  /** Database name. */
+  char m_dbname[NAME_LEN];
+  /** Length of @c m_dbname. */
+  uint m_dbname_length;
+  /** Current command. */
+  int m_command;
+  /** Start time. */
+  time_t m_start_time;
+  /** Processlist state. */
+  const char *m_processlist_state_ptr;
+  /** Length of @c m_processlist_state_ptr. */
+  uint m_processlist_state_length;
+  /** Processlist info. */
+  const char *m_processlist_info_ptr;
+  /** Length of @c m_processlist_info_length. */
+  uint m_processlist_info_length;
+
+  PFS_events_stages m_stage_current;
+
+  /** Size of @c m_events_statements_stack. */
+  uint m_events_statements_count;
+  PFS_events_statements *m_statement_stack;
+
+  PFS_host *m_host;
+  PFS_user *m_user;
+  PFS_account *m_account;
+};
 
-PFS_single_stat_chain*
-find_per_thread_cond_class_wait_stat(PFS_thread *thread,
-                                     PFS_cond_class *klass);
+extern PFS_single_stat *global_instr_class_waits_array;
+extern PFS_stage_stat *global_instr_class_stages_array;
+extern PFS_statement_stat *global_instr_class_statements_array;
 
-PFS_single_stat_chain*
-find_per_thread_file_class_wait_stat(PFS_thread *thread,
-                                     PFS_file_class *klass);
+PFS_mutex *sanitize_mutex(PFS_mutex *unsafe);
+PFS_rwlock *sanitize_rwlock(PFS_rwlock *unsafe);
+PFS_cond *sanitize_cond(PFS_cond *unsafe);
+PFS_thread *sanitize_thread(PFS_thread *unsafe);
+PFS_file *sanitize_file(PFS_file *unsafe);
+PFS_socket *sanitize_socket(PFS_socket *unsafe);
 
 int init_instruments(const PFS_global_param *param);
 void cleanup_instruments();
@@ -266,9 +538,13 @@ PFS_file* find_or_create_file(PFS_thread *thread, PFS_file_class *klass,
 
 void release_file(PFS_file *pfs);
 void destroy_file(PFS_thread *thread, PFS_file *pfs);
-PFS_table* create_table(PFS_table_share *share, const void *identity);
+PFS_table* create_table(PFS_table_share *share, PFS_thread *opening_thread,
+                        const void *identity);
 void destroy_table(PFS_table *pfs);
 
+PFS_socket* create_socket(PFS_socket_class *socket_class, const void *identity);
+void destroy_socket(PFS_socket *pfs);
+
 /* For iterators and show status. */
 
 extern ulong mutex_max;
@@ -285,9 +561,13 @@ extern long file_handle_max;
 extern ulong file_handle_lost;
 extern ulong table_max;
 extern ulong table_lost;
+extern ulong socket_max;
+extern ulong socket_lost;
 extern ulong events_waits_history_per_thread;
-extern ulong instr_class_per_thread;
+extern ulong events_stages_history_per_thread;
+extern ulong events_statements_history_per_thread;
 extern ulong locker_lost;
+extern ulong statement_lost;
 
 /* Exposing the data directly, for iterators. */
 
@@ -298,10 +578,51 @@ extern PFS_thread *thread_array;
 extern PFS_file *file_array;
 extern PFS_file **file_handle_array;
 extern PFS_table *table_array;
+extern PFS_socket *socket_array;
 
 void reset_events_waits_by_instance();
-void reset_per_thread_wait_stat();
 void reset_file_instance_io();
+void reset_socket_instance_io();
+
+void aggregate_all_event_names(PFS_single_stat *from_array,
+                               PFS_single_stat *to_array);
+void aggregate_all_event_names(PFS_single_stat *from_array,
+                               PFS_single_stat *to_array_1,
+                               PFS_single_stat *to_array_2);
+
+void aggregate_all_stages(PFS_stage_stat *from_array,
+                          PFS_stage_stat *to_array);
+void aggregate_all_stages(PFS_stage_stat *from_array,
+                          PFS_stage_stat *to_array_1,
+                          PFS_stage_stat *to_array_2);
+
+void aggregate_all_statements(PFS_statement_stat *from_array,
+                              PFS_statement_stat *to_array);
+void aggregate_all_statements(PFS_statement_stat *from_array,
+                              PFS_statement_stat *to_array_1,
+                              PFS_statement_stat *to_array_2);
+
+void aggregate_thread(PFS_thread *thread);
+void aggregate_thread_waits(PFS_thread *thread);
+void aggregate_thread_stages(PFS_thread *thread);
+void aggregate_thread_statements(PFS_thread *thread);
+void clear_thread_account(PFS_thread *thread);
+void set_thread_account(PFS_thread *thread);
+
+/** Update derived flags for all mutex instances. */
+void update_mutex_derived_flags();
+/** Update derived flags for all rwlock instances. */
+void update_rwlock_derived_flags();
+/** Update derived flags for all condition instances. */
+void update_cond_derived_flags();
+/** Update derived flags for all file handles. */
+void update_file_derived_flags();
+/** Update derived flags for all table handles. */
+void update_table_derived_flags();
+/** Update derived flags for all socket instances. */
+void update_socket_derived_flags();
+/** Update derived flags for all instruments. */
+void update_instruments_derived_flags();
 
 /** @} */
 #endif
diff --git a/storage/perfschema/pfs_instr_class.cc b/storage/perfschema/pfs_instr_class.cc
index 8bad6e99b3a..0a4b47404a4 100644
--- a/storage/perfschema/pfs_instr_class.cc
+++ b/storage/perfschema/pfs_instr_class.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2008, 2011, Oracle and/or its affiliates. All rights reserved.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
@@ -20,10 +20,14 @@
 
 #include "my_global.h"
 #include "my_sys.h"
+#include "structs.h"
+#include "table.h"
 #include "pfs_instr_class.h"
 #include "pfs_instr.h"
 #include "pfs_global.h"
+#include "pfs_timer.h"
 #include "pfs_events_waits.h"
+#include "pfs_setup_object.h"
 #include "pfs_atomic.h"
 #include "mysql/psi/mysql_thread.h"
 #include "lf.h"
@@ -44,6 +48,21 @@
 my_bool pfs_enabled= TRUE;
 
 /**
+  PFS_INSTRUMENT option settings array and associated state variable to
+  serialize access during shutdown.
+ */
+DYNAMIC_ARRAY pfs_instr_config_array;
+int pfs_instr_config_state= PFS_INSTR_CONFIG_NOT_INITIALIZED;
+
+static void configure_instr_class(PFS_instr_class *entry);
+
+static void init_instr_class(PFS_instr_class *klass,
+                             const char *name,
+                             uint name_length,
+                             int flags,
+                             PFS_class_type class_type);
+
+/**
   Current number of elements in mutex_class_array.
   This global variable is written to during:
   - the performance schema initialization
@@ -76,14 +95,26 @@ ulong thread_class_lost= 0;
 ulong file_class_max= 0;
 /** Number of file class lost. @sa file_class_array */
 ulong file_class_lost= 0;
+/** Size of the stage class array. @sa stage_class_array */
+ulong stage_class_max= 0;
+/** Number of stage class lost. @sa stage_class_array */
+ulong stage_class_lost= 0;
+/** Size of the statement class array. @sa statement_class_array */
+ulong statement_class_max= 0;
+/** Number of statement class lost. @sa statement_class_array */
+ulong statement_class_lost= 0;
 /** Size of the table share array. @sa table_share_array */
 ulong table_share_max= 0;
 /** Number of table share lost. @sa table_share_array */
 ulong table_share_lost= 0;
+/** Size of the socket class array. @sa socket_class_array */
+ulong socket_class_max= 0;
+/** Number of socket class lost. @sa socket_class_array */
+ulong socket_class_lost= 0;
 
-static PFS_mutex_class *mutex_class_array= NULL;
-static PFS_rwlock_class *rwlock_class_array= NULL;
-static PFS_cond_class *cond_class_array= NULL;
+PFS_mutex_class *mutex_class_array= NULL;
+PFS_rwlock_class *rwlock_class_array= NULL;
+PFS_cond_class *cond_class_array= NULL;
 
 /**
   Current number or elements in thread_class_array.
@@ -104,29 +135,99 @@ static PFS_thread_class *thread_class_array= NULL;
 */
 PFS_table_share *table_share_array= NULL;
 
-PFS_instr_class global_table_class=
-{
-  "wait/table", /* name */
-  10, /* name length */
-  0, /* flags */
-  true, /* enabled */
-  true, /* timed */
-  { &flag_events_waits_current, NULL, 0, 0, 0, 0} /* wait stat chain */
+PFS_instr_class global_table_io_class;
+PFS_instr_class global_table_lock_class;
+PFS_instr_class global_idle_class;
+
+/** Class-timer map */
+enum_timer_name *class_timers[] =
+{&wait_timer,      /* PFS_CLASS_NONE */
+ &wait_timer,      /* PFS_CLASS_MUTEX */
+ &wait_timer,      /* PFS_CLASS_RWLOCK */
+ &wait_timer,      /* PFS_CLASS_COND */
+ &wait_timer,      /* PFS_CLASS_FILE */
+ &wait_timer,      /* PFS_CLASS_TABLE */
+ &stage_timer,     /* PFS_CLASS_STAGE */
+ &statement_timer, /* PFS_CLASS_STATEMENT */
+ &wait_timer,      /* PFS_CLASS_SOCKET */
+ &wait_timer,      /* PFS_CLASS_TABLE_IO */
+ &wait_timer,      /* PFS_CLASS_TABLE_LOCK */
+ &idle_timer       /* PFS_CLASS_IDLE */
 };
 
-/** Hash table for instrumented tables.  */
+/**
+  Hash index for instrumented table shares.
+  This index is searched by table fully qualified name (@c PFS_table_share_key),
+  and points to instrumented table shares (@c PFS_table_share).
+  @sa table_share_array
+  @sa PFS_table_share_key
+  @sa PFS_table_share
+  @sa table_share_hash_get_key
+  @sa get_table_share_hash_pins
+*/
 static LF_HASH table_share_hash;
 /** True if table_share_hash is initialized. */
 static bool table_share_hash_inited= false;
-C_MODE_START
-/** Get hash table key for instrumented tables. */
-static uchar *table_share_hash_get_key(const uchar *, size_t *, my_bool);
-C_MODE_END
 
 static volatile uint32 file_class_dirty_count= 0;
 static volatile uint32 file_class_allocated_count= 0;
 
-static PFS_file_class *file_class_array= NULL;
+PFS_file_class *file_class_array= NULL;
+
+static volatile uint32 stage_class_dirty_count= 0;
+static volatile uint32 stage_class_allocated_count= 0;
+
+static PFS_stage_class *stage_class_array= NULL;
+
+static volatile uint32 statement_class_dirty_count= 0;
+static volatile uint32 statement_class_allocated_count= 0;
+
+static PFS_statement_class *statement_class_array= NULL;
+
+static volatile uint32 socket_class_dirty_count= 0;
+static volatile uint32 socket_class_allocated_count= 0;
+
+static PFS_socket_class *socket_class_array= NULL;
+
+uint mutex_class_start= 0;
+uint rwlock_class_start= 0;
+uint cond_class_start= 0;
+uint file_class_start= 0;
+uint table_class_start= 0;
+uint wait_class_max= 0;
+uint socket_class_start= 0;
+
+void init_event_name_sizing(const PFS_global_param *param)
+{
+  mutex_class_start= 0;
+  rwlock_class_start= mutex_class_start + param->m_mutex_class_sizing;
+  cond_class_start= rwlock_class_start + param->m_rwlock_class_sizing;
+  file_class_start= cond_class_start + param->m_cond_class_sizing;
+  socket_class_start= file_class_start + param->m_file_class_sizing;
+  table_class_start= socket_class_start + param->m_socket_class_sizing;
+  wait_class_max= table_class_start + 3; /* global table io, lock, idle */
+}
+
+void register_global_classes()
+{
+  /* Table IO class */
+  init_instr_class(&global_table_io_class, "wait/io/table/sql/handler", 25,
+                   0, PFS_CLASS_TABLE_IO);
+  global_table_io_class.m_event_name_index= table_class_start;
+  configure_instr_class(&global_table_io_class);
+
+  /* Table lock class */
+  init_instr_class(&global_table_lock_class, "wait/lock/table/sql/handler", 27,
+                   0, PFS_CLASS_TABLE_LOCK);
+  global_table_lock_class.m_event_name_index= table_class_start + 1;
+  configure_instr_class(&global_table_lock_class);
+  
+  /* Idle class */
+  init_instr_class(&global_idle_class, "idle", 4,
+                   0, PFS_CLASS_IDLE);
+  global_idle_class.m_event_name_index= table_class_start + 2;
+  configure_instr_class(&global_idle_class);
+}
 
 /**
   Initialize the instrument synch class buffers.
@@ -258,6 +359,8 @@ void cleanup_table_share(void)
   table_share_max= 0;
 }
 
+C_MODE_START
+/** get_key function for @c table_share_hash. */
 static uchar *table_share_hash_get_key(const uchar *entry, size_t *length,
                                        my_bool)
 {
@@ -272,6 +375,7 @@ static uchar *table_share_hash_get_key(const uchar *entry, size_t *length,
   result= &share->m_key.m_hash_key[0];
   return const_cast<uchar*> (reinterpret_cast<const uchar*> (result));
 }
+C_MODE_END
 
 /** Initialize the table share hash table. */
 int init_table_share_hash(void)
@@ -296,6 +400,72 @@ void cleanup_table_share_hash(void)
 }
 
 /**
+  Get the hash pins for @sa table_share_hash.
+  @param thread The running thread.
+  @returns The LF_HASH pins for the thread.
+*/
+LF_PINS* get_table_share_hash_pins(PFS_thread *thread)
+{
+  if (unlikely(thread->m_table_share_hash_pins == NULL))
+  {
+    if (! table_share_hash_inited)
+      return NULL;
+    thread->m_table_share_hash_pins= lf_hash_get_pins(&table_share_hash);
+  }
+  return thread->m_table_share_hash_pins;
+}
+
+/**
+  Set a table share hash key.
+  @param [out] key The key to populate.
+  @param temporary True for TEMPORARY TABLE.
+  @param schema_name The table schema name.
+  @param schema_name_length The table schema name length.
+  @param table_name The table name.
+  @param table_name_length The table name length.
+*/
+static void set_table_share_key(PFS_table_share_key *key,
+                                bool temporary,
+                                const char *schema_name, uint schema_name_length,
+                                const char *table_name, uint table_name_length)
+{
+  DBUG_ASSERT(schema_name_length <= NAME_LEN);
+  DBUG_ASSERT(table_name_length <= NAME_LEN);
+  char *saved_schema_name;
+  char *saved_table_name;
+
+  char *ptr= &key->m_hash_key[0];
+  ptr[0]= (temporary ? OBJECT_TYPE_TEMPORARY_TABLE : OBJECT_TYPE_TABLE);
+  ptr++;
+  saved_schema_name= ptr;
+  memcpy(ptr, schema_name, schema_name_length);
+  ptr+= schema_name_length;
+  ptr[0]= 0;
+  ptr++;
+  saved_table_name= ptr;
+  memcpy(ptr, table_name, table_name_length);
+  ptr+= table_name_length;
+  ptr[0]= 0;
+  ptr++;
+  key->m_key_length= ptr - &key->m_hash_key[0];
+
+  if (lower_case_table_names)
+  {
+    my_casedn_str(files_charset_info, saved_schema_name);
+    my_casedn_str(files_charset_info, saved_table_name);
+  }
+}
+
+void PFS_table_share::refresh_setup_object_flags(PFS_thread *thread)
+{
+  lookup_setup_object(thread,
+                      OBJECT_TYPE_TABLE,
+                      m_schema_name, m_schema_name_length,
+                      m_table_name, m_table_name_length,
+                      &m_enabled, &m_timed);
+}
+
+/**
   Initialize the file class buffer.
   @param file_class_sizing            max number of file class
   @return 0 on success
@@ -329,10 +499,113 @@ void cleanup_file_class(void)
   file_class_max= 0;
 }
 
+/**
+  Initialize the stage class buffer.
+  @param stage_class_sizing            max number of stage class
+  @return 0 on success
+*/
+int init_stage_class(uint stage_class_sizing)
+{
+  int result= 0;
+  stage_class_dirty_count= stage_class_allocated_count= 0;
+  stage_class_max= stage_class_sizing;
+  stage_class_lost= 0;
+
+  if (stage_class_max > 0)
+  {
+    stage_class_array= PFS_MALLOC_ARRAY(stage_class_max, PFS_stage_class,
+                                        MYF(MY_ZEROFILL));
+    if (unlikely(stage_class_array == NULL))
+      return 1;
+  }
+  else
+    stage_class_array= NULL;
+
+  return result;
+}
+
+/** Cleanup the stage class buffers. */
+void cleanup_stage_class(void)
+{
+  pfs_free(stage_class_array);
+  stage_class_array= NULL;
+  stage_class_dirty_count= stage_class_allocated_count= 0;
+  stage_class_max= 0;
+}
+
+/**
+  Initialize the statement class buffer.
+  @param statement_class_sizing            max number of statement class
+  @return 0 on success
+*/
+int init_statement_class(uint statement_class_sizing)
+{
+  int result= 0;
+  statement_class_dirty_count= statement_class_allocated_count= 0;
+  statement_class_max= statement_class_sizing;
+  statement_class_lost= 0;
+
+  if (statement_class_max > 0)
+  {
+    statement_class_array= PFS_MALLOC_ARRAY(statement_class_max, PFS_statement_class,
+                                            MYF(MY_ZEROFILL));
+    if (unlikely(statement_class_array == NULL))
+      return 1;
+  }
+  else
+    statement_class_array= NULL;
+
+  return result;
+}
+
+/** Cleanup the statement class buffers. */
+void cleanup_statement_class(void)
+{
+  pfs_free(statement_class_array);
+  statement_class_array= NULL;
+  statement_class_dirty_count= statement_class_allocated_count= 0;
+  statement_class_max= 0;
+}
+
+/**
+  Initialize the socket class buffer.
+  @param socket_class_sizing            max number of socket class
+  @return 0 on success
+*/
+int init_socket_class(uint socket_class_sizing)
+{
+  int result= 0;
+  socket_class_dirty_count= socket_class_allocated_count= 0;
+  socket_class_max= socket_class_sizing;
+  socket_class_lost= 0;
+
+  if (socket_class_max > 0)
+  {
+    socket_class_array= PFS_MALLOC_ARRAY(socket_class_max, PFS_socket_class,
+                                         MYF(MY_ZEROFILL));
+    if (unlikely(socket_class_array == NULL))
+      return 1;
+  }
+  else
+    socket_class_array= NULL;
+
+  return result;
+}
+
+/** Cleanup the socket class buffers. */
+void cleanup_socket_class(void)
+{
+  pfs_free(socket_class_array);
+  socket_class_array= NULL;
+  socket_class_dirty_count= socket_class_allocated_count= 0;
+  socket_class_max= 0;
+}
+
 static void init_instr_class(PFS_instr_class *klass,
                              const char *name,
                              uint name_length,
-                             int flags)
+                             int flags,
+                             PFS_class_type class_type)
 {
   DBUG_ASSERT(name_length <= PFS_MAX_INFO_NAME_LENGTH);
   memset(klass, 0, sizeof(PFS_instr_class));
@@ -341,6 +614,43 @@ static void init_instr_class(PFS_instr_class *klass,
   klass->m_flags= flags;
   klass->m_enabled= true;
   klass->m_timed= true;
+  klass->m_type= class_type;
+  klass->m_timer= class_timers[class_type];
+}
+
+/**
+  Set user-defined configuration values for an instrument.
+*/
+static void configure_instr_class(PFS_instr_class *entry)
+{
+  uint match_length= 0; /* length of matching pattern */
+
+  for (uint i= 0; i < pfs_instr_config_array.elements; i++)
+  {
+    PFS_instr_config* e;
+    get_dynamic(&pfs_instr_config_array, (uchar*)&e, i);
+
+    /**
+      Compare class name to all configuration entries. In case of multiple
+      matches, the longer specification wins. For example, the pattern
+      'ABC/DEF/GHI=ON' has precedence over 'ABC/DEF/%=OFF' regardless of
+      position within the configuration file or command line.
+
+      Consecutive wildcards affect the count.
+    */
+    if (!my_wildcmp(&my_charset_latin1,
+                    entry->m_name, entry->m_name+entry->m_name_length,
+                    e->m_name, e->m_name+e->m_name_length,
+                    '\\', '?','%'))
+    {
+        if (e->m_name_length >= match_length)
+        {
+           entry->m_enabled= e->m_enabled;
+           entry->m_timed= e->m_timed;
+           match_length= MY_MAX(e->m_name_length, match_length);
+        }
+    }
+  }
 }
 
 #define REGISTER_CLASS_BODY_PART(INDEX, ARRAY, MAX, NAME, NAME_LENGTH) \
@@ -404,16 +714,16 @@ PFS_sync_key register_mutex_class(const char *name, uint name_length,
         in INSTALL PLUGIN.
     */
     entry= &mutex_class_array[index];
-    init_instr_class(entry, name, name_length, flags);
-    entry->m_wait_stat.m_control_flag=
-      &flag_events_waits_summary_by_event_name;
-    entry->m_wait_stat.m_parent= NULL;
-    reset_single_stat_link(&entry->m_wait_stat);
-    entry->m_lock_stat.m_control_flag=
-      &flag_events_locks_summary_by_event_name;
-    entry->m_lock_stat.m_parent= NULL;
-    reset_single_stat_link(&entry->m_lock_stat);
-    entry->m_index= index;
+    init_instr_class(entry, name, name_length, flags, PFS_CLASS_MUTEX);
+    entry->m_lock_stat.reset();
+    entry->m_event_name_index= mutex_class_start + index;
+    entry->m_singleton= NULL;
+    entry->m_enabled= false; /* disabled by default */
+    entry->m_timed= false;
+
+    /* Set user-defined configuration options for this instrument */
+    configure_instr_class(entry);
+
     /*
       Now that this entry is populated, advertise it
 
@@ -470,20 +780,15 @@ PFS_sync_key register_rwlock_class(const char *name, uint name_length,
   if (index < rwlock_class_max)
   {
     entry= &rwlock_class_array[index];
-    init_instr_class(entry, name, name_length, flags);
-    entry->m_wait_stat.m_control_flag=
-      &flag_events_waits_summary_by_event_name;
-    entry->m_wait_stat.m_parent= NULL;
-    reset_single_stat_link(&entry->m_wait_stat);
-    entry->m_read_lock_stat.m_control_flag=
-      &flag_events_locks_summary_by_event_name;
-    entry->m_read_lock_stat.m_parent= NULL;
-    reset_single_stat_link(&entry->m_read_lock_stat);
-    entry->m_write_lock_stat.m_control_flag=
-      &flag_events_locks_summary_by_event_name;
-    entry->m_write_lock_stat.m_parent= NULL;
-    reset_single_stat_link(&entry->m_write_lock_stat);
-    entry->m_index= index;
+    init_instr_class(entry, name, name_length, flags, PFS_CLASS_RWLOCK);
+    entry->m_read_lock_stat.reset();
+    entry->m_write_lock_stat.reset();
+    entry->m_event_name_index= rwlock_class_start + index;
+    entry->m_singleton= NULL;
+    entry->m_enabled= false; /* disabled by default */
+    entry->m_timed= false;
+    /* Set user-defined configuration options for this instrument */
+    configure_instr_class(entry);
     PFS_atomic::add_u32(&rwlock_class_allocated_count, 1);
     return (index + 1);
   }
@@ -514,12 +819,13 @@ PFS_sync_key register_cond_class(const char *name, uint name_length,
   if (index < cond_class_max)
   {
     entry= &cond_class_array[index];
-    init_instr_class(entry, name, name_length, flags);
-    entry->m_wait_stat.m_control_flag=
-      &flag_events_waits_summary_by_event_name;
-    entry->m_wait_stat.m_parent= NULL;
-    reset_single_stat_link(&entry->m_wait_stat);
-    entry->m_index= index;
+    init_instr_class(entry, name, name_length, flags, PFS_CLASS_COND);
+    entry->m_event_name_index= cond_class_start + index;
+    entry->m_singleton= NULL;
+    entry->m_enabled= false; /* disabled by default */
+    entry->m_timed= false;
+    /* Set user-defined configuration options for this instrument */
+    configure_instr_class(entry);
     PFS_atomic::add_u32(&cond_class_allocated_count, 1);
     return (index + 1);
   }
@@ -655,12 +961,13 @@ PFS_file_key register_file_class(const char *name, uint name_length,
   if (index < file_class_max)
   {
     entry= &file_class_array[index];
-    init_instr_class(entry, name, name_length, flags);
-    entry->m_wait_stat.m_control_flag=
-      &flag_events_waits_summary_by_event_name;
-    entry->m_wait_stat.m_parent= NULL;
-    reset_single_stat_link(&entry->m_wait_stat);
-    entry->m_index= index;
+    init_instr_class(entry, name, name_length, flags, PFS_CLASS_FILE);
+    entry->m_event_name_index= file_class_start + index;
+    entry->m_singleton= NULL;
+    entry->m_enabled= true; /* enabled by default */
+    entry->m_timed= true;
+    /* Set user-defined configuration options for this instrument */
+    configure_instr_class(entry);
     PFS_atomic::add_u32(&file_class_allocated_count, 1);
     return (index + 1);
   }
@@ -670,6 +977,80 @@ PFS_file_key register_file_class(const char *name, uint name_length,
 }
 
 /**
+  Register a stage instrumentation metadata.
+  @param name                         the instrumented name
+  @param name_length                  length in bytes of name
+  @param flags                        the instrumentation flags
+  @return a stage instrumentation key
+*/
+PFS_stage_key register_stage_class(const char *name, uint name_length,
+                                   int flags)
+{
+  /* See comments in register_mutex_class */
+  uint32 index;
+  PFS_stage_class *entry;
+
+  REGISTER_CLASS_BODY_PART(index, stage_class_array, stage_class_max,
+                           name, name_length)
+
+  index= PFS_atomic::add_u32(&stage_class_dirty_count, 1);
+
+  if (index < stage_class_max)
+  {
+    entry= &stage_class_array[index];
+    init_instr_class(entry, name, name_length, flags, PFS_CLASS_STAGE);
+    entry->m_event_name_index= index;
+    entry->m_enabled= false; /* disabled by default */
+    entry->m_timed= false;
+    /* Set user-defined configuration options for this instrument */
+    configure_instr_class(entry);
+    PFS_atomic::add_u32(&stage_class_allocated_count, 1);
+
+    return (index + 1);
+  }
+
+  stage_class_lost++;
+  return 0;
+}
+
+/**
+  Register a statement instrumentation metadata.
+  @param name                         the instrumented name
+  @param name_length                  length in bytes of name
+  @param flags                        the instrumentation flags
+  @return a statement instrumentation key
+*/
+PFS_statement_key register_statement_class(const char *name, uint name_length,
+                                           int flags)
+{
+  /* See comments in register_mutex_class */
+  uint32 index;
+  PFS_statement_class *entry;
+
+  REGISTER_CLASS_BODY_PART(index, statement_class_array, statement_class_max,
+                           name, name_length)
+
+  index= PFS_atomic::add_u32(&statement_class_dirty_count, 1);
+
+  if (index < statement_class_max)
+  {
+    entry= &statement_class_array[index];
+    init_instr_class(entry, name, name_length, flags, PFS_CLASS_STATEMENT);
+    entry->m_event_name_index= index;
+    entry->m_enabled= true; /* enabled by default */
+    entry->m_timed= true;
+    /* Set user-defined configuration options for this instrument */
+    configure_instr_class(entry);
+    PFS_atomic::add_u32(&statement_class_allocated_count, 1);
+
+    return (index + 1);
+  }
+
+  statement_class_lost++;
+  return 0;
+}
+
+/**
   Find a file instrumentation class by key.
   @param key                          the instrument key
   @return the instrument class, or NULL
@@ -685,125 +1066,283 @@ PFS_file_class *sanitize_file_class(PFS_file_class *unsafe)
 }
 
 /**
-  Find or create a table instance by name.
+  Find a stage instrumentation class by key.
+  @param key                          the instrument key
+  @return the instrument class, or NULL
+*/
+PFS_stage_class *find_stage_class(PFS_stage_key key)
+{
+  FIND_CLASS_BODY(key, stage_class_allocated_count, stage_class_array);
+}
+
+PFS_stage_class *sanitize_stage_class(PFS_stage_class *unsafe)
+{
+  SANITIZE_ARRAY_BODY(PFS_stage_class, stage_class_array, stage_class_max, unsafe);
+}
+
+/**
+  Find a statement instrumentation class by key.
+  @param key                          the instrument key
+  @return the instrument class, or NULL
+*/
+PFS_statement_class *find_statement_class(PFS_stage_key key)
+{
+  FIND_CLASS_BODY(key, statement_class_allocated_count, statement_class_array);
+}
+
+PFS_statement_class *sanitize_statement_class(PFS_statement_class *unsafe)
+{
+  SANITIZE_ARRAY_BODY(PFS_statement_class, statement_class_array, statement_class_max, unsafe);
+}
+
+/**
+  Register a socket instrumentation metadata.
+  @param name                         the instrumented name
+  @param name_length                  length in bytes of name
+  @param flags                        the instrumentation flags
+  @return a socket instrumentation key
+*/
+PFS_socket_key register_socket_class(const char *name, uint name_length,
+                                     int flags)
+{
+  /* See comments in register_mutex_class */
+  uint32 index;
+  PFS_socket_class *entry;
+
+  REGISTER_CLASS_BODY_PART(index, socket_class_array, socket_class_max,
+                           name, name_length)
+
+  index= PFS_atomic::add_u32(&socket_class_dirty_count, 1);
+
+  if (index < socket_class_max)
+  {
+    entry= &socket_class_array[index];
+    init_instr_class(entry, name, name_length, flags, PFS_CLASS_SOCKET);
+    entry->m_event_name_index= socket_class_start + index;
+    entry->m_singleton= NULL;
+    entry->m_enabled= false; /* disabled by default */
+    entry->m_timed= false;
+    /* Set user-defined configuration options for this instrument */
+    configure_instr_class(entry);
+    PFS_atomic::add_u32(&socket_class_allocated_count, 1);
+    return (index + 1);
+  }
+
+  socket_class_lost++;
+  return 0;
+}
+
+/**
+  Find a socket instrumentation class by key.
+  @param key                          the instrument key
+  @return the instrument class, or NULL
+*/
+PFS_socket_class *find_socket_class(PFS_socket_key key)
+{
+  FIND_CLASS_BODY(key, socket_class_allocated_count, socket_class_array);
+}
+
+PFS_socket_class *sanitize_socket_class(PFS_socket_class *unsafe)
+{
+  SANITIZE_ARRAY_BODY(PFS_socket_class, socket_class_array, socket_class_max, unsafe);
+}
+
+PFS_instr_class *find_table_class(uint index)
+{
+  if (index == 1)
+    return & global_table_io_class;
+  if (index == 2)
+    return & global_table_lock_class;
+  return NULL;
+}
+
+PFS_instr_class *sanitize_table_class(PFS_instr_class *unsafe)
+{
+  if (likely((& global_table_io_class == unsafe) ||
+             (& global_table_lock_class == unsafe)))
+    return unsafe;
+  return NULL;
+}
+
+PFS_instr_class *find_idle_class(uint index)
+{
+  if (index == 1)
+    return & global_idle_class;
+  return NULL;
+}
+
+PFS_instr_class *sanitize_idle_class(PFS_instr_class *unsafe)
+{
+  if (likely(& global_idle_class == unsafe))
+    return unsafe;
+  return NULL;
+}
+
+static void set_keys(PFS_table_share *pfs, const TABLE_SHARE *share)
+{
+  int len;
+  KEY *key_info= share->key_info;
+  PFS_table_key *pfs_key= pfs->m_keys;
+  PFS_table_key *pfs_key_last= pfs->m_keys + share->keys;
+  pfs->m_key_count= share->keys;
+
+  for ( ; pfs_key < pfs_key_last; pfs_key++, key_info++)
+  {
+    len= strlen(key_info->name);
+    memcpy(pfs_key->m_name, key_info->name, len);
+    pfs_key->m_name_length= len;
+  }
+
+  pfs_key_last= pfs->m_keys + MAX_KEY;
+  for ( ; pfs_key < pfs_key_last; pfs_key++)
+    pfs_key->m_name_length= 0;
+}
+
+static int compare_keys(PFS_table_share *pfs, const TABLE_SHARE *share)
+{
+  uint len;
+  KEY *key_info= share->key_info;
+  PFS_table_key *pfs_key= pfs->m_keys;
+  PFS_table_key *pfs_key_last= pfs->m_keys + share->keys;
+
+  if (pfs->m_key_count != share->keys)
+    return 1;
+
+  for ( ; pfs_key < pfs_key_last; pfs_key++, key_info++)
+  {
+    len= strlen(key_info->name);
+    if (len != pfs_key->m_name_length)
+      return 1;
+
+    if (memcmp(pfs_key->m_name, key_info->name, len) != 0)
+      return 1;
+  }
+
+  return 0;
+}
+
+/**
+  Find or create a table share instrumentation.
   @param thread                       the executing instrumented thread
-  @param schema_name                  the table schema name
-  @param schema_name_length           the table schema name length
-  @param table_name                   the table name
-  @param table_name_length            the table name length
-  @return a table instance, or NULL
+  @param temporary                    true for TEMPORARY TABLE
+  @param share                        table share
+  @return a table share, or NULL
 */
 PFS_table_share* find_or_create_table_share(PFS_thread *thread,
-                                            const char *schema_name,
-                                            uint schema_name_length,
-                                            const char *table_name,
-                                            uint table_name_length)
+                                            bool temporary,
+                                            const TABLE_SHARE *share)
 {
   /* See comments in register_mutex_class */
-  int pass;
   PFS_table_share_key key;
 
-  if (! table_share_hash_inited)
+  LF_PINS *pins= get_table_share_hash_pins(thread);
+  if (unlikely(pins == NULL))
   {
-    /* Table instrumentation can be turned off. */
     table_share_lost++;
     return NULL;
   }
 
-  if (unlikely(thread->m_table_share_hash_pins == NULL))
-  {
-    thread->m_table_share_hash_pins= lf_hash_get_pins(&table_share_hash);
-    if (unlikely(thread->m_table_share_hash_pins == NULL))
-    {
-      table_share_lost++;
-      return NULL;
-    }
-  }
+  const char *schema_name= share->db.str;
+  uint schema_name_length= share->db.length;
+  const char *table_name= share->table_name.str;
+  uint table_name_length= share->table_name.length;
 
-  DBUG_ASSERT(schema_name_length <= NAME_LEN);
-  DBUG_ASSERT(table_name_length <= NAME_LEN);
-
-  char *ptr= &key.m_hash_key[0];
-  memcpy(ptr, schema_name, schema_name_length);
-  ptr+= schema_name_length;
-  ptr[0]= 0; ptr++;
-  memcpy(ptr, table_name, table_name_length);
-  ptr+= table_name_length;
-  ptr[0]= 0; ptr++;
-  key.m_key_length= ptr - &key.m_hash_key[0];
+  set_table_share_key(&key, temporary,
+                      schema_name, schema_name_length,
+                      table_name, table_name_length);
 
   PFS_table_share **entry;
   uint retry_count= 0;
   const uint retry_max= 3;
+  bool enabled= true;
+  bool timed= true;
+  static uint table_share_monotonic_index= 0;
+  uint index;
+  uint attempts= 0;
+  PFS_table_share *pfs;
+
 search:
   entry= reinterpret_cast<PFS_table_share**>
-    (lf_hash_search(&table_share_hash, thread->m_table_share_hash_pins,
-                    &key.m_hash_key[0], key.m_key_length));
+    (lf_hash_search(&table_share_hash, pins,
+                    key.m_hash_key, key.m_key_length));
   if (entry && (entry != MY_ERRPTR))
   {
-    PFS_table_share *pfs;
     pfs= *entry;
-    lf_hash_search_unpin(thread->m_table_share_hash_pins);
+    pfs->inc_refcount() ;
+    if (compare_keys(pfs, share) != 0)
+    {
+      set_keys(pfs, share);
+      /* FIXME: aggregate to table_share sink ? */
+      pfs->m_table_stat.fast_reset();
+    }
+    lf_hash_search_unpin(pins);
     return pfs;
   }
 
-  /* table_name is not constant, just using it for noise on create */
-  uint i= randomized_index(table_name, table_share_max);
+  lf_hash_search_unpin(pins);
 
-  /*
-    Pass 1: [random, table_share_max - 1]
-    Pass 2: [0, table_share_max - 1]
-  */
-  for (pass= 1; pass <= 2; i=0, pass++)
+  if (retry_count == 0)
+  {
+    lookup_setup_object(thread,
+                        OBJECT_TYPE_TABLE,
+                        schema_name, schema_name_length,
+                        table_name, table_name_length,
+                        &enabled, &timed);
+    /*
+      Even when enabled is false, a record is added in the dictionary:
+      - It makes enabling a table already in the table cache possible,
+      - It improves performances for the next time a TABLE_SHARE is reloaded
+        in the table cache.
+    */
+  }
+
+  while (++attempts <= table_share_max)
   {
-    PFS_table_share *pfs= table_share_array + i;
-    PFS_table_share *pfs_last= table_share_array + table_share_max;
-    for ( ; pfs < pfs_last; pfs++)
+    /* See create_mutex() */
+    PFS_atomic::add_u32(& table_share_monotonic_index, 1);
+    index= table_share_monotonic_index % table_share_max;
+    pfs= table_share_array + index;
+
+    if (pfs->m_lock.is_free())
     {
-      if (pfs->m_lock.is_free())
+      if (pfs->m_lock.free_to_dirty())
       {
-        if (pfs->m_lock.free_to_dirty())
+        pfs->m_key= key;
+        pfs->m_schema_name= &pfs->m_key.m_hash_key[1];
+        pfs->m_schema_name_length= schema_name_length;
+        pfs->m_table_name= &pfs->m_key.m_hash_key[schema_name_length + 2];
+        pfs->m_table_name_length= table_name_length;
+        pfs->m_enabled= enabled;
+        pfs->m_timed= timed;
+        pfs->init_refcount();
+        pfs->m_table_stat.fast_reset();
+        set_keys(pfs, share);
+
+        int res;
+        res= lf_hash_insert(&table_share_hash, pins, &pfs);
+        if (likely(res == 0))
         {
-          pfs->m_key= key;
-          pfs->m_schema_name= &pfs->m_key.m_hash_key[0];
-          pfs->m_schema_name_length= schema_name_length;
-          pfs->m_table_name= &pfs->m_key.m_hash_key[schema_name_length + 1];
-          pfs->m_table_name_length= table_name_length;
-          pfs->m_wait_stat.m_control_flag=
-            &flag_events_waits_summary_by_instance;
-          pfs->m_wait_stat.m_parent= NULL;
-          reset_single_stat_link(&pfs->m_wait_stat);
-          pfs->m_enabled= true;
-          pfs->m_timed= true;
-          pfs->m_aggregated= false;
-
-          int res;
-          res= lf_hash_insert(&table_share_hash,
-                              thread->m_table_share_hash_pins, &pfs);
-          if (likely(res == 0))
-          {
-            pfs->m_lock.dirty_to_allocated();
-            return pfs;
-          }
+          pfs->m_lock.dirty_to_allocated();
+          return pfs;
+        }
 
-          pfs->m_lock.dirty_to_free();
+        pfs->m_lock.dirty_to_free();
 
-          if (res > 0)
+        if (res > 0)
+        {
+          /* Duplicate insert by another thread */
+          if (++retry_count > retry_max)
           {
-            /* Duplicate insert by another thread */
-            if (++retry_count > retry_max)
-            {
-              /* Avoid infinite loops */
-              table_share_lost++;
-              return NULL;
-            }
-            goto search;
+            /* Avoid infinite loops */
+            table_share_lost++;
+            return NULL;
           }
-
-          /* OOM in lf_hash_insert */
-          table_share_lost++;
-          return NULL;
+          goto search;
         }
+
+        /* OOM in lf_hash_insert */
+        table_share_lost++;
+        return NULL;
       }
     }
   }
@@ -812,112 +1351,103 @@ search:
   return NULL;
 }
 
-PFS_table_share *sanitize_table_share(PFS_table_share *unsafe)
-{
-  SANITIZE_ARRAY_BODY(PFS_table_share, table_share_array, table_share_max, unsafe);
-}
-
-const char *sanitize_table_schema_name(const char *unsafe)
+void PFS_table_share::aggregate_io(void)
 {
-  intptr ptr= (intptr) unsafe;
-  intptr first= (intptr) &table_share_array[0];
-  intptr last= (intptr) &table_share_array[table_share_max];
-
-
-  /* Check if unsafe points inside table_share_array[] */
-  if (likely((first <= ptr) && (ptr < last)))
-  {
-    intptr offset= (ptr - first) % sizeof(PFS_table_share);
-    intptr from= my_offsetof(PFS_table_share, m_key.m_hash_key);
-    /* Check if unsafe points inside PFS_table_share::m_key::m_hash_key */
-    if (likely((from <= offset) && (offset < from + PFS_TABLESHARE_HASHKEY_SIZE)))
-    {
-      PFS_table_share *base= (PFS_table_share*) (ptr - offset);
-      /* Check if unsafe really is the schema name */
-      if (likely(base->m_schema_name == unsafe))
-        return unsafe;
-    }
-  }
-  return NULL;
+  uint index= global_table_io_class.m_event_name_index;
+  PFS_single_stat *table_io_total= & global_instr_class_waits_array[index];
+  m_table_stat.sum_io(table_io_total);
+  m_table_stat.fast_reset_io();
 }
 
-const char *sanitize_table_object_name(const char *unsafe)
+void PFS_table_share::aggregate_lock(void)
 {
-  intptr ptr= (intptr) unsafe;
-  intptr first= (intptr) &table_share_array[0];
-  intptr last= (intptr) &table_share_array[table_share_max];
-
-
-  /* Check if unsafe points inside table_share_array[] */
-  if (likely((first <= ptr) && (ptr < last)))
-  {
-    intptr offset= (ptr - first) % sizeof(PFS_table_share);
-    intptr from= my_offsetof(PFS_table_share, m_key.m_hash_key);
-    /* Check if unsafe points inside PFS_table_share::m_key::m_hash_key */
-    if (likely((from <= offset) && (offset < from + PFS_TABLESHARE_HASHKEY_SIZE)))
-    {
-      PFS_table_share *base= (PFS_table_share*) (ptr - offset);
-      /* Check if unsafe really is the table name */
-      if (likely(base->m_table_name == unsafe))
-        return unsafe;
-    }
-  }
-  return NULL;
+  uint index= global_table_lock_class.m_event_name_index;
+  PFS_single_stat *table_lock_total= & global_instr_class_waits_array[index];
+  m_table_stat.sum_lock(table_lock_total);
+  m_table_stat.fast_reset_lock();
 }
 
-static void reset_mutex_class_waits(void)
+void release_table_share(PFS_table_share *pfs)
 {
-  PFS_mutex_class *pfs= mutex_class_array;
-  PFS_mutex_class *pfs_last= mutex_class_array + mutex_class_max;
-
-  for ( ; pfs < pfs_last; pfs++)
-    reset_single_stat_link(&pfs->m_wait_stat);
+  DBUG_ASSERT(pfs->get_refcount() > 0);
+  pfs->dec_refcount();
 }
 
-static void reset_rwlock_class_waits(void)
+/**
+  Drop the instrumented table share associated with a table.
+  @param thread The running thread
+  @param temporary True for TEMPORARY TABLE
+  @param schema_name The table schema name
+  @param schema_name_length The table schema name length
+  @param table_name The table name
+  @param table_name_length The table name length
+*/
+void drop_table_share(PFS_thread *thread,
+                      bool temporary,
+                      const char *schema_name, uint schema_name_length,
+                      const char *table_name, uint table_name_length)
 {
-  PFS_rwlock_class *pfs= rwlock_class_array;
-  PFS_rwlock_class *pfs_last= rwlock_class_array + rwlock_class_max;
+  PFS_table_share_key key;
+  LF_PINS* pins= get_table_share_hash_pins(thread);
+  if (unlikely(pins == NULL))
+    return;
+  set_table_share_key(&key, temporary, schema_name, schema_name_length,
+                      table_name, table_name_length);
+  PFS_table_share **entry;
+  entry= reinterpret_cast<PFS_table_share**>
+    (lf_hash_search(&table_share_hash, pins,
+                    key.m_hash_key, key.m_key_length));
+  if (entry && (entry != MY_ERRPTR))
+  {
+    PFS_table_share *pfs= *entry;
+    lf_hash_delete(&table_share_hash, pins,
+                   pfs->m_key.m_hash_key, pfs->m_key.m_key_length);
+    pfs->m_lock.allocated_to_free();
+  }
 
-  for ( ; pfs < pfs_last; pfs++)
-    reset_single_stat_link(&pfs->m_wait_stat);
+  lf_hash_search_unpin(pins);
 }
 
-static void reset_cond_class_waits(void)
+/**
+  Sanitize an unsafe table_share pointer.
+  @param unsafe The possibly corrupt pointer.
+  @return A valid table_safe_pointer, or NULL.
+*/
+PFS_table_share *sanitize_table_share(PFS_table_share *unsafe)
 {
-  PFS_cond_class *pfs= cond_class_array;
-  PFS_cond_class *pfs_last= cond_class_array + cond_class_max;
-
-  for ( ; pfs < pfs_last; pfs++)
-    reset_single_stat_link(&pfs->m_wait_stat);
+  SANITIZE_ARRAY_BODY(PFS_table_share, table_share_array, table_share_max, unsafe);
 }
 
-static void reset_file_class_waits(void)
+/** Reset the io statistics per file class. */
+void reset_file_class_io(void)
 {
   PFS_file_class *pfs= file_class_array;
   PFS_file_class *pfs_last= file_class_array + file_class_max;
 
   for ( ; pfs < pfs_last; pfs++)
-    reset_single_stat_link(&pfs->m_wait_stat);
+    pfs->m_file_stat.m_io_stat.reset();
 }
 
-/** Reset the wait statistics for every instrument class. */
-void reset_instrument_class_waits(void)
+/** Reset the io statistics per socket class. */
+void reset_socket_class_io(void)
 {
-  reset_mutex_class_waits();
-  reset_rwlock_class_waits();
-  reset_cond_class_waits();
-  reset_file_class_waits();
+  PFS_socket_class *pfs= socket_class_array;
+  PFS_socket_class *pfs_last= socket_class_array + socket_class_max;
+
+  for ( ; pfs < pfs_last; pfs++)
+    pfs->m_socket_stat.m_io_stat.reset();
 }
 
-/** Reset the io statistics per file class. */
-void reset_file_class_io(void)
+void update_table_share_derived_flags(PFS_thread *thread)
 {
-  PFS_file_class *pfs= file_class_array;
-  PFS_file_class *pfs_last= file_class_array + file_class_max;
+  PFS_table_share *pfs= table_share_array;
+  PFS_table_share *pfs_last= table_share_array + table_share_max;
 
   for ( ; pfs < pfs_last; pfs++)
-    reset_file_stat(&pfs->m_file_stat);
+  {
+    if (pfs->m_lock.is_populated())
+      pfs->refresh_setup_object_flags(thread);
+  }
 }
 
 /** @} */
diff --git a/storage/perfschema/pfs_instr_class.h b/storage/perfschema/pfs_instr_class.h
index b84691ccde5..bef25e76467 100644
--- a/storage/perfschema/pfs_instr_class.h
+++ b/storage/perfschema/pfs_instr_class.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2008, 2011, Oracle and/or its affiliates. All rights reserved.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
@@ -39,9 +39,13 @@
 #define PFS_MAX_FULL_PREFIX_NAME_LENGTH 32
 
 #include <my_global.h>
+#include <my_sys.h>
 #include <mysql/psi/psi.h>
 #include "pfs_lock.h"
 #include "pfs_stat.h"
+#include "pfs_column_types.h"
+
+struct PFS_global_param;
 
 /**
   @addtogroup Performance_schema_buffers
@@ -49,6 +53,7 @@
 */
 
 extern my_bool pfs_enabled;
+extern enum_timer_name *class_timers[];
 
 /** Key, naming a synch instrument (mutex, rwlock, cond). */
 typedef unsigned int PFS_sync_key;
@@ -56,55 +61,143 @@ typedef unsigned int PFS_sync_key;
 typedef unsigned int PFS_thread_key;
 /** Key, naming a file instrument. */
 typedef unsigned int PFS_file_key;
+/** Key, naming a stage instrument. */
+typedef unsigned int PFS_stage_key;
+/** Key, naming a statement instrument. */
+typedef unsigned int PFS_statement_key;
+/** Key, naming a socket instrument. */
+typedef unsigned int PFS_socket_key;
+
+enum PFS_class_type
+{
+  PFS_CLASS_NONE=        0,
+  PFS_CLASS_MUTEX=       1,
+  PFS_CLASS_RWLOCK=      2,
+  PFS_CLASS_COND=        3,
+  PFS_CLASS_FILE=        4,
+  PFS_CLASS_TABLE=       5,
+  PFS_CLASS_STAGE=       6,
+  PFS_CLASS_STATEMENT=   7,
+  PFS_CLASS_SOCKET=      8,
+  PFS_CLASS_TABLE_IO=    9,
+  PFS_CLASS_TABLE_LOCK= 10,
+  PFS_CLASS_IDLE=       11,
+  PFS_CLASS_LAST=       PFS_CLASS_IDLE,
+  PFS_CLASS_MAX=        PFS_CLASS_LAST + 1
+};
+
+/** User-defined instrument configuration. */
+struct PFS_instr_config
+{
+  /* Instrument name. */
+  char *m_name;
+  /* Name length. */
+  uint m_name_length;
+  /** Enabled flag. */
+  bool m_enabled;
+  /** Timed flag. */
+  bool m_timed;
+};
+
+extern DYNAMIC_ARRAY pfs_instr_config_array;
+extern int pfs_instr_config_state;
+
+static const int PFS_INSTR_CONFIG_NOT_INITIALIZED= 0;
+static const int PFS_INSTR_CONFIG_ALLOCATED= 1;
+static const int PFS_INSTR_CONFIG_DEALLOCATED= 2;
 
 struct PFS_thread;
 
+extern uint mutex_class_start;
+extern uint rwlock_class_start;
+extern uint cond_class_start;
+extern uint file_class_start;
+extern uint table_class_start;
+extern uint socket_class_start;
+extern uint wait_class_max;
+
 /** Information for all instrumentation. */
 struct PFS_instr_class
 {
-  /** Instrument name. */
-  char m_name[PFS_MAX_INFO_NAME_LENGTH];
-  /** Length in bytes of @c m_name. */
-  uint m_name_length;
-  /** Instrument flags. */
-  int m_flags;
+  /** Class type */
+  PFS_class_type m_type;
   /** True if this instrument is enabled. */
   bool m_enabled;
   /** True if this instrument is timed. */
   bool m_timed;
-  /** Wait statistics chain. */
-  PFS_single_stat_chain m_wait_stat;
+  /** Instrument flags. */
+  int m_flags;
+  /**
+    Instrument name index.
+    Self index in:
+    - EVENTS_WAITS_SUMMARY_*_BY_EVENT_NAME for waits
+    - EVENTS_STAGES_SUMMARY_*_BY_EVENT_NAME for stages
+    - EVENTS_STATEMENTS_SUMMARY_*_BY_EVENT_NAME for statements
+  */
+  uint m_event_name_index;
+  /** Instrument name. */
+  char m_name[PFS_MAX_INFO_NAME_LENGTH];
+  /** Length in bytes of @c m_name. */
+  uint m_name_length;
+  /** Timer associated with this class. */
+  enum_timer_name *m_timer;
+
+  bool is_singleton() const
+  {
+    return m_flags & PSI_FLAG_GLOBAL;
+  }
+  static void set_enabled(PFS_instr_class *pfs, bool enabled);
+  static void set_timed(PFS_instr_class *pfs, bool timed);
+
+  bool is_deferred() const
+  {
+    switch(m_type)
+    {
+      case PFS_CLASS_SOCKET:
+        return true;
+        break;
+      default:
+        return false;
+        break;
+    };
+  }
 };
 
+struct PFS_mutex;
+
 /** Instrumentation metadata for a MUTEX. */
 struct PFS_mutex_class : public PFS_instr_class
 {
   /**
-    Lock statistics chain.
+    Lock statistics.
     This statistic is not exposed in user visible tables yet.
   */
-  PFS_single_stat_chain m_lock_stat;
-  /** Self index in @c mutex_class_array. */
-  uint m_index;
+  PFS_single_stat m_lock_stat;
+  /** Singleton instance. */
+  PFS_mutex *m_singleton;
 };
 
+struct PFS_rwlock;
+
 /** Instrumentation metadata for a RWLOCK. */
 struct PFS_rwlock_class : public PFS_instr_class
 {
   /**
-    Read lock statistics chain.
+    Read lock statistics.
     This statistic is not exposed in user visible tables yet.
   */
-  PFS_single_stat_chain m_read_lock_stat;
+  PFS_single_stat m_read_lock_stat;
   /**
-    Write lock statistics chain.
+    Write lock statistics.
     This statistic is not exposed in user visible tables yet.
   */
-  PFS_single_stat_chain m_write_lock_stat;
-  /** Self index in @c rwlock_class_array. */
-  uint m_index;
+  PFS_single_stat m_write_lock_stat;
+  /** Singleton instance. */
+  PFS_rwlock *m_singleton;
 };
 
+struct PFS_cond;
+
 /** Instrumentation metadata for a COND. */
 struct PFS_cond_class : public PFS_instr_class
 {
@@ -113,19 +206,21 @@ struct PFS_cond_class : public PFS_instr_class
     This statistic is not exposed in user visible tables yet.
   */
   PFS_cond_stat m_cond_stat;
-  /** Self index in @c cond_class_array. */
-  uint m_index;
+  /** Singleton instance. */
+  PFS_cond *m_singleton;
 };
 
 /** Instrumentation metadata of a thread. */
 struct PFS_thread_class
 {
+  /** True if this thread instrument is enabled. */
+  bool m_enabled;
+  /** Singleton instance. */
+  PFS_thread *m_singleton;
   /** Thread instrument name. */
   char m_name[PFS_MAX_INFO_NAME_LENGTH];
   /** Length in bytes of @c m_name. */
   uint m_name_length;
-  /** True if this thread instrument is enabled. */
-  bool m_enabled;
 };
 
 #define PFS_TABLESHARE_HASHKEY_SIZE (NAME_LEN + 1 + NAME_LEN + 1)
@@ -136,7 +231,7 @@ struct PFS_table_share_key
   /**
     Hash search key.
     This has to be a string for LF_HASH,
-    the format is "<schema_name><0x00><object_name><0x00>"
+    the format is "<enum_object_type><schema_name><0x00><object_name><0x00>"
     @see create_table_def_key
   */
   char m_hash_key[PFS_TABLESHARE_HASHKEY_SIZE];
@@ -144,11 +239,70 @@ struct PFS_table_share_key
   uint m_key_length;
 };
 
+/** Table index or 'key' */
+struct PFS_table_key
+{
+  /** Index name */
+  char m_name[NAME_LEN];
+  /** Length in bytes of @c m_name. */
+  uint m_name_length;
+};
+
 /** Instrumentation metadata for a table share. */
 struct PFS_table_share
 {
+public:
+  uint32 get_version()
+  { return m_lock.get_version(); }
+
+  enum_object_type get_object_type()
+  {
+    return (enum_object_type) m_key.m_hash_key[0];
+  }
+
+  void aggregate_io(void);
+  void aggregate_lock(void);
+
+  inline void aggregate(void)
+  {
+    aggregate_io();
+    aggregate_lock();
+  }
+
+  inline void init_refcount(void)
+  {
+    PFS_atomic::store_32(& m_refcount, 1);
+  }
+
+  inline int get_refcount(void)
+  {
+    return PFS_atomic::load_32(& m_refcount);
+  }
+
+  inline void inc_refcount(void)
+  {
+    PFS_atomic::add_32(& m_refcount, 1);
+  }
+
+  inline void dec_refcount(void)
+  {
+    PFS_atomic::add_32(& m_refcount, -1);
+  }
+
+  void refresh_setup_object_flags(PFS_thread *thread);
+
   /** Internal lock. */
   pfs_lock m_lock;
+  /**
+    True if table instrumentation is enabled.
+    This flag is computed from the content of table setup_objects.
+  */
+  bool m_enabled;
+  /**
+    True if table instrumentation is timed.
+    This flag is computed from the content of table setup_objects.
+  */
+  bool m_timed;
   /** Search key. */
   PFS_table_share_key m_key;
   /** Schema name. */
@@ -159,32 +313,73 @@ struct PFS_table_share
   const char *m_table_name;
   /** Length in bytes of @c m_table_name. */
   uint m_table_name_length;
-  /** Wait statistics chain. */
-  PFS_single_stat_chain m_wait_stat;
-  /** True if this table instrument is enabled. */
-  bool m_enabled;
-  /** True if this table instrument is timed. */
-  bool m_timed;
-  /** True if this table instrument is aggregated. */
-  bool m_aggregated;
+  /** Number of indexes. */
+  uint m_key_count;
+  /** Table statistics. */
+  PFS_table_stat m_table_stat;
+  /** Index names. */
+  PFS_table_key m_keys[MAX_KEY];
+
+private:
+  /** Number of opened table handles. */
+  int m_refcount;
 };
 
 /**
-  Instrument controlling all tables.
-  This instrument is used as a default when there is no
-  entry present in SETUP_OBJECTS.
+  Instrument controlling all table io.
+  This instrument is used with table SETUP_OBJECTS.
 */
-extern PFS_instr_class global_table_class;
+extern PFS_instr_class global_table_io_class;
+
+/**
+  Instrument controlling all table lock.
+  This instrument is used with table SETUP_OBJECTS.
+*/
+extern PFS_instr_class global_table_lock_class;
+
+/**
+  Instrument controlling all idle waits.
+*/
+extern PFS_instr_class global_idle_class;
+
+struct PFS_file;
 
 /** Instrumentation metadata for a file. */
 struct PFS_file_class : public PFS_instr_class
 {
   /** File usage statistics. */
   PFS_file_stat m_file_stat;
-  /** Self index in @c file_class_array. */
-  uint m_index;
+  /** Singleton instance. */
+  PFS_file *m_singleton;
 };
 
+/** Instrumentation metadata for a stage. */
+struct PFS_stage_class : public PFS_instr_class
+{
+  /** Stage usage statistics. */
+  PFS_stage_stat m_stage_stat;
+};
+
+/** Instrumentation metadata for a statement. */
+struct PFS_statement_class : public PFS_instr_class
+{
+};
+
+struct  PFS_socket;
+
+/** Instrumentation metadata for a socket. */
+struct PFS_socket_class : public PFS_instr_class
+{
+  /** Socket usage statistics. */
+  PFS_socket_stat m_socket_stat;
+  /** Singleton instance. */
+  PFS_socket *m_singleton;
+};
+
+void init_event_name_sizing(const PFS_global_param *param);
+
+void register_global_classes();
+
 int init_sync_class(uint mutex_class_sizing,
                     uint rwlock_class_sizing,
                     uint cond_class_sizing);
@@ -198,6 +393,12 @@ int init_table_share_hash();
 void cleanup_table_share_hash();
 int init_file_class(uint file_class_sizing);
 void cleanup_file_class();
+int init_stage_class(uint stage_class_sizing);
+void cleanup_stage_class();
+int init_statement_class(uint statement_class_sizing);
+void cleanup_statement_class();
+int init_socket_class(uint socket_class_sizing);
+void cleanup_socket_class();
 
 PFS_sync_key register_mutex_class(const char *name, uint name_length,
                                   int flags);
@@ -214,6 +415,15 @@ PFS_thread_key register_thread_class(const char *name, uint name_length,
 PFS_file_key register_file_class(const char *name, uint name_length,
                                  int flags);
 
+PFS_stage_key register_stage_class(const char *name, uint name_length,
+                                   int flags);
+
+PFS_statement_key register_statement_class(const char *name, uint name_length,
+                                           int flags);
+
+PFS_socket_key register_socket_class(const char *name, uint name_length,
+                                     int flags);
+
 PFS_mutex_class *find_mutex_class(PSI_mutex_key key);
 PFS_mutex_class *sanitize_mutex_class(PFS_mutex_class *unsafe);
 PFS_rwlock_class *find_rwlock_class(PSI_rwlock_key key);
@@ -224,14 +434,25 @@ PFS_thread_class *find_thread_class(PSI_thread_key key);
 PFS_thread_class *sanitize_thread_class(PFS_thread_class *unsafe);
 PFS_file_class *find_file_class(PSI_file_key key);
 PFS_file_class *sanitize_file_class(PFS_file_class *unsafe);
-const char *sanitize_table_schema_name(const char *unsafe);
-const char *sanitize_table_object_name(const char *unsafe);
+PFS_stage_class *find_stage_class(PSI_stage_key key);
+PFS_stage_class *sanitize_stage_class(PFS_stage_class *unsafe);
+PFS_statement_class *find_statement_class(PSI_statement_key key);
+PFS_statement_class *sanitize_statement_class(PFS_statement_class *unsafe);
+PFS_instr_class *find_table_class(uint index);
+PFS_instr_class *sanitize_table_class(PFS_instr_class *unsafe);
+PFS_socket_class *find_socket_class(PSI_socket_key key);
+PFS_socket_class *sanitize_socket_class(PFS_socket_class *unsafe);
+PFS_instr_class *find_idle_class(uint index);
+PFS_instr_class *sanitize_idle_class(PFS_instr_class *unsafe);
 
 PFS_table_share *find_or_create_table_share(PFS_thread *thread,
-                                            const char *schema_name,
-                                            uint schema_name_length,
-                                            const char *table_name,
-                                            uint table_name_length);
+                                            bool temporary,
+                                            const TABLE_SHARE *share);
+void release_table_share(PFS_table_share *pfs);
+void drop_table_share(PFS_thread *thread,
+                      bool temporary,
+                      const char *schema_name, uint schema_name_length,
+                      const char *table_name, uint table_name_length);
 
 PFS_table_share *sanitize_table_share(PFS_table_share *unsafe);
 
@@ -245,12 +466,28 @@ extern ulong thread_class_max;
 extern ulong thread_class_lost;
 extern ulong file_class_max;
 extern ulong file_class_lost;
+extern ulong stage_class_max;
+extern ulong stage_class_lost;
+extern ulong statement_class_max;
+extern ulong statement_class_lost;
+extern ulong socket_class_max;
+extern ulong socket_class_lost;
 extern ulong table_share_max;
 extern ulong table_share_lost;
+
+/* Exposing the data directly, for iterators. */
+
+extern PFS_mutex_class *mutex_class_array;
+extern PFS_rwlock_class *rwlock_class_array;
+extern PFS_cond_class *cond_class_array;
+extern PFS_file_class *file_class_array;
 extern PFS_table_share *table_share_array;
 
-void reset_instrument_class_waits();
 void reset_file_class_io();
+void reset_socket_class_io();
+
+/** Update derived flags for all table shares. */
+void update_table_share_derived_flags(PFS_thread *thread);
 
 /** @} */
 #endif
diff --git a/storage/perfschema/pfs_lex_token.h b/storage/perfschema/pfs_lex_token.h
new file mode 100644
index 00000000000..6074570f56f
--- /dev/null
+++ b/storage/perfschema/pfs_lex_token.h
@@ -0,0 +1,906 @@
+/*
+Copyright (c) 2011, 2012, Oracle, Monty Program Ab and others.
+
+*/
+/*
+  This file is generated, do not edit.
+  See file storage/perfschema/gen_pfs_lex_token.cc.
+*/
+struct lex_token_string
+{
+  const char *m_token_string;
+  int m_token_length;
+};
+typedef struct lex_token_string lex_token_string;
+lex_token_string lex_token_array[]=
+{
+/* PART 1: character tokens. */
+/* 000 */  { "\x00", 1},
+/* 001 */  { "\x01", 1},
+/* 002 */  { "\x02", 1},
+/* 003 */  { "\x03", 1},
+/* 004 */  { "\x04", 1},
+/* 005 */  { "\x05", 1},
+/* 006 */  { "\x06", 1},
+/* 007 */  { "\x07", 1},
+/* 008 */  { "\x08", 1},
+/* 009 */  { "\x09", 1},
+/* 010 */  { "\x0a", 1},
+/* 011 */  { "\x0b", 1},
+/* 012 */  { "\x0c", 1},
+/* 013 */  { "\x0d", 1},
+/* 014 */  { "\x0e", 1},
+/* 015 */  { "\x0f", 1},
+/* 016 */  { "\x10", 1},
+/* 017 */  { "\x11", 1},
+/* 018 */  { "\x12", 1},
+/* 019 */  { "\x13", 1},
+/* 020 */  { "\x14", 1},
+/* 021 */  { "\x15", 1},
+/* 022 */  { "\x16", 1},
+/* 023 */  { "\x17", 1},
+/* 024 */  { "\x18", 1},
+/* 025 */  { "\x19", 1},
+/* 026 */  { "\x1a", 1},
+/* 027 */  { "\x1b", 1},
+/* 028 */  { "\x1c", 1},
+/* 029 */  { "\x1d", 1},
+/* 030 */  { "\x1e", 1},
+/* 031 */  { "\x1f", 1},
+/* 032 */  { "\x20", 1},
+/* 033 */  { "\x21", 1},
+/* 034 */  { "\x22", 1},
+/* 035 */  { "\x23", 1},
+/* 036 */  { "\x24", 1},
+/* 037 */  { "\x25", 1},
+/* 038 */  { "\x26", 1},
+/* 039 */  { "\x27", 1},
+/* 040 */  { "\x28", 1},
+/* 041 */  { "\x29", 1},
+/* 042 */  { "\x2a", 1},
+/* 043 */  { "\x2b", 1},
+/* 044 */  { "\x2c", 1},
+/* 045 */  { "\x2d", 1},
+/* 046 */  { "\x2e", 1},
+/* 047 */  { "\x2f", 1},
+/* 048 */  { "\x30", 1},
+/* 049 */  { "\x31", 1},
+/* 050 */  { "\x32", 1},
+/* 051 */  { "\x33", 1},
+/* 052 */  { "\x34", 1},
+/* 053 */  { "\x35", 1},
+/* 054 */  { "\x36", 1},
+/* 055 */  { "\x37", 1},
+/* 056 */  { "\x38", 1},
+/* 057 */  { "\x39", 1},
+/* 058 */  { "\x3a", 1},
+/* 059 */  { "\x3b", 1},
+/* 060 */  { "\x3c", 1},
+/* 061 */  { "\x3d", 1},
+/* 062 */  { "\x3e", 1},
+/* 063 */  { "\x3f", 1},
+/* 064 */  { "\x40", 1},
+/* 065 */  { "\x41", 1},
+/* 066 */  { "\x42", 1},
+/* 067 */  { "\x43", 1},
+/* 068 */  { "\x44", 1},
+/* 069 */  { "\x45", 1},
+/* 070 */  { "\x46", 1},
+/* 071 */  { "\x47", 1},
+/* 072 */  { "\x48", 1},
+/* 073 */  { "\x49", 1},
+/* 074 */  { "\x4a", 1},
+/* 075 */  { "\x4b", 1},
+/* 076 */  { "\x4c", 1},
+/* 077 */  { "\x4d", 1},
+/* 078 */  { "\x4e", 1},
+/* 079 */  { "\x4f", 1},
+/* 080 */  { "\x50", 1},
+/* 081 */  { "\x51", 1},
+/* 082 */  { "\x52", 1},
+/* 083 */  { "\x53", 1},
+/* 084 */  { "\x54", 1},
+/* 085 */  { "\x55", 1},
+/* 086 */  { "\x56", 1},
+/* 087 */  { "\x57", 1},
+/* 088 */  { "\x58", 1},
+/* 089 */  { "\x59", 1},
+/* 090 */  { "\x5a", 1},
+/* 091 */  { "\x5b", 1},
+/* 092 */  { "\x5c", 1},
+/* 093 */  { "\x5d", 1},
+/* 094 */  { "\x5e", 1},
+/* 095 */  { "\x5f", 1},
+/* 096 */  { "\x60", 1},
+/* 097 */  { "\x61", 1},
+/* 098 */  { "\x62", 1},
+/* 099 */  { "\x63", 1},
+/* 100 */  { "\x64", 1},
+/* 101 */  { "\x65", 1},
+/* 102 */  { "\x66", 1},
+/* 103 */  { "\x67", 1},
+/* 104 */  { "\x68", 1},
+/* 105 */  { "\x69", 1},
+/* 106 */  { "\x6a", 1},
+/* 107 */  { "\x6b", 1},
+/* 108 */  { "\x6c", 1},
+/* 109 */  { "\x6d", 1},
+/* 110 */  { "\x6e", 1},
+/* 111 */  { "\x6f", 1},
+/* 112 */  { "\x70", 1},
+/* 113 */  { "\x71", 1},
+/* 114 */  { "\x72", 1},
+/* 115 */  { "\x73", 1},
+/* 116 */  { "\x74", 1},
+/* 117 */  { "\x75", 1},
+/* 118 */  { "\x76", 1},
+/* 119 */  { "\x77", 1},
+/* 120 */  { "\x78", 1},
+/* 121 */  { "\x79", 1},
+/* 122 */  { "\x7a", 1},
+/* 123 */  { "\x7b", 1},
+/* 124 */  { "\x7c", 1},
+/* 125 */  { "\x7d", 1},
+/* 126 */  { "\x7e", 1},
+/* 127 */  { "\x7f", 1},
+/* 128 */  { "\x80", 1},
+/* 129 */  { "\x81", 1},
+/* 130 */  { "\x82", 1},
+/* 131 */  { "\x83", 1},
+/* 132 */  { "\x84", 1},
+/* 133 */  { "\x85", 1},
+/* 134 */  { "\x86", 1},
+/* 135 */  { "\x87", 1},
+/* 136 */  { "\x88", 1},
+/* 137 */  { "\x89", 1},
+/* 138 */  { "\x8a", 1},
+/* 139 */  { "\x8b", 1},
+/* 140 */  { "\x8c", 1},
+/* 141 */  { "\x8d", 1},
+/* 142 */  { "\x8e", 1},
+/* 143 */  { "\x8f", 1},
+/* 144 */  { "\x90", 1},
+/* 145 */  { "\x91", 1},
+/* 146 */  { "\x92", 1},
+/* 147 */  { "\x93", 1},
+/* 148 */  { "\x94", 1},
+/* 149 */  { "\x95", 1},
+/* 150 */  { "\x96", 1},
+/* 151 */  { "\x97", 1},
+/* 152 */  { "\x98", 1},
+/* 153 */  { "\x99", 1},
+/* 154 */  { "\x9a", 1},
+/* 155 */  { "\x9b", 1},
+/* 156 */  { "\x9c", 1},
+/* 157 */  { "\x9d", 1},
+/* 158 */  { "\x9e", 1},
+/* 159 */  { "\x9f", 1},
+/* 160 */  { "\xa0", 1},
+/* 161 */  { "\xa1", 1},
+/* 162 */  { "\xa2", 1},
+/* 163 */  { "\xa3", 1},
+/* 164 */  { "\xa4", 1},
+/* 165 */  { "\xa5", 1},
+/* 166 */  { "\xa6", 1},
+/* 167 */  { "\xa7", 1},
+/* 168 */  { "\xa8", 1},
+/* 169 */  { "\xa9", 1},
+/* 170 */  { "\xaa", 1},
+/* 171 */  { "\xab", 1},
+/* 172 */  { "\xac", 1},
+/* 173 */  { "\xad", 1},
+/* 174 */  { "\xae", 1},
+/* 175 */  { "\xaf", 1},
+/* 176 */  { "\xb0", 1},
+/* 177 */  { "\xb1", 1},
+/* 178 */  { "\xb2", 1},
+/* 179 */  { "\xb3", 1},
+/* 180 */  { "\xb4", 1},
+/* 181 */  { "\xb5", 1},
+/* 182 */  { "\xb6", 1},
+/* 183 */  { "\xb7", 1},
+/* 184 */  { "\xb8", 1},
+/* 185 */  { "\xb9", 1},
+/* 186 */  { "\xba", 1},
+/* 187 */  { "\xbb", 1},
+/* 188 */  { "\xbc", 1},
+/* 189 */  { "\xbd", 1},
+/* 190 */  { "\xbe", 1},
+/* 191 */  { "\xbf", 1},
+/* 192 */  { "\xc0", 1},
+/* 193 */  { "\xc1", 1},
+/* 194 */  { "\xc2", 1},
+/* 195 */  { "\xc3", 1},
+/* 196 */  { "\xc4", 1},
+/* 197 */  { "\xc5", 1},
+/* 198 */  { "\xc6", 1},
+/* 199 */  { "\xc7", 1},
+/* 200 */  { "\xc8", 1},
+/* 201 */  { "\xc9", 1},
+/* 202 */  { "\xca", 1},
+/* 203 */  { "\xcb", 1},
+/* 204 */  { "\xcc", 1},
+/* 205 */  { "\xcd", 1},
+/* 206 */  { "\xce", 1},
+/* 207 */  { "\xcf", 1},
+/* 208 */  { "\xd0", 1},
+/* 209 */  { "\xd1", 1},
+/* 210 */  { "\xd2", 1},
+/* 211 */  { "\xd3", 1},
+/* 212 */  { "\xd4", 1},
+/* 213 */  { "\xd5", 1},
+/* 214 */  { "\xd6", 1},
+/* 215 */  { "\xd7", 1},
+/* 216 */  { "\xd8", 1},
+/* 217 */  { "\xd9", 1},
+/* 218 */  { "\xda", 1},
+/* 219 */  { "\xdb", 1},
+/* 220 */  { "\xdc", 1},
+/* 221 */  { "\xdd", 1},
+/* 222 */  { "\xde", 1},
+/* 223 */  { "\xdf", 1},
+/* 224 */  { "\xe0", 1},
+/* 225 */  { "\xe1", 1},
+/* 226 */  { "\xe2", 1},
+/* 227 */  { "\xe3", 1},
+/* 228 */  { "\xe4", 1},
+/* 229 */  { "\xe5", 1},
+/* 230 */  { "\xe6", 1},
+/* 231 */  { "\xe7", 1},
+/* 232 */  { "\xe8", 1},
+/* 233 */  { "\xe9", 1},
+/* 234 */  { "\xea", 1},
+/* 235 */  { "\xeb", 1},
+/* 236 */  { "\xec", 1},
+/* 237 */  { "\xed", 1},
+/* 238 */  { "\xee", 1},
+/* 239 */  { "\xef", 1},
+/* 240 */  { "\xf0", 1},
+/* 241 */  { "\xf1", 1},
+/* 242 */  { "\xf2", 1},
+/* 243 */  { "\xf3", 1},
+/* 244 */  { "\xf4", 1},
+/* 245 */  { "\xf5", 1},
+/* 246 */  { "\xf6", 1},
+/* 247 */  { "\xf7", 1},
+/* 248 */  { "\xf8", 1},
+/* 249 */  { "\xf9", 1},
+/* 250 */  { "\xfa", 1},
+/* 251 */  { "\xfb", 1},
+/* 252 */  { "\xfc", 1},
+/* 253 */  { "\xfd", 1},
+/* 254 */  { "\xfe", 1},
+/* 255 */  { "\xff", 1},
+/* PART 2: named tokens. */
+/* 256 */  { "(unknown)", 9},
+/* 257 */  { "(unknown)", 9},
+/* 258 */  { "(unknown)", 9},
+/* 259 */  { "ACCESSIBLE", 10},
+/* 260 */  { "ACTION", 6},
+/* 261 */  { "ADD", 3},
+/* 262 */  { "ADDDATE", 7},
+/* 263 */  { "AFTER", 5},
+/* 264 */  { "AGAINST", 7},
+/* 265 */  { "AGGREGATE", 9},
+/* 266 */  { "ALGORITHM", 9},
+/* 267 */  { "ALL", 3},
+/* 268 */  { "ALTER", 5},
+/* 269 */  { "ALWAYS", 6},
+/* 270 */  { "ANALYZE", 7},
+/* 271 */  { "&&", 2},
+/* 272 */  { "AND", 3},
+/* 273 */  { "SOME", 4},
+/* 274 */  { "AS", 2},
+/* 275 */  { "ASC", 3},
+/* 276 */  { "ASCII", 5},
+/* 277 */  { "ASENSITIVE", 10},
+/* 278 */  { "AT", 2},
+/* 279 */  { "AUTHORS", 7},
+/* 280 */  { "AUTOEXTEND_SIZE", 15},
+/* 281 */  { "AUTO_INCREMENT", 14},
+/* 282 */  { "AVG_ROW_LENGTH", 14},
+/* 283 */  { "AVG", 3},
+/* 284 */  { "BACKUP", 6},
+/* 285 */  { "BEFORE", 6},
+/* 286 */  { "BEGIN", 5},
+/* 287 */  { "BETWEEN", 7},
+/* 288 */  { "INT8", 4},
+/* 289 */  { "BINARY", 6},
+/* 290 */  { "BINLOG", 6},
+/* 291 */  { "(bin)", 5},
+/* 292 */  { "BIT_AND", 7},
+/* 293 */  { "BIT_OR", 6},
+/* 294 */  { "BIT", 3},
+/* 295 */  { "BIT_XOR", 7},
+/* 296 */  { "BLOB", 4},
+/* 297 */  { "BLOCK", 5},
+/* 298 */  { "BOOLEAN", 7},
+/* 299 */  { "BOOL", 4},
+/* 300 */  { "BOTH", 4},
+/* 301 */  { "BTREE", 5},
+/* 302 */  { "BY", 2},
+/* 303 */  { "BYTE", 4},
+/* 304 */  { "CACHE", 5},
+/* 305 */  { "CALL", 4},
+/* 306 */  { "CASCADE", 7},
+/* 307 */  { "CASCADED", 8},
+/* 308 */  { "CASE", 4},
+/* 309 */  { "CAST", 4},
+/* 310 */  { "CATALOG_NAME", 12},
+/* 311 */  { "CHAIN", 5},
+/* 312 */  { "CHANGE", 6},
+/* 313 */  { "CHANGED", 7},
+/* 314 */  { "CHARSET", 7},
+/* 315 */  { "CHARACTER", 9},
+/* 316 */  { "CHECKPOINT", 10},
+/* 317 */  { "CHECKSUM", 8},
+/* 318 */  { "CHECK", 5},
+/* 319 */  { "CIPHER", 6},
+/* 320 */  { "CLASS_ORIGIN", 12},
+/* 321 */  { "CLIENT", 6},
+/* 322 */  { "CLIENT_STATISTICS", 17},
+/* 323 */  { "CLOSE", 5},
+/* 324 */  { "COALESCE", 8},
+/* 325 */  { "CODE", 4},
+/* 326 */  { "COLLATE", 7},
+/* 327 */  { "COLLATION", 9},
+/* 328 */  { "FIELDS", 6},
+/* 329 */  { "COLUMN_ADD", 10},
+/* 330 */  { "COLUMN_CREATE", 13},
+/* 331 */  { "COLUMN_DELETE", 13},
+/* 332 */  { "COLUMN_EXISTS", 13},
+/* 333 */  { "COLUMN_GET", 10},
+/* 334 */  { "COLUMN_LIST", 11},
+/* 335 */  { "COLUMN", 6},
+/* 336 */  { "COLUMN_NAME", 11},
+/* 337 */  { "COMMENT", 7},
+/* 338 */  { "COMMITTED", 9},
+/* 339 */  { "COMMIT", 6},
+/* 340 */  { "COMPACT", 7},
+/* 341 */  { "COMPLETION", 10},
+/* 342 */  { "COMPRESSED", 10},
+/* 343 */  { "CONCURRENT", 10},
+/* 344 */  { "CONDITION", 9},
+/* 345 */  { "CONNECTION", 10},
+/* 346 */  { "CONSISTENT", 10},
+/* 347 */  { "CONSTRAINT", 10},
+/* 348 */  { "CONSTRAINT_CATALOG", 18},
+/* 349 */  { "CONSTRAINT_NAME", 15},
+/* 350 */  { "CONSTRAINT_SCHEMA", 17},
+/* 351 */  { "CONTAINS", 8},
+/* 352 */  { "CONTEXT", 7},
+/* 353 */  { "CONTINUE", 8},
+/* 354 */  { "CONTRIBUTORS", 12},
+/* 355 */  { "CONVERT", 7},
+/* 356 */  { "COUNT", 5},
+/* 357 */  { "CPU", 3},
+/* 358 */  { "CREATE", 6},
+/* 359 */  { "CROSS", 5},
+/* 360 */  { "CUBE", 4},
+/* 361 */  { "CURDATE", 7},
+/* 362 */  { "CURRENT_USER", 12},
+/* 363 */  { "CURSOR", 6},
+/* 364 */  { "CURSOR_NAME", 11},
+/* 365 */  { "CURTIME", 7},
+/* 366 */  { "SCHEMA", 6},
+/* 367 */  { "SCHEMAS", 7},
+/* 368 */  { "DATAFILE", 8},
+/* 369 */  { "DATA", 4},
+/* 370 */  { "DATETIME", 8},
+/* 371 */  { "DATE_ADD", 8},
+/* 372 */  { "DATE_SUB", 8},
+/* 373 */  { "DATE", 4},
+/* 374 */  { "DAY_HOUR", 8},
+/* 375 */  { "DAY_MICROSECOND", 15},
+/* 376 */  { "DAY_MINUTE", 10},
+/* 377 */  { "DAY_SECOND", 10},
+/* 378 */  { "SQL_TSI_DAY", 11},
+/* 379 */  { "DEALLOCATE", 10},
+/* 380 */  { "(decimal)", 9},
+/* 381 */  { "DECIMAL", 7},
+/* 382 */  { "DECLARE", 7},
+/* 383 */  { "DEFAULT", 7},
+/* 384 */  { "DEFINER", 7},
+/* 385 */  { "DELAYED", 7},
+/* 386 */  { "DELAY_KEY_WRITE", 15},
+/* 387 */  { "DELETE", 6},
+/* 388 */  { "DESC", 4},
+/* 389 */  { "EXPLAIN", 7},
+/* 390 */  { "DES_KEY_FILE", 12},
+/* 391 */  { "DETERMINISTIC", 13},
+/* 392 */  { "DIRECTORY", 9},
+/* 393 */  { "DISABLE", 7},
+/* 394 */  { "DISCARD", 7},
+/* 395 */  { "DISK", 4},
+/* 396 */  { "DISTINCTROW", 11},
+/* 397 */  { "DIV", 3},
+/* 398 */  { "FLOAT8", 6},
+/* 399 */  { "DO", 2},
+/* 400 */  { "DROP", 4},
+/* 401 */  { "DUAL", 4},
+/* 402 */  { "DUMPFILE", 8},
+/* 403 */  { "DUPLICATE", 9},
+/* 404 */  { "DYNAMIC", 7},
+/* 405 */  { "EACH", 4},
+/* 406 */  { "ELSE", 4},
+/* 407 */  { "ELSEIF", 6},
+/* 408 */  { "ENABLE", 6},
+/* 409 */  { "ENCLOSED", 8},
+/* 410 */  { "END", 3},
+/* 411 */  { "ENDS", 4},
+/* 412 */  { "", 0},
+/* 413 */  { "ENGINES", 7},
+/* 414 */  { "ENGINE", 6},
+/* 415 */  { "ENUM", 4},
+/* 416 */  { "=", 1},
+/* 417 */  { "<=>", 3},
+/* 418 */  { "ERROR", 5},
+/* 419 */  { "ERRORS", 6},
+/* 420 */  { "ESCAPED", 7},
+/* 421 */  { "ESCAPE", 6},
+/* 422 */  { "EVENTS", 6},
+/* 423 */  { "EVENT", 5},
+/* 424 */  { "EVERY", 5},
+/* 425 */  { "EXAMINED", 8},
+/* 426 */  { "EXECUTE", 7},
+/* 427 */  { "EXISTS", 6},
+/* 428 */  { "EXIT", 4},
+/* 429 */  { "EXPANSION", 9},
+/* 430 */  { "EXTENDED", 8},
+/* 431 */  { "EXTENT_SIZE", 11},
+/* 432 */  { "EXTRACT", 7},
+/* 433 */  { "FALSE", 5},
+/* 434 */  { "FAST", 4},
+/* 435 */  { "FAULTS", 6},
+/* 436 */  { "FETCH", 5},
+/* 437 */  { "FILE", 4},
+/* 438 */  { "FIRST", 5},
+/* 439 */  { "FIXED", 5},
+/* 440 */  { "(float)", 7},
+/* 441 */  { "FLOAT4", 6},
+/* 442 */  { "FLUSH", 5},
+/* 443 */  { "FORCE", 5},
+/* 444 */  { "FOREIGN", 7},
+/* 445 */  { "FOR", 3},
+/* 446 */  { "FOUND", 5},
+/* 447 */  { "FROM", 4},
+/* 448 */  { "FULL", 4},
+/* 449 */  { "FULLTEXT", 8},
+/* 450 */  { "FUNCTION", 8},
+/* 451 */  { ">=", 2},
+/* 452 */  { "GENERAL", 7},
+/* 453 */  { "GENERATED", 9},
+/* 454 */  { "GEOMETRYCOLLECTION", 18},
+/* 455 */  { "GEOMETRY", 8},
+/* 456 */  { "GET_FORMAT", 10},
+/* 457 */  { "GLOBAL", 6},
+/* 458 */  { "GRANT", 5},
+/* 459 */  { "GRANTS", 6},
+/* 460 */  { "GROUP", 5},
+/* 461 */  { "GROUP_CONCAT", 12},
+/* 462 */  { ">", 1},
+/* 463 */  { "HANDLER", 7},
+/* 464 */  { "HARD", 4},
+/* 465 */  { "HASH", 4},
+/* 466 */  { "HAVING", 6},
+/* 467 */  { "HELP", 4},
+/* 468 */  { "(hex)", 5},
+/* 469 */  { "HIGH_PRIORITY", 13},
+/* 470 */  { "HOST", 4},
+/* 471 */  { "HOSTS", 5},
+/* 472 */  { "HOUR_MICROSECOND", 16},
+/* 473 */  { "HOUR_MINUTE", 11},
+/* 474 */  { "HOUR_SECOND", 11},
+/* 475 */  { "SQL_TSI_HOUR", 12},
+/* 476 */  { "(id)", 4},
+/* 477 */  { "IDENTIFIED", 10},
+/* 478 */  { "(id_quoted)", 11},
+/* 479 */  { "IF", 2},
+/* 480 */  { "IGNORE", 6},
+/* 481 */  { "IGNORE_SERVER_IDS", 17},
+/* 482 */  { "IMPORT", 6},
+/* 483 */  { "INDEXES", 7},
+/* 484 */  { "INDEX", 5},
+/* 485 */  { "INDEX_STATISTICS", 16},
+/* 486 */  { "INFILE", 6},
+/* 487 */  { "INITIAL_SIZE", 12},
+/* 488 */  { "INNER", 5},
+/* 489 */  { "INOUT", 5},
+/* 490 */  { "INSENSITIVE", 11},
+/* 491 */  { "INSERT", 6},
+/* 492 */  { "INSERT_METHOD", 13},
+/* 493 */  { "INSTALL", 7},
+/* 494 */  { "INTERVAL", 8},
+/* 495 */  { "INTO", 4},
+/* 496 */  { "INTEGER", 7},
+/* 497 */  { "INVOKER", 7},
+/* 498 */  { "IN", 2},
+/* 499 */  { "IO", 2},
+/* 500 */  { "IPC", 3},
+/* 501 */  { "IS", 2},
+/* 502 */  { "ISOLATION", 9},
+/* 503 */  { "ISSUER", 6},
+/* 504 */  { "ITERATE", 7},
+/* 505 */  { "JOIN", 4},
+/* 506 */  { "KEYS", 4},
+/* 507 */  { "KEY_BLOCK_SIZE", 14},
+/* 508 */  { "KEY", 3},
+/* 509 */  { "KILL", 4},
+/* 510 */  { "LANGUAGE", 8},
+/* 511 */  { "LAST", 4},
+/* 512 */  { "<=", 2},
+/* 513 */  { "LEADING", 7},
+/* 514 */  { "LEAVES", 6},
+/* 515 */  { "LEAVE", 5},
+/* 516 */  { "LEFT", 4},
+/* 517 */  { "LESS", 4},
+/* 518 */  { "LEVEL", 5},
+/* 519 */  { "(hostname)", 10},
+/* 520 */  { "LIKE", 4},
+/* 521 */  { "LIMIT", 5},
+/* 522 */  { "LINEAR", 6},
+/* 523 */  { "LINES", 5},
+/* 524 */  { "LINESTRING", 10},
+/* 525 */  { "LIST", 4},
+/* 526 */  { "LOAD", 4},
+/* 527 */  { "LOCAL", 5},
+/* 528 */  { "LOCATOR", 7},
+/* 529 */  { "LOCKS", 5},
+/* 530 */  { "LOCK", 4},
+/* 531 */  { "LOGFILE", 7},
+/* 532 */  { "LOGS", 4},
+/* 533 */  { "LONGBLOB", 8},
+/* 534 */  { "LONGTEXT", 8},
+/* 535 */  { "(long)", 6},
+/* 536 */  { "LONG", 4},
+/* 537 */  { "LOOP", 4},
+/* 538 */  { "LOW_PRIORITY", 12},
+/* 539 */  { "<", 1},
+/* 540 */  { "MASTER_CONNECT_RETRY", 20},
+/* 541 */  { "MASTER_HOST", 11},
+/* 542 */  { "MASTER_LOG_FILE", 15},
+/* 543 */  { "MASTER_LOG_POS", 14},
+/* 544 */  { "MASTER_PASSWORD", 15},
+/* 545 */  { "MASTER_PORT", 11},
+/* 546 */  { "MASTER_SERVER_ID", 16},
+/* 547 */  { "MASTER_SSL_CAPATH", 17},
+/* 548 */  { "MASTER_SSL_CA", 13},
+/* 549 */  { "MASTER_SSL_CERT", 15},
+/* 550 */  { "MASTER_SSL_CIPHER", 17},
+/* 551 */  { "MASTER_SSL_KEY", 14},
+/* 552 */  { "MASTER_SSL", 10},
+/* 553 */  { "MASTER_SSL_VERIFY_SERVER_CERT", 29},
+/* 554 */  { "MASTER", 6},
+/* 555 */  { "MASTER_USER", 11},
+/* 556 */  { "MASTER_HEARTBEAT_PERIOD", 23},
+/* 557 */  { "MATCH", 5},
+/* 558 */  { "MAX_CONNECTIONS_PER_HOUR", 24},
+/* 559 */  { "MAX_QUERIES_PER_HOUR", 20},
+/* 560 */  { "MAX_ROWS", 8},
+/* 561 */  { "MAX_SIZE", 8},
+/* 562 */  { "MAX", 3},
+/* 563 */  { "MAX_UPDATES_PER_HOUR", 20},
+/* 564 */  { "MAX_USER_CONNECTIONS", 20},
+/* 565 */  { "MAXVALUE", 8},
+/* 566 */  { "MEDIUMBLOB", 10},
+/* 567 */  { "MIDDLEINT", 9},
+/* 568 */  { "MEDIUMTEXT", 10},
+/* 569 */  { "MEDIUM", 6},
+/* 570 */  { "MEMORY", 6},
+/* 571 */  { "MERGE", 5},
+/* 572 */  { "MESSAGE_TEXT", 12},
+/* 573 */  { "MICROSECOND", 11},
+/* 574 */  { "MIGRATE", 7},
+/* 575 */  { "MINUTE_MICROSECOND", 18},
+/* 576 */  { "MINUTE_SECOND", 13},
+/* 577 */  { "SQL_TSI_MINUTE", 14},
+/* 578 */  { "MIN_ROWS", 8},
+/* 579 */  { "MIN", 3},
+/* 580 */  { "MODE", 4},
+/* 581 */  { "MODIFIES", 8},
+/* 582 */  { "MODIFY", 6},
+/* 583 */  { "MOD", 3},
+/* 584 */  { "SQL_TSI_MONTH", 13},
+/* 585 */  { "MULTILINESTRING", 15},
+/* 586 */  { "MULTIPOINT", 10},
+/* 587 */  { "MULTIPOLYGON", 12},
+/* 588 */  { "MUTEX", 5},
+/* 589 */  { "MYSQL_ERRNO", 11},
+/* 590 */  { "NAMES", 5},
+/* 591 */  { "NAME", 4},
+/* 592 */  { "NATIONAL", 8},
+/* 593 */  { "NATURAL", 7},
+/* 594 */  { "(nchar)", 7},
+/* 595 */  { "NCHAR", 5},
+/* 596 */  { "NDBCLUSTER", 10},
+/* 597 */  { "!=", 2},
+/* 598 */  { "~", 1},
+/* 599 */  { "NEW", 3},
+/* 600 */  { "NEXT", 4},
+/* 601 */  { "NODEGROUP", 9},
+/* 602 */  { "NONE", 4},
+/* 603 */  { "!", 1},
+/* 604 */  { "NOT", 3},
+/* 605 */  { "NOW", 3},
+/* 606 */  { "NO", 2},
+/* 607 */  { "NO_WAIT", 7},
+/* 608 */  { "NO_WRITE_TO_BINLOG", 18},
+/* 609 */  { "NULL", 4},
+/* 610 */  { "(num)", 5},
+/* 611 */  { "NUMERIC", 7},
+/* 612 */  { "NVARCHAR", 8},
+/* 613 */  { "OFFSET", 6},
+/* 614 */  { "OLD_PASSWORD", 12},
+/* 615 */  { "ON", 2},
+/* 616 */  { "ONE_SHOT", 8},
+/* 617 */  { "ONE", 3},
+/* 618 */  { "ONLINE", 6},
+/* 619 */  { "OPEN", 4},
+/* 620 */  { "OPTIMIZE", 8},
+/* 621 */  { "OPTIONS", 7},
+/* 622 */  { "OPTION", 6},
+/* 623 */  { "OPTIONALLY", 10},
+/* 624 */  { "|", 1},
+/* 625 */  { "ORDER", 5},
+/* 626 */  { "||", 2},
+/* 627 */  { "OR", 2},
+/* 628 */  { "OUTER", 5},
+/* 629 */  { "OUTFILE", 7},
+/* 630 */  { "OUT", 3},
+/* 631 */  { "OWNER", 5},
+/* 632 */  { "PACK_KEYS", 9},
+/* 633 */  { "PAGE", 4},
+/* 634 */  { "PAGE_CHECKSUM", 13},
+/* 635 */  { "?", 1},
+/* 636 */  { "PARSER", 6},
+/* 637 */  { "PARSE_VCOL_EXPR", 15},
+/* 638 */  { "PARTIAL", 7},
+/* 639 */  { "PARTITIONING", 12},
+/* 640 */  { "PARTITIONS", 10},
+/* 641 */  { "PARTITION", 9},
+/* 642 */  { "PASSWORD", 8},
+/* 643 */  { "PERSISTENT", 10},
+/* 644 */  { "PHASE", 5},
+/* 645 */  { "PLUGINS", 7},
+/* 646 */  { "PLUGIN", 6},
+/* 647 */  { "POINT", 5},
+/* 648 */  { "POLYGON", 7},
+/* 649 */  { "PORT", 4},
+/* 650 */  { "POSITION", 8},
+/* 651 */  { "PRECISION", 9},
+/* 652 */  { "PREPARE", 7},
+/* 653 */  { "PRESERVE", 8},
+/* 654 */  { "PREV", 4},
+/* 655 */  { "PRIMARY", 7},
+/* 656 */  { "PRIVILEGES", 10},
+/* 657 */  { "PROCEDURE", 9},
+/* 658 */  { "PROCESS", 7},
+/* 659 */  { "PROCESSLIST", 11},
+/* 660 */  { "PROFILE", 7},
+/* 661 */  { "PROFILES", 8},
+/* 662 */  { "PROXY", 5},
+/* 663 */  { "PURGE", 5},
+/* 664 */  { "SQL_TSI_QUARTER", 15},
+/* 665 */  { "QUERY", 5},
+/* 666 */  { "QUICK", 5},
+/* 667 */  { "RANGE", 5},
+/* 668 */  { "READS", 5},
+/* 669 */  { "READ_ONLY", 9},
+/* 670 */  { "READ", 4},
+/* 671 */  { "READ_WRITE", 10},
+/* 672 */  { "REAL", 4},
+/* 673 */  { "REBUILD", 7},
+/* 674 */  { "RECOVER", 7},
+/* 675 */  { "REDOFILE", 8},
+/* 676 */  { "REDO_BUFFER_SIZE", 16},
+/* 677 */  { "REDUNDANT", 9},
+/* 678 */  { "REFERENCES", 10},
+/* 679 */  { "RLIKE", 5},
+/* 680 */  { "RELAY", 5},
+/* 681 */  { "RELAYLOG", 8},
+/* 682 */  { "RELAY_LOG_FILE", 14},
+/* 683 */  { "RELAY_LOG_POS", 13},
+/* 684 */  { "RELAY_THREAD", 12},
+/* 685 */  { "RELEASE", 7},
+/* 686 */  { "RELOAD", 6},
+/* 687 */  { "REMOVE", 6},
+/* 688 */  { "RENAME", 6},
+/* 689 */  { "REORGANIZE", 10},
+/* 690 */  { "REPAIR", 6},
+/* 691 */  { "REPEATABLE", 10},
+/* 692 */  { "REPEAT", 6},
+/* 693 */  { "REPLACE", 7},
+/* 694 */  { "REPLICATION", 11},
+/* 695 */  { "REQUIRE", 7},
+/* 696 */  { "RESET", 5},
+/* 697 */  { "RESIGNAL", 8},
+/* 698 */  { "USER_RESOURCES", 14},
+/* 699 */  { "RESTORE", 7},
+/* 700 */  { "RESTRICT", 8},
+/* 701 */  { "RESUME", 6},
+/* 702 */  { "RETURNS", 7},
+/* 703 */  { "RETURN", 6},
+/* 704 */  { "REVOKE", 6},
+/* 705 */  { "RIGHT", 5},
+/* 706 */  { "ROLLBACK", 8},
+/* 707 */  { "ROLLUP", 6},
+/* 708 */  { "ROUTINE", 7},
+/* 709 */  { "ROWS", 4},
+/* 710 */  { "ROW_FORMAT", 10},
+/* 711 */  { "ROW", 3},
+/* 712 */  { "RTREE", 5},
+/* 713 */  { "SAVEPOINT", 9},
+/* 714 */  { "SCHEDULE", 8},
+/* 715 */  { "SCHEMA_NAME", 11},
+/* 716 */  { "SECOND_MICROSECOND", 18},
+/* 717 */  { "SQL_TSI_SECOND", 14},
+/* 718 */  { "SECURITY", 8},
+/* 719 */  { "SELECT", 6},
+/* 720 */  { "SENSITIVE", 9},
+/* 721 */  { "SEPARATOR", 9},
+/* 722 */  { "SERIALIZABLE", 12},
+/* 723 */  { "SERIAL", 6},
+/* 724 */  { "SESSION", 7},
+/* 725 */  { "SERVER", 6},
+/* 726 */  { "SERVER_OPTIONS", 14},
+/* 727 */  { "SET", 3},
+/* 728 */  { ":=", 2},
+/* 729 */  { "SHARE", 5},
+/* 730 */  { "<<", 2},
+/* 731 */  { ">>", 2},
+/* 732 */  { "SHOW", 4},
+/* 733 */  { "SHUTDOWN", 8},
+/* 734 */  { "SIGNAL", 6},
+/* 735 */  { "SIGNED", 6},
+/* 736 */  { "SIMPLE", 6},
+/* 737 */  { "SLAVE", 5},
+/* 738 */  { "SLOW", 4},
+/* 739 */  { "SMALLINT", 8},
+/* 740 */  { "SNAPSHOT", 8},
+/* 741 */  { "SOCKET", 6},
+/* 742 */  { "SOFT", 4},
+/* 743 */  { "SONAME", 6},
+/* 744 */  { "SOUNDS", 6},
+/* 745 */  { "SOURCE", 6},
+/* 746 */  { "SPATIAL", 7},
+/* 747 */  { "SPECIFIC", 8},
+/* 748 */  { "SQLEXCEPTION", 12},
+/* 749 */  { "SQLSTATE", 8},
+/* 750 */  { "SQLWARNING", 10},
+/* 751 */  { "SQL_BIG_RESULT", 14},
+/* 752 */  { "SQL_BUFFER_RESULT", 17},
+/* 753 */  { "SQL_CACHE", 9},
+/* 754 */  { "SQL_CALC_FOUND_ROWS", 19},
+/* 755 */  { "SQL_NO_CACHE", 12},
+/* 756 */  { "SQL_SMALL_RESULT", 16},
+/* 757 */  { "SQL", 3},
+/* 758 */  { "SQL_THREAD", 10},
+/* 759 */  { "SSL", 3},
+/* 760 */  { "STARTING", 8},
+/* 761 */  { "STARTS", 6},
+/* 762 */  { "START", 5},
+/* 763 */  { "STATUS", 6},
+/* 764 */  { "STDDEV_SAMP", 11},
+/* 765 */  { "STDDEV_POP", 10},
+/* 766 */  { "STOP", 4},
+/* 767 */  { "STORAGE", 7},
+/* 768 */  { "STRAIGHT_JOIN", 13},
+/* 769 */  { "STRING", 6},
+/* 770 */  { "SUBCLASS_ORIGIN", 15},
+/* 771 */  { "SUBDATE", 7},
+/* 772 */  { "SUBJECT", 7},
+/* 773 */  { "SUBPARTITIONS", 13},
+/* 774 */  { "SUBPARTITION", 12},
+/* 775 */  { "SUBSTRING", 9},
+/* 776 */  { "SUM", 3},
+/* 777 */  { "SUPER", 5},
+/* 778 */  { "SUSPEND", 7},
+/* 779 */  { "SWAPS", 5},
+/* 780 */  { "SWITCHES", 8},
+/* 781 */  { "SYSDATE", 7},
+/* 782 */  { "TABLES", 6},
+/* 783 */  { "TABLESPACE", 10},
+/* 784 */  { "TABLE_REF_PRIORITY", 18},
+/* 785 */  { "TABLE_STATISTICS", 16},
+/* 786 */  { "TABLE", 5},
+/* 787 */  { "TABLE_CHECKSUM", 14},
+/* 788 */  { "TABLE_NAME", 10},
+/* 789 */  { "TEMPORARY", 9},
+/* 790 */  { "TEMPTABLE", 9},
+/* 791 */  { "TERMINATED", 10},
+/* 792 */  { "(text)", 6},
+/* 793 */  { "TEXT", 4},
+/* 794 */  { "THAN", 4},
+/* 795 */  { "THEN", 4},
+/* 796 */  { "TIMESTAMP", 9},
+/* 797 */  { "TIMESTAMPADD", 12},
+/* 798 */  { "TIMESTAMPDIFF", 13},
+/* 799 */  { "TIME", 4},
+/* 800 */  { "TINYBLOB", 8},
+/* 801 */  { "TINYINT", 7},
+/* 802 */  { "TINYTEXT", 8},
+/* 803 */  { "TO", 2},
+/* 804 */  { "TRAILING", 8},
+/* 805 */  { "TRANSACTION", 11},
+/* 806 */  { "TRANSACTIONAL", 13},
+/* 807 */  { "TRIGGERS", 8},
+/* 808 */  { "TRIGGER", 7},
+/* 809 */  { "TRIM", 4},
+/* 810 */  { "TRUE", 4},
+/* 811 */  { "TRUNCATE", 8},
+/* 812 */  { "TYPES", 5},
+/* 813 */  { "TYPE", 4},
+/* 814 */  { "UDF_RETURNS", 11},
+/* 815 */  { "(ulonglong)", 11},
+/* 816 */  { "UNCOMMITTED", 11},
+/* 817 */  { "UNDEFINED", 9},
+/* 818 */  { "(_charset)", 10},
+/* 819 */  { "UNDOFILE", 8},
+/* 820 */  { "UNDO_BUFFER_SIZE", 16},
+/* 821 */  { "UNDO", 4},
+/* 822 */  { "UNICODE", 7},
+/* 823 */  { "UNINSTALL", 9},
+/* 824 */  { "UNION", 5},
+/* 825 */  { "UNIQUE", 6},
+/* 826 */  { "UNKNOWN", 7},
+/* 827 */  { "UNLOCK", 6},
+/* 828 */  { "UNSIGNED", 8},
+/* 829 */  { "UNTIL", 5},
+/* 830 */  { "UPDATE", 6},
+/* 831 */  { "UPGRADE", 7},
+/* 832 */  { "USAGE", 5},
+/* 833 */  { "SYSTEM_USER", 11},
+/* 834 */  { "USER_STATISTICS", 15},
+/* 835 */  { "USE_FRM", 7},
+/* 836 */  { "USE", 3},
+/* 837 */  { "USING", 5},
+/* 838 */  { "UTC_DATE", 8},
+/* 839 */  { "UTC_TIMESTAMP", 13},
+/* 840 */  { "UTC_TIME", 8},
+/* 841 */  { "VALUES", 6},
+/* 842 */  { "VALUE", 5},
+/* 843 */  { "VARBINARY", 9},
+/* 844 */  { "VARCHARACTER", 12},
+/* 845 */  { "VARIABLES", 9},
+/* 846 */  { "VAR_POP", 7},
+/* 847 */  { "VARYING", 7},
+/* 848 */  { "VAR_SAMP", 8},
+/* 849 */  { "VIA", 3},
+/* 850 */  { "VIEW", 4},
+/* 851 */  { "VIRTUAL", 7},
+/* 852 */  { "WAIT", 4},
+/* 853 */  { "WARNINGS", 8},
+/* 854 */  { "WEEK", 4},
+/* 855 */  { "WHEN", 4},
+/* 856 */  { "WHERE", 5},
+/* 857 */  { "WHILE", 5},
+/* 858 */  { "WITH", 4},
+/* 859 */  { "WITH CUBE", 9},
+/* 860 */  { "WITH ROLLUP", 11},
+/* 861 */  { "WORK", 4},
+/* 862 */  { "WRAPPER", 7},
+/* 863 */  { "WRITE", 5},
+/* 864 */  { "X509", 4},
+/* 865 */  { "XA", 2},
+/* 866 */  { "XML", 3},
+/* 867 */  { "XOR", 3},
+/* 868 */  { "YEAR_MONTH", 10},
+/* 869 */  { "YEAR", 4},
+/* 870 */  { "ZEROFILL", 8},
+/* 871 */  { "?", 1},
+/* 872 */  { "?, ...", 6},
+/* 873 */  { "(?)", 3},
+/* 874 */  { "(?) /* , ... */", 15},
+/* 875 */  { "(...)", 5},
+/* 876 */  { "(...) /* , ... */", 17},
+/* 877 */  { "UNUSED", 6},
+/* DUMMY */ { "", 0}
+};
+/* PFS specific tokens. */
+#define TOK_PFS_GENERIC_VALUE 871
+#define TOK_PFS_GENERIC_VALUE_LIST 872
+#define TOK_PFS_ROW_SINGLE_VALUE 873
+#define TOK_PFS_ROW_SINGLE_VALUE_LIST 874
+#define TOK_PFS_ROW_MULTIPLE_VALUE 875
+#define TOK_PFS_ROW_MULTIPLE_VALUE_LIST 876
+#define TOK_PFS_UNUSED 877
diff --git a/storage/perfschema/pfs_lock.h b/storage/perfschema/pfs_lock.h
index 82c34fc2702..65937e94ece 100644
--- a/storage/perfschema/pfs_lock.h
+++ b/storage/perfschema/pfs_lock.h
@@ -108,6 +108,17 @@ struct pfs_lock
   }
 
   /**
+    Execute an allocated to dirty transition.
+    This transition should be executed by the writer that owns the record,
+    before the record is modified.
+  */
+  void allocated_to_dirty(void)
+  {
+    DBUG_ASSERT(m_state == PFS_LOCK_ALLOCATED);
+    PFS_atomic::store_32(&m_state, PFS_LOCK_DIRTY);
+  }
+
+  /**
     Execute a dirty to allocated transition.
     This transition should be executed by the writer that owns the record,
     after the record is in a state ready to be read.
@@ -172,6 +183,11 @@ struct pfs_lock
             (copy->m_state == PFS_atomic::load_32(&m_state)) &&
             (copy->m_state == PFS_LOCK_ALLOCATED));
   }
+
+  uint32 get_version()
+  {
+    return PFS_atomic::load_u32(&m_version);
+  }
 };
 
 
diff --git a/storage/perfschema/pfs_server.cc b/storage/perfschema/pfs_server.cc
index 0f322a9cb76..1f7010e5b5f 100644
--- a/storage/perfschema/pfs_server.cc
+++ b/storage/perfschema/pfs_server.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2008, 2011, Oracle and/or its affiliates. All rights reserved.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
@@ -27,21 +27,35 @@
 #include "pfs_instr_class.h"
 #include "pfs_instr.h"
 #include "pfs_events_waits.h"
+#include "pfs_events_stages.h"
+#include "pfs_events_statements.h"
 #include "pfs_timer.h"
+#include "pfs_setup_actor.h"
+#include "pfs_setup_object.h"
+#include "pfs_host.h"
+#include "pfs_user.h"
+#include "pfs_account.h"
+#include "pfs_defaults.h"
+#include "pfs_digest.h"
 
 PFS_global_param pfs_param;
 
+PFS_table_stat PFS_table_stat::g_reset_template;
+
 C_MODE_START
 static void destroy_pfs_thread(void *key);
 C_MODE_END
 
 static void cleanup_performance_schema(void);
+void cleanup_instrument_config(void);
 
 struct PSI_bootstrap*
 initialize_performance_schema(const PFS_global_param *param)
 {
   pfs_initialized= false;
 
+  PFS_table_stat::g_reset_template.reset();
+
   if (! param->m_enabled)
   {
     /*
@@ -54,6 +68,9 @@ initialize_performance_schema(const PFS_global_param *param)
   init_timers();
   PFS_atomic::init();
 
+  init_event_name_sizing(param);
+  register_global_classes();
+
   if (pthread_key_create(&THR_PFS, destroy_pfs_thread))
     return NULL;
 
@@ -65,11 +82,30 @@ initialize_performance_schema(const PFS_global_param *param)
       init_thread_class(param->m_thread_class_sizing) ||
       init_table_share(param->m_table_share_sizing) ||
       init_file_class(param->m_file_class_sizing) ||
+      init_stage_class(param->m_stage_class_sizing) ||
+      init_statement_class(param->m_statement_class_sizing) ||
+      init_socket_class(param->m_socket_class_sizing) ||
       init_instruments(param) ||
       init_events_waits_history_long(
         param->m_events_waits_history_long_sizing) ||
+      init_events_stages_history_long(
+        param->m_events_stages_history_long_sizing) ||
+      init_events_statements_history_long(
+        param->m_events_statements_history_long_sizing) ||
       init_file_hash() ||
-      init_table_share_hash())
+      init_table_share_hash() ||
+      init_setup_actor(param) ||
+      init_setup_actor_hash() ||
+      init_setup_object(param) ||
+      init_setup_object_hash() ||
+      init_host(param) ||
+      init_host_hash() ||
+      init_user(param) ||
+      init_user_hash() ||
+      init_account(param) ||
+      init_account_hash() ||
+      init_digest(param) ||
+      init_digest_hash())
   {
     /*
       The performance schema initialization failed.
@@ -80,6 +116,22 @@ initialize_performance_schema(const PFS_global_param *param)
   }
 
   pfs_initialized= true;
+
+  /** Default values for SETUP_CONSUMERS */
+  flag_events_stages_current=          param->m_consumer_events_stages_current_enabled;
+  flag_events_stages_history=          param->m_consumer_events_stages_history_enabled;
+  flag_events_stages_history_long=     param->m_consumer_events_stages_history_long_enabled;
+  flag_events_statements_current=      param->m_consumer_events_statements_current_enabled;
+  flag_events_statements_history=      param->m_consumer_events_statements_history_enabled;
+  flag_events_statements_history_long= param->m_consumer_events_statements_history_long_enabled;
+  flag_events_waits_current=           param->m_consumer_events_waits_current_enabled;
+  flag_events_waits_history=           param->m_consumer_events_waits_history_enabled;
+  flag_events_waits_history_long=      param->m_consumer_events_waits_history_long_enabled;
+  flag_global_instrumentation=         param->m_consumer_global_instrumentation_enabled;
+  flag_thread_instrumentation=         param->m_consumer_thread_instrumentation_enabled;
+  flag_statements_digest=              param->m_consumer_statement_digest_enabled;
+
+  install_default_setup(&PFS_bootstrap);
   return &PFS_bootstrap;
 }
 
@@ -104,21 +156,41 @@ static void destroy_pfs_thread(void *key)
 
 static void cleanup_performance_schema(void)
 {
+  cleanup_instrument_config();
+/*  Disabled: Bug#5666
   cleanup_instruments();
   cleanup_sync_class();
   cleanup_thread_class();
   cleanup_table_share();
   cleanup_file_class();
+  cleanup_stage_class();
+  cleanup_statement_class();
+  cleanup_socket_class();
   cleanup_events_waits_history_long();
+  cleanup_events_stages_history_long();
+  cleanup_events_statements_history_long();
   cleanup_table_share_hash();
   cleanup_file_hash();
+  cleanup_setup_actor();
+  cleanup_setup_actor_hash();
+  cleanup_setup_object();
+  cleanup_setup_object_hash();
+  cleanup_host();
+  cleanup_host_hash();
+  cleanup_user();
+  cleanup_user_hash();
+  cleanup_account();
+  cleanup_account_hash();
+  cleanup_digest();
   PFS_atomic::cleanup();
+*/
 }
 
 void shutdown_performance_schema(void)
 {
   pfs_initialized= false;
   cleanup_performance_schema();
+#if 0
   /*
     Be careful to not delete un-initialized keys,
     this would affect key 0, which is THR_KEY_mysys,
@@ -129,5 +201,94 @@ void shutdown_performance_schema(void)
     pthread_key_delete(THR_PFS);
     THR_PFS_initialized= false;
   }
+#endif
+}
+
+/**
+  Initialize the dynamic array used to hold PFS_INSTRUMENT configuration
+  options.
+*/
+void init_pfs_instrument_array()
+{
+  my_init_dynamic_array(&pfs_instr_config_array, sizeof(PFS_instr_config*), 10, 10);
+  pfs_instr_config_state=  PFS_INSTR_CONFIG_ALLOCATED;
 }
 
+/**
+  Deallocate the PFS_INSTRUMENT array. Use an atomic compare-and-swap to ensure
+  that it is deallocated only once in the chaotic environment of server shutdown.
+*/
+void cleanup_instrument_config()
+{
+  int desired_state= PFS_INSTR_CONFIG_ALLOCATED;
+  
+  /* Ignore if another thread has already deallocated the array */
+  if (my_atomic_cas32(&pfs_instr_config_state, &desired_state, PFS_INSTR_CONFIG_DEALLOCATED))
+    delete_dynamic(&pfs_instr_config_array);
+}
+
+/**
+  Process one performance_schema_instrument configuration string. Isolate the
+  instrument name, evaluate the option value, and store them in a dynamic array.
+  Return 'false' for success, 'true' for error.
+
+  @param name    Instrument name
+  @param value   Configuration option: 'on', 'off', etc.
+  @return 0 for success, non zero for errors
+*/
+
+int add_pfs_instr_to_array(const char* name, const char* value)
+{
+  int name_length= strlen(name);
+  int value_length= strlen(value);
+
+  /* Allocate structure plus string buffers plus null terminators */
+  PFS_instr_config* e = (PFS_instr_config*)my_malloc(sizeof(PFS_instr_config)
+                       + name_length + 1 + value_length + 1, MYF(MY_WME));
+  if (!e) return 1;
+  
+  /* Copy the instrument name */
+  e->m_name= (char*)e + sizeof(PFS_instr_config);
+  memcpy(e->m_name, name, name_length);
+  e->m_name_length= name_length;
+  e->m_name[name_length]= '\0';
+  
+  /* Set flags accordingly */
+  if (!my_strcasecmp(&my_charset_latin1, value, "counted"))
+  {
+    e->m_enabled= true;
+    e->m_timed= false;
+  }
+  else
+  if (!my_strcasecmp(&my_charset_latin1, value, "true") ||
+      !my_strcasecmp(&my_charset_latin1, value, "on") ||
+      !my_strcasecmp(&my_charset_latin1, value, "1") ||
+      !my_strcasecmp(&my_charset_latin1, value, "yes"))
+  {
+    e->m_enabled= true;
+    e->m_timed= true;
+  }
+  else
+  if (!my_strcasecmp(&my_charset_latin1, value, "false") ||
+      !my_strcasecmp(&my_charset_latin1, value, "off") ||
+      !my_strcasecmp(&my_charset_latin1, value, "0") ||
+      !my_strcasecmp(&my_charset_latin1, value, "no"))
+  {
+    e->m_enabled= false;
+    e->m_timed= false;
+  }
+  else
+  {
+    my_free(e);
+    return 1;
+  }
+
+  /* Add to the array of default startup options */
+  if (insert_dynamic(&pfs_instr_config_array, &e))
+  {
+    my_free(e);
+    return 1;
+  }
+
+  return 0;
+}
diff --git a/storage/perfschema/pfs_server.h b/storage/perfschema/pfs_server.h
index 8cb68cd52b1..f65febdeb6d 100644
--- a/storage/perfschema/pfs_server.h
+++ b/storage/perfschema/pfs_server.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2008, 2011, Oracle and/or its affiliates. All rights reserved.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
@@ -54,11 +54,17 @@
 #ifndef PFS_MAX_FILE_HANDLE
   #define PFS_MAX_FILE_HANDLE 32768
 #endif
+#ifndef PFS_MAX_SOCKETS
+  #define PFS_MAX_SOCKETS 1000
+#endif
+#ifndef PFS_MAX_SOCKET_CLASS
+  #define PFS_MAX_SOCKET_CLASS 10
+#endif
 #ifndef PFS_MAX_TABLE_SHARE
-  #define PFS_MAX_TABLE_SHARE 50000
+  #define PFS_MAX_TABLE_SHARE 1000
 #endif
 #ifndef PFS_MAX_TABLE
-  #define PFS_MAX_TABLE 100000
+  #define PFS_MAX_TABLE 10000
 #endif
 #ifndef PFS_WAITS_HISTORY_SIZE
   #define PFS_WAITS_HISTORY_SIZE 10
@@ -66,36 +72,217 @@
 #ifndef PFS_WAITS_HISTORY_LONG_SIZE
   #define PFS_WAITS_HISTORY_LONG_SIZE 10000
 #endif
+#ifndef PFS_MAX_SETUP_ACTOR
+  #define PFS_MAX_SETUP_ACTOR 100
+#endif
+#ifndef PFS_MAX_SETUP_OBJECT
+  #define PFS_MAX_SETUP_OBJECT 100
+#endif
+#ifndef PFS_MAX_HOST
+  #define PFS_MAX_HOST 100
+#endif
+#ifndef PFS_MAX_USER
+  #define PFS_MAX_USER 100
+#endif
+#ifndef PFS_MAX_ACCOUNT
+  #define PFS_MAX_ACCOUNT 100
+#endif
+#ifndef PFS_MAX_STAGE_CLASS
+  #define PFS_MAX_STAGE_CLASS 150
+#endif
+#ifndef PFS_STAGES_HISTORY_SIZE
+  #define PFS_STAGES_HISTORY_SIZE 10
+#endif
+#ifndef PFS_STAGES_HISTORY_LONG_SIZE
+  #define PFS_STAGES_HISTORY_LONG_SIZE 10000
+#endif
+#ifndef PFS_STATEMENTS_HISTORY_SIZE
+  #define PFS_STATEMENTS_HISTORY_SIZE 10
+#endif
+#ifndef PFS_STATEMENTS_HISTORY_LONG_SIZE
+  #define PFS_STATEMENTS_HISTORY_LONG_SIZE 10000
+#endif
+#ifndef PFS_STATEMENTS_STACK_SIZE
+  #define PFS_STATEMENTS_STACK_SIZE 10
+#endif
+#ifndef PFS_DIGEST_SIZE
+  #define PFS_DIGEST_SIZE 200
+#endif
 
+/** Performance schema global sizing parameters. */
 struct PFS_global_param
 {
-  bool m_enabled;
+  /** True if the performance schema is enabled. */
+  bool m_enabled; 
+  /** Default values for SETUP_CONSUMERS. */
+  bool m_consumer_events_stages_current_enabled;
+  bool m_consumer_events_stages_history_enabled;
+  bool m_consumer_events_stages_history_long_enabled;
+  bool m_consumer_events_statements_current_enabled;
+  bool m_consumer_events_statements_history_enabled;
+  bool m_consumer_events_statements_history_long_enabled;
+  bool m_consumer_events_waits_current_enabled;
+  bool m_consumer_events_waits_history_enabled;
+  bool m_consumer_events_waits_history_long_enabled;
+  bool m_consumer_global_instrumentation_enabled;
+  bool m_consumer_thread_instrumentation_enabled;
+  bool m_consumer_statement_digest_enabled;
+
+  /** Default instrument configuration option. */
+  char *m_pfs_instrument;
+
+  /**
+    Maximum number of instrumented mutex classes.
+    @sa mutex_class_lost.
+  */
   ulong m_mutex_class_sizing;
+  /**
+    Maximum number of instrumented rwlock classes.
+    @sa rwlock_class_lost.
+  */
   ulong m_rwlock_class_sizing;
+  /**
+    Maximum number of instrumented cond classes.
+    @sa cond_class_lost.
+  */
   ulong m_cond_class_sizing;
+  /**
+    Maximum number of instrumented thread classes.
+    @sa thread_class_lost.
+  */
   ulong m_thread_class_sizing;
+  /**
+    Maximum number of instrumented table share.
+    @sa table_share_lost.
+  */
   ulong m_table_share_sizing;
+  /**
+    Maximum number of instrumented file classes.
+    @sa file_class_lost.
+  */
   ulong m_file_class_sizing;
+  /**
+    Maximum number of instrumented mutex instances.
+    @sa mutex_lost.
+  */
   ulong m_mutex_sizing;
+  /**
+    Maximum number of instrumented rwlock instances.
+    @sa rwlock_lost.
+  */
   ulong m_rwlock_sizing;
+  /**
+    Maximum number of instrumented cond instances.
+    @sa cond_lost.
+  */
   ulong m_cond_sizing;
+  /**
+    Maximum number of instrumented thread instances.
+    @sa thread_lost.
+  */
   ulong m_thread_sizing;
+  /**
+    Maximum number of instrumented table handles.
+    @sa table_lost.
+  */
   ulong m_table_sizing;
+  /**
+    Maximum number of instrumented file instances.
+    @sa file_lost.
+  */
   ulong m_file_sizing;
+  /**
+    Maximum number of instrumented file handles.
+    @sa file_handle_lost.
+  */
   ulong m_file_handle_sizing;
+  /**
+    Maxium number of instrumented socket instances
+    @sa socket_lost  
+  */
+  ulong m_socket_sizing;
+  /**
+    Maximum number of instrumented socket classes.
+    @sa socket_class_lost.
+  */
+  ulong m_socket_class_sizing;
+  /** Maximum number of rows per thread in table EVENTS_WAITS_HISTORY. */
   ulong m_events_waits_history_sizing;
+  /** Maximum number of rows in table EVENTS_WAITS_HISTORY_LONG. */
   ulong m_events_waits_history_long_sizing;
+  /** Maximum number of rows in table SETUP_ACTORS. */
+  ulong m_setup_actor_sizing;
+  /** Maximum number of rows in table SETUP_OBJECTS. */
+  ulong m_setup_object_sizing;
+  /** Maximum number of rows in table HOSTS. */
+  ulong m_host_sizing;
+  /** Maximum number of rows in table USERS. */
+  ulong m_user_sizing;
+  /** Maximum number of rows in table ACCOUNTS. */
+  ulong m_account_sizing;
+  /**
+    Maximum number of instrumented stage classes.
+    @sa stage_class_lost.
+  */
+  ulong m_stage_class_sizing;
+  /** Maximum number of rows per thread in table EVENTS_STAGES_HISTORY. */
+  ulong m_events_stages_history_sizing;
+  /** Maximum number of rows in table EVENTS_STAGES_HISTORY_LONG. */
+  ulong m_events_stages_history_long_sizing;
+  /**
+    Maximum number of instrumented statement classes.
+    @sa statement_class_lost.
+  */
+  ulong m_statement_class_sizing;
+  /** Maximum number of rows per thread in table EVENTS_STATEMENT_HISTORY. */
+  ulong m_events_statements_history_sizing;
+  /** Maximum number of rows in table EVENTS_STATEMENTS_HISTORY_LONG. */
+  ulong m_events_statements_history_long_sizing;
+  /** Maximum number of digests to be captured */
+  ulong m_digest_sizing;
 };
 
+/**
+  Performance schema sizing values for the server.
+  This global variable is set when parsing server startup options.
+*/
 extern PFS_global_param pfs_param;
 
+/**
+  Initialize the performance schema.
+  @param param Size parameters to use.
+  @return A boostrap handle, or NULL.
+*/
 struct PSI_bootstrap*
 initialize_performance_schema(const PFS_global_param *param);
 
+/**
+  Initialize the performance schema ACL.
+  ACL is strictly enforced when the server is running in normal mode,
+  to enforce that only legal operations are allowed.
+  When running in boostrap mode, ACL restrictions are relaxed,
+  to allow the boostrap scripts to DROP / CREATE performance schema tables.
+  @sa ACL_internal_schema_registry
+  @param bootstrap True if the server is starting in bootstrap mode.
+*/
 void initialize_performance_schema_acl(bool bootstrap);
 
 void check_performance_schema();
 
+/**
+  Initialize the dynamic array holding individual instrument settings collected
+  from the server configuration options.
+*/
+void init_pfs_instrument_array();
+
+/**
+  Process one PFS_INSTRUMENT configuration string.
+*/
+int add_pfs_instr_to_array(const char* name, const char* value);
+
+/**
+  Shutdown the performance schema.
+*/
 void shutdown_performance_schema();
 
 #endif
diff --git a/storage/perfschema/pfs_setup_actor.cc b/storage/perfschema/pfs_setup_actor.cc
new file mode 100644
index 00000000000..a587d3643d2
--- /dev/null
+++ b/storage/perfschema/pfs_setup_actor.cc
@@ -0,0 +1,337 @@
+/* Copyright (c) 2010, 2011, Oracle and/or its affiliates. All rights reserved.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; version 2 of the License.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software Foundation,
+  51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA */
+
+/**
+  @file storage/perfschema/pfs_setup_actor.cc
+  Performance schema setup actor (implementation).
+*/
+
+#include "my_global.h"
+#include "my_sys.h"
+#include "my_base.h"
+#include "pfs.h"
+#include "pfs_stat.h"
+#include "pfs_instr.h"
+#include "pfs_setup_actor.h"
+#include "pfs_global.h"
+
+/**
+  @addtogroup Performance_schema_buffers
+  @{
+*/
+
+/** Size of the setup_actor instances array. @sa setup_actor_array */
+ulong setup_actor_max;
+
+/**
+  Setup_actor instances array.
+  @sa setup_actor_max
+*/
+
+PFS_setup_actor *setup_actor_array= NULL;
+
+/** Hash table for setup_actor records. */
+static LF_HASH setup_actor_hash;
+/** True if @c setup_actor_hash is initialized. */
+static bool setup_actor_hash_inited= false;
+
+/**
+  Initialize the setup actor buffers.
+  @param param                        sizing parameters
+  @return 0 on success
+*/
+int init_setup_actor(const PFS_global_param *param)
+{
+  setup_actor_max= param->m_setup_actor_sizing;
+
+  setup_actor_array= NULL;
+
+  if (setup_actor_max > 0)
+  {
+    setup_actor_array= PFS_MALLOC_ARRAY(setup_actor_max, PFS_setup_actor,
+                                         MYF(MY_ZEROFILL));
+    if (unlikely(setup_actor_array == NULL))
+      return 1;
+  }
+
+  return 0;
+}
+
+/** Cleanup all the setup actor buffers. */
+void cleanup_setup_actor(void)
+{
+  pfs_free(setup_actor_array);
+  setup_actor_array= NULL;
+  setup_actor_max= 0;
+}
+
+C_MODE_START
+static uchar *setup_actor_hash_get_key(const uchar *entry, size_t *length,
+                                       my_bool)
+{
+  const PFS_setup_actor * const *typed_entry;
+  const PFS_setup_actor *setup_actor;
+  const void *result;
+  typed_entry= reinterpret_cast<const PFS_setup_actor* const *> (entry);
+  DBUG_ASSERT(typed_entry != NULL);
+  setup_actor= *typed_entry;
+  DBUG_ASSERT(setup_actor != NULL);
+  *length= setup_actor->m_key.m_key_length;
+  result= setup_actor->m_key.m_hash_key;
+  return const_cast<uchar*> (reinterpret_cast<const uchar*> (result));
+}
+C_MODE_END
+
+/**
+  Initialize the setup actor hash.
+  @return 0 on success
+*/
+int init_setup_actor_hash(void)
+{
+  if (! setup_actor_hash_inited)
+  {
+    lf_hash_init(&setup_actor_hash, sizeof(PFS_setup_actor*), LF_HASH_UNIQUE,
+                 0, 0, setup_actor_hash_get_key, &my_charset_bin);
+    setup_actor_hash_inited= true;
+  }
+  return 0;
+}
+
+/** Cleanup the setup actor hash. */
+void cleanup_setup_actor_hash(void)
+{
+  if (setup_actor_hash_inited)
+  {
+    lf_hash_destroy(&setup_actor_hash);
+    setup_actor_hash_inited= false;
+  }
+}
+
+static LF_PINS* get_setup_actor_hash_pins(PFS_thread *thread)
+{
+  if (unlikely(thread->m_setup_actor_hash_pins == NULL))
+  {
+    if (! setup_actor_hash_inited)
+      return NULL;
+    thread->m_setup_actor_hash_pins= lf_hash_get_pins(&setup_actor_hash);
+  }
+  return thread->m_setup_actor_hash_pins;
+}
+
+static void set_setup_actor_key(PFS_setup_actor_key *key,
+                                const char *user, uint user_length,
+                                const char *host, uint host_length,
+                                const char *role, uint role_length)
+{
+  DBUG_ASSERT(user_length <= USERNAME_LENGTH);
+  DBUG_ASSERT(host_length <= HOSTNAME_LENGTH);
+
+  char *ptr= &key->m_hash_key[0];
+  memcpy(ptr, user, user_length);
+  ptr+= user_length;
+  ptr[0]= 0;
+  ptr++;
+  memcpy(ptr, host, host_length);
+  ptr+= host_length;
+  ptr[0]= 0;
+  ptr++;
+  memcpy(ptr, role, role_length);
+  ptr+= role_length;
+  ptr[0]= 0;
+  ptr++;
+  key->m_key_length= ptr - &key->m_hash_key[0];
+}
+
+int insert_setup_actor(const String *user, const String *host, const String *role)
+{
+  if (setup_actor_max == 0)
+    return HA_ERR_RECORD_FILE_FULL;
+
+  PFS_thread *thread= PFS_thread::get_current_thread();
+  if (unlikely(thread == NULL))
+    return HA_ERR_OUT_OF_MEM;
+
+  LF_PINS *pins= get_setup_actor_hash_pins(thread);
+  if (unlikely(pins == NULL))
+    return HA_ERR_OUT_OF_MEM;
+
+  static uint setup_actor_monotonic_index= 0;
+  uint index;
+  uint attempts= 0;
+  PFS_setup_actor *pfs;
+
+  while (++attempts <= setup_actor_max)
+  {
+    /* See create_mutex() */
+    PFS_atomic::add_u32(& setup_actor_monotonic_index, 1);
+    index= setup_actor_monotonic_index % setup_actor_max;
+    pfs= setup_actor_array + index;
+
+    if (pfs->m_lock.is_free())
+    {
+      if (pfs->m_lock.free_to_dirty())
+      {
+        set_setup_actor_key(&pfs->m_key,
+                            user->ptr(), user->length(),
+                            host->ptr(), host->length(),
+                            role->ptr(), role->length());
+        pfs->m_username= &pfs->m_key.m_hash_key[0];
+        pfs->m_username_length= user->length();
+        pfs->m_hostname= pfs->m_username + pfs->m_username_length + 1;
+        pfs->m_hostname_length= host->length();
+        pfs->m_rolename= pfs->m_hostname + pfs->m_hostname_length + 1;
+        pfs->m_rolename_length= role->length();
+
+        int res;
+        res= lf_hash_insert(&setup_actor_hash, pins, &pfs);
+        if (likely(res == 0))
+        {
+          pfs->m_lock.dirty_to_allocated();
+          return 0;
+        }
+
+        pfs->m_lock.dirty_to_free();
+        if (res > 0)
+          return HA_ERR_FOUND_DUPP_KEY;
+        return HA_ERR_OUT_OF_MEM;
+      }
+    }
+  }
+
+  return HA_ERR_RECORD_FILE_FULL;
+}
+
+int delete_setup_actor(const String *user, const String *host, const String *role)
+{
+  PFS_thread *thread= PFS_thread::get_current_thread();
+  if (unlikely(thread == NULL))
+    return HA_ERR_OUT_OF_MEM;
+
+  LF_PINS* pins= get_setup_actor_hash_pins(thread);
+  if (unlikely(pins == NULL))
+    return HA_ERR_OUT_OF_MEM;
+
+  PFS_setup_actor_key key;
+  set_setup_actor_key(&key,
+                      user->ptr(), user->length(),
+                      host->ptr(), host->length(),
+                      role->ptr(), role->length());
+
+  PFS_setup_actor **entry;
+  entry= reinterpret_cast<PFS_setup_actor**>
+    (lf_hash_search(&setup_actor_hash, pins, key.m_hash_key, key.m_key_length));
+
+  if (entry && (entry != MY_ERRPTR))
+  {
+    PFS_setup_actor *pfs= *entry;
+    lf_hash_delete(&setup_actor_hash, pins, key.m_hash_key, key.m_key_length);
+    pfs->m_lock.allocated_to_free();
+  }
+
+  lf_hash_search_unpin(pins);
+
+  return 0;
+}
+
+int reset_setup_actor()
+{
+  PFS_thread *thread= PFS_thread::get_current_thread();
+  if (unlikely(thread == NULL))
+    return HA_ERR_OUT_OF_MEM;
+
+  LF_PINS* pins= get_setup_actor_hash_pins(thread);
+  if (unlikely(pins == NULL))
+    return HA_ERR_OUT_OF_MEM;
+
+  PFS_setup_actor *pfs= setup_actor_array;
+  PFS_setup_actor *pfs_last= setup_actor_array + setup_actor_max;
+
+  for ( ; pfs < pfs_last; pfs++)
+  {
+    if (pfs->m_lock.is_populated())
+    {
+      lf_hash_delete(&setup_actor_hash, pins,
+                     pfs->m_key.m_hash_key, pfs->m_key.m_key_length);
+      pfs->m_lock.allocated_to_free();
+    }
+  }
+
+  return 0;
+}
+
+long setup_actor_count()
+{
+  return setup_actor_hash.count;
+}
+
+/*
+  - '%' should be replaced by NULL in table SETUP_ACTOR
+  - add an ENABLED column to include/exclude patterns, more flexible
+  - the principle is similar to SETUP_OBJECTS
+*/
+void lookup_setup_actor(PFS_thread *thread,
+                        const char *user, uint user_length,
+                        const char *host, uint host_length,
+                        bool *enabled)
+{
+  PFS_setup_actor_key key;
+  PFS_setup_actor **entry;
+  int i;
+
+  LF_PINS* pins= get_setup_actor_hash_pins(thread);
+  if (unlikely(pins == NULL))
+  {
+    *enabled= false;
+    return;
+  }
+
+  for (i= 1; i<=4; i++)
+  {
+    /*
+      WL#988 Roles is not implemented, so we do not have a role name.
+      Looking up "%" in SETUP_ACTORS.ROLE.
+    */
+    switch(i)
+    {
+    case 1:
+      set_setup_actor_key(&key, user, user_length, host, host_length, "%", 1);
+      break;
+    case 2:
+      set_setup_actor_key(&key, user, user_length, "%", 1, "%", 1);
+      break;
+    case 3:
+      set_setup_actor_key(&key, "%", 1, host, host_length, "%", 1);
+      break;
+    case 4:
+      set_setup_actor_key(&key, "%", 1, "%", 1, "%", 1);
+      break;
+    }
+    entry= reinterpret_cast<PFS_setup_actor**>
+      (lf_hash_search(&setup_actor_hash, pins, key.m_hash_key, key.m_key_length));
+
+    if (entry && (entry != MY_ERRPTR))
+    {
+      lf_hash_search_unpin(pins);
+      *enabled= true;
+      return;
+    }
+
+    lf_hash_search_unpin(pins);
+  }
+  *enabled= false;
+  return;
+}
+
+/** @} */
diff --git a/storage/perfschema/pfs_setup_actor.h b/storage/perfschema/pfs_setup_actor.h
new file mode 100644
index 00000000000..8b0ee8a485c
--- /dev/null
+++ b/storage/perfschema/pfs_setup_actor.h
@@ -0,0 +1,97 @@
+/* Copyright (c) 2010, 2011, Oracle and/or its affiliates. All rights reserved.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; version 2 of the License.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software Foundation,
+  51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA */
+
+#ifndef PFS_SETUP_ACTOR_H
+#define PFS_SETUP_ACTOR_H
+
+/**
+  @file storage/perfschema/pfs_setup_actor.h
+  Performance schema setup actors (declarations).
+*/
+
+#include "sql_string.h"
+#include "pfs_lock.h"
+#include "lf.h"
+
+struct PFS_global_param;
+
+/* WL#988 Roles Not implemented yet */
+#define ROLENAME_LENGTH 64
+
+/**
+  @addtogroup Performance_schema_buffers
+  @{
+*/
+
+/** Hash key for @sa PFS_setup_actor. */
+struct PFS_setup_actor_key
+{
+  /**
+    Hash search key.
+    This has to be a string for LF_HASH,
+    the format is "<username><0x00><hostname><0x00><rolename><0x00>"
+  */
+  char m_hash_key[USERNAME_LENGTH + 1 + HOSTNAME_LENGTH + 1 + ROLENAME_LENGTH + 1];
+  /** Length of @c m_hash_key. */
+  uint m_key_length;
+};
+
+/** A setup_actor record. */
+struct PFS_setup_actor
+{
+  /** Internal lock. */
+  pfs_lock m_lock;
+  /** Hash key. */
+  PFS_setup_actor_key m_key;
+  /** User name. This points inside the hash key. */
+  const char *m_username;
+  /** Length of @c m_username. */
+  uint m_username_length;
+  /** Host name. This points inside the hash key. */
+  const char *m_hostname;
+  /** Length of @c m_hostname. */
+  uint m_hostname_length;
+  /** Role name. This points inside the hash key. */
+  const char *m_rolename;
+  /** Length of @c m_rolename. */
+  uint m_rolename_length;
+};
+
+int init_setup_actor(const PFS_global_param *param);
+void cleanup_setup_actor(void);
+int init_setup_actor_hash(void);
+void cleanup_setup_actor_hash(void);
+
+int insert_setup_actor(const String *user, const String *host, const String *role);
+int delete_setup_actor(const String *user, const String *host, const String *role);
+int reset_setup_actor(void);
+long setup_actor_count(void);
+
+void lookup_setup_actor(PFS_thread *thread,
+                        const char *user, uint user_length,
+                        const char *host, uint host_length,
+                        bool *enabled);
+
+/* For iterators and show status. */
+
+extern ulong setup_actor_max;
+
+/* Exposing the data directly, for iterators. */
+
+extern PFS_setup_actor *setup_actor_array;
+
+/** @} */
+#endif
+
diff --git a/storage/perfschema/pfs_setup_object.cc b/storage/perfschema/pfs_setup_object.cc
new file mode 100644
index 00000000000..a9e9bb7881b
--- /dev/null
+++ b/storage/perfschema/pfs_setup_object.cc
@@ -0,0 +1,345 @@
+/* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; version 2 of the License.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+/**
+  @file storage/perfschema/pfs_setup_object.cc
+  Performance schema setup object (implementation).
+*/
+
+#include "my_global.h"
+#include "my_sys.h"
+#include "my_base.h"
+#include "sql_string.h"
+#include "pfs.h"
+#include "pfs_stat.h"
+#include "pfs_instr.h"
+#include "pfs_setup_object.h"
+#include "pfs_global.h"
+
+/**
+  @addtogroup Performance_schema_buffers
+  @{
+*/
+
+uint setup_objects_version= 0;
+
+ulong setup_object_max;
+
+PFS_setup_object *setup_object_array= NULL;
+
+static LF_HASH setup_object_hash;
+static bool setup_object_hash_inited= false;
+
+/**
+  Initialize the setup object buffers.
+  @param param                        sizing parameters
+  @return 0 on success
+*/
+int init_setup_object(const PFS_global_param *param)
+{
+  setup_object_max= param->m_setup_object_sizing;
+
+  setup_object_array= NULL;
+
+  if (setup_object_max > 0)
+  {
+    setup_object_array= PFS_MALLOC_ARRAY(setup_object_max, PFS_setup_object,
+                                         MYF(MY_ZEROFILL));
+    if (unlikely(setup_object_array == NULL))
+      return 1;
+  }
+
+  return 0;
+}
+
+/** Cleanup all the setup object buffers. */
+void cleanup_setup_object(void)
+{
+  pfs_free(setup_object_array);
+  setup_object_array= NULL;
+  setup_object_max= 0;
+}
+
+C_MODE_START
+static uchar *setup_object_hash_get_key(const uchar *entry, size_t *length,
+                                        my_bool)
+{
+  const PFS_setup_object * const *typed_entry;
+  const PFS_setup_object *setup_object;
+  const void *result;
+  typed_entry= reinterpret_cast<const PFS_setup_object* const *> (entry);
+  DBUG_ASSERT(typed_entry != NULL);
+  setup_object= *typed_entry;
+  DBUG_ASSERT(setup_object != NULL);
+  *length= setup_object->m_key.m_key_length;
+  result= setup_object->m_key.m_hash_key;
+  return const_cast<uchar*> (reinterpret_cast<const uchar*> (result));
+}
+C_MODE_END
+
+/**
+  Initialize the setup objects hash.
+  @return 0 on success
+*/
+int init_setup_object_hash(void)
+{
+  if (! setup_object_hash_inited)
+  {
+    lf_hash_init(&setup_object_hash, sizeof(PFS_setup_object*), LF_HASH_UNIQUE,
+                 0, 0, setup_object_hash_get_key, &my_charset_bin);
+    setup_object_hash_inited= true;
+  }
+  return 0;
+}
+
+/** Cleanup the setup objects hash. */
+void cleanup_setup_object_hash(void)
+{
+  if (setup_object_hash_inited)
+  {
+    lf_hash_destroy(&setup_object_hash);
+    setup_object_hash_inited= false;
+  }
+}
+
+static LF_PINS* get_setup_object_hash_pins(PFS_thread *thread)
+{
+  if (unlikely(thread->m_setup_object_hash_pins == NULL))
+  {
+    if (! setup_object_hash_inited)
+      return NULL;
+    thread->m_setup_object_hash_pins= lf_hash_get_pins(&setup_object_hash);
+  }
+  return thread->m_setup_object_hash_pins;
+}
+
+static void set_setup_object_key(PFS_setup_object_key *key,
+                                 enum_object_type object_type,
+                                 const char *schema, uint schema_length,
+                                 const char *object, uint object_length)
+{
+  DBUG_ASSERT(schema_length <= NAME_LEN);
+  DBUG_ASSERT(object_length <= NAME_LEN);
+
+  char *ptr= &key->m_hash_key[0];
+  ptr[0]= (char) object_type;
+  ptr++;
+  memcpy(ptr, schema, schema_length);
+  ptr+= schema_length;
+  ptr[0]= 0;
+  ptr++;
+  memcpy(ptr, object, object_length);
+  ptr+= object_length;
+  ptr[0]= 0;
+  ptr++;
+  key->m_key_length= ptr - &key->m_hash_key[0];
+}
+
+int insert_setup_object(enum_object_type object_type, const String *schema,
+                        const String *object, bool enabled, bool timed)
+{
+  if (setup_object_max == 0)
+    return HA_ERR_RECORD_FILE_FULL;
+
+  PFS_thread *thread= PFS_thread::get_current_thread();
+  if (unlikely(thread == NULL))
+    return HA_ERR_OUT_OF_MEM;
+
+  LF_PINS* pins= get_setup_object_hash_pins(thread);
+  if (unlikely(pins == NULL))
+    return HA_ERR_OUT_OF_MEM;
+
+  static uint setup_object_monotonic_index= 0;
+  uint index;
+  uint attempts= 0;
+  PFS_setup_object *pfs;
+
+  while (++attempts <= setup_object_max)
+  {
+    /* See create_mutex() */
+    PFS_atomic::add_u32(& setup_object_monotonic_index, 1);
+    index= setup_object_monotonic_index % setup_object_max;
+    pfs= setup_object_array + index;
+
+    if (pfs->m_lock.is_free())
+    {
+      if (pfs->m_lock.free_to_dirty())
+      {
+        set_setup_object_key(&pfs->m_key, object_type,
+                             schema->ptr(), schema->length(),
+                             object->ptr(), object->length());
+        pfs->m_schema_name= &pfs->m_key.m_hash_key[1];
+        pfs->m_schema_name_length= schema->length();
+        pfs->m_object_name= pfs->m_schema_name + pfs->m_schema_name_length + 1;
+        pfs->m_object_name_length= object->length();
+        pfs->m_enabled= enabled;
+        pfs->m_timed= timed;
+
+        int res;
+        res= lf_hash_insert(&setup_object_hash, pins, &pfs);
+        if (likely(res == 0))
+        {
+          pfs->m_lock.dirty_to_allocated();
+          setup_objects_version++;
+          return 0;
+        }
+
+        pfs->m_lock.dirty_to_free();
+        if (res > 0)
+          return HA_ERR_FOUND_DUPP_KEY;
+        /* OOM in lf_hash_insert */
+        return HA_ERR_OUT_OF_MEM;
+      }
+    }
+  }
+
+  return HA_ERR_RECORD_FILE_FULL;
+}
+
+int delete_setup_object(enum_object_type object_type, const String *schema,
+                        const String *object)
+{
+  PFS_thread *thread= PFS_thread::get_current_thread();
+  if (unlikely(thread == NULL))
+    return HA_ERR_OUT_OF_MEM;
+
+  LF_PINS* pins= get_setup_object_hash_pins(thread);
+  if (unlikely(pins == NULL))
+    return HA_ERR_OUT_OF_MEM;
+
+  PFS_setup_object_key key;
+  set_setup_object_key(&key, object_type,
+                       schema->ptr(), schema->length(),
+                       object->ptr(), object->length());
+
+  PFS_setup_object **entry;
+  entry= reinterpret_cast<PFS_setup_object**>
+    (lf_hash_search(&setup_object_hash, pins, key.m_hash_key, key.m_key_length));
+
+  if (entry && (entry != MY_ERRPTR))
+  {
+    PFS_setup_object *pfs= *entry;
+    lf_hash_delete(&setup_object_hash, pins, key.m_hash_key, key.m_key_length);
+    pfs->m_lock.allocated_to_free();
+  }
+
+  lf_hash_search_unpin(pins);
+
+  setup_objects_version++;
+  return 0;
+}
+
+int reset_setup_object()
+{
+  PFS_thread *thread= PFS_thread::get_current_thread();
+  if (unlikely(thread == NULL))
+    return HA_ERR_OUT_OF_MEM;
+
+  LF_PINS* pins= get_setup_object_hash_pins(thread);
+  if (unlikely(pins == NULL))
+    return HA_ERR_OUT_OF_MEM;
+
+  PFS_setup_object *pfs= setup_object_array;
+  PFS_setup_object *pfs_last= setup_object_array + setup_object_max;
+
+  for ( ; pfs < pfs_last; pfs++)
+  {
+    if (pfs->m_lock.is_populated())
+    {
+      lf_hash_delete(&setup_object_hash, pins,
+                     pfs->m_key.m_hash_key, pfs->m_key.m_key_length);
+      pfs->m_lock.allocated_to_free();
+    }
+  }
+
+  setup_objects_version++;
+  return 0;
+}
+
+long setup_object_count()
+{
+  return setup_object_hash.count;
+}
+
+void lookup_setup_object(PFS_thread *thread,
+                         enum_object_type object_type,
+                         const char *schema_name, int schema_name_length,
+                         const char *object_name, int object_name_length,
+                         bool *enabled, bool *timed)
+{
+  PFS_setup_object_key key;
+  PFS_setup_object **entry;
+  PFS_setup_object *pfs;
+  int i;
+
+  /*
+    The table io instrumentation uses "TABLE" and "TEMPORARY TABLE".
+    SETUP_OBJECT uses "TABLE" for both concepts.
+    There is no way to provide a different setup for:
+    - TABLE foo.bar
+    - TEMPORARY TABLE foo.bar
+  */
+  DBUG_ASSERT(object_type != OBJECT_TYPE_TEMPORARY_TABLE);
+
+  LF_PINS* pins= get_setup_object_hash_pins(thread);
+  if (unlikely(pins == NULL))
+  {
+    *enabled= false;
+    *timed= false;
+    return;
+  }
+
+  for (i= 1; i<=3; i++)
+  {
+    switch(i)
+    {
+    case 1:
+      /* Lookup OBJECT_TYPE + OBJECT_SCHEMA + OBJECT_NAME in SETUP_OBJECTS */
+      set_setup_object_key(&key,
+                           object_type,
+                           schema_name, schema_name_length,
+                           object_name, object_name_length);
+      break;
+    case 2:
+      /* Lookup OBJECT_TYPE + OBJECT_SCHEMA + "%" in SETUP_OBJECTS */
+      set_setup_object_key(&key,
+                           object_type,
+                           schema_name, schema_name_length, "%", 1);
+      break;
+    case 3:
+      /* Lookup OBJECT_TYPE + "%" + "%" in SETUP_OBJECTS */
+      set_setup_object_key(&key, object_type, "%", 1, "%", 1);
+      break;
+    }
+    entry= reinterpret_cast<PFS_setup_object**>
+      (lf_hash_search(&setup_object_hash, pins, key.m_hash_key, key.m_key_length));
+
+    if (entry && (entry != MY_ERRPTR))
+    {
+      pfs= *entry;
+      *enabled= pfs->m_enabled;
+      *timed= pfs->m_timed;
+      lf_hash_search_unpin(pins);
+      return;
+    }
+
+    lf_hash_search_unpin(pins);
+  }
+  *enabled= false;
+  *timed= false;
+  return;
+}
+
+/** @} */
diff --git a/storage/perfschema/pfs_setup_object.h b/storage/perfschema/pfs_setup_object.h
new file mode 100644
index 00000000000..44d2b76c627
--- /dev/null
+++ b/storage/perfschema/pfs_setup_object.h
@@ -0,0 +1,101 @@
+/* Copyright (c) 2010, 2011, Oracle and/or its affiliates. All rights reserved.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; version 2 of the License.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301  USA */
+
+#ifndef PFS_SETUP_OBJECT_H
+#define PFS_SETUP_OBJECT_H
+
+/**
+  @file storage/perfschema/pfs_setup_object.h
+  Performance schema setup object (declarations).
+*/
+
+#include "pfs_lock.h"
+#include "lf.h"
+
+class String;
+struct PFS_global_param;
+
+/**
+  @addtogroup Performance_schema_buffers
+  @{
+*/
+
+/** Hash key for @sa PFS_setup_object. */
+struct PFS_setup_object_key
+{
+  /**
+    Hash search key.
+    This has to be a string for LF_HASH,
+    the format is "<enum_object_type><schema_name><0x00><object_name><0x00>"
+  */
+  char m_hash_key[1 + NAME_LEN + 1 + NAME_LEN + 1];
+  uint m_key_length;
+};
+
+/** A setup_object record. */
+struct PFS_setup_object
+{
+  enum_object_type get_object_type()
+  {
+    return (enum_object_type) m_key.m_hash_key[0];
+  }
+
+  /** Internal lock. */
+  pfs_lock m_lock;
+  /** Hash key. */
+  PFS_setup_object_key m_key;
+  /** Schema name. Points inside m_key. */
+  const char *m_schema_name;
+  /** Length of @c m_schema_name. */
+  uint m_schema_name_length;
+  /** Object name. Points inside m_key. */
+  const char *m_object_name;
+  /** Length of @c m_object_name. */
+  uint m_object_name_length;
+  /** ENABLED flag. */
+  bool m_enabled;
+  /** TIMED flag. */
+  bool m_timed;
+};
+
+int init_setup_object(const PFS_global_param *param);
+void cleanup_setup_object(void);
+int init_setup_object_hash(void);
+void cleanup_setup_object_hash(void);
+
+int insert_setup_object(enum_object_type object_type, const String *schema,
+                        const String *object, bool enabled, bool timed);
+int delete_setup_object(enum_object_type object_type, const String *schema,
+                        const String *object);
+int reset_setup_object(void);
+long setup_object_count(void);
+
+void lookup_setup_object(PFS_thread *thread,
+                         enum_object_type object_type,
+                         const char *schema_name, int schema_name_length,
+                         const char *object_name, int object_name_length,
+                         bool *enabled, bool *timed);
+
+/* For iterators and show status. */
+
+extern ulong setup_object_max;
+
+/* Exposing the data directly, for iterators. */
+
+extern PFS_setup_object *setup_object_array;
+
+/** @} */
+#endif
+
diff --git a/storage/perfschema/pfs_stat.h b/storage/perfschema/pfs_stat.h
index 5955a515d1a..32c462b8ba2 100644
--- a/storage/perfschema/pfs_stat.h
+++ b/storage/perfschema/pfs_stat.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2008, 2011, Oracle and/or its affiliates. All rights reserved.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
@@ -16,6 +16,10 @@
 #ifndef PFS_STAT_H
 #define PFS_STAT_H
 
+#include "sql_const.h"
+/* memcpy */
+#include "string.h"
+
 /**
   @file storage/perfschema/pfs_stat.h
   Statistics (declarations).
@@ -26,16 +30,9 @@
   @{
 */
 
-/** Usage statistics chain, for a single value and its aggregates. */
-struct PFS_single_stat_chain
+/** Single statistic. */
+struct PFS_single_stat
 {
-  /**
-    Control flag.
-    Statistics are aggregated only if the control flag is true.
-  */
-  bool *m_control_flag;
-  /** Next link in the statistics chain. */
-  struct PFS_single_stat_chain *m_parent;
   /** Count of values. */
   ulonglong m_count;
   /** Sum of values. */
@@ -44,60 +41,104 @@ struct PFS_single_stat_chain
   ulonglong m_min;
   /** Maximum value. */
   ulonglong m_max;
+
+  PFS_single_stat()
+  {
+    m_count= 0;
+    m_sum= 0;
+    m_min= ULONGLONG_MAX;
+    m_max= 0;
+  }
+
+  inline void reset(void)
+  {
+    m_count= 0;
+    m_sum= 0;
+    m_min= ULONGLONG_MAX;
+    m_max= 0;
+  }
+
+  inline void aggregate(const PFS_single_stat *stat)
+  {
+    m_count+= stat->m_count;
+    m_sum+= stat->m_sum;
+    if (unlikely(m_min > stat->m_min))
+      m_min= stat->m_min;
+    if (unlikely(m_max < stat->m_max))
+      m_max= stat->m_max;
+  }
+
+  inline void aggregate_counted()
+  {
+    m_count++;
+  }
+
+  inline void aggregate_counted(ulonglong count)
+  {
+    m_count+= count;
+  }
+
+  inline void aggregate_value(ulonglong value)
+  {
+    m_count++;
+    m_sum+= value;
+    if (unlikely(m_min > value))
+      m_min= value;
+    if (unlikely(m_max < value))
+      m_max= value;
+  }
 };
 
-/**
-  Reset a single statistic link.
-  Only the current link is reset, parents are not affected.
-  @param stat                         the statistics link to reset
-*/
-inline void reset_single_stat_link(PFS_single_stat_chain *stat)
+/** Combined statistic. */
+struct PFS_byte_stat : public PFS_single_stat
 {
-  stat->m_count= 0;
-  stat->m_sum= 0;
-  stat->m_min= ULONGLONG_MAX;
-  stat->m_max= 0;
-}
+  /** Byte count statistics */
+  ulonglong m_bytes;
 
-/**
-  Aggregate a value to a statistic chain.
-  @param stat                         the aggregated statistic chain
-  @param value                        the value to aggregate
-*/
-inline void aggregate_single_stat_chain(PFS_single_stat_chain *stat,
-                                        ulonglong value)
-{
-  do
-  {
-    if (*stat->m_control_flag)
-    {
-      stat->m_count++;
-      stat->m_sum+= value;
-      if (stat->m_min > value)
-        stat->m_min= value;
-      if (stat->m_max < value)
-        stat->m_max= value;
-    }
-    stat= stat->m_parent;
-  }
-  while (stat);
-}
+  /* Aggregate wait stats, event count and byte count */
+  inline void aggregate(const PFS_byte_stat *stat)
+  {
+    PFS_single_stat::aggregate(stat);
+    m_bytes+= stat->m_bytes;
+  }
 
-/**
-  Increment the value counts in a statistic chain.
-  Used for instruments that are 'ENABLED' but not 'TIMED'.
-  @param stat                         the aggregated statistic chain
-*/
-inline void increment_single_stat_chain(PFS_single_stat_chain *stat)
-{
-  do
+  /* Aggregate individual wait time, event count and byte count */
+  inline void aggregate(ulonglong wait, ulonglong bytes)
+  {
+    aggregate_value(wait);
+    m_bytes+= bytes;
+  }
+
+  /* Aggregate wait stats and event count */
+  inline void aggregate_waits(const PFS_byte_stat *stat)
+  {
+    PFS_single_stat::aggregate(stat);
+  }
+
+  /* Aggregate event count and byte count */
+  inline void aggregate_counted()
+  {
+    PFS_single_stat::aggregate_counted();
+  }
+
+  /* Aggregate event count and byte count */
+  inline void aggregate_counted(ulonglong bytes)
+  {
+    PFS_single_stat::aggregate_counted();
+    m_bytes+= bytes;
+  }
+    
+  PFS_byte_stat()
   {
-    if (*stat->m_control_flag)
-      stat->m_count++;
-    stat= stat->m_parent;
+    reset();
   }
-  while (stat);
-}
+
+  inline void reset(void)
+  {
+    PFS_single_stat::reset();
+    m_bytes= 0;
+  }
+};
 
 /** Statistics for COND usage. */
 struct PFS_cond_stat
@@ -108,33 +149,440 @@ struct PFS_cond_stat
   ulonglong m_broadcast_count;
 };
 
+/** Statistics for FILE IO. Used for both waits and byte counts. */
+struct PFS_file_io_stat
+{
+  /** READ statistics */
+  PFS_byte_stat m_read;
+  /** WRITE statistics */
+  PFS_byte_stat m_write;
+  /** Miscelleanous statistics */
+  PFS_byte_stat m_misc;
+
+  inline void reset(void)
+  {
+    m_read.reset();
+    m_write.reset();
+    m_misc.reset();
+  }
+
+  inline void aggregate(const PFS_file_io_stat *stat)
+  {
+    m_read.aggregate(&stat->m_read);
+    m_write.aggregate(&stat->m_write);
+    m_misc.aggregate(&stat->m_misc);
+  }
+
+  /* Sum waits and byte counts */
+  inline void sum(PFS_byte_stat *stat)
+  {
+    stat->aggregate(&m_read);
+    stat->aggregate(&m_write);
+    stat->aggregate(&m_misc);
+  }
+
+  /* Sum waits only */
+  inline void sum_waits(PFS_single_stat *stat)
+  {
+    stat->aggregate(&m_read);
+    stat->aggregate(&m_write);
+    stat->aggregate(&m_misc);
+  }
+};
+
 /** Statistics for FILE usage. */
 struct PFS_file_stat
 {
   /** Number of current open handles. */
   ulong m_open_count;
-  /** Count of READ operations. */
-  ulonglong m_count_read;
-  /** Count of WRITE operations. */
-  ulonglong m_count_write;
-  /** Number of bytes read. */
-  ulonglong m_read_bytes;
-  /** Number of bytes written. */
-  ulonglong m_write_bytes;
+  /** File IO statistics. */
+  PFS_file_io_stat m_io_stat;
+
+  /** Reset file statistics. */
+  inline void reset(void)
+  {
+    m_io_stat.reset();
+  }
 };
 
-/**
-  Reset file statistic.
-  @param stat                         the statistics to reset
-*/
-inline void reset_file_stat(PFS_file_stat *stat)
+/** Statistics for stage usage. */
+struct PFS_stage_stat
+{
+  PFS_single_stat m_timer1_stat;
+
+  inline void reset(void)
+  { m_timer1_stat.reset(); }
+
+  inline void aggregate_counted()
+  { m_timer1_stat.aggregate_counted(); }
+
+  inline void aggregate_value(ulonglong value)
+  { m_timer1_stat.aggregate_value(value); }
+
+  inline void aggregate(PFS_stage_stat *stat)
+  { m_timer1_stat.aggregate(& stat->m_timer1_stat); }
+};
+
+/** Statistics for statement usage. */
+struct PFS_statement_stat
+{
+  PFS_single_stat m_timer1_stat;
+  ulonglong m_error_count;
+  ulonglong m_warning_count;
+  ulonglong m_rows_affected;
+  ulonglong m_lock_time;
+  ulonglong m_rows_sent;
+  ulonglong m_rows_examined;
+  ulonglong m_created_tmp_disk_tables;
+  ulonglong m_created_tmp_tables;
+  ulonglong m_select_full_join;
+  ulonglong m_select_full_range_join;
+  ulonglong m_select_range;
+  ulonglong m_select_range_check;
+  ulonglong m_select_scan;
+  ulonglong m_sort_merge_passes;
+  ulonglong m_sort_range;
+  ulonglong m_sort_rows;
+  ulonglong m_sort_scan;
+  ulonglong m_no_index_used;
+  ulonglong m_no_good_index_used;
+
+  PFS_statement_stat()
+  {
+    m_error_count= 0;
+    m_warning_count= 0;
+    m_rows_affected= 0;
+    m_lock_time= 0;
+    m_rows_sent= 0;
+    m_rows_examined= 0;
+    m_created_tmp_disk_tables= 0;
+    m_created_tmp_tables= 0;
+    m_select_full_join= 0;
+    m_select_full_range_join= 0;
+    m_select_range= 0;
+    m_select_range_check= 0;
+    m_select_scan= 0;
+    m_sort_merge_passes= 0;
+    m_sort_range= 0;
+    m_sort_rows= 0;
+    m_sort_scan= 0;
+    m_no_index_used= 0;
+    m_no_good_index_used= 0;
+  }
+
+  inline void reset(void)
+  {
+    m_timer1_stat.reset();
+    m_error_count= 0;
+    m_warning_count= 0;
+    m_rows_affected= 0;
+    m_lock_time= 0;
+    m_rows_sent= 0;
+    m_rows_examined= 0;
+    m_created_tmp_disk_tables= 0;
+    m_created_tmp_tables= 0;
+    m_select_full_join= 0;
+    m_select_full_range_join= 0;
+    m_select_range= 0;
+    m_select_range_check= 0;
+    m_select_scan= 0;
+    m_sort_merge_passes= 0;
+    m_sort_range= 0;
+    m_sort_rows= 0;
+    m_sort_scan= 0;
+    m_no_index_used= 0;
+    m_no_good_index_used= 0;
+  }
+
+  inline void aggregate_counted()
+  { m_timer1_stat.aggregate_counted(); }
+
+  inline void aggregate_value(ulonglong value)
+  { m_timer1_stat.aggregate_value(value); }
+
+  inline void aggregate(PFS_statement_stat *stat)
+  {
+    m_timer1_stat.aggregate(& stat->m_timer1_stat);
+
+    m_error_count+= stat->m_error_count;
+    m_warning_count+= stat->m_warning_count;
+    m_rows_affected+= stat->m_rows_affected;
+    m_lock_time+= stat->m_lock_time;
+    m_rows_sent+= stat->m_rows_sent;
+    m_rows_examined+= stat->m_rows_examined;
+    m_created_tmp_disk_tables+= stat->m_created_tmp_disk_tables;
+    m_created_tmp_tables+= stat->m_created_tmp_tables;
+    m_select_full_join+= stat->m_select_full_join;
+    m_select_full_range_join+= stat->m_select_full_range_join;
+    m_select_range+= stat->m_select_range;
+    m_select_range_check+= stat->m_select_range_check;
+    m_select_scan+= stat->m_select_scan;
+    m_sort_merge_passes+= stat->m_sort_merge_passes;
+    m_sort_range+= stat->m_sort_range;
+    m_sort_rows+= stat->m_sort_rows;
+    m_sort_scan+= stat->m_sort_scan;
+    m_no_index_used+= stat->m_no_index_used;
+    m_no_good_index_used+= stat->m_no_good_index_used;
+  }
+};
+
+/** Single table io statistic. */
+struct PFS_table_io_stat
 {
-  stat->m_open_count= 0;
-  stat->m_count_read= 0;
-  stat->m_count_write= 0;
-  stat->m_read_bytes= 0;
-  stat->m_write_bytes= 0;
-}
+  /** FETCH statistics */
+  PFS_single_stat m_fetch;
+  /** INSERT statistics */
+  PFS_single_stat m_insert;
+  /** UPDATE statistics */
+  PFS_single_stat m_update;
+  /** DELETE statistics */
+  PFS_single_stat m_delete;
+
+  inline void reset(void)
+  {
+    m_fetch.reset();
+    m_insert.reset();
+    m_update.reset();
+    m_delete.reset();
+  }
+
+  inline void aggregate(const PFS_table_io_stat *stat)
+  {
+    m_fetch.aggregate(&stat->m_fetch);
+    m_insert.aggregate(&stat->m_insert);
+    m_update.aggregate(&stat->m_update);
+    m_delete.aggregate(&stat->m_delete);
+  }
+
+  inline void sum(PFS_single_stat *result)
+  {
+    result->aggregate(& m_fetch);
+    result->aggregate(& m_insert);
+    result->aggregate(& m_update);
+    result->aggregate(& m_delete);
+  }
+};
+
+enum PFS_TL_LOCK_TYPE
+{
+  /* Locks from enum thr_lock */
+  PFS_TL_READ= 0,
+  PFS_TL_READ_WITH_SHARED_LOCKS= 1,
+  PFS_TL_READ_HIGH_PRIORITY= 2,
+  PFS_TL_READ_NO_INSERT= 3,
+  PFS_TL_WRITE_ALLOW_WRITE= 4,
+  PFS_TL_WRITE_CONCURRENT_INSERT= 5,
+  PFS_TL_WRITE_DELAYED= 6,
+  PFS_TL_WRITE_LOW_PRIORITY= 7,
+  PFS_TL_WRITE= 8,
+
+  /* Locks for handler::ha_external_lock() */
+  PFS_TL_READ_EXTERNAL= 9,
+  PFS_TL_WRITE_EXTERNAL= 10
+};
+
+#define COUNT_PFS_TL_LOCK_TYPE 11
+
+/** Statistics for table locks. */
+struct PFS_table_lock_stat
+{
+  PFS_single_stat m_stat[COUNT_PFS_TL_LOCK_TYPE];
+
+  inline void reset(void)
+  {
+    PFS_single_stat *pfs= & m_stat[0];
+    PFS_single_stat *pfs_last= & m_stat[COUNT_PFS_TL_LOCK_TYPE];
+    for ( ; pfs < pfs_last ; pfs++)
+      pfs->reset();
+  }
+
+  inline void aggregate(const PFS_table_lock_stat *stat)
+  {
+    PFS_single_stat *pfs= & m_stat[0];
+    PFS_single_stat *pfs_last= & m_stat[COUNT_PFS_TL_LOCK_TYPE];
+    const PFS_single_stat *pfs_from= & stat->m_stat[0];
+    for ( ; pfs < pfs_last ; pfs++, pfs_from++)
+      pfs->aggregate(pfs_from);
+  }
+
+  inline void sum(PFS_single_stat *result)
+  {
+    PFS_single_stat *pfs= & m_stat[0];
+    PFS_single_stat *pfs_last= & m_stat[COUNT_PFS_TL_LOCK_TYPE];
+    for ( ; pfs < pfs_last ; pfs++)
+      result->aggregate(pfs);
+  }
+};
+
+/** Statistics for TABLE usage. */
+struct PFS_table_stat
+{
+  /**
+    Statistics, per index.
+    Each index stat is in [0, MAX_KEY-1],
+    stats when using no index are in [MAX_KEY].
+  */
+  PFS_table_io_stat m_index_stat[MAX_KEY + 1];
+
+  /**
+    Statistics, per lock type.
+  */
+  PFS_table_lock_stat m_lock_stat;
+
+  /** Reset table io statistic. */
+  inline void reset_io(void)
+  {
+    PFS_table_io_stat *stat= & m_index_stat[0];
+    PFS_table_io_stat *stat_last= & m_index_stat[MAX_KEY + 1];
+    for ( ; stat < stat_last ; stat++)
+      stat->reset();
+  }
+
+  /** Reset table lock statistic. */
+  inline void reset_lock(void)
+  {
+    m_lock_stat.reset();
+  }
+
+  /** Reset table statistic. */
+  inline void reset(void)
+  {
+    reset_io();
+    reset_lock();
+  }
+
+  inline void fast_reset_io(void)
+  {
+    memcpy(& m_index_stat, & g_reset_template.m_index_stat, sizeof(m_index_stat));
+  }
+
+  inline void fast_reset_lock(void)
+  {
+    memcpy(& m_lock_stat, & g_reset_template.m_lock_stat, sizeof(m_lock_stat));
+  }
+
+  inline void fast_reset(void)
+  {
+    memcpy(this, & g_reset_template, sizeof(*this));
+  }
+
+  inline void aggregate_io(const PFS_table_stat *stat)
+  {
+    PFS_table_io_stat *to_stat= & m_index_stat[0];
+    PFS_table_io_stat *to_stat_last= & m_index_stat[MAX_KEY + 1];
+    const PFS_table_io_stat *from_stat= & stat->m_index_stat[0];
+    for ( ; to_stat < to_stat_last ; from_stat++, to_stat++)
+      to_stat->aggregate(from_stat);
+  }
+
+  inline void aggregate_lock(const PFS_table_stat *stat)
+  {
+    m_lock_stat.aggregate(& stat->m_lock_stat);
+  }
+
+  inline void aggregate(const PFS_table_stat *stat)
+  {
+    aggregate_io(stat);
+    aggregate_lock(stat);
+  }
+
+  inline void sum_io(PFS_single_stat *result)
+  {
+    PFS_table_io_stat *stat= & m_index_stat[0];
+    PFS_table_io_stat *stat_last= & m_index_stat[MAX_KEY + 1];
+    for ( ; stat < stat_last ; stat++)
+      stat->sum(result);
+  }
+
+  inline void sum_lock(PFS_single_stat *result)
+  {
+    m_lock_stat.sum(result);
+  }
+
+  inline void sum(PFS_single_stat *result)
+  {
+    sum_io(result);
+    sum_lock(result);
+  }
+
+  static struct PFS_table_stat g_reset_template;
+};
+
+/** Statistics for SOCKET IO. Used for both waits and byte counts. */
+struct PFS_socket_io_stat
+{
+  /** READ statistics */
+  PFS_byte_stat m_read;
+  /** WRITE statistics */
+  PFS_byte_stat m_write;
+  /** Miscelleanous statistics */
+  PFS_byte_stat m_misc;
+
+  inline void reset(void)
+  {
+    m_read.reset();
+    m_write.reset();
+    m_misc.reset();
+  }
+
+  inline void aggregate(const PFS_socket_io_stat *stat)
+  {
+    m_read.aggregate(&stat->m_read);
+    m_write.aggregate(&stat->m_write);
+    m_misc.aggregate(&stat->m_misc);
+  }
+
+  /* Sum waits and byte counts */
+  inline void sum(PFS_byte_stat *stat)
+  {
+    stat->aggregate(&m_read);
+    stat->aggregate(&m_write);
+    stat->aggregate(&m_misc);
+  }
+
+  /* Sum waits only */
+  inline void sum_waits(PFS_single_stat *stat)
+  {
+    stat->aggregate(&m_read);
+    stat->aggregate(&m_write);
+    stat->aggregate(&m_misc);
+  }
+};
+
+/** Statistics for SOCKET usage. */
+struct PFS_socket_stat
+{
+  /** Socket timing and byte count statistics per operation */
+  PFS_socket_io_stat m_io_stat;
+
+  /** Reset socket statistics. */
+  inline void reset(void)
+  {
+    m_io_stat.reset();
+  }
+};
+
+struct PFS_connection_stat
+{
+  PFS_connection_stat()
+  : m_current_connections(0),
+    m_total_connections(0)
+  {}
+
+  ulonglong m_current_connections;
+  ulonglong m_total_connections;
+
+  inline void aggregate_active(ulonglong active)
+  {
+    m_current_connections+= active;
+    m_total_connections+= active;
+  }
+
+  inline void aggregate_disconnected(ulonglong disconnected)
+  {
+    m_total_connections+= disconnected;
+  }
+};
 
 /** @} */
 #endif
diff --git a/storage/perfschema/pfs_timer.cc b/storage/perfschema/pfs_timer.cc
index 302548c97c2..3d8d2e07ce5 100644
--- a/storage/perfschema/pfs_timer.cc
+++ b/storage/perfschema/pfs_timer.cc
@@ -1,5 +1,4 @@
-/* Copyright (c) 2008 MySQL AB, 2010 Sun Microsystems, Inc.
-   Use is subject to license terms.
+/* Copyright (c) 2008, 2011, Oracle and/or its affiliates. All rights reserved.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
@@ -23,7 +22,10 @@
 #include "pfs_timer.h"
 #include "my_rdtsc.h"
 
+enum_timer_name idle_timer= TIMER_NAME_MICROSEC;
 enum_timer_name wait_timer= TIMER_NAME_CYCLE;
+enum_timer_name stage_timer= TIMER_NAME_NANOSEC;
+enum_timer_name statement_timer= TIMER_NAME_NANOSEC;
 MY_TIMER_INFO pfs_timer_info;
 
 static ulonglong cycle_v0;
@@ -38,6 +40,17 @@ static ulong microsec_to_pico; /* In theory, 1 000 000 */
 static ulong millisec_to_pico; /* In theory, 1 000 000 000, fits in uint32 */
 static ulonglong tick_to_pico; /* 1e10 at 100 Hz, 1.666e10 at 60 Hz */
 
+/* Indexed by enum enum_timer_name */
+static struct time_normalizer to_pico_data[FIRST_TIMER_NAME + COUNT_TIMER_NAME]=
+{
+  { 0, 0}, /* unused */
+  { 0, 0}, /* cycle */
+  { 0, 0}, /* nanosec */
+  { 0, 0}, /* microsec */
+  { 0, 0}, /* millisec */
+  { 0, 0}  /* tick */
+};
+
 static inline ulong round_to_ulong(double value)
 {
   return (ulong) (value + 0.5);
@@ -89,13 +102,75 @@ void init_timers(void)
                                      (double)pfs_timer_info.ticks.frequency);
   else
     tick_to_pico= 0;
+
+  to_pico_data[TIMER_NAME_CYCLE].m_v0= cycle_v0;
+  to_pico_data[TIMER_NAME_CYCLE].m_factor= cycle_to_pico;
+
+  to_pico_data[TIMER_NAME_NANOSEC].m_v0= nanosec_v0;
+  to_pico_data[TIMER_NAME_NANOSEC].m_factor= nanosec_to_pico;
+
+  to_pico_data[TIMER_NAME_MICROSEC].m_v0= microsec_v0;
+  to_pico_data[TIMER_NAME_MICROSEC].m_factor= microsec_to_pico;
+
+  to_pico_data[TIMER_NAME_MILLISEC].m_v0= millisec_v0;
+  to_pico_data[TIMER_NAME_MILLISEC].m_factor= millisec_to_pico;
+
+  to_pico_data[TIMER_NAME_TICK].m_v0= tick_v0;
+  to_pico_data[TIMER_NAME_TICK].m_factor= tick_to_pico;
+}
+
+ulonglong get_timer_raw_value(enum_timer_name timer_name)
+{
+  switch (timer_name)
+  {
+  case TIMER_NAME_CYCLE:
+    return my_timer_cycles();
+  case TIMER_NAME_NANOSEC:
+    return my_timer_nanoseconds();
+  case TIMER_NAME_MICROSEC:
+    return my_timer_microseconds();
+  case TIMER_NAME_MILLISEC:
+    return my_timer_milliseconds();
+  case TIMER_NAME_TICK:
+    return my_timer_ticks();
+  default:
+    DBUG_ASSERT(false);
+  }
+  return 0;
+}
+
+ulonglong get_timer_raw_value_and_function(enum_timer_name timer_name, timer_fct_t *fct)
+{
+  switch (timer_name)
+  {
+  case TIMER_NAME_CYCLE:
+    *fct= my_timer_cycles;
+    return my_timer_cycles();
+  case TIMER_NAME_NANOSEC:
+    *fct= my_timer_nanoseconds;
+    return my_timer_nanoseconds();
+  case TIMER_NAME_MICROSEC:
+    *fct= my_timer_microseconds;
+    return my_timer_microseconds();
+  case TIMER_NAME_MILLISEC:
+    *fct= my_timer_milliseconds;
+    return my_timer_milliseconds();
+  case TIMER_NAME_TICK:
+    *fct= my_timer_ticks;
+    return my_timer_ticks();
+  default:
+    *fct= NULL;
+    DBUG_ASSERT(false);
+  }
+  return 0;
 }
 
-ulonglong get_timer_value(enum_timer_name timer_name)
+ulonglong get_timer_pico_value(enum_timer_name timer_name)
 {
   ulonglong result;
 
-  switch (timer_name) {
+  switch (timer_name)
+  {
   case TIMER_NAME_CYCLE:
     result= (my_timer_cycles() - cycle_v0) * cycle_to_pico;
     break;
@@ -118,3 +193,38 @@ ulonglong get_timer_value(enum_timer_name timer_name)
   return result;
 }
 
+time_normalizer* time_normalizer::get(enum_timer_name timer_name)
+{
+  uint index= static_cast<uint> (timer_name);
+
+  DBUG_ASSERT(index >= FIRST_TIMER_NAME);
+  DBUG_ASSERT(index <= LAST_TIMER_NAME);
+
+  return & to_pico_data[index];
+}
+
+void time_normalizer::to_pico(ulonglong start, ulonglong end,
+                              ulonglong *pico_start, ulonglong *pico_end, ulonglong *pico_wait)
+{
+  if (start == 0)
+  {
+    *pico_start= 0;
+    *pico_end= 0;
+    *pico_wait= 0;
+  }
+  else
+  {
+    *pico_start= (start - m_v0) * m_factor;
+    if (end == 0)
+    {
+      *pico_end= 0;
+      *pico_wait= 0;
+    }
+    else
+    {
+      *pico_end= (end - m_v0) * m_factor;
+      *pico_wait= (end - start) * m_factor;
+    }
+  }
+}
+
diff --git a/storage/perfschema/pfs_timer.h b/storage/perfschema/pfs_timer.h
index 6736a977ab9..1cae20e89dd 100644
--- a/storage/perfschema/pfs_timer.h
+++ b/storage/perfschema/pfs_timer.h
@@ -1,5 +1,4 @@
-/* Copyright (c) 2008 MySQL AB, 2010 Sun Microsystems, Inc.
-   Use is subject to license terms.
+/* Copyright (c) 2008, 2011, Oracle and/or its affiliates. All rights reserved.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
@@ -24,12 +23,118 @@
 #include <my_rdtsc.h>
 #include "pfs_column_types.h"
 
+/** Conversion factor, from micro seconds to pico seconds. */
+#define MICROSEC_TO_PICOSEC 1000000
+
+/**
+  A time normalizer.
+  A time normalizer consist of a transformation that
+  converts raw timer values (expressed in the timer unit)
+  to normalized values, expressed in picoseconds.
+*/
+struct time_normalizer
+{
+  /**
+    Get a time normalizer for a given timer.
+    @param timer_name the timer name
+    @return the normalizer for the timer
+  */
+  static time_normalizer* get(enum_timer_name timer_name);
+
+  /** Timer value at server statup. */
+  ulonglong m_v0;
+  /** Conversion factor from timer values to pico seconds. */
+  ulonglong m_factor;
+
+  /**
+    Convert a wait from timer units to pico seconds.
+    @param wait a wait, expressed in timer units
+    @return the wait, expressed in pico seconds
+  */
+  inline ulonglong wait_to_pico(ulonglong wait)
+  {
+    return wait * m_factor;
+  }
+
+  /**
+    Convert a time from timer units to pico seconds.
+    @param t a time, expressed in timer units
+    @return the time, expressed in pico seconds
+  */
+  inline ulonglong time_to_pico(ulonglong t)
+  {
+    return (t == 0 ? 0 : (t - m_v0) * m_factor);
+  }
+
+  /**
+    Convert start / end times from timer units to pico seconds.
+    @param start start time, expressed in timer units
+    @param end end time, expressed in timer units
+    @param[out] pico_start start time, expressed in pico seconds
+    @param[out] pico_end end time, expressed in pico seconds
+    @param[out] pico_wait wait time, expressed in pico seconds
+  */
+  void to_pico(ulonglong start, ulonglong end,
+               ulonglong *pico_start, ulonglong *pico_end, ulonglong *pico_wait);
+};
+
+/**
+  Idle timer.
+  The timer used to measure all idle events.
+*/
+extern enum_timer_name idle_timer;
+/**
+  Wait timer.
+  The timer used to measure all wait events.
+*/
 extern enum_timer_name wait_timer;
+/**
+  Stage timer.
+  The timer used to measure all stage events.
+*/
+extern enum_timer_name stage_timer;
+/**
+  Statement timer.
+  The timer used to measure all statement events.
+*/
+extern enum_timer_name statement_timer;
+/**
+  Timer information data.
+  Characteristics about each suported timer.
+*/
 extern MY_TIMER_INFO pfs_timer_info;
 
+/** Initialize the timer component. */
 void init_timers();
 
-ulonglong get_timer_value(enum_timer_name timer_name);
+extern "C"
+{
+  /** A timer function. */
+  typedef ulonglong (*timer_fct_t)(void);
+}
+
+/**
+  Get a timer value, in pico seconds.
+  @param timer_name the timer to use
+  @return timer value, in pico seconds
+*/
+ulonglong get_timer_pico_value(enum_timer_name timer_name);
+/**
+  Get a timer value, in timer units.
+  @param timer_name the timer to use
+  @return timer value, in timer units
+*/
+ulonglong get_timer_raw_value(enum_timer_name timer_name);
+/**
+  Get a timer value and function, in timer units.
+  This function is useful when code needs to call the same timer several times.
+  The returned timer function can be invoked directly, which avoids having to
+  resolve the timer by name for each call.
+  @param timer_name the timer to use
+  @param[out] fct the timer function
+  @return timer value, in timer units
+*/
+ulonglong get_timer_raw_value_and_function(enum_timer_name timer_name, timer_fct_t *fct);
 
 #endif
 
diff --git a/storage/perfschema/pfs_user.cc b/storage/perfschema/pfs_user.cc
new file mode 100644
index 00000000000..d7794a131a1
--- /dev/null
+++ b/storage/perfschema/pfs_user.cc
@@ -0,0 +1,381 @@
+/* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; version 2 of the License.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+/**
+  @file storage/perfschema/pfs_user.cc
+  Performance schema user (implementation).
+*/
+
+#include "my_global.h"
+#include "my_sys.h"
+#include "pfs.h"
+#include "pfs_stat.h"
+#include "pfs_instr.h"
+#include "pfs_setup_actor.h"
+#include "pfs_user.h"
+#include "pfs_global.h"
+#include "pfs_instr_class.h"
+
+/**
+  @addtogroup Performance_schema_buffers
+  @{
+*/
+
+ulong user_max;
+ulong user_lost;
+
+PFS_user *user_array= NULL;
+
+static PFS_single_stat *user_instr_class_waits_array= NULL;
+static PFS_stage_stat *user_instr_class_stages_array= NULL;
+static PFS_statement_stat *user_instr_class_statements_array= NULL;
+
+static LF_HASH user_hash;
+static bool user_hash_inited= false;
+
+/**
+  Initialize the user buffers.
+  @param param                        sizing parameters
+  @return 0 on success
+*/
+int init_user(const PFS_global_param *param)
+{
+  uint index;
+
+  user_max= param->m_user_sizing;
+
+  user_array= NULL;
+  user_instr_class_waits_array= NULL;
+  user_instr_class_stages_array= NULL;
+  user_instr_class_statements_array= NULL;
+  uint waits_sizing= user_max * wait_class_max;
+  uint stages_sizing= user_max * stage_class_max;
+  uint statements_sizing= user_max * statement_class_max;
+
+  if (user_max > 0)
+  {
+    user_array= PFS_MALLOC_ARRAY(user_max, PFS_user,
+                                 MYF(MY_ZEROFILL));
+    if (unlikely(user_array == NULL))
+      return 1;
+  }
+
+  if (waits_sizing > 0)
+  {
+    user_instr_class_waits_array=
+      PFS_connection_slice::alloc_waits_slice(waits_sizing);
+    if (unlikely(user_instr_class_waits_array == NULL))
+      return 1;
+  }
+
+  if (stages_sizing > 0)
+  {
+    user_instr_class_stages_array=
+      PFS_connection_slice::alloc_stages_slice(stages_sizing);
+    if (unlikely(user_instr_class_stages_array == NULL))
+      return 1;
+  }
+
+  if (statements_sizing > 0)
+  {
+    user_instr_class_statements_array=
+      PFS_connection_slice::alloc_statements_slice(statements_sizing);
+    if (unlikely(user_instr_class_statements_array == NULL))
+      return 1;
+  }
+
+  for (index= 0; index < user_max; index++)
+  {
+    user_array[index].m_instr_class_waits_stats=
+      &user_instr_class_waits_array[index * wait_class_max];
+    user_array[index].m_instr_class_stages_stats=
+      &user_instr_class_stages_array[index * stage_class_max];
+    user_array[index].m_instr_class_statements_stats=
+      &user_instr_class_statements_array[index * statement_class_max];
+  }
+
+  return 0;
+}
+
+/** Cleanup all the user buffers. */
+void cleanup_user(void)
+{
+  pfs_free(user_array);
+  user_array= NULL;
+  pfs_free(user_instr_class_waits_array);
+  user_instr_class_waits_array= NULL;
+  pfs_free(user_instr_class_stages_array);
+  user_instr_class_stages_array= NULL;
+  pfs_free(user_instr_class_statements_array);
+  user_instr_class_statements_array= NULL;
+  user_max= 0;
+}
+
+C_MODE_START
+static uchar *user_hash_get_key(const uchar *entry, size_t *length,
+                                my_bool)
+{
+  const PFS_user * const *typed_entry;
+  const PFS_user *user;
+  const void *result;
+  typed_entry= reinterpret_cast<const PFS_user* const *> (entry);
+  DBUG_ASSERT(typed_entry != NULL);
+  user= *typed_entry;
+  DBUG_ASSERT(user != NULL);
+  *length= user->m_key.m_key_length;
+  result= user->m_key.m_hash_key;
+  return const_cast<uchar*> (reinterpret_cast<const uchar*> (result));
+}
+C_MODE_END
+
+/**
+  Initialize the user hash.
+  @return 0 on success
+*/
+int init_user_hash(void)
+{
+  if (! user_hash_inited)
+  {
+    lf_hash_init(&user_hash, sizeof(PFS_user*), LF_HASH_UNIQUE,
+                 0, 0, user_hash_get_key, &my_charset_bin);
+    user_hash_inited= true;
+  }
+  return 0;
+}
+
+/** Cleanup the user hash. */
+void cleanup_user_hash(void)
+{
+  if (user_hash_inited)
+  {
+    lf_hash_destroy(&user_hash);
+    user_hash_inited= false;
+  }
+}
+
+static LF_PINS* get_user_hash_pins(PFS_thread *thread)
+{
+  if (unlikely(thread->m_user_hash_pins == NULL))
+  {
+    if (! user_hash_inited)
+      return NULL;
+    thread->m_user_hash_pins= lf_hash_get_pins(&user_hash);
+  }
+  return thread->m_user_hash_pins;
+}
+
+static void set_user_key(PFS_user_key *key,
+                         const char *user, uint user_length)
+{
+  DBUG_ASSERT(user_length <= USERNAME_LENGTH);
+
+  char *ptr= &key->m_hash_key[0];
+  if (user_length > 0)
+  {
+    memcpy(ptr, user, user_length);
+    ptr+= user_length;
+  }
+  ptr[0]= 0;
+  ptr++;
+  key->m_key_length= ptr - &key->m_hash_key[0];
+}
+
+PFS_user *
+find_or_create_user(PFS_thread *thread,
+                    const char *username, uint username_length)
+{
+  if (user_max == 0)
+  {
+    user_lost++;
+    return NULL;
+  }
+
+  LF_PINS *pins= get_user_hash_pins(thread);
+  if (unlikely(pins == NULL))
+  {
+    user_lost++;
+    return NULL;
+  }
+
+  PFS_user_key key;
+  set_user_key(&key, username, username_length);
+
+  PFS_user **entry;
+  uint retry_count= 0;
+  const uint retry_max= 3;
+
+search:
+  entry= reinterpret_cast<PFS_user**>
+    (lf_hash_search(&user_hash, pins,
+                    key.m_hash_key, key.m_key_length));
+  if (entry && (entry != MY_ERRPTR))
+  {
+    PFS_user *pfs;
+    pfs= *entry;
+    pfs->inc_refcount();
+    lf_hash_search_unpin(pins);
+    return pfs;
+  }
+
+  lf_hash_search_unpin(pins);
+
+  PFS_scan scan;
+  uint random= randomized_index(username, user_max);
+
+  for (scan.init(random, user_max);
+       scan.has_pass();
+       scan.next_pass())
+  {
+    PFS_user *pfs= user_array + scan.first();
+    PFS_user *pfs_last= user_array + scan.last();
+    for ( ; pfs < pfs_last; pfs++)
+    {
+      if (pfs->m_lock.is_free())
+      {
+        if (pfs->m_lock.free_to_dirty())
+        {
+          pfs->m_key= key;
+          if (username_length > 0)
+            pfs->m_username= &pfs->m_key.m_hash_key[0];
+          else
+            pfs->m_username= NULL;
+          pfs->m_username_length= username_length;
+
+          pfs->init_refcount();
+          pfs->reset_stats();
+          pfs->m_disconnected_count= 0;
+
+          int res;
+          res= lf_hash_insert(&user_hash, pins, &pfs);
+          if (likely(res == 0))
+          {
+            pfs->m_lock.dirty_to_allocated();
+            return pfs;
+          }
+
+          pfs->m_lock.dirty_to_free();
+
+          if (res > 0)
+          {
+            if (++retry_count > retry_max)
+            {
+              user_lost++;
+              return NULL;
+            }
+            goto search;
+          }
+
+          user_lost++;
+          return NULL;
+        }
+      }
+    }
+  }
+
+  user_lost++;
+  return NULL;
+}
+
+void PFS_user::aggregate()
+{
+  aggregate_waits();
+  aggregate_stages();
+  aggregate_statements();
+  aggregate_stats();
+}
+
+void PFS_user::aggregate_waits()
+{
+  /* No parent to aggregate to, clean the stats */
+  reset_waits_stats();
+}
+
+void PFS_user::aggregate_stages()
+{
+  /* No parent to aggregate to, clean the stats */
+  reset_stages_stats();
+}
+
+void PFS_user::aggregate_statements()
+{
+  /* No parent to aggregate to, clean the stats */
+  reset_statements_stats();
+}
+
+void PFS_user::aggregate_stats()
+{
+  /* No parent to aggregate to, clean the stats */
+  m_disconnected_count= 0;
+}
+
+void PFS_user::release()
+{
+  dec_refcount();
+}
+
+PFS_user *sanitize_user(PFS_user *unsafe)
+{
+  if ((&user_array[0] <= unsafe) &&
+      (unsafe < &user_array[user_max]))
+    return unsafe;
+  return NULL;
+}
+
+void purge_user(PFS_thread *thread, PFS_user *user)
+{
+  LF_PINS *pins= get_user_hash_pins(thread);
+  if (unlikely(pins == NULL))
+    return;
+
+  PFS_user **entry;
+  entry= reinterpret_cast<PFS_user**>
+    (lf_hash_search(&user_hash, pins,
+                    user->m_key.m_hash_key, user->m_key.m_key_length));
+  if (entry && (entry != MY_ERRPTR))
+  {
+    DBUG_ASSERT(*entry == user);
+    if (user->get_refcount() == 0)
+    {
+      lf_hash_delete(&user_hash, pins,
+                     user->m_key.m_hash_key, user->m_key.m_key_length);
+      user->m_lock.allocated_to_free();
+    }
+  }
+
+  lf_hash_search_unpin(pins);
+}
+
+/** Purge non connected users, reset stats of connected users. */
+void purge_all_user(void)
+{
+  PFS_thread *thread= PFS_thread::get_current_thread();
+  if (unlikely(thread == NULL))
+    return;
+
+  PFS_user *pfs= user_array;
+  PFS_user *pfs_last= user_array + user_max;
+
+  for ( ; pfs < pfs_last; pfs++)
+  {
+    if (pfs->m_lock.is_populated())
+    {
+      pfs->aggregate();
+      if (pfs->get_refcount() == 0)
+        purge_user(thread, pfs);
+    }
+  }
+}
+
+/** @} */
diff --git a/storage/perfschema/pfs_user.h b/storage/perfschema/pfs_user.h
new file mode 100644
index 00000000000..0f937c6c927
--- /dev/null
+++ b/storage/perfschema/pfs_user.h
@@ -0,0 +1,113 @@
+/* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; version 2 of the License.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+#ifndef PFS_USER_H
+#define PFS_USER_H
+
+/**
+  @file storage/perfschema/pfs_user.h
+  Performance schema user (declarations).
+*/
+
+#include "pfs_lock.h"
+#include "lf.h"
+#include "pfs_con_slice.h"
+
+struct PFS_global_param;
+struct PFS_thread;
+
+/**
+  @addtogroup Performance_schema_buffers
+  @{
+*/
+
+struct PFS_user_key
+{
+  /**
+    Hash search key.
+    This has to be a string for LF_HASH,
+    the format is "<username><0x00>"
+  */
+  char m_hash_key[USERNAME_LENGTH + 1];
+  uint m_key_length;
+};
+
+struct PFS_user : public PFS_connection_slice
+{
+public:
+  inline void init_refcount(void)
+  {
+    PFS_atomic::store_32(& m_refcount, 1);
+  }
+
+  inline int get_refcount(void)
+  {
+    return PFS_atomic::load_32(& m_refcount);
+  }
+
+  inline void inc_refcount(void)
+  {
+    PFS_atomic::add_32(& m_refcount, 1);
+  }
+
+  inline void dec_refcount(void)
+  {
+    PFS_atomic::add_32(& m_refcount, -1);
+  }
+
+  void aggregate(void);
+  void aggregate_waits(void);
+  void aggregate_stages(void);
+  void aggregate_statements(void);
+  void aggregate_stats(void);
+  void release(void);
+
+  /** Internal lock. */
+  pfs_lock m_lock;
+  PFS_user_key m_key;
+  const char *m_username;
+  uint m_username_length;
+
+  ulonglong m_disconnected_count;
+
+private:
+  int m_refcount;
+};
+
+int init_user(const PFS_global_param *param);
+void cleanup_user(void);
+int init_user_hash(void);
+void cleanup_user_hash(void);
+
+PFS_user *
+find_or_create_user(PFS_thread *thread,
+                    const char *username, uint username_length);
+
+PFS_user *sanitize_user(PFS_user *unsafe);
+void purge_all_user(void);
+
+
+/* For iterators and show status. */
+
+extern ulong user_max;
+extern ulong user_lost;
+
+/* Exposing the data directly, for iterators. */
+
+extern PFS_user *user_array;
+
+/** @} */
+#endif
+
diff --git a/storage/perfschema/pfs_visitor.cc b/storage/perfschema/pfs_visitor.cc
new file mode 100644
index 00000000000..fe2b16a2f76
--- /dev/null
+++ b/storage/perfschema/pfs_visitor.cc
@@ -0,0 +1,1161 @@
+/* Copyright (c) 2010, 2011, Oracle and/or its affiliates. All rights reserved.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; version 2 of the License.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software Foundation,
+  51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA */
+
+#include "my_global.h"
+#include "my_sys.h"
+#include "pfs_visitor.h"
+#include "pfs_instr.h"
+#include "pfs_instr_class.h"
+#include "pfs_user.h"
+#include "pfs_host.h"
+#include "pfs_account.h"
+
+/**
+  @file storage/perfschema/pfs_visitor.cc
+  Visitors (implementation).
+*/
+
+/**
+  @addtogroup Performance_schema_buffers
+  @{
+*/
+
+/** Connection iterator */
+void PFS_connection_iterator::visit_global(bool with_hosts, bool with_users,
+                                           bool with_accounts, bool with_threads,
+                                           PFS_connection_visitor *visitor)
+{
+  DBUG_ASSERT(visitor != NULL);
+
+  visitor->visit_global();
+
+  if (with_hosts)
+  {
+    PFS_host *pfs= host_array;
+    PFS_host *pfs_last= pfs + host_max;
+    for ( ; pfs < pfs_last; pfs++)
+    {
+      if (pfs->m_lock.is_populated())
+        visitor->visit_host(pfs);
+    }
+  }
+
+  if (with_users)
+  {
+    PFS_user *pfs= user_array;
+    PFS_user *pfs_last= pfs + user_max;
+    for ( ; pfs < pfs_last; pfs++)
+    {
+      if (pfs->m_lock.is_populated())
+        visitor->visit_user(pfs);
+    }
+  }
+
+  if (with_accounts)
+  {
+    PFS_account *pfs= account_array;
+    PFS_account *pfs_last= pfs + account_max;
+    for ( ; pfs < pfs_last; pfs++)
+    {
+      if (pfs->m_lock.is_populated())
+        visitor->visit_account(pfs);
+    }
+  }
+
+  if (with_threads)
+  {
+    PFS_thread *pfs= thread_array;
+    PFS_thread *pfs_last= pfs + thread_max;
+    for ( ; pfs < pfs_last; pfs++)
+    {
+      if (pfs->m_lock.is_populated())
+        visitor->visit_thread(pfs);
+    }
+  }
+}
+
+void PFS_connection_iterator::visit_host(PFS_host *host,
+                                         bool with_accounts, bool with_threads,
+                                         PFS_connection_visitor *visitor)
+{
+  DBUG_ASSERT(visitor != NULL);
+
+  visitor->visit_host(host);
+
+  if (with_accounts)
+  {
+    PFS_account *pfs= account_array;
+    PFS_account *pfs_last= pfs + account_max;
+    for ( ; pfs < pfs_last; pfs++)
+    {
+      if ((pfs->m_host == host) && pfs->m_lock.is_populated())
+      {
+        visitor->visit_account(pfs);
+      }
+    }
+  }
+
+  if (with_threads)
+  {
+    PFS_thread *pfs= thread_array;
+    PFS_thread *pfs_last= pfs + thread_max;
+    for ( ; pfs < pfs_last; pfs++)
+    {
+      if (pfs->m_lock.is_populated())
+      {
+        PFS_account *safe_account= sanitize_account(pfs->m_account);
+        if ((safe_account != NULL) && (safe_account->m_host == host))
+        {
+          /*
+            If the thread belongs to a known user@host that belongs to this host,
+            process it.
+          */
+          visitor->visit_thread(pfs);
+        }
+        else if (pfs->m_host == host)
+        {
+          /*
+            If the thread belongs to a 'lost' user@host that belong to this host,
+            process it.
+          */
+          visitor->visit_thread(pfs);
+        }
+      }
+    }
+  }
+}
+
+void PFS_connection_iterator::visit_user(PFS_user *user,
+                                         bool with_accounts, bool with_threads,
+                                         PFS_connection_visitor *visitor)
+{
+  DBUG_ASSERT(visitor != NULL);
+
+  visitor->visit_user(user);
+
+  if (with_accounts)
+  {
+    PFS_account *pfs= account_array;
+    PFS_account *pfs_last= pfs + account_max;
+    for ( ; pfs < pfs_last; pfs++)
+    {
+      if ((pfs->m_user == user) && pfs->m_lock.is_populated())
+      {
+        visitor->visit_account(pfs);
+      }
+    }
+  }
+
+  if (with_threads)
+  {
+    PFS_thread *pfs= thread_array;
+    PFS_thread *pfs_last= pfs + thread_max;
+    for ( ; pfs < pfs_last; pfs++)
+    {
+      if (pfs->m_lock.is_populated())
+      {
+        PFS_account *safe_account= sanitize_account(pfs->m_account);
+        if ((safe_account != NULL) && (safe_account->m_user == user))
+        {
+          /*
+            If the thread belongs to a known user@host that belongs to this user,
+            process it.
+          */
+          visitor->visit_thread(pfs);
+        }
+        else if (pfs->m_user == user)
+        {
+          /*
+            If the thread belongs to a 'lost' user@host that belong to this user,
+            process it.
+          */
+          visitor->visit_thread(pfs);
+        }
+      }
+    }
+  }
+}
+
+void PFS_connection_iterator::visit_account(PFS_account *account,
+                                              bool with_threads,
+                                              PFS_connection_visitor *visitor)
+{
+  DBUG_ASSERT(visitor != NULL);
+
+  visitor->visit_account(account);
+
+  if (with_threads)
+  {
+    PFS_thread *pfs= thread_array;
+    PFS_thread *pfs_last= pfs + thread_max;
+    for ( ; pfs < pfs_last; pfs++)
+    {
+      if ((pfs->m_account == account) && pfs->m_lock.is_populated())
+      {
+        visitor->visit_thread(pfs);
+      }
+    }
+  }
+}
+
+void PFS_instance_iterator::visit_all(PFS_instance_visitor *visitor)
+{
+  visit_all_mutex(visitor);
+  visit_all_rwlock(visitor);
+  visit_all_cond(visitor);
+  visit_all_file(visitor);
+}
+
+void PFS_instance_iterator::visit_all_mutex(PFS_instance_visitor *visitor)
+{
+  visit_all_mutex_classes(visitor);
+  visit_all_mutex_instances(visitor);
+}
+
+void PFS_instance_iterator::visit_all_mutex_classes(PFS_instance_visitor *visitor)
+{
+  PFS_mutex_class *pfs= mutex_class_array;
+  PFS_mutex_class *pfs_last= pfs + mutex_class_max;
+  for ( ; pfs < pfs_last; pfs++)
+  {
+    if (pfs->m_name_length != 0)
+    {
+      visitor->visit_mutex_class(pfs);
+    }
+  }
+}
+
+void PFS_instance_iterator::visit_all_mutex_instances(PFS_instance_visitor *visitor)
+{
+  PFS_mutex *pfs= mutex_array;
+  PFS_mutex *pfs_last= pfs + mutex_max;
+  for ( ; pfs < pfs_last; pfs++)
+  {
+    if (pfs->m_lock.is_populated())
+    {
+      visitor->visit_mutex(pfs);
+    }
+  }
+}
+
+void PFS_instance_iterator::visit_all_rwlock(PFS_instance_visitor *visitor)
+{
+  visit_all_rwlock_classes(visitor);
+  visit_all_rwlock_instances(visitor);
+}
+
+void PFS_instance_iterator::visit_all_rwlock_classes(PFS_instance_visitor *visitor)
+{
+  PFS_rwlock_class *pfs= rwlock_class_array;
+  PFS_rwlock_class *pfs_last= pfs + rwlock_class_max;
+  for ( ; pfs < pfs_last; pfs++)
+  {
+    if (pfs->m_name_length != 0)
+    {
+      visitor->visit_rwlock_class(pfs);
+    }
+  }
+}
+
+void PFS_instance_iterator::visit_all_rwlock_instances(PFS_instance_visitor *visitor)
+{
+  PFS_rwlock *pfs= rwlock_array;
+  PFS_rwlock *pfs_last= pfs + rwlock_max;
+  for ( ; pfs < pfs_last; pfs++)
+  {
+    if (pfs->m_lock.is_populated())
+    {
+      visitor->visit_rwlock(pfs);
+    }
+  }
+}
+
+void PFS_instance_iterator::visit_all_cond(PFS_instance_visitor *visitor)
+{
+  visit_all_cond_classes(visitor);
+  visit_all_cond_instances(visitor);
+}
+
+void PFS_instance_iterator::visit_all_cond_classes(PFS_instance_visitor *visitor)
+{
+  PFS_cond_class *pfs= cond_class_array;
+  PFS_cond_class *pfs_last= pfs + cond_class_max;
+  for ( ; pfs < pfs_last; pfs++)
+  {
+    if (pfs->m_name_length != 0)
+    {
+      visitor->visit_cond_class(pfs);
+    }
+  }
+}
+
+void PFS_instance_iterator::visit_all_cond_instances(PFS_instance_visitor *visitor)
+{
+  PFS_cond *pfs= cond_array;
+  PFS_cond *pfs_last= pfs + cond_max;
+  for ( ; pfs < pfs_last; pfs++)
+  {
+    if (pfs->m_lock.is_populated())
+    {
+      visitor->visit_cond(pfs);
+    }
+  }
+}
+
+void PFS_instance_iterator::visit_all_file(PFS_instance_visitor *visitor)
+{
+  visit_all_file_classes(visitor);
+  visit_all_file_instances(visitor);
+}
+
+void PFS_instance_iterator::visit_all_file_classes(PFS_instance_visitor *visitor)
+{
+  PFS_file_class *pfs= file_class_array;
+  PFS_file_class *pfs_last= pfs + file_class_max;
+  for ( ; pfs < pfs_last; pfs++)
+  {
+    if (pfs->m_name_length != 0)
+    {
+      visitor->visit_file_class(pfs);
+    }
+  }
+}
+
+void PFS_instance_iterator::visit_all_file_instances(PFS_instance_visitor *visitor)
+{
+  PFS_file *pfs= file_array;
+  PFS_file *pfs_last= pfs + file_max;
+  for ( ; pfs < pfs_last; pfs++)
+  {
+    if (pfs->m_lock.is_populated())
+    {
+      visitor->visit_file(pfs);
+    }
+  }
+}
+
+/** Instance iterator */
+
+void PFS_instance_iterator::visit_mutex_instances(PFS_mutex_class *klass,
+                                                  PFS_instance_visitor *visitor)
+{
+  DBUG_ASSERT(visitor != NULL);
+
+  visitor->visit_mutex_class(klass);
+
+  if (klass->is_singleton())
+  {
+    PFS_mutex *pfs= sanitize_mutex(klass->m_singleton);
+    if (likely(pfs != NULL))
+    {
+      if (likely(pfs->m_lock.is_populated()))
+      {
+        visitor->visit_mutex(pfs);
+      }
+    }
+  }
+  else
+  {
+    PFS_mutex *pfs= mutex_array;
+    PFS_mutex *pfs_last= pfs + mutex_max;
+    for ( ; pfs < pfs_last; pfs++)
+    {
+      if ((pfs->m_class == klass) && pfs->m_lock.is_populated())
+      {
+        visitor->visit_mutex(pfs);
+      }
+    }
+  }
+}
+
+void PFS_instance_iterator::visit_rwlock_instances(PFS_rwlock_class *klass,
+                                                   PFS_instance_visitor *visitor)
+{
+  DBUG_ASSERT(visitor != NULL);
+
+  visitor->visit_rwlock_class(klass);
+
+  if (klass->is_singleton())
+  {
+    PFS_rwlock *pfs= sanitize_rwlock(klass->m_singleton);
+    if (likely(pfs != NULL))
+    {
+      if (likely(pfs->m_lock.is_populated()))
+      {
+        visitor->visit_rwlock(pfs);
+      }
+    }
+  }
+  else
+  {
+    PFS_rwlock *pfs= rwlock_array;
+    PFS_rwlock *pfs_last= pfs + rwlock_max;
+    for ( ; pfs < pfs_last; pfs++)
+    {
+      if ((pfs->m_class == klass) && pfs->m_lock.is_populated())
+      {
+        visitor->visit_rwlock(pfs);
+      }
+    }
+  }
+}
+
+void PFS_instance_iterator::visit_cond_instances(PFS_cond_class *klass,
+                                                 PFS_instance_visitor *visitor)
+{
+  DBUG_ASSERT(visitor != NULL);
+
+  visitor->visit_cond_class(klass);
+
+  if (klass->is_singleton())
+  {
+    PFS_cond *pfs= sanitize_cond(klass->m_singleton);
+    if (likely(pfs != NULL))
+    {
+      if (likely(pfs->m_lock.is_populated()))
+      {
+        visitor->visit_cond(pfs);
+      }
+    }
+  }
+  else
+  {
+    PFS_cond *pfs= cond_array;
+    PFS_cond *pfs_last= pfs + cond_max;
+    for ( ; pfs < pfs_last; pfs++)
+    {
+      if ((pfs->m_class == klass) && pfs->m_lock.is_populated())
+      {
+        visitor->visit_cond(pfs);
+      }
+    }
+  }
+}
+
+void PFS_instance_iterator::visit_file_instances(PFS_file_class *klass,
+                                                 PFS_instance_visitor *visitor)
+{
+  DBUG_ASSERT(visitor != NULL);
+
+  visitor->visit_file_class(klass);
+
+  if (klass->is_singleton())
+  {
+    PFS_file *pfs= sanitize_file(klass->m_singleton);
+    if (likely(pfs != NULL))
+    {
+      if (likely(pfs->m_lock.is_populated()))
+      {
+        visitor->visit_file(pfs);
+      }
+    }
+  }
+  else
+  {
+    PFS_file *pfs= file_array;
+    PFS_file *pfs_last= pfs + file_max;
+    for ( ; pfs < pfs_last; pfs++)
+    {
+      if ((pfs->m_class == klass) && pfs->m_lock.is_populated())
+      {
+        visitor->visit_file(pfs);
+      }
+    }
+  }
+}
+
+/** Socket instance iterator visting a socket class and all instances */
+
+void PFS_instance_iterator::visit_socket_instances(PFS_socket_class *klass,
+                                                   PFS_instance_visitor *visitor)
+{
+  DBUG_ASSERT(visitor != NULL);
+
+  visitor->visit_socket_class(klass);
+
+  if (klass->is_singleton())
+  {
+    PFS_socket *pfs= sanitize_socket(klass->m_singleton);
+    if (likely(pfs != NULL))
+    {
+      if (likely(pfs->m_lock.is_populated()))
+      {
+        visitor->visit_socket(pfs);
+      }
+    }
+  }
+  else
+  {
+    PFS_socket *pfs= socket_array;
+    PFS_socket *pfs_last= pfs + socket_max;
+    for ( ; pfs < pfs_last; pfs++)
+    {
+      if ((pfs->m_class == klass) && pfs->m_lock.is_populated())
+      {
+        visitor->visit_socket(pfs);
+      }
+    }
+  }
+}
+
+/** Socket instance iterator visting sockets owned by PFS_thread. */
+
+void PFS_instance_iterator::visit_socket_instances(PFS_socket_class *klass,
+                                                   PFS_instance_visitor *visitor,
+                                                   PFS_thread *thread,
+                                                   bool visit_class)
+{
+  DBUG_ASSERT(visitor != NULL);
+  DBUG_ASSERT(thread != NULL);
+
+  if (visit_class)
+    visitor->visit_socket_class(klass);
+
+  if (klass->is_singleton())
+  {
+    PFS_socket *pfs= sanitize_socket(klass->m_singleton);
+    if (likely(pfs != NULL))
+    {
+      if (unlikely(pfs->m_thread_owner == thread))
+        visitor->visit_socket(pfs);
+    }
+  }
+  else
+  {
+    /* Get current socket stats from each socket instance owned by this thread */
+    PFS_socket *pfs= socket_array;
+    PFS_socket *pfs_last= pfs + socket_max;
+
+    for ( ; pfs < pfs_last; pfs++)
+    {
+      if (unlikely((pfs->m_class == klass) &&
+                   (pfs->m_thread_owner == thread)))
+      {
+        visitor->visit_socket(pfs);
+      }
+    }
+  }
+}
+
+/** Generic instance iterator with PFS_thread as matching criteria */
+
+void PFS_instance_iterator::visit_instances(PFS_instr_class *klass,
+                                            PFS_instance_visitor *visitor,
+                                            PFS_thread *thread,
+                                            bool visit_class)
+{
+  DBUG_ASSERT(visitor != NULL);
+  DBUG_ASSERT(klass != NULL);
+
+  switch (klass->m_type)
+  {
+  case PFS_CLASS_SOCKET:
+    {
+    PFS_socket_class *socket_class= reinterpret_cast<PFS_socket_class*>(klass);
+    PFS_instance_iterator::visit_socket_instances(socket_class, visitor,
+                                                  thread, visit_class);
+    }
+    break;
+  default:
+    break;
+  }
+}
+
+/** Object iterator */
+void PFS_object_iterator::visit_all(PFS_object_visitor *visitor)
+{
+  visit_all_tables(visitor);
+}
+
+void PFS_object_iterator::visit_all_tables(PFS_object_visitor *visitor)
+{
+  DBUG_ASSERT(visitor != NULL);
+
+  visitor->visit_global();
+
+  /* For all the table shares ... */
+  PFS_table_share *share= table_share_array;
+  PFS_table_share *share_last= table_share_array + table_share_max;
+  for ( ; share < share_last; share++)
+  {
+    if (share->m_lock.is_populated())
+    {
+      visitor->visit_table_share(share);
+    }
+  }
+
+  /* For all the table handles ... */
+  PFS_table *table= table_array;
+  PFS_table *table_last= table_array + table_max;
+  for ( ; table < table_last; table++)
+  {
+    if (table->m_lock.is_populated())
+    {
+      visitor->visit_table(table);
+    }
+  }
+}
+
+void PFS_object_iterator::visit_tables(PFS_table_share *share,
+                                       PFS_object_visitor *visitor)
+{
+  DBUG_ASSERT(visitor != NULL);
+
+  visitor->visit_table_share(share);
+
+  /* For all the table handles ... */
+  PFS_table *table= table_array;
+  PFS_table *table_last= table_array + table_max;
+  for ( ; table < table_last; table++)
+  {
+    if ((table->m_share == share) && table->m_lock.is_populated())
+    {
+      visitor->visit_table(table);
+    }
+  }
+}
+
+void PFS_object_iterator::visit_table_indexes(PFS_table_share *share,
+                                              uint index,
+                                              PFS_object_visitor *visitor)
+{
+  DBUG_ASSERT(visitor != NULL);
+
+  visitor->visit_table_share_index(share, index);
+
+  /* For all the table handles ... */
+  PFS_table *table= table_array;
+  PFS_table *table_last= table_array + table_max;
+  for ( ; table < table_last; table++)
+  {
+    if ((table->m_share == share) && table->m_lock.is_populated())
+    {
+      visitor->visit_table_index(table, index);
+    }
+  }
+}
+
+/** Connection wait visitor */
+
+PFS_connection_wait_visitor
+::PFS_connection_wait_visitor(PFS_instr_class *klass)
+{
+  m_index= klass->m_event_name_index;
+}
+
+PFS_connection_wait_visitor::~PFS_connection_wait_visitor()
+{}
+
+void PFS_connection_wait_visitor::visit_global()
+{
+  /*
+    This visitor is used only for idle instruments.
+    For waits, do not sum by connection but by instances,
+    it is more efficient.
+  */
+  DBUG_ASSERT(m_index == global_idle_class.m_event_name_index);
+  m_stat.aggregate(& global_instr_class_waits_array[m_index]);
+}
+
+void PFS_connection_wait_visitor::visit_host(PFS_host *pfs)
+{
+  m_stat.aggregate(& pfs->m_instr_class_waits_stats[m_index]);
+}
+
+void PFS_connection_wait_visitor::visit_user(PFS_user *pfs)
+{
+  m_stat.aggregate(& pfs->m_instr_class_waits_stats[m_index]);
+}
+
+void PFS_connection_wait_visitor::visit_account(PFS_account *pfs)
+{
+  m_stat.aggregate(& pfs->m_instr_class_waits_stats[m_index]);
+}
+
+void PFS_connection_wait_visitor::visit_thread(PFS_thread *pfs)
+{
+  m_stat.aggregate(& pfs->m_instr_class_waits_stats[m_index]);
+}
+
+PFS_connection_all_wait_visitor
+::PFS_connection_all_wait_visitor()
+{}
+
+PFS_connection_all_wait_visitor::~PFS_connection_all_wait_visitor()
+{}
+
+void PFS_connection_all_wait_visitor::visit_global()
+{
+  /* Sum by instances, not by connection */
+  DBUG_ASSERT(false);
+}
+
+void PFS_connection_all_wait_visitor::visit_connection_slice(PFS_connection_slice *pfs)
+{
+  PFS_single_stat *stat= pfs->m_instr_class_waits_stats;
+  PFS_single_stat *stat_last= stat + wait_class_max;
+  for ( ; stat < stat_last; stat++)
+  {
+    m_stat.aggregate(stat);
+  }
+}
+
+void PFS_connection_all_wait_visitor::visit_host(PFS_host *pfs)
+{
+  visit_connection_slice(pfs);
+}
+
+void PFS_connection_all_wait_visitor::visit_user(PFS_user *pfs)
+{
+  visit_connection_slice(pfs);
+}
+
+void PFS_connection_all_wait_visitor::visit_account(PFS_account *pfs)
+{
+  visit_connection_slice(pfs);
+}
+
+void PFS_connection_all_wait_visitor::visit_thread(PFS_thread *pfs)
+{
+  visit_connection_slice(pfs);
+}
+
+PFS_connection_stage_visitor::PFS_connection_stage_visitor(PFS_stage_class *klass)
+{
+  m_index= klass->m_event_name_index;
+}
+
+PFS_connection_stage_visitor::~PFS_connection_stage_visitor()
+{}
+
+void PFS_connection_stage_visitor::visit_global()
+{
+  m_stat.aggregate(& global_instr_class_stages_array[m_index]);
+}
+
+void PFS_connection_stage_visitor::visit_host(PFS_host *pfs)
+{
+  m_stat.aggregate(& pfs->m_instr_class_stages_stats[m_index]);
+}
+
+void PFS_connection_stage_visitor::visit_user(PFS_user *pfs)
+{
+  m_stat.aggregate(& pfs->m_instr_class_stages_stats[m_index]);
+}
+
+void PFS_connection_stage_visitor::visit_account(PFS_account *pfs)
+{
+  m_stat.aggregate(& pfs->m_instr_class_stages_stats[m_index]);
+}
+
+void PFS_connection_stage_visitor::visit_thread(PFS_thread *pfs)
+{
+  m_stat.aggregate(& pfs->m_instr_class_stages_stats[m_index]);
+}
+
+PFS_connection_statement_visitor
+::PFS_connection_statement_visitor(PFS_statement_class *klass)
+{
+  m_index= klass->m_event_name_index;
+}
+
+PFS_connection_statement_visitor::~PFS_connection_statement_visitor()
+{}
+
+void PFS_connection_statement_visitor::visit_global()
+{
+  m_stat.aggregate(& global_instr_class_statements_array[m_index]);
+}
+
+void PFS_connection_statement_visitor::visit_host(PFS_host *pfs)
+{
+  m_stat.aggregate(& pfs->m_instr_class_statements_stats[m_index]);
+}
+
+void PFS_connection_statement_visitor::visit_user(PFS_user *pfs)
+{
+  m_stat.aggregate(& pfs->m_instr_class_statements_stats[m_index]);
+}
+
+void PFS_connection_statement_visitor::visit_account(PFS_account *pfs)
+{
+  m_stat.aggregate(& pfs->m_instr_class_statements_stats[m_index]);
+}
+
+void PFS_connection_statement_visitor::visit_thread(PFS_thread *pfs)
+{
+  m_stat.aggregate(& pfs->m_instr_class_statements_stats[m_index]);
+}
+
+/** Instance wait visitor */
+PFS_connection_all_statement_visitor
+::PFS_connection_all_statement_visitor()
+{}
+
+PFS_connection_all_statement_visitor::~PFS_connection_all_statement_visitor()
+{}
+
+void PFS_connection_all_statement_visitor::visit_global()
+{
+  PFS_statement_stat *stat= global_instr_class_statements_array;
+  PFS_statement_stat *stat_last= stat + statement_class_max;
+  for ( ; stat < stat_last; stat++)
+  {
+    m_stat.aggregate(stat);
+  }
+}
+
+void PFS_connection_all_statement_visitor::visit_connection_slice(PFS_connection_slice *pfs)
+{
+  PFS_statement_stat *stat= pfs->m_instr_class_statements_stats;
+  PFS_statement_stat *stat_last= stat + statement_class_max;
+  for ( ; stat < stat_last; stat++)
+  {
+    m_stat.aggregate(stat);
+  }
+}
+
+void PFS_connection_all_statement_visitor::visit_host(PFS_host *pfs)
+{
+  visit_connection_slice(pfs);
+}
+
+void PFS_connection_all_statement_visitor::visit_user(PFS_user *pfs)
+{
+  visit_connection_slice(pfs);
+}
+
+void PFS_connection_all_statement_visitor::visit_account(PFS_account *pfs)
+{
+  visit_connection_slice(pfs);
+}
+
+void PFS_connection_all_statement_visitor::visit_thread(PFS_thread *pfs)
+{
+  visit_connection_slice(pfs);
+}
+
+PFS_connection_stat_visitor::PFS_connection_stat_visitor()
+{}
+
+PFS_connection_stat_visitor::~PFS_connection_stat_visitor()
+{}
+
+void PFS_connection_stat_visitor::visit_global()
+{}
+
+void PFS_connection_stat_visitor::visit_host(PFS_host *pfs)
+{
+  m_stat.aggregate_disconnected(pfs->m_disconnected_count);
+}
+
+void PFS_connection_stat_visitor::visit_user(PFS_user *pfs)
+{
+  m_stat.aggregate_disconnected(pfs->m_disconnected_count);
+}
+
+void PFS_connection_stat_visitor::visit_account(PFS_account *pfs)
+{
+  m_stat.aggregate_disconnected(pfs->m_disconnected_count);
+}
+
+void PFS_connection_stat_visitor::visit_thread(PFS_thread *)
+{
+  m_stat.aggregate_active(1);
+}
+
+PFS_instance_wait_visitor::PFS_instance_wait_visitor()
+{
+}
+
+PFS_instance_wait_visitor::~PFS_instance_wait_visitor()
+{}
+
+void PFS_instance_wait_visitor::visit_mutex_class(PFS_mutex_class *pfs) 
+{
+  uint index= pfs->m_event_name_index;
+  m_stat.aggregate(& global_instr_class_waits_array[index]);
+}
+
+void PFS_instance_wait_visitor::visit_rwlock_class(PFS_rwlock_class *pfs) 
+{
+  uint index= pfs->m_event_name_index;
+  m_stat.aggregate(& global_instr_class_waits_array[index]);
+}
+
+void PFS_instance_wait_visitor::visit_cond_class(PFS_cond_class *pfs) 
+{
+  uint index= pfs->m_event_name_index;
+  m_stat.aggregate(& global_instr_class_waits_array[index]);
+}
+
+void PFS_instance_wait_visitor::visit_file_class(PFS_file_class *pfs) 
+{
+  uint index= pfs->m_event_name_index;
+  m_stat.aggregate(& global_instr_class_waits_array[index]);
+}
+
+void PFS_instance_wait_visitor::visit_socket_class(PFS_socket_class *pfs) 
+{
+  /* Collect global wait stats */
+  uint index= pfs->m_event_name_index;
+  m_stat.aggregate(&global_instr_class_waits_array[index]);
+
+  /* If deferred, then pull wait stats directly from the socket class. */
+  if (pfs->is_deferred())
+    pfs->m_socket_stat.m_io_stat.sum_waits(&m_stat);
+}
+
+void PFS_instance_wait_visitor::visit_mutex(PFS_mutex *pfs) 
+{
+  m_stat.aggregate(& pfs->m_wait_stat);
+}
+
+void PFS_instance_wait_visitor::visit_rwlock(PFS_rwlock *pfs) 
+{
+  m_stat.aggregate(& pfs->m_wait_stat);
+}
+
+void PFS_instance_wait_visitor::visit_cond(PFS_cond *pfs) 
+{
+  m_stat.aggregate(& pfs->m_wait_stat);
+}
+
+void PFS_instance_wait_visitor::visit_file(PFS_file *pfs) 
+{
+  /* Combine per-operation file wait stats before aggregating */
+  PFS_single_stat stat;
+  pfs->m_file_stat.m_io_stat.sum_waits(&stat);
+  m_stat.aggregate(&stat);
+}
+
+void PFS_instance_wait_visitor::visit_socket(PFS_socket *pfs) 
+{
+  /* Combine per-operation socket wait stats before aggregating */
+  PFS_single_stat stat;
+  pfs->m_socket_stat.m_io_stat.sum_waits(&stat);
+  m_stat.aggregate(&stat);
+}
+
+/** Table IO wait visitor */
+
+PFS_object_wait_visitor::PFS_object_wait_visitor()
+{}
+
+PFS_object_wait_visitor::~PFS_object_wait_visitor()
+{}
+
+void PFS_object_wait_visitor::visit_global()
+{
+  uint index;
+
+  index= global_table_io_class.m_event_name_index;
+  m_stat.aggregate(& global_instr_class_waits_array[index]);
+
+  index= global_table_lock_class.m_event_name_index;
+  m_stat.aggregate(& global_instr_class_waits_array[index]);
+}
+
+void PFS_object_wait_visitor::visit_table_share(PFS_table_share *pfs)
+{
+  pfs->m_table_stat.sum(& m_stat);
+}
+
+void PFS_object_wait_visitor::visit_table(PFS_table *pfs)
+{
+  pfs->m_table_stat.sum(& m_stat);
+}
+
+PFS_table_io_wait_visitor::PFS_table_io_wait_visitor()
+{}
+
+PFS_table_io_wait_visitor::~PFS_table_io_wait_visitor()
+{}
+
+void PFS_table_io_wait_visitor::visit_global()
+{
+  uint index= global_table_io_class.m_event_name_index;
+  m_stat.aggregate(& global_instr_class_waits_array[index]);
+}
+
+void PFS_table_io_wait_visitor::visit_table_share(PFS_table_share *pfs)
+{
+  PFS_table_io_stat io_stat;
+  uint index;
+
+  /* Aggregate index stats */
+  for (index= 0; index < pfs->m_key_count; index++)
+    io_stat.aggregate(& pfs->m_table_stat.m_index_stat[index]);
+
+  /* Aggregate global stats */
+  io_stat.aggregate(& pfs->m_table_stat.m_index_stat[MAX_KEY]);
+
+  io_stat.sum(& m_stat);
+}
+
+void PFS_table_io_wait_visitor::visit_table(PFS_table *pfs)
+{
+  PFS_table_share *safe_share= sanitize_table_share(pfs->m_share);
+
+  if (likely(safe_share != NULL))
+  {
+    PFS_table_io_stat io_stat;
+    uint index;
+
+    /* Aggregate index stats */
+    for (index= 0; index < safe_share->m_key_count; index++)
+      io_stat.aggregate(& pfs->m_table_stat.m_index_stat[index]);
+
+    /* Aggregate global stats */
+    io_stat.aggregate(& pfs->m_table_stat.m_index_stat[MAX_KEY]);
+
+    io_stat.sum(& m_stat);
+  }
+}
+
+/** Table IO stat visitor */
+
+PFS_table_io_stat_visitor::PFS_table_io_stat_visitor()
+{}
+
+PFS_table_io_stat_visitor::~PFS_table_io_stat_visitor()
+{}
+
+void PFS_table_io_stat_visitor::visit_table_share(PFS_table_share *pfs)
+{
+  uint index;
+
+  /* Aggregate index stats */
+  for (index= 0; index < pfs->m_key_count; index++)
+    m_stat.aggregate(& pfs->m_table_stat.m_index_stat[index]);
+
+  /* Aggregate global stats */
+  m_stat.aggregate(& pfs->m_table_stat.m_index_stat[MAX_KEY]);
+}
+
+void PFS_table_io_stat_visitor::visit_table(PFS_table *pfs)
+{
+  PFS_table_share *safe_share= sanitize_table_share(pfs->m_share);
+
+  if (likely(safe_share != NULL))
+  {
+    uint index;
+
+    /* Aggregate index stats */
+    for (index= 0; index < safe_share->m_key_count; index++)
+      m_stat.aggregate(& pfs->m_table_stat.m_index_stat[index]);
+
+    /* Aggregate global stats */
+    m_stat.aggregate(& pfs->m_table_stat.m_index_stat[MAX_KEY]);
+  }
+}
+
+/** Index IO stat visitor */
+
+PFS_index_io_stat_visitor::PFS_index_io_stat_visitor()
+{}
+
+PFS_index_io_stat_visitor::~PFS_index_io_stat_visitor()
+{}
+
+void PFS_index_io_stat_visitor::visit_table_share_index(PFS_table_share *pfs, uint index)
+{
+  m_stat.aggregate(& pfs->m_table_stat.m_index_stat[index]);
+}
+
+void PFS_index_io_stat_visitor::visit_table_index(PFS_table *pfs, uint index)
+{
+  m_stat.aggregate(& pfs->m_table_stat.m_index_stat[index]);
+}
+
+/** Table lock wait visitor */
+
+PFS_table_lock_wait_visitor::PFS_table_lock_wait_visitor()
+{}
+
+PFS_table_lock_wait_visitor::~PFS_table_lock_wait_visitor()
+{}
+
+void PFS_table_lock_wait_visitor::visit_global()
+{
+  uint index= global_table_lock_class.m_event_name_index;
+  m_stat.aggregate(& global_instr_class_waits_array[index]);
+}
+
+void PFS_table_lock_wait_visitor::visit_table_share(PFS_table_share *pfs)
+{
+  pfs->m_table_stat.sum_lock(& m_stat);
+}
+
+void PFS_table_lock_wait_visitor::visit_table(PFS_table *pfs)
+{
+  pfs->m_table_stat.sum_lock(& m_stat);
+}
+
+/** Table lock stat visitor */
+
+PFS_table_lock_stat_visitor::PFS_table_lock_stat_visitor()
+{}
+
+PFS_table_lock_stat_visitor::~PFS_table_lock_stat_visitor()
+{}
+
+void PFS_table_lock_stat_visitor::visit_table_share(PFS_table_share *pfs)
+{
+  m_stat.aggregate(& pfs->m_table_stat.m_lock_stat);
+}
+
+void PFS_table_lock_stat_visitor::visit_table(PFS_table *pfs)
+{
+  m_stat.aggregate(& pfs->m_table_stat.m_lock_stat);
+}
+
+PFS_instance_socket_io_stat_visitor::PFS_instance_socket_io_stat_visitor()
+{}
+
+PFS_instance_socket_io_stat_visitor::~PFS_instance_socket_io_stat_visitor()
+{}
+
+void PFS_instance_socket_io_stat_visitor::visit_socket_class(PFS_socket_class *pfs) 
+{
+  /* Aggregate wait times, event counts and byte counts */
+  m_socket_io_stat.aggregate(&pfs->m_socket_stat.m_io_stat);
+}
+
+void PFS_instance_socket_io_stat_visitor::visit_socket(PFS_socket *pfs) 
+{
+  /* Aggregate wait times, event counts and byte counts */
+  m_socket_io_stat.aggregate(&pfs->m_socket_stat.m_io_stat);
+}
+
+
+PFS_instance_file_io_stat_visitor::PFS_instance_file_io_stat_visitor()
+{}
+
+PFS_instance_file_io_stat_visitor::~PFS_instance_file_io_stat_visitor()
+{}
+
+void PFS_instance_file_io_stat_visitor::visit_file_class(PFS_file_class *pfs) 
+{
+  /* Aggregate wait times, event counts and byte counts */
+  m_file_io_stat.aggregate(&pfs->m_file_stat.m_io_stat);
+}
+
+void PFS_instance_file_io_stat_visitor::visit_file(PFS_file *pfs) 
+{
+  /* Aggregate wait times, event counts and byte counts */
+  m_file_io_stat.aggregate(&pfs->m_file_stat.m_io_stat);
+}
+/** @} */
diff --git a/storage/perfschema/pfs_visitor.h b/storage/perfschema/pfs_visitor.h
new file mode 100644
index 00000000000..4ec63d00636
--- /dev/null
+++ b/storage/perfschema/pfs_visitor.h
@@ -0,0 +1,568 @@
+/* Copyright (c) 2010, 2011, Oracle and/or its affiliates. All rights reserved.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; version 2 of the License.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software Foundation,
+  51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA */
+
+#ifndef PFS_VISITOR_H
+#define PFS_VISITOR_H
+
+#include "pfs_stat.h"
+
+/**
+  @file storage/perfschema/pfs_visitor.h
+  Visitors (declarations).
+*/
+
+/**
+  @addtogroup Performance_schema_buffers
+  @{
+*/
+
+struct PFS_user;
+struct PFS_account;
+struct PFS_host;
+struct PFS_thread;
+struct PFS_instr_class;
+struct PFS_mutex_class;
+struct PFS_rwlock_class;
+struct PFS_cond_class;
+struct PFS_file_class;
+struct PFS_socket_class;
+struct PFS_table_share;
+struct PFS_mutex;
+struct PFS_rwlock;
+struct PFS_cond;
+struct PFS_file;
+struct PFS_table;
+struct PFS_stage_class;
+struct PFS_statement_class;
+struct PFS_socket;
+struct PFS_connection_slice;
+
+/**
+  Interface class to visit groups of connections.
+  @sa PFS_connection_iterator
+*/
+class PFS_connection_visitor
+{
+public:
+  PFS_connection_visitor() {}
+  virtual ~PFS_connection_visitor() {}
+  /** Visit all connections. */
+  virtual void visit_global() {}
+  /** Visit all connections of a host. */
+  virtual void visit_host(PFS_host *pfs) {}
+  /** Visit all connections of a user+host. */
+  virtual void visit_account(PFS_account *pfs) {}
+  /** Visit all connections of a user. */
+  virtual void visit_user(PFS_user *pfs) {}
+  /** Visit a thread. */
+  virtual void visit_thread(PFS_thread *pfs) {}
+};
+
+/**
+  Iterator over groups of connections.
+  @sa PFS_connection_visitor
+*/
+class PFS_connection_iterator
+{
+public:
+  /**
+    Visit all connections.
+    @param with_hosts when true, visit also all hosts.
+    @param with_users when true, visit also all users.
+    @param with_accounts when true, visit also all user+host.
+    @param with_threads when true, visit also all threads.
+    @param visitor the visitor to call
+  */
+  static void visit_global(bool with_hosts, bool with_users,
+                           bool with_accounts, bool with_threads,
+                           PFS_connection_visitor *visitor);
+  /**
+    Visit all connections of a host.
+    @param host the host to visit.
+    @param with_accounts when true, visit also all related user+host.
+    @param with_threads when true, visit also all related threads.
+    @param visitor the visitor to call
+  */
+  static void visit_host(PFS_host *host, bool with_accounts, bool with_threads,
+                         PFS_connection_visitor *visitor);
+  /**
+    Visit all connections of a user.
+    @param user the user to visit.
+    @param with_accounts when true, visit also all related user+host.
+    @param with_threads when true, visit also all related threads.
+    @param visitor the visitor to call
+  */
+  static void visit_user(PFS_user *user, bool with_accounts, bool with_threads,
+                         PFS_connection_visitor *visitor);
+  /**
+    Visit all connections of a user+host.
+    @param account the user+host to visit.
+    @param with_threads when true, visit also all related threads.
+    @param visitor the visitor to call
+  */
+  static void visit_account(PFS_account *account, bool with_threads,
+                              PFS_connection_visitor *visitor);
+  /**
+    Visit a thread or connection.
+    @param thread the thread to visit.
+    @param visitor the visitor to call
+  */
+  static inline void visit_thread(PFS_thread *thread,
+                                  PFS_connection_visitor *visitor)
+  { visitor->visit_thread(thread); }
+};
+
+/**
+  Interface class to visit groups of instrumentation point instances.
+  @sa PFS_instance_iterator
+*/
+class PFS_instance_visitor
+{
+public:
+  PFS_instance_visitor() {}
+  virtual ~PFS_instance_visitor() {}
+  /** Visit a mutex class. */
+  virtual void visit_mutex_class(PFS_mutex_class *pfs) {}
+  /** Visit a rwlock class. */
+  virtual void visit_rwlock_class(PFS_rwlock_class *pfs) {}
+  /** Visit a cond class. */
+  virtual void visit_cond_class(PFS_cond_class *pfs) {}
+  /** Visit a file class. */
+  virtual void visit_file_class(PFS_file_class *pfs) {}
+  /** Visit a socket class. */
+  virtual void visit_socket_class(PFS_socket_class *pfs) {}
+  /** Visit a mutex instance. */
+  virtual void visit_mutex(PFS_mutex *pfs) {}
+  /** Visit a rwlock instance. */
+  virtual void visit_rwlock(PFS_rwlock *pfs) {}
+  /** Visit a cond instance. */
+  virtual void visit_cond(PFS_cond *pfs) {}
+  /** Visit a file instance. */
+  virtual void visit_file(PFS_file *pfs) {}
+  /** Visit a socket instance. */
+  virtual void visit_socket(PFS_socket *pfs) {}
+};
+
+/**
+  Iterator over groups of instrumentation point instances.
+  @sa PFS_instance_visitor
+*/
+class PFS_instance_iterator
+{
+public:
+  static void visit_all(PFS_instance_visitor *visitor);
+  static void visit_all_mutex(PFS_instance_visitor *visitor);
+  static void visit_all_mutex_classes(PFS_instance_visitor *visitor);
+  static void visit_all_mutex_instances(PFS_instance_visitor *visitor);
+  static void visit_all_rwlock(PFS_instance_visitor *visitor);
+  static void visit_all_rwlock_classes(PFS_instance_visitor *visitor);
+  static void visit_all_rwlock_instances(PFS_instance_visitor *visitor);
+  static void visit_all_cond(PFS_instance_visitor *visitor);
+  static void visit_all_cond_classes(PFS_instance_visitor *visitor);
+  static void visit_all_cond_instances(PFS_instance_visitor *visitor);
+  static void visit_all_file(PFS_instance_visitor *visitor);
+  static void visit_all_file_classes(PFS_instance_visitor *visitor);
+  static void visit_all_file_instances(PFS_instance_visitor *visitor);
+
+  /**
+    Visit a mutex class and related instances.
+    @param klass the klass to visit.
+    @param visitor the visitor to call
+  */
+  static void visit_mutex_instances(PFS_mutex_class *klass,
+                                    PFS_instance_visitor *visitor);
+  /**
+    Visit a rwlock class and related instances.
+    @param klass the klass to visit.
+    @param visitor the visitor to call
+  */
+  static void visit_rwlock_instances(PFS_rwlock_class *klass,
+                                     PFS_instance_visitor *visitor);
+  /**
+    Visit a cond class and related instances.
+    @param klass the klass to visit.
+    @param visitor the visitor to call
+  */
+  static void visit_cond_instances(PFS_cond_class *klass,
+                                   PFS_instance_visitor *visitor);
+  /**
+    Visit a file class and related instances.
+    @param klass the klass to visit.
+    @param visitor the visitor to call
+  */
+  static void visit_file_instances(PFS_file_class *klass,
+                                   PFS_instance_visitor *visitor);
+  /**
+    Visit a socket class and related instances.
+    @param klass the klass to visit.
+    @param visitor the visitor to call
+  */
+  static void visit_socket_instances(PFS_socket_class *klass,
+                                     PFS_instance_visitor *visitor);
+  /**
+    Visit a socket class and related instances.
+    @param klass the klass to visit.
+    @param visitor the visitor to call
+    @param thread the owning thread to match
+    @param visit_class if true then visit the socket class
+  */
+  static void visit_socket_instances(PFS_socket_class *klass,
+                                     PFS_instance_visitor *visitor,
+                                     PFS_thread *thread,
+                                     bool visit_class= true);
+  /**
+    Visit an instrument class and related instances.
+    @param klass the klass to visit.
+    @param visitor the visitor to call
+    @param thread comparison criteria
+    @param visit_class if true then visit the class
+  */
+  static void visit_instances(PFS_instr_class *klass,
+                              PFS_instance_visitor *visitor,
+                              PFS_thread *thread,
+                              bool visit_class= true);
+};
+
+/**
+  Interface class to visit groups of SQL objects.
+  @sa PFS_object_iterator
+*/
+class PFS_object_visitor
+{
+public:
+  PFS_object_visitor() {}
+  virtual ~PFS_object_visitor() {}
+  /** Visit global data. */
+  virtual void visit_global() {}
+  /** Visit a table share. */
+  virtual void visit_table_share(PFS_table_share *pfs) {}
+  /** Visit a table share index. */
+  virtual void visit_table_share_index(PFS_table_share *pfs, uint index) {}
+  /** Visit a table. */
+  virtual void visit_table(PFS_table *pfs) {}
+  /** Visit a table index. */
+  virtual void visit_table_index(PFS_table *pfs, uint index) {}
+};
+
+/**
+  Iterator over groups of SQL objects.
+  @sa PFS_object_visitor
+*/
+class PFS_object_iterator
+{
+public:
+  /** Visit all objects. */
+  static void visit_all(PFS_object_visitor *visitor);
+  /** Visit all tables and related handles. */
+  static void visit_all_tables(PFS_object_visitor *visitor);
+  /** Visit a table and related table handles. */
+  static void visit_tables(PFS_table_share *share,
+                           PFS_object_visitor *visitor);
+  /** Visit a table index and related table handles indexes. */
+  static void visit_table_indexes(PFS_table_share *share,
+                                  uint index,
+                                  PFS_object_visitor *visitor);
+};
+
+/**
+  A concrete connection visitor that aggregates
+  wait statistics for a given event_name.
+*/
+class PFS_connection_wait_visitor : public PFS_connection_visitor
+{
+public:
+  /** Constructor. */
+  PFS_connection_wait_visitor(PFS_instr_class *klass);
+  virtual ~PFS_connection_wait_visitor();
+  virtual void visit_global();
+  virtual void visit_host(PFS_host *pfs);
+  virtual void visit_account(PFS_account *pfs);
+  virtual void visit_user(PFS_user *pfs);
+  virtual void visit_thread(PFS_thread *pfs);
+
+  /** EVENT_NAME instrument index. */
+  uint m_index;
+  /** Wait statistic collected. */
+  PFS_single_stat m_stat;
+};
+
+/**
+  A concrete connection visitor that aggregates
+  wait statistics for all events.
+*/
+class PFS_connection_all_wait_visitor : public PFS_connection_visitor
+{
+public:
+  /** Constructor. */
+  PFS_connection_all_wait_visitor();
+  virtual ~PFS_connection_all_wait_visitor();
+  virtual void visit_global();
+  virtual void visit_host(PFS_host *pfs);
+  virtual void visit_account(PFS_account *pfs);
+  virtual void visit_user(PFS_user *pfs);
+  virtual void visit_thread(PFS_thread *pfs);
+
+  /** Wait statistic collected. */
+  PFS_single_stat m_stat;
+
+private:
+  void visit_connection_slice(PFS_connection_slice *pfs);
+};
+
+/**
+  A concrete connection visitor that aggregates
+  stage statistics.
+*/
+class PFS_connection_stage_visitor : public PFS_connection_visitor
+{
+public:
+  /** Constructor. */
+  PFS_connection_stage_visitor(PFS_stage_class *klass);
+  virtual ~PFS_connection_stage_visitor();
+  virtual void visit_global();
+  virtual void visit_host(PFS_host *pfs);
+  virtual void visit_account(PFS_account *pfs);
+  virtual void visit_user(PFS_user *pfs);
+  virtual void visit_thread(PFS_thread *pfs);
+
+  /** EVENT_NAME instrument index. */
+  uint m_index;
+  /** Stage statistic collected. */
+  PFS_stage_stat m_stat;
+};
+
+/**
+  A concrete connection visitor that aggregates
+  statement statistics for a given event_name.
+*/
+class PFS_connection_statement_visitor : public PFS_connection_visitor
+{
+public:
+  /** Constructor. */
+  PFS_connection_statement_visitor(PFS_statement_class *klass);
+  virtual ~PFS_connection_statement_visitor();
+  virtual void visit_global();
+  virtual void visit_host(PFS_host *pfs);
+  virtual void visit_account(PFS_account *pfs);
+  virtual void visit_user(PFS_user *pfs);
+  virtual void visit_thread(PFS_thread *pfs);
+
+  /** EVENT_NAME instrument index. */
+  uint m_index;
+  /** Statement statistic collected. */
+  PFS_statement_stat m_stat;
+};
+
+/**
+  A concrete connection visitor that aggregates
+  statement statistics for all events.
+*/
+class PFS_connection_all_statement_visitor : public PFS_connection_visitor
+{
+public:
+  /** Constructor. */
+  PFS_connection_all_statement_visitor();
+  virtual ~PFS_connection_all_statement_visitor();
+  virtual void visit_global();
+  virtual void visit_host(PFS_host *pfs);
+  virtual void visit_account(PFS_account *pfs);
+  virtual void visit_user(PFS_user *pfs);
+  virtual void visit_thread(PFS_thread *pfs);
+
+  /** Statement statistic collected. */
+  PFS_statement_stat m_stat;
+
+private:
+  void visit_connection_slice(PFS_connection_slice *pfs);
+};
+
+/**
+  A concrete connection visitor that aggregates
+  connection statistics.
+*/
+class PFS_connection_stat_visitor : public PFS_connection_visitor
+{
+public:
+  /** Constructor. */
+  PFS_connection_stat_visitor();
+  virtual ~PFS_connection_stat_visitor();
+  virtual void visit_global();
+  virtual void visit_host(PFS_host *pfs);
+  virtual void visit_account(PFS_account *pfs);
+  virtual void visit_user(PFS_user *pfs);
+  virtual void visit_thread(PFS_thread *pfs);
+
+  /** Connection statistic collected. */
+  PFS_connection_stat m_stat;
+};
+
+/**
+  A concrete instance visitor that aggregates
+  wait statistics.
+*/
+class PFS_instance_wait_visitor : public PFS_instance_visitor
+{
+public:
+  PFS_instance_wait_visitor();
+  virtual ~PFS_instance_wait_visitor();
+  virtual void visit_mutex_class(PFS_mutex_class *pfs);
+  virtual void visit_rwlock_class(PFS_rwlock_class *pfs);
+  virtual void visit_cond_class(PFS_cond_class *pfs);
+  virtual void visit_file_class(PFS_file_class *pfs);
+  virtual void visit_socket_class(PFS_socket_class *pfs);
+  virtual void visit_mutex(PFS_mutex *pfs);
+  virtual void visit_rwlock(PFS_rwlock *pfs);
+  virtual void visit_cond(PFS_cond *pfs);
+  virtual void visit_file(PFS_file *pfs);
+  virtual void visit_socket(PFS_socket *pfs);
+
+  /** Wait statistic collected. */
+  PFS_single_stat m_stat;
+};
+
+/**
+  A concrete object visitor that aggregates
+  object wait statistics.
+*/
+class PFS_object_wait_visitor : public PFS_object_visitor
+{
+public:
+  PFS_object_wait_visitor();
+  virtual ~PFS_object_wait_visitor();
+  virtual void visit_global();
+  virtual void visit_table_share(PFS_table_share *pfs);
+  virtual void visit_table(PFS_table *pfs);
+
+  /** Object wait statistic collected. */
+  PFS_single_stat m_stat;
+};
+
+/**
+  A concrete object visitor that aggregates
+  table io wait statistics.
+*/
+class PFS_table_io_wait_visitor : public PFS_object_visitor
+{
+public:
+  PFS_table_io_wait_visitor();
+  virtual ~PFS_table_io_wait_visitor();
+  virtual void visit_global();
+  virtual void visit_table_share(PFS_table_share *pfs);
+  virtual void visit_table(PFS_table *pfs);
+
+  /** Table io wait statistic collected. */
+  PFS_single_stat m_stat;
+};
+
+/**
+  A concrete object visitor that aggregates
+  table io statistics.
+*/
+class PFS_table_io_stat_visitor : public PFS_object_visitor
+{
+public:
+  PFS_table_io_stat_visitor();
+  virtual ~PFS_table_io_stat_visitor();
+  virtual void visit_table_share(PFS_table_share *pfs);
+  virtual void visit_table(PFS_table *pfs);
+
+  /** Table io statistic collected. */
+  PFS_table_io_stat m_stat;
+};
+
+/**
+  A concrete object visitor that aggregates
+  index io statistics.
+*/
+class PFS_index_io_stat_visitor : public PFS_object_visitor
+{
+public:
+  PFS_index_io_stat_visitor();
+  virtual ~PFS_index_io_stat_visitor();
+  virtual void visit_table_share_index(PFS_table_share *pfs, uint index);
+  virtual void visit_table_index(PFS_table *pfs, uint index);
+
+  /** Index io statistic collected. */
+  PFS_table_io_stat m_stat;
+};
+
+/**
+  A concrete object visitor that aggregates
+  table lock wait statistics.
+*/
+class PFS_table_lock_wait_visitor : public PFS_object_visitor
+{
+public:
+  PFS_table_lock_wait_visitor();
+  virtual ~PFS_table_lock_wait_visitor();
+  virtual void visit_global();
+  virtual void visit_table_share(PFS_table_share *pfs);
+  virtual void visit_table(PFS_table *pfs);
+
+  /** Table lock wait statistic collected. */
+  PFS_single_stat m_stat;
+};
+
+/**
+  A concrete object visitor that aggregates
+  table lock statistics.
+*/
+class PFS_table_lock_stat_visitor : public PFS_object_visitor
+{
+public:
+  PFS_table_lock_stat_visitor();
+  virtual ~PFS_table_lock_stat_visitor();
+  virtual void visit_table_share(PFS_table_share *pfs);
+  virtual void visit_table(PFS_table *pfs);
+
+  /** Table lock statistic collected. */
+  PFS_table_lock_stat m_stat;
+};
+
+/**
+  A concrete instance visitor that aggregates
+  socket wait and byte count statistics.
+*/
+class PFS_instance_socket_io_stat_visitor : public PFS_instance_visitor
+{
+public:
+  PFS_instance_socket_io_stat_visitor();
+  virtual ~PFS_instance_socket_io_stat_visitor();
+  virtual void visit_socket_class(PFS_socket_class *pfs);
+  virtual void visit_socket(PFS_socket *pfs);
+
+  /** Wait and byte count statistics collected. */
+  PFS_socket_io_stat m_socket_io_stat;
+};
+
+/**
+  A concrete instance visitor that aggregates
+  file wait and byte count statistics.
+*/
+class PFS_instance_file_io_stat_visitor : public PFS_instance_visitor
+{
+public:
+  PFS_instance_file_io_stat_visitor();
+  virtual ~PFS_instance_file_io_stat_visitor();
+  virtual void visit_file_class(PFS_file_class *pfs);
+  virtual void visit_file(PFS_file *pfs);
+
+  /** Wait and byte count statistics collected. */
+  PFS_file_io_stat m_file_io_stat;
+};
+
+/** @} */
+#endif
+
diff --git a/storage/perfschema/table_accounts.cc b/storage/perfschema/table_accounts.cc
new file mode 100644
index 00000000000..92a5d264a44
--- /dev/null
+++ b/storage/perfschema/table_accounts.cc
@@ -0,0 +1,148 @@
+/* Copyright (c) 2011, Oracle and/or its affiliates. All rights reserved.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; version 2 of the License.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software Foundation,
+  51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA */
+
+#include "my_global.h"
+#include "my_pthread.h"
+#include "table_accounts.h"
+#include "pfs_instr_class.h"
+#include "pfs_instr.h"
+#include "pfs_account.h"
+#include "pfs_visitor.h"
+
+THR_LOCK table_accounts::m_table_lock;
+
+static const TABLE_FIELD_TYPE field_types[]=
+{
+  {
+    { C_STRING_WITH_LEN("USER") },
+    { C_STRING_WITH_LEN("char(16)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("HOST") },
+    { C_STRING_WITH_LEN("char(60)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("CURRENT_CONNECTIONS") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("TOTAL_CONNECTIONS") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  }
+};
+
+TABLE_FIELD_DEF
+table_accounts::m_field_def=
+{ 4, field_types };
+
+PFS_engine_table_share
+table_accounts::m_share=
+{
+  { C_STRING_WITH_LEN("accounts") },
+  &pfs_truncatable_acl,
+  &table_accounts::create,
+  NULL, /* write_row */
+  table_accounts::delete_all_rows,
+  NULL, /* get_row_count */
+  1000, /* records */
+  sizeof(PFS_simple_index), /* ref length */
+  &m_table_lock,
+  &m_field_def,
+  false /* checked */
+};
+
+PFS_engine_table* table_accounts::create()
+{
+  return new table_accounts();
+}
+
+int
+table_accounts::delete_all_rows(void)
+{
+  reset_events_waits_by_thread();
+  reset_events_waits_by_account();
+  reset_events_stages_by_thread();
+  reset_events_stages_by_account();
+  reset_events_statements_by_thread();
+  reset_events_statements_by_account();
+  purge_all_account();
+  return 0;
+}
+
+table_accounts::table_accounts()
+  : cursor_by_account(& m_share),
+  m_row_exists(false)
+{}
+
+void table_accounts::make_row(PFS_account *pfs)
+{
+  pfs_lock lock;
+
+  m_row_exists= false;
+  pfs->m_lock.begin_optimistic_lock(&lock);
+
+  if (m_row.m_account.make_row(pfs))
+    return;
+
+  PFS_connection_stat_visitor visitor;
+  PFS_connection_iterator::visit_account(pfs, true, & visitor);
+
+  if (! pfs->m_lock.end_optimistic_lock(& lock))
+    return;
+
+  m_row.m_connection_stat.set(& visitor.m_stat);
+  m_row_exists= true;
+}
+
+int table_accounts::read_row_values(TABLE *table,
+                                      unsigned char *buf,
+                                      Field **fields,
+                                      bool read_all)
+{
+  Field *f;
+
+  if (unlikely(! m_row_exists))
+    return HA_ERR_RECORD_DELETED;
+
+  /* Set the null bits */
+  DBUG_ASSERT(table->s->null_bytes == 1);
+  buf[0]= 0;
+
+  for (; (f= *fields) ; fields++)
+  {
+    if (read_all || bitmap_is_set(table->read_set, f->field_index))
+    {
+      switch(f->field_index)
+      {
+      case 0: /* USER */
+      case 1: /* HOST */
+        m_row.m_account.set_field(f->field_index, f);
+        break;
+      case 2: /* CURRENT_CONNECTIONS */
+      case 3: /* TOTAL_CONNECTIONS */
+        m_row.m_connection_stat.set_field(f->field_index - 2, f);
+        break;
+      default:
+        DBUG_ASSERT(false);
+      }
+    }
+  }
+  return 0;
+}
+
diff --git a/storage/perfschema/table_accounts.h b/storage/perfschema/table_accounts.h
new file mode 100644
index 00000000000..232cb9d9b36
--- /dev/null
+++ b/storage/perfschema/table_accounts.h
@@ -0,0 +1,80 @@
+/* Copyright (c) 2011, Oracle and/or its affiliates. All rights reserved.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; version 2 of the License.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software Foundation,
+  51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA */
+
+#ifndef TABLE_ACCOUNTS_H
+#define TABLE_ACCOUNTS_H
+
+#include "pfs_column_types.h"
+#include "cursor_by_account.h"
+#include "table_helper.h"
+
+struct PFS_account;
+
+/**
+  \addtogroup Performance_schema_tables
+  @{
+*/
+
+/**
+  A row of PERFORMANCE_SCHEMA.ACCOUNTS.
+*/
+struct row_accounts
+{
+  /** Column USER, HOST. */
+  PFS_account_row m_account;
+  /** Columns CURRENT_CONNECTIONS, TOTAL_CONNECTIONS. */
+  PFS_connection_stat_row m_connection_stat;
+};
+
+/** Table PERFORMANCE_SCHEMA.ACCOUNTS. */
+class table_accounts : public cursor_by_account
+{
+public:
+  /** Table share */
+  static PFS_engine_table_share m_share;
+  /** Table builder */
+  static PFS_engine_table* create();
+  static int delete_all_rows();
+
+protected:
+  virtual int read_row_values(TABLE *table,
+                              unsigned char *buf,
+                              Field **fields,
+                              bool read_all);
+
+
+protected:
+  table_accounts();
+
+public:
+  ~table_accounts()
+  {}
+
+private:
+  virtual void make_row(PFS_account *pfs);
+
+  /** Table share lock. */
+  static THR_LOCK m_table_lock;
+  /** Fields definition. */
+  static TABLE_FIELD_DEF m_field_def;
+
+  /** Current row. */
+  row_accounts m_row;
+  /** True if the current row exists. */
+  bool m_row_exists;
+};
+
+/** @} */
+#endif
diff --git a/storage/perfschema/table_all_instr.cc b/storage/perfschema/table_all_instr.cc
index f29a006107a..ae8988d3a93 100644
--- a/storage/perfschema/table_all_instr.cc
+++ b/storage/perfschema/table_all_instr.cc
@@ -40,6 +40,7 @@ int table_all_instr::rnd_next(void)
   PFS_rwlock *rwlock;
   PFS_cond *cond;
   PFS_file *file;
+  PFS_socket *socket;
 
   for (m_pos.set_at(&m_next_pos);
        m_pos.has_more_view();
@@ -94,6 +95,18 @@ int table_all_instr::rnd_next(void)
         }
       }
       break;
+    case pos_all_instr::VIEW_SOCKET:
+      for ( ; m_pos.m_index_2 < socket_max; m_pos.m_index_2++)
+      {
+        socket= &socket_array[m_pos.m_index_2];
+        if (socket->m_lock.is_populated())
+        {
+          make_socket_row(socket);
+          m_next_pos.set_after(&m_pos);
+          return 0;
+        }
+      }
+      break;
     }
   }
 
@@ -106,6 +119,7 @@ int table_all_instr::rnd_pos(const void *pos)
   PFS_rwlock *rwlock;
   PFS_cond *cond;
   PFS_file *file;
+  PFS_socket *socket;
 
   set_position(pos);
 
@@ -146,115 +160,12 @@ int table_all_instr::rnd_pos(const void *pos)
       return 0;
     }
     break;
-  }
-
-  return HA_ERR_RECORD_DELETED;
-}
-
-table_all_instr_class::table_all_instr_class
-(const PFS_engine_table_share *share)
-  : PFS_engine_table(share, &m_pos),
-    m_pos(), m_next_pos()
-{}
-
-void table_all_instr_class::reset_position(void)
-{
-  m_pos.reset();
-  m_next_pos.reset();
-}
-
-int table_all_instr_class::rnd_next(void)
-{
-  PFS_mutex_class *mutex_class;
-  PFS_rwlock_class *rwlock_class;
-  PFS_cond_class *cond_class;
-  PFS_file_class *file_class;
-
-  for (m_pos.set_at(&m_next_pos);
-       m_pos.has_more_view();
-       m_pos.next_view())
-  {
-    switch (m_pos.m_index_1) {
-    case pos_all_instr_class::VIEW_MUTEX:
-      mutex_class= find_mutex_class(m_pos.m_index_2);
-      if (mutex_class)
-      {
-        make_instr_row(mutex_class);
-        m_next_pos.set_after(&m_pos);
-        return 0;
-      }
-      break;
-    case pos_all_instr_class::VIEW_RWLOCK:
-      rwlock_class= find_rwlock_class(m_pos.m_index_2);
-      if (rwlock_class)
-      {
-        make_instr_row(rwlock_class);
-        m_next_pos.set_after(&m_pos);
-        return 0;
-      }
-      break;
-    case pos_all_instr_class::VIEW_COND:
-      cond_class= find_cond_class(m_pos.m_index_2);
-      if (cond_class)
-      {
-        make_instr_row(cond_class);
-        m_next_pos.set_after(&m_pos);
-        return 0;
-      }
-      break;
-    case pos_all_instr_class::VIEW_FILE:
-      file_class= find_file_class(m_pos.m_index_2);
-      if (file_class)
-      {
-        make_instr_row(file_class);
-        m_next_pos.set_after(&m_pos);
-        return 0;
-      }
-      break;
-    }
-  }
-
-  return HA_ERR_END_OF_FILE;
-}
-
-int table_all_instr_class::rnd_pos(const void *pos)
-{
-  PFS_mutex_class *mutex_class;
-  PFS_rwlock_class *rwlock_class;
-  PFS_cond_class *cond_class;
-  PFS_file_class *file_class;
-
-  set_position(pos);
-  switch (m_pos.m_index_1) {
-  case pos_all_instr_class::VIEW_MUTEX:
-    mutex_class= find_mutex_class(m_pos.m_index_2);
-    if (mutex_class)
-    {
-      make_instr_row(mutex_class);
-      return 0;
-    }
-    break;
-  case pos_all_instr_class::VIEW_RWLOCK:
-    rwlock_class= find_rwlock_class(m_pos.m_index_2);
-    if (rwlock_class)
+  case pos_all_instr::VIEW_SOCKET:
+    DBUG_ASSERT(m_pos.m_index_2 < socket_max);
+    socket= &socket_array[m_pos.m_index_2];
+    if (socket->m_lock.is_populated())
     {
-      make_instr_row(rwlock_class);
-      return 0;
-    }
-    break;
-  case pos_all_instr_class::VIEW_COND:
-    cond_class= find_cond_class(m_pos.m_index_2);
-    if (cond_class)
-    {
-      make_instr_row(cond_class);
-      return 0;
-    }
-    break;
-  case pos_all_instr_class::VIEW_FILE:
-    file_class= find_file_class(m_pos.m_index_2);
-    if (file_class)
-    {
-      make_instr_row(file_class);
+      make_socket_row(socket);
       return 0;
     }
     break;
@@ -262,4 +173,3 @@ int table_all_instr_class::rnd_pos(const void *pos)
 
   return HA_ERR_RECORD_DELETED;
 }
-
diff --git a/storage/perfschema/table_all_instr.h b/storage/perfschema/table_all_instr.h
index 6e404659030..f78f8cee3ba 100644
--- a/storage/perfschema/table_all_instr.h
+++ b/storage/perfschema/table_all_instr.h
@@ -24,88 +24,29 @@
 #include "pfs_instr_class.h"
 #include "pfs_instr.h"
 #include "pfs_engine_table.h"
+#include "table_helper.h"
 
 /**
   @addtogroup Performance_schema_tables
   @{
 */
 
-/** Position of a cursor on table_all_instr_class. */
-struct pos_all_instr_class : public PFS_double_index,
-                             public PFS_instrument_view_constants
-{
-  pos_all_instr_class()
-    : PFS_double_index(VIEW_MUTEX, 1)
-  {}
-
-  inline void reset(void)
-  {
-    m_index_1= VIEW_MUTEX;
-    m_index_2= 1;
-  }
-
-  inline bool has_more_view(void)
-  { return (m_index_1 <= VIEW_FILE); }
-
-  inline void next_view(void)
-  {
-    m_index_1++;
-    /* Instrument keys start at 1, not 0. */
-    m_index_2= 1;
-  }
-};
-
-/**
-  Abstract table, a union of all instrumentations class metadata.
-  This table is a union of:
-  - a view on all mutex classes,
-  - a view on all rwlock classes,
-  - a view on all cond classes,
-  - a view on all file classes
-*/
-class table_all_instr_class : public PFS_engine_table
-{
-public:
-  virtual int rnd_next();
-  virtual int rnd_pos(const void *pos);
-  virtual void reset_position(void);
-
-protected:
-  table_all_instr_class(const PFS_engine_table_share *share);
-
-public:
-  ~table_all_instr_class()
-  {}
-
-protected:
-  /**
-    Build a row.
-    @param klass                      the instrument class
-  */
-  virtual void make_instr_row(PFS_instr_class *klass)= 0;
-
-  /** Current position. */
-  pos_all_instr_class m_pos;
-  /** Next position. */
-  pos_all_instr_class m_next_pos;
-};
-
 /** Position of a cursor on table_all_instr. */
 struct pos_all_instr : public PFS_double_index,
                        public PFS_instrument_view_constants
 {
   pos_all_instr()
-    : PFS_double_index(VIEW_MUTEX, 0)
+    : PFS_double_index(FIRST_VIEW, 0)
   {}
 
   inline void reset(void)
   {
-    m_index_1= VIEW_MUTEX;
+    m_index_1= FIRST_VIEW;
     m_index_2= 0;
   }
 
   inline bool has_more_view(void)
-  { return (m_index_1 <= VIEW_FILE); }
+  { return (m_index_1 <= LAST_VIEW); }
 
   inline void next_view(void)
   {
@@ -120,7 +61,8 @@ struct pos_all_instr : public PFS_double_index,
   - a view on all mutex instances,
   - a view on all rwlock instances,
   - a view on all cond instances,
-  - a view on all file instances
+  - a view on all file instances,
+  - a view on all socket instances
 */
 class table_all_instr : public PFS_engine_table
 {
@@ -157,6 +99,11 @@ protected:
     @param pfs                        the file instance
   */
   virtual void make_file_row(PFS_file *pfs)= 0;
+  /**
+    Build a row in the socket instance view.
+    @param pfs                        the socket instance
+  */
+  virtual void make_socket_row(PFS_socket *pfs)= 0;
 
   /** Current position. */
   pos_all_instr m_pos;
diff --git a/storage/perfschema/table_esgs_by_account_by_event_name.cc b/storage/perfschema/table_esgs_by_account_by_event_name.cc
new file mode 100644
index 00000000000..e36a0d0d00a
--- /dev/null
+++ b/storage/perfschema/table_esgs_by_account_by_event_name.cc
@@ -0,0 +1,233 @@
+/* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; version 2 of the License.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+/**
+  @file storage/perfschema/table_esgs_by_account_by_event_name.cc
+  Table EVENTS_STAGES_SUMMARY_BY_ACCOUNT_BY_EVENT_NAME (implementation).
+*/
+
+#include "my_global.h"
+#include "my_pthread.h"
+#include "pfs_instr_class.h"
+#include "pfs_column_types.h"
+#include "pfs_column_values.h"
+#include "table_esgs_by_account_by_event_name.h"
+#include "pfs_global.h"
+#include "pfs_visitor.h"
+
+THR_LOCK table_esgs_by_account_by_event_name::m_table_lock;
+
+static const TABLE_FIELD_TYPE field_types[]=
+{
+  {
+    { C_STRING_WITH_LEN("USER") },
+    { C_STRING_WITH_LEN("char(16)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("HOST") },
+    { C_STRING_WITH_LEN("char(60)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("EVENT_NAME") },
+    { C_STRING_WITH_LEN("varchar(128)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("COUNT_STAR") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_TIMER_WAIT") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("MIN_TIMER_WAIT") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("AVG_TIMER_WAIT") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("MAX_TIMER_WAIT") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  }
+};
+
+TABLE_FIELD_DEF
+table_esgs_by_account_by_event_name::m_field_def=
+{ 8, field_types };
+
+PFS_engine_table_share
+table_esgs_by_account_by_event_name::m_share=
+{
+  { C_STRING_WITH_LEN("events_stages_summary_by_account_by_event_name") },
+  &pfs_truncatable_acl,
+  table_esgs_by_account_by_event_name::create,
+  NULL, /* write_row */
+  table_esgs_by_account_by_event_name::delete_all_rows,
+  NULL, /* get_row_count */
+  1000, /* records */
+  sizeof(pos_esgs_by_account_by_event_name),
+  &m_table_lock,
+  &m_field_def,
+  false /* checked */
+};
+
+PFS_engine_table*
+table_esgs_by_account_by_event_name::create(void)
+{
+  return new table_esgs_by_account_by_event_name();
+}
+
+int
+table_esgs_by_account_by_event_name::delete_all_rows(void)
+{
+  reset_events_stages_by_thread();
+  reset_events_stages_by_account();
+  return 0;
+}
+
+table_esgs_by_account_by_event_name::table_esgs_by_account_by_event_name()
+  : PFS_engine_table(&m_share, &m_pos),
+    m_row_exists(false), m_pos(), m_next_pos()
+{}
+
+void table_esgs_by_account_by_event_name::reset_position(void)
+{
+  m_pos.reset();
+  m_next_pos.reset();
+}
+
+int table_esgs_by_account_by_event_name::rnd_init(bool scan)
+{
+  m_normalizer= time_normalizer::get(stage_timer);
+  return 0;
+}
+
+int table_esgs_by_account_by_event_name::rnd_next(void)
+{
+  PFS_account *account;
+  PFS_stage_class *stage_class;
+
+  for (m_pos.set_at(&m_next_pos);
+       m_pos.has_more_account();
+       m_pos.next_account())
+  {
+    account= &account_array[m_pos.m_index_1];
+    if (account->m_lock.is_populated())
+    {
+      stage_class= find_stage_class(m_pos.m_index_2);
+      if (stage_class)
+      {
+        make_row(account, stage_class);
+        m_next_pos.set_after(&m_pos);
+        return 0;
+      }
+    }
+  }
+
+  return HA_ERR_END_OF_FILE;
+}
+
+int
+table_esgs_by_account_by_event_name::rnd_pos(const void *pos)
+{
+  PFS_account *account;
+  PFS_stage_class *stage_class;
+
+  set_position(pos);
+  DBUG_ASSERT(m_pos.m_index_1 < account_max);
+
+  account= &account_array[m_pos.m_index_1];
+  if (! account->m_lock.is_populated())
+    return HA_ERR_RECORD_DELETED;
+
+  stage_class= find_stage_class(m_pos.m_index_2);
+  if (stage_class)
+  {
+    make_row(account, stage_class);
+    return 0;
+  }
+
+  return HA_ERR_RECORD_DELETED;
+}
+
+void table_esgs_by_account_by_event_name
+::make_row(PFS_account *account, PFS_stage_class *klass)
+{
+  pfs_lock lock;
+  m_row_exists= false;
+
+  account->m_lock.begin_optimistic_lock(&lock);
+
+  if (m_row.m_account.make_row(account))
+    return;
+
+  m_row.m_event_name.make_row(klass);
+
+  PFS_connection_stage_visitor visitor(klass);
+  PFS_connection_iterator::visit_account(account, true, & visitor);
+
+  if (! account->m_lock.end_optimistic_lock(&lock))
+    return;
+
+  m_row_exists= true;
+  m_row.m_stat.set(m_normalizer, & visitor.m_stat);
+}
+
+int table_esgs_by_account_by_event_name
+::read_row_values(TABLE *table, unsigned char *buf, Field **fields,
+                  bool read_all)
+{
+  Field *f;
+
+  if (unlikely(! m_row_exists))
+    return HA_ERR_RECORD_DELETED;
+
+  /* Set the null bits */
+  DBUG_ASSERT(table->s->null_bytes == 1);
+  buf[0]= 0;
+
+  for (; (f= *fields) ; fields++)
+  {
+    if (read_all || bitmap_is_set(table->read_set, f->field_index))
+    {
+      switch(f->field_index)
+      {
+      case 0: /* USER */
+      case 1: /* HOST */
+        m_row.m_account.set_field(f->field_index, f);
+        break;
+      case 2: /* EVENT_NAME */
+        m_row.m_event_name.set_field(f);
+        break;
+      default: /* 3, ... COUNT/SUM/MIN/AVG/MAX */
+        m_row.m_stat.set_field(f->field_index - 3, f);
+        break;
+      }
+    }
+  }
+
+  return 0;
+}
+
diff --git a/storage/perfschema/table_esgs_by_account_by_event_name.h b/storage/perfschema/table_esgs_by_account_by_event_name.h
new file mode 100644
index 00000000000..c20f129de1e
--- /dev/null
+++ b/storage/perfschema/table_esgs_by_account_by_event_name.h
@@ -0,0 +1,125 @@
+/* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; version 2 of the License.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+#ifndef TABLE_ESGS_BY_ACCOUNT_BY_EVENT_NAME_H
+#define TABLE_ESGS_BY_ACCOUNT_BY_EVENT_NAME_H
+
+/**
+  @file storage/perfschema/table_esgs_by_account_by_event_name.h
+  Table EVENTS_STAGES_SUMMARY_BY_ACCOUNT_BY_EVENT_NAME (declarations).
+*/
+
+#include "pfs_column_types.h"
+#include "pfs_engine_table.h"
+#include "pfs_instr_class.h"
+#include "pfs_instr.h"
+#include "pfs_account.h"
+#include "table_helper.h"
+
+/**
+  @addtogroup Performance_schema_tables
+  @{
+*/
+
+/**
+  A row of table
+  PERFORMANCE_SCHEMA.EVENTS_STAGES_SUMMARY_BY_ACCOUNT_BY_EVENT_NAME.
+*/
+struct row_esgs_by_account_by_event_name
+{
+  /** Column USER, HOST. */
+  PFS_account_row m_account;
+  /** Column EVENT_NAME. */
+  PFS_event_name_row m_event_name;
+  /** Columns COUNT_STAR, SUM/MIN/AVG/MAX TIMER_WAIT. */
+  PFS_stage_stat_row m_stat;
+};
+
+/**
+  Position of a cursor on
+  PERFORMANCE_SCHEMA.EVENTS_STAGES_SUMMARY_BY_ACCOUNT_BY_EVENT_NAME.
+  Index 1 on user@host (0 based)
+  Index 2 on stage class (1 based)
+*/
+struct pos_esgs_by_account_by_event_name
+: public PFS_double_index
+{
+  pos_esgs_by_account_by_event_name()
+    : PFS_double_index(0, 1)
+  {}
+
+  inline void reset(void)
+  {
+    m_index_1= 0;
+    m_index_2= 1;
+  }
+
+  inline bool has_more_account(void)
+  { return (m_index_1 < account_max); }
+
+  inline void next_account(void)
+  {
+    m_index_1++;
+    m_index_2= 1;
+  }
+};
+
+/** Table PERFORMANCE_SCHEMA.EVENTS_STAGES_SUMMARY_BY_ACCOUNT_BY_EVENT_NAME. */
+class table_esgs_by_account_by_event_name : public PFS_engine_table
+{
+public:
+  /** Table share */
+  static PFS_engine_table_share m_share;
+  static PFS_engine_table* create();
+  static int delete_all_rows();
+
+  virtual int rnd_init(bool scan);
+  virtual int rnd_next();
+  virtual int rnd_pos(const void *pos);
+  virtual void reset_position(void);
+
+protected:
+  virtual int read_row_values(TABLE *table,
+                              unsigned char *buf,
+                              Field **fields,
+                              bool read_all);
+
+  table_esgs_by_account_by_event_name();
+
+public:
+  ~table_esgs_by_account_by_event_name()
+  {}
+
+protected:
+  void make_row(PFS_account *account, PFS_stage_class *klass);
+
+private:
+  /** Table share lock. */
+  static THR_LOCK m_table_lock;
+  /** Fields definition. */
+  static TABLE_FIELD_DEF m_field_def;
+
+  /** Current row. */
+  row_esgs_by_account_by_event_name m_row;
+  /** True is the current row exists. */
+  bool m_row_exists;
+  /** Current position. */
+  pos_esgs_by_account_by_event_name m_pos;
+  /** Next position. */
+  pos_esgs_by_account_by_event_name m_next_pos;
+};
+
+/** @} */
+#endif
diff --git a/storage/perfschema/table_esgs_by_host_by_event_name.cc b/storage/perfschema/table_esgs_by_host_by_event_name.cc
new file mode 100644
index 00000000000..2357d899116
--- /dev/null
+++ b/storage/perfschema/table_esgs_by_host_by_event_name.cc
@@ -0,0 +1,229 @@
+/* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; version 2 of the License.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+/**
+  @file storage/perfschema/table_esgs_by_host_by_event_name.cc
+  Table EVENTS_STAGES_SUMMARY_BY_HOST_BY_EVENT_NAME (implementation).
+*/
+
+#include "my_global.h"
+#include "my_pthread.h"
+#include "pfs_instr_class.h"
+#include "pfs_column_types.h"
+#include "pfs_column_values.h"
+#include "table_esgs_by_host_by_event_name.h"
+#include "pfs_global.h"
+#include "pfs_account.h"
+#include "pfs_visitor.h"
+
+THR_LOCK table_esgs_by_host_by_event_name::m_table_lock;
+
+static const TABLE_FIELD_TYPE field_types[]=
+{
+  {
+    { C_STRING_WITH_LEN("HOST") },
+    { C_STRING_WITH_LEN("char(60)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("EVENT_NAME") },
+    { C_STRING_WITH_LEN("varchar(128)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("COUNT_STAR") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_TIMER_WAIT") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("MIN_TIMER_WAIT") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("AVG_TIMER_WAIT") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("MAX_TIMER_WAIT") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  }
+};
+
+TABLE_FIELD_DEF
+table_esgs_by_host_by_event_name::m_field_def=
+{ 7, field_types };
+
+PFS_engine_table_share
+table_esgs_by_host_by_event_name::m_share=
+{
+  { C_STRING_WITH_LEN("events_stages_summary_by_host_by_event_name") },
+  &pfs_truncatable_acl,
+  table_esgs_by_host_by_event_name::create,
+  NULL, /* write_row */
+  table_esgs_by_host_by_event_name::delete_all_rows,
+  NULL, /* get_row_count */
+  1000, /* records */
+  sizeof(pos_esgs_by_host_by_event_name),
+  &m_table_lock,
+  &m_field_def,
+  false /* checked */
+};
+
+PFS_engine_table*
+table_esgs_by_host_by_event_name::create(void)
+{
+  return new table_esgs_by_host_by_event_name();
+}
+
+int
+table_esgs_by_host_by_event_name::delete_all_rows(void)
+{
+  reset_events_stages_by_thread();
+  reset_events_stages_by_account();
+  reset_events_stages_by_host();
+  return 0;
+}
+
+table_esgs_by_host_by_event_name::table_esgs_by_host_by_event_name()
+  : PFS_engine_table(&m_share, &m_pos),
+    m_row_exists(false), m_pos(), m_next_pos()
+{}
+
+void table_esgs_by_host_by_event_name::reset_position(void)
+{
+  m_pos.reset();
+  m_next_pos.reset();
+}
+
+int table_esgs_by_host_by_event_name::rnd_init(bool scan)
+{
+  m_normalizer= time_normalizer::get(stage_timer);
+  return 0;
+}
+
+int table_esgs_by_host_by_event_name::rnd_next(void)
+{
+  PFS_host *host;
+  PFS_stage_class *stage_class;
+
+  for (m_pos.set_at(&m_next_pos);
+       m_pos.has_more_host();
+       m_pos.next_host())
+  {
+    host= &host_array[m_pos.m_index_1];
+    if (host->m_lock.is_populated())
+    {
+      stage_class= find_stage_class(m_pos.m_index_2);
+      if (stage_class)
+      {
+        make_row(host, stage_class);
+        m_next_pos.set_after(&m_pos);
+        return 0;
+      }
+    }
+  }
+
+  return HA_ERR_END_OF_FILE;
+}
+
+int
+table_esgs_by_host_by_event_name::rnd_pos(const void *pos)
+{
+  PFS_host *host;
+  PFS_stage_class *stage_class;
+
+  set_position(pos);
+  DBUG_ASSERT(m_pos.m_index_1 < host_max);
+
+  host= &host_array[m_pos.m_index_1];
+  if (! host->m_lock.is_populated())
+    return HA_ERR_RECORD_DELETED;
+
+  stage_class= find_stage_class(m_pos.m_index_2);
+  if (stage_class)
+  {
+    make_row(host, stage_class);
+    return 0;
+  }
+
+  return HA_ERR_RECORD_DELETED;
+}
+
+void table_esgs_by_host_by_event_name
+::make_row(PFS_host *host, PFS_stage_class *klass)
+{
+  pfs_lock lock;
+  m_row_exists= false;
+
+  host->m_lock.begin_optimistic_lock(&lock);
+
+  if (m_row.m_host.make_row(host))
+    return;
+
+  m_row.m_event_name.make_row(klass);
+
+  PFS_connection_stage_visitor visitor(klass);
+  PFS_connection_iterator::visit_host(host, true, true, & visitor);
+
+  if (! host->m_lock.end_optimistic_lock(&lock))
+    return;
+
+  m_row_exists= true;
+  m_row.m_stat.set(m_normalizer, & visitor.m_stat);
+}
+
+int table_esgs_by_host_by_event_name
+::read_row_values(TABLE *table, unsigned char *buf, Field **fields,
+                  bool read_all)
+{
+  Field *f;
+
+  if (unlikely(! m_row_exists))
+    return HA_ERR_RECORD_DELETED;
+
+  /* Set the null bits */
+  DBUG_ASSERT(table->s->null_bytes == 1);
+  buf[0]= 0;
+
+  for (; (f= *fields) ; fields++)
+  {
+    if (read_all || bitmap_is_set(table->read_set, f->field_index))
+    {
+      switch(f->field_index)
+      {
+      case 0: /* HOST */
+        m_row.m_host.set_field(f);
+        break;
+      case 1: /* EVENT_NAME */
+        m_row.m_event_name.set_field(f);
+        break;
+      default: /* 2, ... COUNT/SUM/MIN/AVG/MAX */
+        m_row.m_stat.set_field(f->field_index - 2, f);
+        break;
+      }
+    }
+  }
+
+  return 0;
+}
+
diff --git a/storage/perfschema/table_esgs_by_host_by_event_name.h b/storage/perfschema/table_esgs_by_host_by_event_name.h
new file mode 100644
index 00000000000..223923f3026
--- /dev/null
+++ b/storage/perfschema/table_esgs_by_host_by_event_name.h
@@ -0,0 +1,125 @@
+/* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; version 2 of the License.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+#ifndef TABLE_ESGS_BY_HOST_BY_EVENT_NAME_H
+#define TABLE_ESGS_BY_HOST_BY_EVENT_NAME_H
+
+/**
+  @file storage/perfschema/table_esgs_by_host_by_event_name.h
+  Table EVENTS_STAGES_SUMMARY_BY_HOST_BY_EVENT_NAME (declarations).
+*/
+
+#include "pfs_column_types.h"
+#include "pfs_engine_table.h"
+#include "pfs_instr_class.h"
+#include "pfs_instr.h"
+#include "pfs_host.h"
+#include "table_helper.h"
+
+/**
+  @addtogroup Performance_schema_tables
+  @{
+*/
+
+/**
+  A row of table
+  PERFORMANCE_SCHEMA.EVENTS_STAGES_SUMMARY_BY_HOST_BY_EVENT_NAME.
+*/
+struct row_esgs_by_host_by_event_name
+{
+  /** Column HOST. */
+  PFS_host_row m_host;
+  /** Column EVENT_NAME. */
+  PFS_event_name_row m_event_name;
+  /** Columns COUNT_STAR, SUM/MIN/AVG/MAX TIMER_WAIT. */
+  PFS_stage_stat_row m_stat;
+};
+
+/**
+  Position of a cursor on
+  PERFORMANCE_SCHEMA.EVENTS_STAGES_SUMMARY_BY_HOST_BY_EVENT_NAME.
+  Index 1 on host (0 based)
+  Index 2 on stage class (1 based).
+*/
+struct pos_esgs_by_host_by_event_name
+: public PFS_double_index
+{
+  pos_esgs_by_host_by_event_name()
+    : PFS_double_index(0, 1)
+  {}
+
+  inline void reset(void)
+  {
+    m_index_1= 0;
+    m_index_2= 1;
+  }
+
+  inline bool has_more_host(void)
+  { return (m_index_1 < host_max); }
+
+  inline void next_host(void)
+  {
+    m_index_1++;
+    m_index_2= 1;
+  }
+};
+
+/** Table PERFORMANCE_SCHEMA.EVENTS_STAGES_SUMMARY_BY_HOST_BY_EVENT_NAME. */
+class table_esgs_by_host_by_event_name : public PFS_engine_table
+{
+public:
+  /** Table share */
+  static PFS_engine_table_share m_share;
+  static PFS_engine_table* create();
+  static int delete_all_rows();
+
+  virtual int rnd_init(bool scan);
+  virtual int rnd_next();
+  virtual int rnd_pos(const void *pos);
+  virtual void reset_position(void);
+
+protected:
+  virtual int read_row_values(TABLE *table,
+                              unsigned char *buf,
+                              Field **fields,
+                              bool read_all);
+
+  table_esgs_by_host_by_event_name();
+
+public:
+  ~table_esgs_by_host_by_event_name()
+  {}
+
+protected:
+  void make_row(PFS_host *host, PFS_stage_class *klass);
+
+private:
+  /** Table share lock. */
+  static THR_LOCK m_table_lock;
+  /** Fields definition. */
+  static TABLE_FIELD_DEF m_field_def;
+
+  /** Current row. */
+  row_esgs_by_host_by_event_name m_row;
+  /** True is the current row exists. */
+  bool m_row_exists;
+  /** Current position. */
+  pos_esgs_by_host_by_event_name m_pos;
+  /** Next position. */
+  pos_esgs_by_host_by_event_name m_next_pos;
+};
+
+/** @} */
+#endif
diff --git a/storage/perfschema/table_esgs_by_thread_by_event_name.cc b/storage/perfschema/table_esgs_by_thread_by_event_name.cc
new file mode 100644
index 00000000000..2a69ec24277
--- /dev/null
+++ b/storage/perfschema/table_esgs_by_thread_by_event_name.cc
@@ -0,0 +1,229 @@
+/* Copyright (c) 2010, 2011, Oracle and/or its affiliates. All rights reserved.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; version 2 of the License.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301  USA */
+
+/**
+  @file storage/perfschema/table_esgs_by_thread_by_event_name.cc
+  Table EVENTS_STAGES_SUMMARY_BY_HOST_BY_EVENT_NAME (implementation).
+*/
+
+#include "my_global.h"
+#include "my_pthread.h"
+#include "pfs_instr_class.h"
+#include "pfs_column_types.h"
+#include "pfs_column_values.h"
+#include "table_esgs_by_thread_by_event_name.h"
+#include "pfs_global.h"
+#include "pfs_visitor.h"
+
+THR_LOCK table_esgs_by_thread_by_event_name::m_table_lock;
+
+static const TABLE_FIELD_TYPE field_types[]=
+{
+  {
+    { C_STRING_WITH_LEN("THREAD_ID") },
+    { C_STRING_WITH_LEN("int(11)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("EVENT_NAME") },
+    { C_STRING_WITH_LEN("varchar(128)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("COUNT_STAR") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_TIMER_WAIT") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("MIN_TIMER_WAIT") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("AVG_TIMER_WAIT") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("MAX_TIMER_WAIT") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  }
+};
+
+TABLE_FIELD_DEF
+table_esgs_by_thread_by_event_name::m_field_def=
+{ 7, field_types };
+
+PFS_engine_table_share
+table_esgs_by_thread_by_event_name::m_share=
+{
+  { C_STRING_WITH_LEN("events_stages_summary_by_thread_by_event_name") },
+  &pfs_truncatable_acl,
+  table_esgs_by_thread_by_event_name::create,
+  NULL, /* write_row */
+  table_esgs_by_thread_by_event_name::delete_all_rows,
+  NULL, /* get_row_count */
+  1000, /* records */
+  sizeof(pos_esgs_by_thread_by_event_name),
+  &m_table_lock,
+  &m_field_def,
+  false /* checked */
+};
+
+PFS_engine_table*
+table_esgs_by_thread_by_event_name::create(void)
+{
+  return new table_esgs_by_thread_by_event_name();
+}
+
+int
+table_esgs_by_thread_by_event_name::delete_all_rows(void)
+{
+  reset_events_stages_by_thread();
+  return 0;
+}
+
+table_esgs_by_thread_by_event_name::table_esgs_by_thread_by_event_name()
+  : PFS_engine_table(&m_share, &m_pos),
+    m_row_exists(false), m_pos(), m_next_pos()
+{}
+
+void table_esgs_by_thread_by_event_name::reset_position(void)
+{
+  m_pos.reset();
+  m_next_pos.reset();
+}
+
+int table_esgs_by_thread_by_event_name::rnd_init(bool scan)
+{
+  m_normalizer= time_normalizer::get(stage_timer);
+  return 0;
+}
+
+int table_esgs_by_thread_by_event_name::rnd_next(void)
+{
+  PFS_thread *thread;
+  PFS_stage_class *stage_class;
+
+  for (m_pos.set_at(&m_next_pos);
+       m_pos.has_more_thread();
+       m_pos.next_thread())
+  {
+    thread= &thread_array[m_pos.m_index_1];
+
+    /*
+      Important note: the thread scan is the outer loop (index 1),
+      to minimize the number of calls to atomic operations.
+    */
+    if (thread->m_lock.is_populated())
+    {
+      stage_class= find_stage_class(m_pos.m_index_2);
+      if (stage_class)
+      {
+        make_row(thread, stage_class);
+        m_next_pos.set_after(&m_pos);
+        return 0;
+      }
+    }
+  }
+
+  return HA_ERR_END_OF_FILE;
+}
+
+int
+table_esgs_by_thread_by_event_name::rnd_pos(const void *pos)
+{
+  PFS_thread *thread;
+  PFS_stage_class *stage_class;
+
+  set_position(pos);
+  DBUG_ASSERT(m_pos.m_index_1 < thread_max);
+
+  thread= &thread_array[m_pos.m_index_1];
+  if (! thread->m_lock.is_populated())
+    return HA_ERR_RECORD_DELETED;
+
+  stage_class= find_stage_class(m_pos.m_index_2);
+  if (stage_class)
+  {
+    make_row(thread, stage_class);
+    return 0;
+  }
+
+  return HA_ERR_RECORD_DELETED;
+}
+
+void table_esgs_by_thread_by_event_name
+::make_row(PFS_thread *thread, PFS_stage_class *klass)
+{
+  pfs_lock lock;
+  m_row_exists= false;
+
+  /* Protect this reader against a thread termination */
+  thread->m_lock.begin_optimistic_lock(&lock);
+
+  m_row.m_thread_internal_id= thread->m_thread_internal_id;
+
+  m_row.m_event_name.make_row(klass);
+
+  PFS_connection_stage_visitor visitor(klass);
+  PFS_connection_iterator::visit_thread(thread, & visitor);
+
+  if (thread->m_lock.end_optimistic_lock(&lock))
+    m_row_exists= true;
+
+  m_row.m_stat.set(m_normalizer, & visitor.m_stat);
+}
+
+int table_esgs_by_thread_by_event_name
+::read_row_values(TABLE *table, unsigned char *, Field **fields,
+                  bool read_all)
+{
+  Field *f;
+
+  if (unlikely(! m_row_exists))
+    return HA_ERR_RECORD_DELETED;
+
+  /* Set the null bits */
+  DBUG_ASSERT(table->s->null_bytes == 0);
+
+  for (; (f= *fields) ; fields++)
+  {
+    if (read_all || bitmap_is_set(table->read_set, f->field_index))
+    {
+      switch(f->field_index)
+      {
+      case 0: /* THREAD_ID */
+        set_field_ulong(f, m_row.m_thread_internal_id);
+        break;
+      case 1: /* NAME */
+        m_row.m_event_name.set_field(f);
+        break;
+      default: /* 2, ... COUNT/SUM/MIN/AVG/MAX */
+        m_row.m_stat.set_field(f->field_index - 2, f);
+        break;
+      }
+    }
+  }
+
+  return 0;
+}
+
diff --git a/storage/perfschema/table_esgs_by_thread_by_event_name.h b/storage/perfschema/table_esgs_by_thread_by_event_name.h
new file mode 100644
index 00000000000..049c8997396
--- /dev/null
+++ b/storage/perfschema/table_esgs_by_thread_by_event_name.h
@@ -0,0 +1,129 @@
+/* Copyright (c) 2010, 2011, Oracle and/or its affiliates. All rights reserved.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; version 2 of the License.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301  USA */
+
+#ifndef TABLE_ESGS_BY_THREAD_BY_EVENT_NAME_H
+#define TABLE_ESGS_BY_THREAD_BY_EVENT_NAME_H
+
+/**
+  @file storage/perfschema/table_esgs_by_thread_by_event_name.h
+  Table EVENTS_STAGES_SUMMARY_BY_THREAD_BY_EVENT_NAME (declarations).
+*/
+
+#include "pfs_column_types.h"
+#include "pfs_engine_table.h"
+#include "pfs_instr_class.h"
+#include "pfs_instr.h"
+#include "table_helper.h"
+
+/**
+  @addtogroup Performance_schema_tables
+  @{
+*/
+
+/**
+  A row of table
+  PERFORMANCE_SCHEMA.EVENTS_STAGES_SUMMARY_BY_THREAD_BY_EVENT_NAME.
+*/
+struct row_esgs_by_thread_by_event_name
+{
+  /** Column THREAD_ID. */
+  ulong m_thread_internal_id;
+  /** Column EVENT_NAME. */
+  PFS_event_name_row m_event_name;
+  /** Columns COUNT_STAR, SUM/MIN/AVG/MAX TIMER_WAIT. */
+  PFS_stage_stat_row m_stat;
+};
+
+/**
+  Position of a cursor on
+  PERFORMANCE_SCHEMA.EVENTS_STAGES_SUMMARY_BY_THREAD_BY_EVENT_NAME.
+  Index 1 on thread (0 based).
+  Index 2 on stage class (1 based).
+*/
+struct pos_esgs_by_thread_by_event_name
+: public PFS_double_index
+{
+  pos_esgs_by_thread_by_event_name()
+    : PFS_double_index(0, 1)
+  {}
+
+  inline void reset(void)
+  {
+    m_index_1= 0;
+    m_index_2= 1;
+  }
+
+  inline bool has_more_thread(void)
+  { return (m_index_1 < thread_max); }
+
+  inline void next_thread(void)
+  {
+    m_index_1++;
+    m_index_2= 1;
+  }
+
+  inline void next_stage(void)
+  {
+    m_index_2++;
+  }
+};
+
+/** Table PERFORMANCE_SCHEMA.EVENTS_STAGES_SUMMARY_BY_THREAD_BY_EVENT_NAME. */
+class table_esgs_by_thread_by_event_name : public PFS_engine_table
+{
+public:
+  /** Table share */
+  static PFS_engine_table_share m_share;
+  static PFS_engine_table* create();
+  static int delete_all_rows();
+
+  virtual int rnd_init(bool scan);
+  virtual int rnd_next();
+  virtual int rnd_pos(const void *pos);
+  virtual void reset_position(void);
+
+protected:
+  virtual int read_row_values(TABLE *table,
+                              unsigned char *buf,
+                              Field **fields,
+                              bool read_all);
+
+  table_esgs_by_thread_by_event_name();
+
+public:
+  ~table_esgs_by_thread_by_event_name()
+  {}
+
+protected:
+  void make_row(PFS_thread *thread, PFS_stage_class *klass);
+
+private:
+  /** Table share lock. */
+  static THR_LOCK m_table_lock;
+  /** Fields definition. */
+  static TABLE_FIELD_DEF m_field_def;
+
+  /** Current row. */
+  row_esgs_by_thread_by_event_name m_row;
+  /** True is the current row exists. */
+  bool m_row_exists;
+  /** Current position. */
+  pos_esgs_by_thread_by_event_name m_pos;
+  /** Next position. */
+  pos_esgs_by_thread_by_event_name m_next_pos;
+};
+
+/** @} */
+#endif
diff --git a/storage/perfschema/table_esgs_by_user_by_event_name.cc b/storage/perfschema/table_esgs_by_user_by_event_name.cc
new file mode 100644
index 00000000000..c7aff6fdb8d
--- /dev/null
+++ b/storage/perfschema/table_esgs_by_user_by_event_name.cc
@@ -0,0 +1,229 @@
+/* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; version 2 of the License.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+/**
+  @file storage/perfschema/table_esgs_by_user_by_event_name.cc
+  Table EVENTS_STAGES_SUMMARY_BY_USER_BY_EVENT_NAME (implementation).
+*/
+
+#include "my_global.h"
+#include "my_pthread.h"
+#include "pfs_instr_class.h"
+#include "pfs_column_types.h"
+#include "pfs_column_values.h"
+#include "table_esgs_by_user_by_event_name.h"
+#include "pfs_global.h"
+#include "pfs_account.h"
+#include "pfs_visitor.h"
+
+THR_LOCK table_esgs_by_user_by_event_name::m_table_lock;
+
+static const TABLE_FIELD_TYPE field_types[]=
+{
+  {
+    { C_STRING_WITH_LEN("USER") },
+    { C_STRING_WITH_LEN("char(16)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("EVENT_NAME") },
+    { C_STRING_WITH_LEN("varchar(128)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("COUNT_STAR") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_TIMER_WAIT") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("MIN_TIMER_WAIT") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("AVG_TIMER_WAIT") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("MAX_TIMER_WAIT") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  }
+};
+
+TABLE_FIELD_DEF
+table_esgs_by_user_by_event_name::m_field_def=
+{ 7, field_types };
+
+PFS_engine_table_share
+table_esgs_by_user_by_event_name::m_share=
+{
+  { C_STRING_WITH_LEN("events_stages_summary_by_user_by_event_name") },
+  &pfs_truncatable_acl,
+  table_esgs_by_user_by_event_name::create,
+  NULL, /* write_row */
+  table_esgs_by_user_by_event_name::delete_all_rows,
+  NULL, /* get_row_count */
+  1000, /* records */
+  sizeof(pos_esgs_by_user_by_event_name),
+  &m_table_lock,
+  &m_field_def,
+  false /* checked */
+};
+
+PFS_engine_table*
+table_esgs_by_user_by_event_name::create(void)
+{
+  return new table_esgs_by_user_by_event_name();
+}
+
+int
+table_esgs_by_user_by_event_name::delete_all_rows(void)
+{
+  reset_events_stages_by_thread();
+  reset_events_stages_by_account();
+  reset_events_stages_by_user();
+  return 0;
+}
+
+table_esgs_by_user_by_event_name::table_esgs_by_user_by_event_name()
+  : PFS_engine_table(&m_share, &m_pos),
+    m_row_exists(false), m_pos(), m_next_pos()
+{}
+
+void table_esgs_by_user_by_event_name::reset_position(void)
+{
+  m_pos.reset();
+  m_next_pos.reset();
+}
+
+int table_esgs_by_user_by_event_name::rnd_init(bool scan)
+{
+  m_normalizer= time_normalizer::get(stage_timer);
+  return 0;
+}
+
+int table_esgs_by_user_by_event_name::rnd_next(void)
+{
+  PFS_user *user;
+  PFS_stage_class *stage_class;
+
+  for (m_pos.set_at(&m_next_pos);
+       m_pos.has_more_user();
+       m_pos.next_user())
+  {
+    user= &user_array[m_pos.m_index_1];
+    if (user->m_lock.is_populated())
+    {
+      stage_class= find_stage_class(m_pos.m_index_2);
+      if (stage_class)
+      {
+        make_row(user, stage_class);
+        m_next_pos.set_after(&m_pos);
+        return 0;
+      }
+    }
+  }
+
+  return HA_ERR_END_OF_FILE;
+}
+
+int
+table_esgs_by_user_by_event_name::rnd_pos(const void *pos)
+{
+  PFS_user *user;
+  PFS_stage_class *stage_class;
+
+  set_position(pos);
+  DBUG_ASSERT(m_pos.m_index_1 < user_max);
+
+  user= &user_array[m_pos.m_index_1];
+  if (! user->m_lock.is_populated())
+    return HA_ERR_RECORD_DELETED;
+
+  stage_class= find_stage_class(m_pos.m_index_2);
+  if (stage_class)
+  {
+    make_row(user, stage_class);
+    return 0;
+  }
+
+  return HA_ERR_RECORD_DELETED;
+}
+
+void table_esgs_by_user_by_event_name
+::make_row(PFS_user *user, PFS_stage_class *klass)
+{
+  pfs_lock lock;
+  m_row_exists= false;
+
+  user->m_lock.begin_optimistic_lock(&lock);
+
+  if (m_row.m_user.make_row(user))
+    return;
+
+  m_row.m_event_name.make_row(klass);
+
+  PFS_connection_stage_visitor visitor(klass);
+  PFS_connection_iterator::visit_user(user, true, true, & visitor);
+
+  if (! user->m_lock.end_optimistic_lock(&lock))
+    return;
+
+  m_row_exists= true;
+  m_row.m_stat.set(m_normalizer, & visitor.m_stat);
+}
+
+int table_esgs_by_user_by_event_name
+::read_row_values(TABLE *table, unsigned char *buf, Field **fields,
+                  bool read_all)
+{
+  Field *f;
+
+  if (unlikely(! m_row_exists))
+    return HA_ERR_RECORD_DELETED;
+
+  /* Set the null bits */
+  DBUG_ASSERT(table->s->null_bytes == 1);
+  buf[0]= 0;
+
+  for (; (f= *fields) ; fields++)
+  {
+    if (read_all || bitmap_is_set(table->read_set, f->field_index))
+    {
+      switch(f->field_index)
+      {
+      case 0: /* USER */
+        m_row.m_user.set_field(f);
+        break;
+      case 1: /* EVENT_NAME */
+        m_row.m_event_name.set_field(f);
+        break;
+      default: /* 2, ... COUNT/SUM/MIN/AVG/MAX */
+        m_row.m_stat.set_field(f->field_index - 2, f);
+        break;
+      }
+    }
+  }
+
+  return 0;
+}
+
diff --git a/storage/perfschema/table_esgs_by_user_by_event_name.h b/storage/perfschema/table_esgs_by_user_by_event_name.h
new file mode 100644
index 00000000000..565e633e386
--- /dev/null
+++ b/storage/perfschema/table_esgs_by_user_by_event_name.h
@@ -0,0 +1,130 @@
+/* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; version 2 of the License.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+#ifndef TABLE_ESGS_BY_USER_BY_EVENT_NAME_H
+#define TABLE_ESGS_BY_USER_BY_EVENT_NAME_H
+
+/**
+  @file storage/perfschema/table_esgs_by_user_by_event_name.h
+  Table EVENTS_STAGES_SUMMARY_BY_USER_BY_EVENT_NAME (declarations).
+*/
+
+#include "pfs_column_types.h"
+#include "pfs_engine_table.h"
+#include "pfs_instr_class.h"
+#include "pfs_instr.h"
+#include "pfs_user.h"
+#include "table_helper.h"
+
+/**
+  @addtogroup Performance_schema_tables
+  @{
+*/
+
+/**
+  A row of table
+  PERFORMANCE_SCHEMA.EVENTS_STAGES_SUMMARY_BY_USER_BY_EVENT_NAME.
+*/
+struct row_esgs_by_user_by_event_name
+{
+  /** Column USER. */
+  PFS_user_row m_user;
+  /** Column EVENT_NAME. */
+  PFS_event_name_row m_event_name;
+  /** Columns COUNT_STAR, SUM/MIN/AVG/MAX TIMER_WAIT. */
+  PFS_stage_stat_row m_stat;
+};
+
+/**
+  Position of a cursor on
+  PERFORMANCE_SCHEMA.EVENTS_STAGES_SUMMARY_BY_USER_BY_EVENT_NAME.
+  Index 1 on user (0 based)
+  Index 2 on stage class (1 based)
+*/
+struct pos_esgs_by_user_by_event_name
+: public PFS_double_index
+{
+  pos_esgs_by_user_by_event_name()
+    : PFS_double_index(0, 1)
+  {}
+
+  inline void reset(void)
+  {
+    m_index_1= 0;
+    m_index_2= 1;
+  }
+
+  inline bool has_more_user(void)
+  { return (m_index_1 < user_max); }
+
+  inline void next_user(void)
+  {
+    m_index_1++;
+    m_index_2= 1;
+  }
+
+  inline void next_stage(void)
+  {
+    m_index_2++;
+  }
+};
+
+/** Table PERFORMANCE_SCHEMA.EVENTS_STAGES_SUMMARY_BY_USER_BY_EVENT_NAME. */
+class table_esgs_by_user_by_event_name : public PFS_engine_table
+{
+public:
+  /** Table share */
+  static PFS_engine_table_share m_share;
+  static PFS_engine_table* create();
+  static int delete_all_rows();
+
+  virtual int rnd_init(bool scan);
+  virtual int rnd_next();
+  virtual int rnd_pos(const void *pos);
+  virtual void reset_position(void);
+
+protected:
+  virtual int read_row_values(TABLE *table,
+                              unsigned char *buf,
+                              Field **fields,
+                              bool read_all);
+
+  table_esgs_by_user_by_event_name();
+
+public:
+  ~table_esgs_by_user_by_event_name()
+  {}
+
+protected:
+  void make_row(PFS_user *user, PFS_stage_class *klass);
+
+private:
+  /** Table share lock. */
+  static THR_LOCK m_table_lock;
+  /** Fields definition. */
+  static TABLE_FIELD_DEF m_field_def;
+
+  /** Current row. */
+  row_esgs_by_user_by_event_name m_row;
+  /** True is the current row exists. */
+  bool m_row_exists;
+  /** Current position. */
+  pos_esgs_by_user_by_event_name m_pos;
+  /** Next position. */
+  pos_esgs_by_user_by_event_name m_next_pos;
+};
+
+/** @} */
+#endif
diff --git a/storage/perfschema/table_esgs_global_by_event_name.cc b/storage/perfschema/table_esgs_global_by_event_name.cc
new file mode 100644
index 00000000000..2ac22fb1551
--- /dev/null
+++ b/storage/perfschema/table_esgs_global_by_event_name.cc
@@ -0,0 +1,204 @@
+/* Copyright (c) 2010, 2011, Oracle and/or its affiliates. All rights reserved.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; version 2 of the License.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301  USA */
+
+/**
+  @file storage/perfschema/table_esgs_global_by_event_name.cc
+  Table EVENTS_STAGES_SUMMARY_GLOBAL_BY_EVENT_NAME (implementation).
+*/
+
+#include "my_global.h"
+#include "my_pthread.h"
+#include "pfs_instr_class.h"
+#include "pfs_column_types.h"
+#include "pfs_column_values.h"
+#include "table_esgs_global_by_event_name.h"
+#include "pfs_global.h"
+#include "pfs_instr.h"
+#include "pfs_timer.h"
+#include "pfs_visitor.h"
+
+THR_LOCK table_esgs_global_by_event_name::m_table_lock;
+
+static const TABLE_FIELD_TYPE field_types[]=
+{
+  {
+    { C_STRING_WITH_LEN("EVENT_NAME") },
+    { C_STRING_WITH_LEN("varchar(128)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("COUNT_STAR") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_TIMER_WAIT") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("MIN_TIMER_WAIT") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("AVG_TIMER_WAIT") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("MAX_TIMER_WAIT") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  }
+};
+
+TABLE_FIELD_DEF
+table_esgs_global_by_event_name::m_field_def=
+{ 6, field_types };
+
+PFS_engine_table_share
+table_esgs_global_by_event_name::m_share=
+{
+  { C_STRING_WITH_LEN("events_stages_summary_global_by_event_name") },
+  &pfs_truncatable_acl,
+  table_esgs_global_by_event_name::create,
+  NULL, /* write_row */
+  table_esgs_global_by_event_name::delete_all_rows,
+  NULL, /* get_row_count */
+  1000, /* records */
+  sizeof(PFS_simple_index),
+  &m_table_lock,
+  &m_field_def,
+  false /* checked */
+};
+
+PFS_engine_table*
+table_esgs_global_by_event_name::create(void)
+{
+  return new table_esgs_global_by_event_name();
+}
+
+int
+table_esgs_global_by_event_name::delete_all_rows(void)
+{
+  reset_events_stages_by_thread();
+  reset_events_stages_global();
+  return 0;
+}
+
+table_esgs_global_by_event_name::table_esgs_global_by_event_name()
+  : PFS_engine_table(&m_share, &m_pos),
+    m_row_exists(false), m_pos(1), m_next_pos(1)
+{}
+
+void table_esgs_global_by_event_name::reset_position(void)
+{
+  m_pos= 1;
+  m_next_pos= 1;
+}
+
+int table_esgs_global_by_event_name::rnd_init(bool scan)
+{
+  m_normalizer= time_normalizer::get(stage_timer);
+  return 0;
+}
+
+int table_esgs_global_by_event_name::rnd_next(void)
+{
+  PFS_stage_class *stage_class;
+
+  if (global_instr_class_stages_array == NULL)
+    return HA_ERR_END_OF_FILE;
+
+  m_pos.set_at(&m_next_pos);
+
+  stage_class= find_stage_class(m_pos.m_index);
+  if (stage_class)
+  {
+    make_row(stage_class);
+    m_next_pos.set_after(&m_pos);
+    return 0;
+  }
+
+  return HA_ERR_END_OF_FILE;
+}
+
+int
+table_esgs_global_by_event_name::rnd_pos(const void *pos)
+{
+  PFS_stage_class *stage_class;
+
+  set_position(pos);
+
+  if (global_instr_class_stages_array == NULL)
+    return HA_ERR_END_OF_FILE;
+
+  stage_class=find_stage_class(m_pos.m_index);
+  if (stage_class)
+  {
+    make_row(stage_class);
+    return 0;
+  }
+
+  return HA_ERR_RECORD_DELETED;
+}
+
+
+void table_esgs_global_by_event_name
+::make_row(PFS_stage_class *klass)
+{
+  m_row.m_event_name.make_row(klass);
+
+  PFS_connection_stage_visitor visitor(klass);
+  PFS_connection_iterator::visit_global(true, /* hosts */
+                                        false, /* users */
+                                        true, true, & visitor);
+
+  m_row.m_stat.set(m_normalizer, & visitor.m_stat);
+  m_row_exists= true;
+}
+
+int table_esgs_global_by_event_name
+::read_row_values(TABLE *table, unsigned char *, Field **fields,
+                  bool read_all)
+{
+  Field *f;
+
+  if (unlikely(! m_row_exists))
+    return HA_ERR_RECORD_DELETED;
+
+  /* Set the null bits */
+  DBUG_ASSERT(table->s->null_bytes == 0);
+
+  for (; (f= *fields) ; fields++)
+  {
+    if (read_all || bitmap_is_set(table->read_set, f->field_index))
+    {
+      switch(f->field_index)
+      {
+      case 0: /* NAME */
+        m_row.m_event_name.set_field(f);
+        break;
+      default: /* 1, ... COUNT/SUM/MIN/AVG/MAX */
+        m_row.m_stat.set_field(f->field_index - 1, f);
+        break;
+      }
+    }
+  }
+
+  return 0;
+}
+
diff --git a/storage/perfschema/table_esgs_global_by_event_name.h b/storage/perfschema/table_esgs_global_by_event_name.h
new file mode 100644
index 00000000000..a9c1456e9b1
--- /dev/null
+++ b/storage/perfschema/table_esgs_global_by_event_name.h
@@ -0,0 +1,93 @@
+/* Copyright (c) 2010, 2011, Oracle and/or its affiliates. All rights reserved.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; version 2 of the License.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301  USA */
+
+#ifndef TABLE_ESGS_GLOBAL_BY_EVENT_NAME_H
+#define TABLE_ESGS_GLOBAL_BY_EVENT_NAME_H
+
+/**
+  @file storage/perfschema/table_esgs_global_by_event_name.h
+  Table EVENTS_STAGES_SUMMARY_GLOBAL_BY_EVENT_NAME (declarations).
+*/
+
+#include "pfs_column_types.h"
+#include "pfs_engine_table.h"
+#include "pfs_instr_class.h"
+#include "pfs_instr.h"
+#include "table_helper.h"
+
+/**
+  @addtogroup Performance_schema_tables
+  @{
+*/
+
+/**
+  A row of table
+  PERFORMANCE_SCHEMA.EVENTS_STAGES_SUMMARY_GLOBAL_BY_EVENT_NAME.
+*/
+struct row_esgs_global_by_event_name
+{
+  /** Column EVENT_NAME. */
+  PFS_event_name_row m_event_name;
+  /** Columns COUNT_STAR, SUM/MIN/AVG/MAX TIMER_WAIT. */
+  PFS_stage_stat_row m_stat;
+};
+
+/** Table PERFORMANCE_SCHEMA.EVENTS_STAGES_SUMMARY_GLOBAL_BY_EVENT_NAME. */
+class table_esgs_global_by_event_name : public PFS_engine_table
+{
+public:
+  /** Table share */
+  static PFS_engine_table_share m_share;
+  static PFS_engine_table* create();
+  static int delete_all_rows();
+
+  virtual int rnd_init(bool scan);
+  virtual int rnd_next();
+  virtual int rnd_pos(const void *pos);
+  virtual void reset_position(void);
+
+protected:
+  virtual int read_row_values(TABLE *table,
+                              unsigned char *buf,
+                              Field **fields,
+                              bool read_all);
+
+  table_esgs_global_by_event_name();
+
+public:
+  ~table_esgs_global_by_event_name()
+  {}
+
+protected:
+  void make_row(PFS_stage_class *klass);
+
+private:
+  /** Table share lock. */
+  static THR_LOCK m_table_lock;
+  /** Fields definition. */
+  static TABLE_FIELD_DEF m_field_def;
+
+  /** Current row. */
+  row_esgs_global_by_event_name m_row;
+  /** True is the current row exists. */
+  bool m_row_exists;
+  /** Current position. */
+  PFS_simple_index m_pos;
+  /** Next position. */
+  PFS_simple_index m_next_pos;
+};
+
+/** @} */
+#endif
diff --git a/storage/perfschema/table_esms_by_account_by_event_name.cc b/storage/perfschema/table_esms_by_account_by_event_name.cc
new file mode 100644
index 00000000000..77f87182945
--- /dev/null
+++ b/storage/perfschema/table_esms_by_account_by_event_name.cc
@@ -0,0 +1,328 @@
+/* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; version 2 of the License.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+/**
+  @file storage/perfschema/table_esms_by_account_by_event_name.cc
+  Table EVENTS_STATEMENTS_SUMMARY_BY_ACCOUNT_BY_EVENT_NAME (implementation).
+*/
+
+#include "my_global.h"
+#include "my_pthread.h"
+#include "pfs_instr_class.h"
+#include "pfs_column_types.h"
+#include "pfs_column_values.h"
+#include "table_esms_by_account_by_event_name.h"
+#include "pfs_global.h"
+#include "pfs_visitor.h"
+
+THR_LOCK table_esms_by_account_by_event_name::m_table_lock;
+
+static const TABLE_FIELD_TYPE field_types[]=
+{
+  {
+    { C_STRING_WITH_LEN("USER") },
+    { C_STRING_WITH_LEN("char(16)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("HOST") },
+    { C_STRING_WITH_LEN("char(60)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("EVENT_NAME") },
+    { C_STRING_WITH_LEN("varchar(128)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("COUNT_STAR") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_TIMER_WAIT") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("MIN_TIMER_WAIT") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("AVG_TIMER_WAIT") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("MAX_TIMER_WAIT") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_LOCK_TIME") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_ERRORS") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_WARNINGS") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_ROWS_AFFECTED") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_ROWS_SENT") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_ROWS_EXAMINED") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_CREATED_TMP_DISK_TABLES") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_CREATED_TMP_TABLES") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_SELECT_FULL_JOIN") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_SELECT_FULL_RANGE_JOIN") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_SELECT_RANGE") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_SELECT_RANGE_CHECK") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_SELECT_SCAN") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_SORT_MERGE_PASSES") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_SORT_RANGE") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_SORT_ROWS") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_SORT_SCAN") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_NO_INDEX_USED") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_NO_GOOD_INDEX_USED") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  }
+};
+
+TABLE_FIELD_DEF
+table_esms_by_account_by_event_name::m_field_def=
+{ 27, field_types };
+
+PFS_engine_table_share
+table_esms_by_account_by_event_name::m_share=
+{
+  { C_STRING_WITH_LEN("events_statements_summary_by_account_by_event_name") },
+  &pfs_truncatable_acl,
+  table_esms_by_account_by_event_name::create,
+  NULL, /* write_row */
+  table_esms_by_account_by_event_name::delete_all_rows,
+  NULL, /* get_row_count */
+  1000, /* records */
+  sizeof(pos_esms_by_account_by_event_name),
+  &m_table_lock,
+  &m_field_def,
+  false /* checked */
+};
+
+PFS_engine_table*
+table_esms_by_account_by_event_name::create(void)
+{
+  return new table_esms_by_account_by_event_name();
+}
+
+int
+table_esms_by_account_by_event_name::delete_all_rows(void)
+{
+  reset_events_statements_by_thread();
+  reset_events_statements_by_account();
+  return 0;
+}
+
+table_esms_by_account_by_event_name::table_esms_by_account_by_event_name()
+  : PFS_engine_table(&m_share, &m_pos),
+    m_row_exists(false), m_pos(), m_next_pos()
+{}
+
+void table_esms_by_account_by_event_name::reset_position(void)
+{
+  m_pos.reset();
+  m_next_pos.reset();
+}
+
+int table_esms_by_account_by_event_name::rnd_init(bool scan)
+{
+  m_normalizer= time_normalizer::get(statement_timer);
+  return 0;
+}
+
+int table_esms_by_account_by_event_name::rnd_next(void)
+{
+  PFS_account *account;
+  PFS_statement_class *statement_class;
+
+  for (m_pos.set_at(&m_next_pos);
+       m_pos.has_more_account();
+       m_pos.next_account())
+  {
+    account= &account_array[m_pos.m_index_1];
+    if (account->m_lock.is_populated())
+    {
+      statement_class= find_statement_class(m_pos.m_index_2);
+      if (statement_class)
+      {
+        make_row(account, statement_class);
+        m_next_pos.set_after(&m_pos);
+        return 0;
+      }
+    }
+  }
+
+  return HA_ERR_END_OF_FILE;
+}
+
+int
+table_esms_by_account_by_event_name::rnd_pos(const void *pos)
+{
+  PFS_account *account;
+  PFS_statement_class *statement_class;
+
+  set_position(pos);
+  DBUG_ASSERT(m_pos.m_index_1 < account_max);
+
+  account= &account_array[m_pos.m_index_1];
+  if (! account->m_lock.is_populated())
+    return HA_ERR_RECORD_DELETED;
+
+  statement_class= find_statement_class(m_pos.m_index_2);
+  if (statement_class)
+  {
+    make_row(account, statement_class);
+    return 0;
+  }
+
+  return HA_ERR_RECORD_DELETED;
+}
+
+void table_esms_by_account_by_event_name
+::make_row(PFS_account *account, PFS_statement_class *klass)
+{
+  pfs_lock lock;
+  m_row_exists= false;
+
+  account->m_lock.begin_optimistic_lock(&lock);
+
+  if (m_row.m_account.make_row(account))
+    return;
+
+  m_row.m_event_name.make_row(klass);
+
+  PFS_connection_statement_visitor visitor(klass);
+  PFS_connection_iterator::visit_account(account, true, & visitor);
+
+  if (! account->m_lock.end_optimistic_lock(&lock))
+    return;
+
+  m_row_exists= true;
+  m_row.m_stat.set(m_normalizer, & visitor.m_stat);
+}
+
+int table_esms_by_account_by_event_name
+::read_row_values(TABLE *table, unsigned char *buf, Field **fields,
+                  bool read_all)
+{
+  Field *f;
+
+  if (unlikely(! m_row_exists))
+    return HA_ERR_RECORD_DELETED;
+
+  /* Set the null bits */
+  DBUG_ASSERT(table->s->null_bytes == 1);
+  buf[0]= 0;
+
+  for (; (f= *fields) ; fields++)
+  {
+    if (read_all || bitmap_is_set(table->read_set, f->field_index))
+    {
+      switch(f->field_index)
+      {
+      case 0: /* USER */
+      case 1: /* HOST */
+        m_row.m_account.set_field(f->field_index, f);
+        break;
+      case 2: /* EVENT_NAME */
+        m_row.m_event_name.set_field(f);
+        break;
+      default: /* 3, ... COUNT/SUM/MIN/AVG/MAX */
+        m_row.m_stat.set_field(f->field_index - 3, f);
+        break;
+      }
+    }
+  }
+
+  return 0;
+}
+
diff --git a/storage/perfschema/table_esms_by_account_by_event_name.h b/storage/perfschema/table_esms_by_account_by_event_name.h
new file mode 100644
index 00000000000..d58bf9e5763
--- /dev/null
+++ b/storage/perfschema/table_esms_by_account_by_event_name.h
@@ -0,0 +1,125 @@
+/* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; version 2 of the License.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+#ifndef TABLE_ESMS_BY_ACCOUNT_BY_EVENT_NAME_H
+#define TABLE_ESMS_BY_ACCOUNT_BY_EVENT_NAME_H
+
+/**
+  @file storage/perfschema/table_esms_by_account_by_event_name.h
+  Table EVENTS_STATEMENTS_SUMMARY_BY_ACCOUNT_BY_EVENT_NAME (declarations).
+*/
+
+#include "pfs_column_types.h"
+#include "pfs_engine_table.h"
+#include "pfs_instr_class.h"
+#include "pfs_instr.h"
+#include "pfs_account.h"
+#include "table_helper.h"
+
+/**
+  @addtogroup Performance_schema_tables
+  @{
+*/
+
+/**
+  A row of table
+  PERFORMANCE_SCHEMA.EVENTS_STATEMENTS_SUMMARY_BY_ACCOUNT_BY_EVENT_NAME.
+*/
+struct row_esms_by_account_by_event_name
+{
+  /** Column USER, HOST. */
+  PFS_account_row m_account;
+  /** Column EVENT_NAME. */
+  PFS_event_name_row m_event_name;
+  /** Columns COUNT_STAR, SUM/MIN/AVG/MAX TIMER_WAIT. */
+  PFS_statement_stat_row m_stat;
+};
+
+/**
+  Position of a cursor on
+  PERFORMANCE_SCHEMA.EVENTS_STATEMENTS_SUMMARY_BY_ACCOUNT_BY_EVENT_NAME.
+  Index 1 on user@host (0 based)
+  Index 2 on statement class (1 based)
+*/
+struct pos_esms_by_account_by_event_name
+: public PFS_double_index
+{
+  pos_esms_by_account_by_event_name()
+    : PFS_double_index(0, 1)
+  {}
+
+  inline void reset(void)
+  {
+    m_index_1= 0;
+    m_index_2= 1;
+  }
+
+  inline bool has_more_account(void)
+  { return (m_index_1 < account_max); }
+
+  inline void next_account(void)
+  {
+    m_index_1++;
+    m_index_2= 1;
+  }
+};
+
+/** Table PERFORMANCE_SCHEMA.EVENTS_STATEMENTS_SUMMARY_BY_ACCOUNT_BY_EVENT_NAME. */
+class table_esms_by_account_by_event_name : public PFS_engine_table
+{
+public:
+  /** Table share */
+  static PFS_engine_table_share m_share;
+  static PFS_engine_table* create();
+  static int delete_all_rows();
+
+  virtual int rnd_init(bool scan);
+  virtual int rnd_next();
+  virtual int rnd_pos(const void *pos);
+  virtual void reset_position(void);
+
+protected:
+  virtual int read_row_values(TABLE *table,
+                              unsigned char *buf,
+                              Field **fields,
+                              bool read_all);
+
+  table_esms_by_account_by_event_name();
+
+public:
+  ~table_esms_by_account_by_event_name()
+  {}
+
+protected:
+  void make_row(PFS_account *account, PFS_statement_class *klass);
+
+private:
+  /** Table share lock. */
+  static THR_LOCK m_table_lock;
+  /** Fields definition. */
+  static TABLE_FIELD_DEF m_field_def;
+
+  /** Current row. */
+  row_esms_by_account_by_event_name m_row;
+  /** True is the current row exists. */
+  bool m_row_exists;
+  /** Current position. */
+  pos_esms_by_account_by_event_name m_pos;
+  /** Next position. */
+  pos_esms_by_account_by_event_name m_next_pos;
+};
+
+/** @} */
+#endif
diff --git a/storage/perfschema/table_esms_by_digest.cc b/storage/perfschema/table_esms_by_digest.cc
new file mode 100644
index 00000000000..dac8d3b01dc
--- /dev/null
+++ b/storage/perfschema/table_esms_by_digest.cc
@@ -0,0 +1,325 @@
+/* Copyright (c) 2010, 2011, Oracle and/or its affiliates. All rights reserved.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; version 2 of the License.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+/**
+  @file storage/perfschema/table_esms_by_digest.cc
+  Table EVENTS_STATEMENTS_SUMMARY_GLOBAL_BY_DIGEST (implementation).
+*/
+
+#include "my_global.h"
+#include "my_pthread.h"
+#include "pfs_instr_class.h"
+#include "pfs_column_types.h"
+#include "pfs_column_values.h"
+#include "table_esms_by_digest.h"
+#include "pfs_global.h"
+#include "pfs_instr.h"
+#include "pfs_timer.h"
+#include "pfs_visitor.h"
+#include "table_esms_by_digest.h"
+#include "pfs_digest.h"
+
+THR_LOCK table_esms_by_digest::m_table_lock;
+
+static const TABLE_FIELD_TYPE field_types[]=
+{
+  {
+    { C_STRING_WITH_LEN("DIGEST") },
+    { C_STRING_WITH_LEN("varchar(32)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("DIGEST_TEXT") },
+    { C_STRING_WITH_LEN("longtext") },
+    { NULL, 0}
+  },
+  { 
+    { C_STRING_WITH_LEN("COUNT_STAR") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_TIMER_WAIT") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("MIN_TIMER_WAIT") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("AVG_TIMER_WAIT") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("MAX_TIMER_WAIT") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_LOCK_TIME") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_ERRORS") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_WARNINGS") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_ROWS_AFFECTED") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_ROWS_SENT") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_ROWS_EXAMINED") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_CREATED_TMP_DISK_TABLES") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_CREATED_TMP_TABLES") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_SELECT_FULL_JOIN") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_SELECT_FULL_RANGE_JOIN") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_SELECT_RANGE") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_SELECT_RANGE_CHECK") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_SELECT_SCAN") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_SORT_MERGE_PASSES") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_SORT_RANGE") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_SORT_ROWS") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_SORT_SCAN") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_NO_INDEX_USED") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_NO_GOOD_INDEX_USED") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("FIRST_SEEN") },
+    { C_STRING_WITH_LEN("timestamp") },
+    { NULL, 0}
+  },
+  { 
+    { C_STRING_WITH_LEN("LAST_SEEN") },
+    { C_STRING_WITH_LEN("timestamp") },
+    { NULL, 0}
+  }
+};
+
+TABLE_FIELD_DEF
+table_esms_by_digest::m_field_def=
+{ 28, field_types };
+
+PFS_engine_table_share
+table_esms_by_digest::m_share=
+{
+  { C_STRING_WITH_LEN("events_statements_summary_by_digest") },
+  &pfs_truncatable_acl,
+  table_esms_by_digest::create,
+  NULL, /* write_row */
+  table_esms_by_digest::delete_all_rows,
+  NULL, /* get_row_count */
+  1000, /* records */
+  sizeof(PFS_simple_index),
+  &m_table_lock,
+  &m_field_def,
+  false /* checked */
+};
+
+PFS_engine_table*
+table_esms_by_digest::create(void)
+{
+  return new table_esms_by_digest();
+}
+
+int
+table_esms_by_digest::delete_all_rows(void)
+{
+  reset_esms_by_digest();
+  return 0;
+}
+
+table_esms_by_digest::table_esms_by_digest()
+  : PFS_engine_table(&m_share, &m_pos),
+    m_row_exists(false), m_pos(0), m_next_pos(0)
+{}
+
+void table_esms_by_digest::reset_position(void)
+{
+  m_pos= 0;
+  m_next_pos= 0;
+}
+
+int table_esms_by_digest::rnd_next(void)
+{
+  PFS_statements_digest_stat* digest_stat;
+
+  if (statements_digest_stat_array == NULL)
+    return HA_ERR_END_OF_FILE;
+
+  for (m_pos.set_at(&m_next_pos);
+       m_pos.m_index < digest_max;
+       m_pos.next())
+  {
+    digest_stat= &statements_digest_stat_array[m_pos.m_index];
+    if (digest_stat->m_first_seen != 0)
+    {
+      make_row(digest_stat);
+      m_next_pos.set_after(&m_pos);
+      return 0;
+    }
+  }
+
+  return HA_ERR_END_OF_FILE;
+}
+
+int
+table_esms_by_digest::rnd_pos(const void *pos)
+{
+  PFS_statements_digest_stat* digest_stat;
+
+  if (statements_digest_stat_array == NULL)
+    return HA_ERR_END_OF_FILE;
+
+  set_position(pos);
+  digest_stat= &statements_digest_stat_array[m_pos.m_index];
+
+  if (digest_stat->m_first_seen != 0)
+  {
+    make_row(digest_stat);
+    return 0;
+  }
+
+  return HA_ERR_RECORD_DELETED;
+}
+
+
+void table_esms_by_digest::make_row(PFS_statements_digest_stat* digest_stat)
+{
+  m_row_exists= false;
+  m_row.m_first_seen= digest_stat->m_first_seen;
+  m_row.m_last_seen= digest_stat->m_last_seen;
+  m_row.m_digest.make_row(digest_stat);
+
+  /*
+    Get statements stats.
+  */
+  time_normalizer *normalizer= time_normalizer::get(statement_timer);
+  m_row.m_stat.set(normalizer, & digest_stat->m_stat);
+
+  m_row_exists= true;
+}
+
+int table_esms_by_digest
+::read_row_values(TABLE *table, unsigned char *buf, Field **fields,
+                  bool read_all)
+{
+  Field *f;
+
+  if (unlikely(! m_row_exists))
+    return HA_ERR_RECORD_DELETED;
+
+  /* 
+    Set the null bits. It indicates how many fields could be null
+    in the table.
+  */
+  DBUG_ASSERT(table->s->null_bytes == 1);
+  buf[0]= 0;
+
+  for (; (f= *fields) ; fields++)
+  {
+    if (read_all || bitmap_is_set(table->read_set, f->field_index))
+    {
+      switch(f->field_index)
+      {
+      case 0: /* DIGEST */
+      case 1: /* DIGEST_TEXT */
+        m_row.m_digest.set_field(f->field_index, f);
+        break;
+      case 26: /* FIRST_SEEN */
+        set_field_timestamp(f, m_row.m_first_seen);
+        break;
+      case 27: /* LAST_SEEN */
+        set_field_timestamp(f, m_row.m_last_seen);
+        break;
+      default: /* 1, ... COUNT/SUM/MIN/AVG/MAX */
+        m_row.m_stat.set_field(f->field_index - 2, f);
+        break;
+      }
+    }
+  }
+
+  return 0;
+}
+
diff --git a/storage/perfschema/table_esms_by_digest.h b/storage/perfschema/table_esms_by_digest.h
new file mode 100644
index 00000000000..2e2e595c056
--- /dev/null
+++ b/storage/perfschema/table_esms_by_digest.h
@@ -0,0 +1,95 @@
+/* Copyright (c) 2010, 2011, Oracle and/or its affiliates. All rights reserved.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; version 2 of the License.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+#ifndef TABLE_ESMS_BY_DIGEST_H
+#define TABLE_ESMS_BY_DIGEST_H
+
+/**
+  @file storage/perfschema/table_esms_by_digest.h
+  Table EVENTS_STATEMENTS_SUMMARY_BY_DIGEST (declarations).
+*/
+
+#include "table_helper.h"
+#include "pfs_digest.h"
+
+/**
+  @addtogroup Performance_schema_tables
+  @{
+*/
+
+/**
+  A row of table
+  PERFORMANCE_SCHEMA.EVENTS_STATEMENTS_SUMMARY_BY_DIGEST.
+*/
+struct row_esms_by_digest
+{
+  /** Columns DIGEST/DIGEST_TEXT. */
+  PFS_digest_row m_digest;
+
+  /** Columns COUNT_STAR, SUM/MIN/AVG/MAX TIMER_WAIT. */
+  PFS_statement_stat_row m_stat;
+
+  /** Column FIRST_SEEN. */
+  ulonglong m_first_seen;
+  /** Column LAST_SEEN. */
+  ulonglong m_last_seen;
+};
+
+/** Table PERFORMANCE_SCHEMA.EVENTS_STATEMENTS_SUMMARY_BY_DIGEST. */
+class table_esms_by_digest : public PFS_engine_table
+{
+public:
+  /** Table share */
+  static PFS_engine_table_share m_share;
+  static PFS_engine_table* create();
+  static int delete_all_rows();
+
+  virtual int rnd_next();
+  virtual int rnd_pos(const void *pos);
+  virtual void reset_position(void);
+
+protected:
+  virtual int read_row_values(TABLE *table,
+                              unsigned char *buf,
+                              Field **fields,
+                              bool read_all);
+
+  table_esms_by_digest();
+
+public:
+  ~table_esms_by_digest()
+  {}
+
+protected:
+  void make_row(PFS_statements_digest_stat*);
+
+private:
+  /** Table share lock. */
+  static THR_LOCK m_table_lock;
+  /** Fields definition. */
+  static TABLE_FIELD_DEF m_field_def;
+
+  /** Current row. */
+  row_esms_by_digest m_row;
+  /** True is the current row exists. */
+  bool m_row_exists;
+  /** Current position. */
+  PFS_simple_index m_pos;
+  /** Next position. */
+  PFS_simple_index m_next_pos;
+};
+
+/** @} */
+#endif
diff --git a/storage/perfschema/table_esms_by_host_by_event_name.cc b/storage/perfschema/table_esms_by_host_by_event_name.cc
new file mode 100644
index 00000000000..57aaf7602c4
--- /dev/null
+++ b/storage/perfschema/table_esms_by_host_by_event_name.cc
@@ -0,0 +1,324 @@
+/* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; version 2 of the License.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+/**
+  @file storage/perfschema/table_esms_by_host_by_event_name.cc
+  Table EVENTS_STATEMENTS_SUMMARY_BY_HOST_BY_EVENT_NAME (implementation).
+*/
+
+#include "my_global.h"
+#include "my_pthread.h"
+#include "pfs_instr_class.h"
+#include "pfs_column_types.h"
+#include "pfs_column_values.h"
+#include "table_esms_by_host_by_event_name.h"
+#include "pfs_global.h"
+#include "pfs_account.h"
+#include "pfs_visitor.h"
+
+THR_LOCK table_esms_by_host_by_event_name::m_table_lock;
+
+static const TABLE_FIELD_TYPE field_types[]=
+{
+  {
+    { C_STRING_WITH_LEN("HOST") },
+    { C_STRING_WITH_LEN("char(60)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("EVENT_NAME") },
+    { C_STRING_WITH_LEN("varchar(128)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("COUNT_STAR") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_TIMER_WAIT") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("MIN_TIMER_WAIT") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("AVG_TIMER_WAIT") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("MAX_TIMER_WAIT") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_LOCK_TIME") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_ERRORS") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_WARNINGS") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_ROWS_AFFECTED") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_ROWS_SENT") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_ROWS_EXAMINED") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_CREATED_TMP_DISK_TABLES") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_CREATED_TMP_TABLES") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_SELECT_FULL_JOIN") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_SELECT_FULL_RANGE_JOIN") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_SELECT_RANGE") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_SELECT_RANGE_CHECK") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_SELECT_SCAN") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_SORT_MERGE_PASSES") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_SORT_RANGE") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_SORT_ROWS") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_SORT_SCAN") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_NO_INDEX_USED") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_NO_GOOD_INDEX_USED") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  }
+};
+
+TABLE_FIELD_DEF
+table_esms_by_host_by_event_name::m_field_def=
+{ 26, field_types };
+
+PFS_engine_table_share
+table_esms_by_host_by_event_name::m_share=
+{
+  { C_STRING_WITH_LEN("events_statements_summary_by_host_by_event_name") },
+  &pfs_truncatable_acl,
+  table_esms_by_host_by_event_name::create,
+  NULL, /* write_row */
+  table_esms_by_host_by_event_name::delete_all_rows,
+  NULL, /* get_row_count */
+  1000, /* records */
+  sizeof(pos_esms_by_host_by_event_name),
+  &m_table_lock,
+  &m_field_def,
+  false /* checked */
+};
+
+PFS_engine_table*
+table_esms_by_host_by_event_name::create(void)
+{
+  return new table_esms_by_host_by_event_name();
+}
+
+int
+table_esms_by_host_by_event_name::delete_all_rows(void)
+{
+  reset_events_statements_by_thread();
+  reset_events_statements_by_account();
+  reset_events_statements_by_host();
+  return 0;
+}
+
+table_esms_by_host_by_event_name::table_esms_by_host_by_event_name()
+  : PFS_engine_table(&m_share, &m_pos),
+    m_row_exists(false), m_pos(), m_next_pos()
+{}
+
+void table_esms_by_host_by_event_name::reset_position(void)
+{
+  m_pos.reset();
+  m_next_pos.reset();
+}
+
+int table_esms_by_host_by_event_name::rnd_init(bool scan)
+{
+  m_normalizer= time_normalizer::get(statement_timer);
+  return 0;
+}
+
+int table_esms_by_host_by_event_name::rnd_next(void)
+{
+  PFS_host *host;
+  PFS_statement_class *statement_class;
+
+  for (m_pos.set_at(&m_next_pos);
+       m_pos.has_more_host();
+       m_pos.next_host())
+  {
+    host= &host_array[m_pos.m_index_1];
+    if (host->m_lock.is_populated())
+    {
+      statement_class= find_statement_class(m_pos.m_index_2);
+      if (statement_class)
+      {
+        make_row(host, statement_class);
+        m_next_pos.set_after(&m_pos);
+        return 0;
+      }
+    }
+  }
+
+  return HA_ERR_END_OF_FILE;
+}
+
+int
+table_esms_by_host_by_event_name::rnd_pos(const void *pos)
+{
+  PFS_host *host;
+  PFS_statement_class *statement_class;
+
+  set_position(pos);
+  DBUG_ASSERT(m_pos.m_index_1 < host_max);
+
+  host= &host_array[m_pos.m_index_1];
+  if (! host->m_lock.is_populated())
+    return HA_ERR_RECORD_DELETED;
+
+  statement_class= find_statement_class(m_pos.m_index_2);
+  if (statement_class)
+  {
+    make_row(host, statement_class);
+    return 0;
+  }
+
+  return HA_ERR_RECORD_DELETED;
+}
+
+void table_esms_by_host_by_event_name
+::make_row(PFS_host *host, PFS_statement_class *klass)
+{
+  pfs_lock lock;
+  m_row_exists= false;
+
+  host->m_lock.begin_optimistic_lock(&lock);
+
+  if (m_row.m_host.make_row(host))
+    return;
+
+  m_row.m_event_name.make_row(klass);
+
+  PFS_connection_statement_visitor visitor(klass);
+  PFS_connection_iterator::visit_host(host, true, true, & visitor);
+
+  if (! host->m_lock.end_optimistic_lock(&lock))
+    return;
+
+  m_row_exists= true;
+  m_row.m_stat.set(m_normalizer, & visitor.m_stat);
+}
+
+int table_esms_by_host_by_event_name
+::read_row_values(TABLE *table, unsigned char *buf, Field **fields,
+                  bool read_all)
+{
+  Field *f;
+
+  if (unlikely(! m_row_exists))
+    return HA_ERR_RECORD_DELETED;
+
+  /* Set the null bits */
+  DBUG_ASSERT(table->s->null_bytes == 1);
+  buf[0]= 0;
+
+  for (; (f= *fields) ; fields++)
+  {
+    if (read_all || bitmap_is_set(table->read_set, f->field_index))
+    {
+      switch(f->field_index)
+      {
+      case 0: /* HOST */
+        m_row.m_host.set_field(f);
+        break;
+      case 1: /* EVENT_NAME */
+        m_row.m_event_name.set_field(f);
+        break;
+      default: /* 2, ... COUNT/SUM/MIN/AVG/MAX */
+        m_row.m_stat.set_field(f->field_index - 2, f);
+        break;
+      }
+    }
+  }
+
+  return 0;
+}
+
diff --git a/storage/perfschema/table_esms_by_host_by_event_name.h b/storage/perfschema/table_esms_by_host_by_event_name.h
new file mode 100644
index 00000000000..00c6beee561
--- /dev/null
+++ b/storage/perfschema/table_esms_by_host_by_event_name.h
@@ -0,0 +1,125 @@
+/* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; version 2 of the License.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+#ifndef TABLE_ESMS_BY_HOST_BY_EVENT_NAME_H
+#define TABLE_ESMS_BY_HOST_BY_EVENT_NAME_H
+
+/**
+  @file storage/perfschema/table_esms_by_host_by_event_name.h
+  Table EVENTS_STATEMENTS_SUMMARY_BY_HOST_BY_EVENT_NAME (declarations).
+*/
+
+#include "pfs_column_types.h"
+#include "pfs_engine_table.h"
+#include "pfs_instr_class.h"
+#include "pfs_instr.h"
+#include "pfs_host.h"
+#include "table_helper.h"
+
+/**
+  @addtogroup Performance_schema_tables
+  @{
+*/
+
+/**
+  A row of table
+  PERFORMANCE_SCHEMA.EVENTS_STATEMENTS_SUMMARY_BY_HOST_BY_EVENT_NAME.
+*/
+struct row_esms_by_host_by_event_name
+{
+  /** Column HOST */
+  PFS_host_row m_host;
+  /** Column EVENT_NAME */
+  PFS_event_name_row m_event_name;
+  /** Columns COUNT_STAR, SUM/MIN/AVG/MAX TIMER_WAIT. */
+  PFS_statement_stat_row m_stat;
+};
+
+/**
+  Position of a cursor on
+  PERFORMANCE_SCHEMA.EVENTS_STATEMENTS_SUMMARY_BY_HOST_BY_EVENT_NAME.
+  Index 1 on host (0 based)
+  Index 2 on statement class (1 based)
+*/
+struct pos_esms_by_host_by_event_name
+: public PFS_double_index
+{
+  pos_esms_by_host_by_event_name()
+    : PFS_double_index(0, 1)
+  {}
+
+  inline void reset(void)
+  {
+    m_index_1= 0;
+    m_index_2= 1;
+  }
+
+  inline bool has_more_host(void)
+  { return (m_index_1 < host_max); }
+
+  inline void next_host(void)
+  {
+    m_index_1++;
+    m_index_2= 1;
+  }
+};
+
+/** Table PERFORMANCE_SCHEMA.EVENTS_STATEMENTS_SUMMARY_BY_HOST_BY_EVENT_NAME. */
+class table_esms_by_host_by_event_name : public PFS_engine_table
+{
+public:
+  /** Table share */
+  static PFS_engine_table_share m_share;
+  static PFS_engine_table* create();
+  static int delete_all_rows();
+
+  virtual int rnd_init(bool scan);
+  virtual int rnd_next();
+  virtual int rnd_pos(const void *pos);
+  virtual void reset_position(void);
+
+protected:
+  virtual int read_row_values(TABLE *table,
+                              unsigned char *buf,
+                              Field **fields,
+                              bool read_all);
+
+  table_esms_by_host_by_event_name();
+
+public:
+  ~table_esms_by_host_by_event_name()
+  {}
+
+protected:
+  void make_row(PFS_host *host, PFS_statement_class *klass);
+
+private:
+  /** Table share lock. */
+  static THR_LOCK m_table_lock;
+  /** Fields definition. */
+  static TABLE_FIELD_DEF m_field_def;
+
+  /** Current row. */
+  row_esms_by_host_by_event_name m_row;
+  /** True is the current row exists. */
+  bool m_row_exists;
+  /** Current position. */
+  pos_esms_by_host_by_event_name m_pos;
+  /** Next position. */
+  pos_esms_by_host_by_event_name m_next_pos;
+};
+
+/** @} */
+#endif
diff --git a/storage/perfschema/table_esms_by_thread_by_event_name.cc b/storage/perfschema/table_esms_by_thread_by_event_name.cc
new file mode 100644
index 00000000000..5a7faca1b79
--- /dev/null
+++ b/storage/perfschema/table_esms_by_thread_by_event_name.cc
@@ -0,0 +1,325 @@
+/* Copyright (c) 2010, 2011, Oracle and/or its affiliates. All rights reserved.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; version 2 of the License.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301  USA */
+
+/**
+  @file storage/perfschema/table_esms_by_thread_by_event_name.cc
+  Table EVENTS_STATEMENTS_SUMMARY_BY_HOST_BY_EVENT_NAME (implementation).
+*/
+
+#include "my_global.h"
+#include "my_pthread.h"
+#include "pfs_instr_class.h"
+#include "pfs_column_types.h"
+#include "pfs_column_values.h"
+#include "table_esms_by_thread_by_event_name.h"
+#include "pfs_global.h"
+#include "pfs_visitor.h"
+
+THR_LOCK table_esms_by_thread_by_event_name::m_table_lock;
+
+static const TABLE_FIELD_TYPE field_types[]=
+{
+  {
+    { C_STRING_WITH_LEN("THREAD_ID") },
+    { C_STRING_WITH_LEN("int(11)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("EVENT_NAME") },
+    { C_STRING_WITH_LEN("varchar(128)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("COUNT_STAR") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_TIMER_WAIT") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("MIN_TIMER_WAIT") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("AVG_TIMER_WAIT") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("MAX_TIMER_WAIT") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_LOCK_TIME") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_ERRORS") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_WARNINGS") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_ROWS_AFFECTED") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_ROWS_SENT") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_ROWS_EXAMINED") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_CREATED_TMP_DISK_TABLES") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_CREATED_TMP_TABLES") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_SELECT_FULL_JOIN") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_SELECT_FULL_RANGE_JOIN") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_SELECT_RANGE") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_SELECT_RANGE_CHECK") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_SELECT_SCAN") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_SORT_MERGE_PASSES") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_SORT_RANGE") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_SORT_ROWS") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_SORT_SCAN") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_NO_INDEX_USED") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_NO_GOOD_INDEX_USED") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  }
+};
+
+TABLE_FIELD_DEF
+table_esms_by_thread_by_event_name::m_field_def=
+{ 26, field_types };
+
+PFS_engine_table_share
+table_esms_by_thread_by_event_name::m_share=
+{
+  { C_STRING_WITH_LEN("events_statements_summary_by_thread_by_event_name") },
+  &pfs_truncatable_acl,
+  table_esms_by_thread_by_event_name::create,
+  NULL, /* write_row */
+  table_esms_by_thread_by_event_name::delete_all_rows,
+  NULL, /* get_row_count */
+  1000, /* records */
+  sizeof(pos_esms_by_thread_by_event_name),
+  &m_table_lock,
+  &m_field_def,
+  false /* checked */
+};
+
+PFS_engine_table*
+table_esms_by_thread_by_event_name::create(void)
+{
+  return new table_esms_by_thread_by_event_name();
+}
+
+int
+table_esms_by_thread_by_event_name::delete_all_rows(void)
+{
+  reset_events_statements_by_thread();
+  return 0;
+}
+
+table_esms_by_thread_by_event_name::table_esms_by_thread_by_event_name()
+  : PFS_engine_table(&m_share, &m_pos),
+    m_row_exists(false), m_pos(), m_next_pos()
+{}
+
+void table_esms_by_thread_by_event_name::reset_position(void)
+{
+  m_pos.reset();
+  m_next_pos.reset();
+}
+
+int table_esms_by_thread_by_event_name::rnd_init(bool scan)
+{
+  m_normalizer= time_normalizer::get(statement_timer);
+  return 0;
+}
+
+int table_esms_by_thread_by_event_name::rnd_next(void)
+{
+  PFS_thread *thread;
+  PFS_statement_class *statement_class;
+
+  for (m_pos.set_at(&m_next_pos);
+       m_pos.has_more_thread();
+       m_pos.next_thread())
+  {
+    thread= &thread_array[m_pos.m_index_1];
+
+    /*
+      Important note: the thread scan is the outer loop (index 1),
+      to minimize the number of calls to atomic operations.
+    */
+    if (thread->m_lock.is_populated())
+    {
+      statement_class= find_statement_class(m_pos.m_index_2);
+      if (statement_class)
+      {
+        make_row(thread, statement_class);
+        m_next_pos.set_after(&m_pos);
+        return 0;
+      }
+    }
+  }
+
+  return HA_ERR_END_OF_FILE;
+}
+
+int
+table_esms_by_thread_by_event_name::rnd_pos(const void *pos)
+{
+  PFS_thread *thread;
+  PFS_statement_class *statement_class;
+
+  set_position(pos);
+  DBUG_ASSERT(m_pos.m_index_1 < thread_max);
+
+  thread= &thread_array[m_pos.m_index_1];
+  if (! thread->m_lock.is_populated())
+    return HA_ERR_RECORD_DELETED;
+
+  statement_class= find_statement_class(m_pos.m_index_2);
+  if (statement_class)
+  {
+    make_row(thread, statement_class);
+    return 0;
+  }
+
+  return HA_ERR_RECORD_DELETED;
+}
+
+void table_esms_by_thread_by_event_name
+::make_row(PFS_thread *thread, PFS_statement_class *klass)
+{
+  pfs_lock lock;
+  m_row_exists= false;
+
+  /* Protect this reader against a thread termination */
+  thread->m_lock.begin_optimistic_lock(&lock);
+
+  m_row.m_thread_internal_id= thread->m_thread_internal_id;
+
+  m_row.m_event_name.make_row(klass);
+
+  PFS_connection_statement_visitor visitor(klass);
+  PFS_connection_iterator::visit_thread(thread, & visitor);
+
+  if (! thread->m_lock.end_optimistic_lock(&lock))
+    return;
+
+  m_row_exists= true;
+  m_row.m_stat.set(m_normalizer, & visitor.m_stat);
+}
+
+int table_esms_by_thread_by_event_name
+::read_row_values(TABLE *table, unsigned char *, Field **fields,
+                  bool read_all)
+{
+  Field *f;
+
+  if (unlikely(! m_row_exists))
+    return HA_ERR_RECORD_DELETED;
+
+  /* Set the null bits */
+  DBUG_ASSERT(table->s->null_bytes == 0);
+
+  for (; (f= *fields) ; fields++)
+  {
+    if (read_all || bitmap_is_set(table->read_set, f->field_index))
+    {
+      switch(f->field_index)
+      {
+      case 0: /* THREAD_ID */
+        set_field_ulong(f, m_row.m_thread_internal_id);
+        break;
+      case 1: /* EVENT_NAME */
+        m_row.m_event_name.set_field(f);
+        break;
+      default: /* 2, ... COUNT/SUM/MIN/AVG/MAX */
+        m_row.m_stat.set_field(f->field_index - 2, f);
+        break;
+      }
+    }
+  }
+
+  return 0;
+}
+
diff --git a/storage/perfschema/table_esms_by_thread_by_event_name.h b/storage/perfschema/table_esms_by_thread_by_event_name.h
new file mode 100644
index 00000000000..2f36606a5e1
--- /dev/null
+++ b/storage/perfschema/table_esms_by_thread_by_event_name.h
@@ -0,0 +1,129 @@
+/* Copyright (c) 2010, 2011, Oracle and/or its affiliates. All rights reserved.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; version 2 of the License.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301  USA */
+
+#ifndef TABLE_ESMS_BY_THREAD_BY_EVENT_NAME_H
+#define TABLE_ESMS_BY_THREAD_BY_EVENT_NAME_H
+
+/**
+  @file storage/perfschema/table_esms_by_thread_by_event_name.h
+  Table EVENTS_STATEMENTS_SUMMARY_BY_THREAD_BY_EVENT_NAME (declarations).
+*/
+
+#include "pfs_column_types.h"
+#include "pfs_engine_table.h"
+#include "pfs_instr_class.h"
+#include "pfs_instr.h"
+#include "table_helper.h"
+
+/**
+  @addtogroup Performance_schema_tables
+  @{
+*/
+
+/**
+  A row of table
+  PERFORMANCE_SCHEMA.EVENTS_STATEMENTS_SUMMARY_BY_THREAD_BY_EVENT_NAME.
+*/
+struct row_esms_by_thread_by_event_name
+{
+  /** Column THREAD_ID. */
+  ulong m_thread_internal_id;
+  /** Column EVENT_NAME. */
+  PFS_event_name_row m_event_name;
+  /** Columns COUNT_STAR, SUM/MIN/AVG/MAX TIMER_WAIT. */
+  PFS_statement_stat_row m_stat;
+};
+
+/**
+  Position of a cursor on
+  PERFORMANCE_SCHEMA.EVENTS_STATEMENTS_SUMMARY_BY_THREAD_BY_EVENT_NAME.
+  Index 1 on thread (0 based).
+  Index 2 on statement class (1 based).
+*/
+struct pos_esms_by_thread_by_event_name
+: public PFS_double_index, public PFS_instrument_view_constants
+{
+  pos_esms_by_thread_by_event_name()
+    : PFS_double_index(0, 1)
+  {}
+
+  inline void reset(void)
+  {
+    m_index_1= 0;
+    m_index_2= 1;
+  }
+
+  inline bool has_more_thread(void)
+  { return (m_index_1 < thread_max); }
+
+  inline void next_thread(void)
+  {
+    m_index_1++;
+    m_index_2= 1;
+  }
+
+  inline void next_statement(void)
+  {
+    m_index_2++;
+  }
+};
+
+/** Table PERFORMANCE_SCHEMA.EVENTS_STATEMENTS_SUMMARY_BY_THREAD_BY_EVENT_NAME. */
+class table_esms_by_thread_by_event_name : public PFS_engine_table
+{
+public:
+  /** Table share */
+  static PFS_engine_table_share m_share;
+  static PFS_engine_table* create();
+  static int delete_all_rows();
+
+  virtual int rnd_init(bool scan);
+  virtual int rnd_next();
+  virtual int rnd_pos(const void *pos);
+  virtual void reset_position(void);
+
+protected:
+  virtual int read_row_values(TABLE *table,
+                              unsigned char *buf,
+                              Field **fields,
+                              bool read_all);
+
+  table_esms_by_thread_by_event_name();
+
+public:
+  ~table_esms_by_thread_by_event_name()
+  {}
+
+protected:
+  void make_row(PFS_thread *thread, PFS_statement_class *klass);
+
+private:
+  /** Table share lock. */
+  static THR_LOCK m_table_lock;
+  /** Fields definition. */
+  static TABLE_FIELD_DEF m_field_def;
+
+  /** Current row. */
+  row_esms_by_thread_by_event_name m_row;
+  /** True is the current row exists. */
+  bool m_row_exists;
+  /** Current position. */
+  pos_esms_by_thread_by_event_name m_pos;
+  /** Next position. */
+  pos_esms_by_thread_by_event_name m_next_pos;
+};
+
+/** @} */
+#endif
diff --git a/storage/perfschema/table_esms_by_user_by_event_name.cc b/storage/perfschema/table_esms_by_user_by_event_name.cc
new file mode 100644
index 00000000000..d65a255e280
--- /dev/null
+++ b/storage/perfschema/table_esms_by_user_by_event_name.cc
@@ -0,0 +1,324 @@
+/* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; version 2 of the License.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+/**
+  @file storage/perfschema/table_esms_by_user_by_event_name.cc
+  Table EVENTS_STATEMENTS_SUMMARY_BY_USER_BY_EVENT_NAME (implementation).
+*/
+
+#include "my_global.h"
+#include "my_pthread.h"
+#include "pfs_instr_class.h"
+#include "pfs_column_types.h"
+#include "pfs_column_values.h"
+#include "table_esms_by_user_by_event_name.h"
+#include "pfs_global.h"
+#include "pfs_account.h"
+#include "pfs_visitor.h"
+
+THR_LOCK table_esms_by_user_by_event_name::m_table_lock;
+
+static const TABLE_FIELD_TYPE field_types[]=
+{
+  {
+    { C_STRING_WITH_LEN("USER") },
+    { C_STRING_WITH_LEN("char(16)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("EVENT_NAME") },
+    { C_STRING_WITH_LEN("varchar(128)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("COUNT_STAR") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_TIMER_WAIT") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("MIN_TIMER_WAIT") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("AVG_TIMER_WAIT") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("MAX_TIMER_WAIT") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_LOCK_TIME") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_ERRORS") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_WARNINGS") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_ROWS_AFFECTED") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_ROWS_SENT") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_ROWS_EXAMINED") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_CREATED_TMP_DISK_TABLES") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_CREATED_TMP_TABLES") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_SELECT_FULL_JOIN") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_SELECT_FULL_RANGE_JOIN") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_SELECT_RANGE") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_SELECT_RANGE_CHECK") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_SELECT_SCAN") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_SORT_MERGE_PASSES") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_SORT_RANGE") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_SORT_ROWS") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_SORT_SCAN") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_NO_INDEX_USED") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_NO_GOOD_INDEX_USED") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  }
+};
+
+TABLE_FIELD_DEF
+table_esms_by_user_by_event_name::m_field_def=
+{ 26, field_types };
+
+PFS_engine_table_share
+table_esms_by_user_by_event_name::m_share=
+{
+  { C_STRING_WITH_LEN("events_statements_summary_by_user_by_event_name") },
+  &pfs_truncatable_acl,
+  table_esms_by_user_by_event_name::create,
+  NULL, /* write_row */
+  table_esms_by_user_by_event_name::delete_all_rows,
+  NULL, /* get_row_count */
+  1000, /* records */
+  sizeof(pos_esms_by_user_by_event_name),
+  &m_table_lock,
+  &m_field_def,
+  false /* checked */
+};
+
+PFS_engine_table*
+table_esms_by_user_by_event_name::create(void)
+{
+  return new table_esms_by_user_by_event_name();
+}
+
+int
+table_esms_by_user_by_event_name::delete_all_rows(void)
+{
+  reset_events_statements_by_thread();
+  reset_events_statements_by_account();
+  reset_events_statements_by_user();
+  return 0;
+}
+
+table_esms_by_user_by_event_name::table_esms_by_user_by_event_name()
+  : PFS_engine_table(&m_share, &m_pos),
+    m_row_exists(false), m_pos(), m_next_pos()
+{}
+
+void table_esms_by_user_by_event_name::reset_position(void)
+{
+  m_pos.reset();
+  m_next_pos.reset();
+}
+
+int table_esms_by_user_by_event_name::rnd_init(bool scan)
+{
+  m_normalizer= time_normalizer::get(statement_timer);
+  return 0;
+}
+
+int table_esms_by_user_by_event_name::rnd_next(void)
+{
+  PFS_user *user;
+  PFS_statement_class *statement_class;
+
+  for (m_pos.set_at(&m_next_pos);
+       m_pos.has_more_user();
+       m_pos.next_user())
+  {
+    user= &user_array[m_pos.m_index_1];
+    if (user->m_lock.is_populated())
+    {
+      statement_class= find_statement_class(m_pos.m_index_2);
+      if (statement_class)
+      {
+        make_row(user, statement_class);
+        m_next_pos.set_after(&m_pos);
+        return 0;
+      }
+    }
+  }
+
+  return HA_ERR_END_OF_FILE;
+}
+
+int
+table_esms_by_user_by_event_name::rnd_pos(const void *pos)
+{
+  PFS_user *user;
+  PFS_statement_class *statement_class;
+
+  set_position(pos);
+  DBUG_ASSERT(m_pos.m_index_1 < user_max);
+
+  user= &user_array[m_pos.m_index_1];
+  if (! user->m_lock.is_populated())
+    return HA_ERR_RECORD_DELETED;
+
+  statement_class= find_statement_class(m_pos.m_index_2);
+  if (statement_class)
+  {
+    make_row(user, statement_class);
+    return 0;
+  }
+
+  return HA_ERR_RECORD_DELETED;
+}
+
+void table_esms_by_user_by_event_name
+::make_row(PFS_user *user, PFS_statement_class *klass)
+{
+  pfs_lock lock;
+  m_row_exists= false;
+
+  user->m_lock.begin_optimistic_lock(&lock);
+
+  if (m_row.m_user.make_row(user))
+    return;
+
+  m_row.m_event_name.make_row(klass);
+
+  PFS_connection_statement_visitor visitor(klass);
+  PFS_connection_iterator::visit_user(user, true, true, & visitor);
+
+  if (! user->m_lock.end_optimistic_lock(&lock))
+    return;
+
+  m_row_exists= true;
+  m_row.m_stat.set(m_normalizer, & visitor.m_stat);
+}
+
+int table_esms_by_user_by_event_name
+::read_row_values(TABLE *table, unsigned char *buf, Field **fields,
+                  bool read_all)
+{
+  Field *f;
+
+  if (unlikely(! m_row_exists))
+    return HA_ERR_RECORD_DELETED;
+
+  /* Set the null bits */
+  DBUG_ASSERT(table->s->null_bytes == 1);
+  buf[0]= 0;
+
+  for (; (f= *fields) ; fields++)
+  {
+    if (read_all || bitmap_is_set(table->read_set, f->field_index))
+    {
+      switch(f->field_index)
+      {
+      case 0: /* USER */
+        m_row.m_user.set_field(f);
+        break;
+      case 1: /* EVENT_NAME */
+        m_row.m_event_name.set_field(f);
+        break;
+      default: /* 2, ... COUNT/SUM/MIN/AVG/MAX */
+        m_row.m_stat.set_field(f->field_index - 2, f);
+        break;
+      }
+    }
+  }
+
+  return 0;
+}
+
diff --git a/storage/perfschema/table_esms_by_user_by_event_name.h b/storage/perfschema/table_esms_by_user_by_event_name.h
new file mode 100644
index 00000000000..4f52b64b6b8
--- /dev/null
+++ b/storage/perfschema/table_esms_by_user_by_event_name.h
@@ -0,0 +1,125 @@
+/* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; version 2 of the License.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+#ifndef TABLE_ESMS_BY_USER_BY_EVENT_NAME_H
+#define TABLE_ESMS_BY_USER_BY_EVENT_NAME_H
+
+/**
+  @file storage/perfschema/table_esms_by_user_by_event_name.h
+  Table EVENTS_STATEMENTS_SUMMARY_BY_USER_BY_EVENT_NAME (declarations).
+*/
+
+#include "pfs_column_types.h"
+#include "pfs_engine_table.h"
+#include "pfs_instr_class.h"
+#include "pfs_instr.h"
+#include "pfs_user.h"
+#include "table_helper.h"
+
+/**
+  @addtogroup Performance_schema_tables
+  @{
+*/
+
+/**
+  A row of table
+  PERFORMANCE_SCHEMA.EVENTS_STATEMENTS_SUMMARY_BY_USER_BY_EVENT_NAME.
+*/
+struct row_esms_by_user_by_event_name
+{
+  /** Column USER. */
+  PFS_user_row m_user;
+  /** Column EVENT_NAME. */
+  PFS_event_name_row m_event_name;
+  /** Columns COUNT_STAR, SUM/MIN/AVG/MAX TIMER_WAIT. */
+  PFS_statement_stat_row m_stat;
+};
+
+/**
+  Position of a cursor on
+  PERFORMANCE_SCHEMA.EVENTS_STATEMENTS_SUMMARY_BY_USER_BY_EVENT_NAME.
+  Index 1 on user (0 based)
+  Index 2 on statement class (1 based)
+*/
+struct pos_esms_by_user_by_event_name
+: public PFS_double_index
+{
+  pos_esms_by_user_by_event_name()
+    : PFS_double_index(0, 1)
+  {}
+
+  inline void reset(void)
+  {
+    m_index_1= 0;
+    m_index_2= 1;
+  }
+
+  inline bool has_more_user(void)
+  { return (m_index_1 < user_max); }
+
+  inline void next_user(void)
+  {
+    m_index_1++;
+    m_index_2= 1;
+  }
+};
+
+/** Table PERFORMANCE_SCHEMA.EVENTS_STATEMENTS_SUMMARY_BY_USER_BY_EVENT_NAME. */
+class table_esms_by_user_by_event_name : public PFS_engine_table
+{
+public:
+  /** Table share */
+  static PFS_engine_table_share m_share;
+  static PFS_engine_table* create();
+  static int delete_all_rows();
+
+  virtual int rnd_init(bool scan);
+  virtual int rnd_next();
+  virtual int rnd_pos(const void *pos);
+  virtual void reset_position(void);
+
+protected:
+  virtual int read_row_values(TABLE *table,
+                              unsigned char *buf,
+                              Field **fields,
+                              bool read_all);
+
+  table_esms_by_user_by_event_name();
+
+public:
+  ~table_esms_by_user_by_event_name()
+  {}
+
+protected:
+  void make_row(PFS_user *user, PFS_statement_class *klass);
+
+private:
+  /** Table share lock. */
+  static THR_LOCK m_table_lock;
+  /** Fields definition. */
+  static TABLE_FIELD_DEF m_field_def;
+
+  /** Current row. */
+  row_esms_by_user_by_event_name m_row;
+  /** True is the current row exists. */
+  bool m_row_exists;
+  /** Current position. */
+  pos_esms_by_user_by_event_name m_pos;
+  /** Next position. */
+  pos_esms_by_user_by_event_name m_next_pos;
+};
+
+/** @} */
+#endif
diff --git a/storage/perfschema/table_esms_global_by_event_name.cc b/storage/perfschema/table_esms_global_by_event_name.cc
new file mode 100644
index 00000000000..22c87f09137
--- /dev/null
+++ b/storage/perfschema/table_esms_global_by_event_name.cc
@@ -0,0 +1,299 @@
+/* Copyright (c) 2010, 2011, Oracle and/or its affiliates. All rights reserved.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; version 2 of the License.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301  USA */
+
+/**
+  @file storage/perfschema/table_esms_global_by_event_name.cc
+  Table EVENTS_STATEMENTS_SUMMARY_GLOBAL_BY_EVENT_NAME (implementation).
+*/
+
+#include "my_global.h"
+#include "my_pthread.h"
+#include "pfs_instr_class.h"
+#include "pfs_column_types.h"
+#include "pfs_column_values.h"
+#include "table_esms_global_by_event_name.h"
+#include "pfs_global.h"
+#include "pfs_instr.h"
+#include "pfs_timer.h"
+#include "pfs_visitor.h"
+
+THR_LOCK table_esms_global_by_event_name::m_table_lock;
+
+static const TABLE_FIELD_TYPE field_types[]=
+{
+  {
+    { C_STRING_WITH_LEN("EVENT_NAME") },
+    { C_STRING_WITH_LEN("varchar(128)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("COUNT_STAR") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_TIMER_WAIT") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("MIN_TIMER_WAIT") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("AVG_TIMER_WAIT") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("MAX_TIMER_WAIT") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_LOCK_TIME") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_ERRORS") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_WARNINGS") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_ROWS_AFFECTED") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_ROWS_SENT") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_ROWS_EXAMINED") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_CREATED_TMP_DISK_TABLES") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_CREATED_TMP_TABLES") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_SELECT_FULL_JOIN") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_SELECT_FULL_RANGE_JOIN") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_SELECT_RANGE") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_SELECT_RANGE_CHECK") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_SELECT_SCAN") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_SORT_MERGE_PASSES") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_SORT_RANGE") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_SORT_ROWS") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_SORT_SCAN") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_NO_INDEX_USED") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_NO_GOOD_INDEX_USED") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  }
+};
+
+TABLE_FIELD_DEF
+table_esms_global_by_event_name::m_field_def=
+{ 25, field_types };
+
+PFS_engine_table_share
+table_esms_global_by_event_name::m_share=
+{
+  { C_STRING_WITH_LEN("events_statements_summary_global_by_event_name") },
+  &pfs_truncatable_acl,
+  table_esms_global_by_event_name::create,
+  NULL, /* write_row */
+  table_esms_global_by_event_name::delete_all_rows,
+  NULL, /* get_row_count */
+  1000, /* records */
+  sizeof(PFS_simple_index),
+  &m_table_lock,
+  &m_field_def,
+  false /* checked */
+};
+
+PFS_engine_table*
+table_esms_global_by_event_name::create(void)
+{
+  return new table_esms_global_by_event_name();
+}
+
+int
+table_esms_global_by_event_name::delete_all_rows(void)
+{
+  reset_events_statements_by_thread();
+  reset_events_statements_global();
+  return 0;
+}
+
+table_esms_global_by_event_name::table_esms_global_by_event_name()
+  : PFS_engine_table(&m_share, &m_pos),
+    m_row_exists(false), m_pos(1), m_next_pos(1)
+{}
+
+void table_esms_global_by_event_name::reset_position(void)
+{
+  m_pos= 1;
+  m_next_pos= 1;
+}
+
+int table_esms_global_by_event_name::rnd_init(bool scan)
+{
+  m_normalizer= time_normalizer::get(statement_timer);
+  return 0;
+}
+
+int table_esms_global_by_event_name::rnd_next(void)
+{
+  PFS_statement_class *statement_class;
+
+  if (global_instr_class_statements_array == NULL)
+    return HA_ERR_END_OF_FILE;
+
+  m_pos.set_at(&m_next_pos);
+
+  statement_class= find_statement_class(m_pos.m_index);
+  if (statement_class)
+  {
+    make_row(statement_class);
+    m_next_pos.set_after(&m_pos);
+    return 0;
+  }
+
+  return HA_ERR_END_OF_FILE;
+}
+
+int
+table_esms_global_by_event_name::rnd_pos(const void *pos)
+{
+  PFS_statement_class *statement_class;
+
+  set_position(pos);
+
+  if (global_instr_class_statements_array == NULL)
+    return HA_ERR_END_OF_FILE;
+
+  statement_class=find_statement_class(m_pos.m_index);
+  if (statement_class)
+  {
+    make_row(statement_class);
+    return 0;
+  }
+
+  return HA_ERR_RECORD_DELETED;
+}
+
+
+void table_esms_global_by_event_name
+::make_row(PFS_statement_class *klass)
+{
+  m_row.m_event_name.make_row(klass);
+
+  PFS_connection_statement_visitor visitor(klass);
+  PFS_connection_iterator::visit_global(true, /* hosts */
+                                        false, /* users */
+                                        true, true, & visitor);
+
+  m_row.m_stat.set(m_normalizer, & visitor.m_stat);
+  m_row_exists= true;
+}
+
+int table_esms_global_by_event_name
+::read_row_values(TABLE *table, unsigned char *, Field **fields,
+                  bool read_all)
+{
+  Field *f;
+
+  if (unlikely(! m_row_exists))
+    return HA_ERR_RECORD_DELETED;
+
+  /* Set the null bits */
+  DBUG_ASSERT(table->s->null_bytes == 0);
+
+  for (; (f= *fields) ; fields++)
+  {
+    if (read_all || bitmap_is_set(table->read_set, f->field_index))
+    {
+      switch(f->field_index)
+      {
+      case 0: /* NAME */
+        m_row.m_event_name.set_field(f);
+        break;
+      default: /* 1, ... COUNT/SUM/MIN/AVG/MAX */
+        m_row.m_stat.set_field(f->field_index - 1, f);
+        break;
+      }
+    }
+  }
+
+  return 0;
+}
+
diff --git a/storage/perfschema/table_esms_global_by_event_name.h b/storage/perfschema/table_esms_global_by_event_name.h
new file mode 100644
index 00000000000..ed07e2b9062
--- /dev/null
+++ b/storage/perfschema/table_esms_global_by_event_name.h
@@ -0,0 +1,93 @@
+/* Copyright (c) 2010, 2011, Oracle and/or its affiliates. All rights reserved.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; version 2 of the License.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301  USA */
+
+#ifndef TABLE_ESMS_GLOBAL_BY_EVENT_NAME_H
+#define TABLE_ESMS_GLOBAL_BY_EVENT_NAME_H
+
+/**
+  @file storage/perfschema/table_esms_global_by_event_name.h
+  Table EVENTS_STATEMENTS_SUMMARY_GLOBAL_BY_EVENT_NAME (declarations).
+*/
+
+#include "pfs_column_types.h"
+#include "pfs_engine_table.h"
+#include "pfs_instr_class.h"
+#include "pfs_instr.h"
+#include "table_helper.h"
+
+/**
+  @addtogroup Performance_schema_tables
+  @{
+*/
+
+/**
+  A row of table
+  PERFORMANCE_SCHEMA.EVENTS_STATEMENTS_SUMMARY_GLOBAL_BY_EVENT_NAME.
+*/
+struct row_esms_global_by_event_name
+{
+  /** Column EVENT_NAME. */
+  PFS_event_name_row m_event_name;
+  /** Columns COUNT_STAR, SUM/MIN/AVG/MAX TIMER_WAIT. */
+  PFS_statement_stat_row m_stat;
+};
+
+/** Table PERFORMANCE_SCHEMA.EVENTS_STATEMENTS_SUMMARY_GLOBAL_BY_EVENT_NAME. */
+class table_esms_global_by_event_name : public PFS_engine_table
+{
+public:
+  /** Table share */
+  static PFS_engine_table_share m_share;
+  static PFS_engine_table* create();
+  static int delete_all_rows();
+
+  virtual int rnd_init(bool scan);
+  virtual int rnd_next();
+  virtual int rnd_pos(const void *pos);
+  virtual void reset_position(void);
+
+protected:
+  virtual int read_row_values(TABLE *table,
+                              unsigned char *buf,
+                              Field **fields,
+                              bool read_all);
+
+  table_esms_global_by_event_name();
+
+public:
+  ~table_esms_global_by_event_name()
+  {}
+
+protected:
+  void make_row(PFS_statement_class *klass);
+
+private:
+  /** Table share lock. */
+  static THR_LOCK m_table_lock;
+  /** Fields definition. */
+  static TABLE_FIELD_DEF m_field_def;
+
+  /** Current row. */
+  row_esms_global_by_event_name m_row;
+  /** True is the current row exists. */
+  bool m_row_exists;
+  /** Current position. */
+  PFS_simple_index m_pos;
+  /** Next position. */
+  PFS_simple_index m_next_pos;
+};
+
+/** @} */
+#endif
diff --git a/storage/perfschema/table_events_stages.cc b/storage/perfschema/table_events_stages.cc
new file mode 100644
index 00000000000..e438249fbd3
--- /dev/null
+++ b/storage/perfschema/table_events_stages.cc
@@ -0,0 +1,523 @@
+/* Copyright (c) 2010, 2011, Oracle and/or its affiliates. All rights reserved.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; version 2 of the License.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software Foundation,
+  51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA */
+
+/**
+  @file storage/perfschema/table_events_stages.cc
+  Table EVENTS_STAGES_xxx (implementation).
+*/
+
+#include "my_global.h"
+#include "my_pthread.h"
+#include "table_events_stages.h"
+#include "pfs_instr_class.h"
+#include "pfs_instr.h"
+#include "pfs_events_stages.h"
+#include "pfs_timer.h"
+
+THR_LOCK table_events_stages_current::m_table_lock;
+
+static const TABLE_FIELD_TYPE field_types[]=
+{
+  {
+    { C_STRING_WITH_LEN("THREAD_ID") },
+    { C_STRING_WITH_LEN("int(11)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("EVENT_ID") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("END_EVENT_ID") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("EVENT_NAME") },
+    { C_STRING_WITH_LEN("varchar(128)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SOURCE") },
+    { C_STRING_WITH_LEN("varchar(64)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("TIMER_START") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("TIMER_END") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("TIMER_WAIT") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("NESTING_EVENT_ID") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("NESTING_EVENT_TYPE") },
+    { C_STRING_WITH_LEN("enum(\'STATEMENT\',\'STAGE\',\'WAIT\'") },
+    { NULL, 0}
+  }
+};
+
+TABLE_FIELD_DEF
+table_events_stages_current::m_field_def=
+{10 , field_types };
+
+PFS_engine_table_share
+table_events_stages_current::m_share=
+{
+  { C_STRING_WITH_LEN("events_stages_current") },
+  &pfs_truncatable_acl,
+  &table_events_stages_current::create,
+  NULL, /* write_row */
+  &table_events_stages_current::delete_all_rows,
+  NULL, /* get_row_count */
+  1000, /* records */
+  sizeof(PFS_simple_index), /* ref length */
+  &m_table_lock,
+  &m_field_def,
+  false /* checked */
+};
+
+THR_LOCK table_events_stages_history::m_table_lock;
+
+PFS_engine_table_share
+table_events_stages_history::m_share=
+{
+  { C_STRING_WITH_LEN("events_stages_history") },
+  &pfs_truncatable_acl,
+  &table_events_stages_history::create,
+  NULL, /* write_row */
+  &table_events_stages_history::delete_all_rows,
+  NULL, /* get_row_count */
+  1000, /* records */
+  sizeof(pos_events_stages_history), /* ref length */
+  &m_table_lock,
+  &table_events_stages_current::m_field_def,
+  false /* checked */
+};
+
+THR_LOCK table_events_stages_history_long::m_table_lock;
+
+PFS_engine_table_share
+table_events_stages_history_long::m_share=
+{
+  { C_STRING_WITH_LEN("events_stages_history_long") },
+  &pfs_truncatable_acl,
+  &table_events_stages_history_long::create,
+  NULL, /* write_row */
+  &table_events_stages_history_long::delete_all_rows,
+  NULL, /* get_row_count */
+  10000, /* records */
+  sizeof(PFS_simple_index), /* ref length */
+  &m_table_lock,
+  &table_events_stages_current::m_field_def,
+  false /* checked */
+};
+
+table_events_stages_common::table_events_stages_common
+(const PFS_engine_table_share *share, void *pos)
+  : PFS_engine_table(share, pos),
+  m_row_exists(false)
+{}
+
+/**
+  Build a row.
+  @param stage                      the stage the cursor is reading
+*/
+void table_events_stages_common::make_row(PFS_events_stages *stage)
+{
+  const char *base;
+  const char *safe_source_file;
+
+  m_row_exists= false;
+
+  PFS_stage_class *unsafe= (PFS_stage_class*) stage->m_class;
+  PFS_stage_class *klass= sanitize_stage_class(unsafe);
+  if (unlikely(klass == NULL))
+    return;
+
+  m_row.m_thread_internal_id= stage->m_thread_internal_id;
+  m_row.m_event_id= stage->m_event_id;
+  m_row.m_end_event_id= stage->m_end_event_id;
+  m_row.m_nesting_event_id= stage->m_nesting_event_id;
+  m_row.m_nesting_event_type= stage->m_nesting_event_type;
+
+  m_normalizer->to_pico(stage->m_timer_start, stage->m_timer_end,
+                      & m_row.m_timer_start, & m_row.m_timer_end, & m_row.m_timer_wait);
+
+  m_row.m_name= klass->m_name;
+  m_row.m_name_length= klass->m_name_length;
+
+  safe_source_file= stage->m_source_file;
+  if (unlikely(safe_source_file == NULL))
+    return;
+
+  base= base_name(safe_source_file);
+  m_row.m_source_length= my_snprintf(m_row.m_source, sizeof(m_row.m_source),
+                                     "%s:%d", base, stage->m_source_line);
+  if (m_row.m_source_length > sizeof(m_row.m_source))
+    m_row.m_source_length= sizeof(m_row.m_source);
+
+  m_row_exists= true;
+  return;
+}
+
+int table_events_stages_common::read_row_values(TABLE *table,
+                                               unsigned char *buf,
+                                               Field **fields,
+                                               bool read_all)
+{
+  Field *f;
+
+  if (unlikely(! m_row_exists))
+    return HA_ERR_RECORD_DELETED;
+
+  /* Set the null bits */
+  DBUG_ASSERT(table->s->null_bytes == 1);
+  buf[0]= 0;
+
+  for (; (f= *fields) ; fields++)
+  {
+    if (read_all || bitmap_is_set(table->read_set, f->field_index))
+    {
+      switch(f->field_index)
+      {
+      case 0: /* THREAD_ID */
+        set_field_ulong(f, m_row.m_thread_internal_id);
+        break;
+      case 1: /* EVENT_ID */
+        set_field_ulonglong(f, m_row.m_event_id);
+        break;
+      case 2: /* END_EVENT_ID */
+        if (m_row.m_end_event_id > 0)
+          set_field_ulonglong(f, m_row.m_end_event_id - 1);
+        else
+          f->set_null();
+        break;
+      case 3: /* EVENT_NAME */
+        set_field_varchar_utf8(f, m_row.m_name, m_row.m_name_length);
+        break;
+      case 4: /* SOURCE */
+        set_field_varchar_utf8(f, m_row.m_source, m_row.m_source_length);
+        break;
+      case 5: /* TIMER_START */
+        if (m_row.m_timer_start != 0)
+          set_field_ulonglong(f, m_row.m_timer_start);
+        else
+          f->set_null();
+        break;
+      case 6: /* TIMER_END */
+        if (m_row.m_timer_end != 0)
+          set_field_ulonglong(f, m_row.m_timer_end);
+        else
+          f->set_null();
+        break;
+      case 7: /* TIMER_WAIT */
+        if (m_row.m_timer_wait != 0)
+          set_field_ulonglong(f, m_row.m_timer_wait);
+        else
+          f->set_null();
+        break;
+      case 8: /* NESTING_EVENT_ID */
+        if (m_row.m_nesting_event_id != 0)
+          set_field_ulonglong(f, m_row.m_nesting_event_id);
+        else
+          f->set_null();
+        break;
+      case 9: /* NESTING_EVENT_TYPE */
+        if (m_row.m_nesting_event_id != 0)
+          set_field_enum(f, m_row.m_nesting_event_type);
+        else
+          f->set_null();
+        break;
+      default:
+        DBUG_ASSERT(false);
+      }
+    }
+  }
+  return 0;
+}
+
+PFS_engine_table* table_events_stages_current::create(void)
+{
+  return new table_events_stages_current();
+}
+
+table_events_stages_current::table_events_stages_current()
+  : table_events_stages_common(&m_share, &m_pos),
+  m_pos(0), m_next_pos(0)
+{}
+
+void table_events_stages_current::reset_position(void)
+{
+  m_pos.m_index= 0;
+  m_next_pos.m_index= 0;
+}
+
+int table_events_stages_current::rnd_init(bool scan)
+{
+  m_normalizer= time_normalizer::get(stage_timer);
+  return 0;
+}
+
+int table_events_stages_current::rnd_next(void)
+{
+  PFS_thread *pfs_thread;
+  PFS_events_stages *stage;
+
+  for (m_pos.set_at(&m_next_pos);
+       m_pos.m_index < thread_max;
+       m_pos.next())
+  {
+    pfs_thread= &thread_array[m_pos.m_index];
+
+    if (! pfs_thread->m_lock.is_populated())
+    {
+      /* This thread does not exist */
+      continue;
+    }
+
+    stage= &pfs_thread->m_stage_current;
+
+    make_row(stage);
+    m_next_pos.set_after(&m_pos);
+    return 0;
+  }
+
+  return HA_ERR_END_OF_FILE;
+}
+
+int table_events_stages_current::rnd_pos(const void *pos)
+{
+  PFS_thread *pfs_thread;
+  PFS_events_stages *stage;
+
+  set_position(pos);
+  DBUG_ASSERT(m_pos.m_index < thread_max);
+  pfs_thread= &thread_array[m_pos.m_index];
+
+  if (! pfs_thread->m_lock.is_populated())
+    return HA_ERR_RECORD_DELETED;
+
+  stage= &pfs_thread->m_stage_current;
+  make_row(stage);
+  return 0;
+}
+
+int table_events_stages_current::delete_all_rows(void)
+{
+  reset_events_stages_current();
+  return 0;
+}
+
+PFS_engine_table* table_events_stages_history::create(void)
+{
+  return new table_events_stages_history();
+}
+
+table_events_stages_history::table_events_stages_history()
+  : table_events_stages_common(&m_share, &m_pos),
+  m_pos(), m_next_pos()
+{}
+
+void table_events_stages_history::reset_position(void)
+{
+  m_pos.reset();
+  m_next_pos.reset();
+}
+
+int table_events_stages_history::rnd_init(bool scan)
+{
+  m_normalizer= time_normalizer::get(stage_timer);
+  return 0;
+}
+
+int table_events_stages_history::rnd_next(void)
+{
+  PFS_thread *pfs_thread;
+  PFS_events_stages *stage;
+
+  if (events_stages_history_per_thread == 0)
+    return HA_ERR_END_OF_FILE;
+
+  for (m_pos.set_at(&m_next_pos);
+       m_pos.m_index_1 < thread_max;
+       m_pos.next_thread())
+  {
+    pfs_thread= &thread_array[m_pos.m_index_1];
+
+    if (! pfs_thread->m_lock.is_populated())
+    {
+      /* This thread does not exist */
+      continue;
+    }
+
+    if (m_pos.m_index_2 >= events_stages_history_per_thread)
+    {
+      /* This thread does not have more (full) history */
+      continue;
+    }
+
+    if ( ! pfs_thread->m_stages_history_full &&
+        (m_pos.m_index_2 >= pfs_thread->m_stages_history_index))
+    {
+      /* This thread does not have more (not full) history */
+      continue;
+    }
+
+    stage= &pfs_thread->m_stages_history[m_pos.m_index_2];
+
+    if (stage->m_class != NULL)
+    {
+      make_row(stage);
+      /* Next iteration, look for the next history in this thread */
+      m_next_pos.set_after(&m_pos);
+      return 0;
+    }
+  }
+
+  return HA_ERR_END_OF_FILE;
+}
+
+int table_events_stages_history::rnd_pos(const void *pos)
+{
+  PFS_thread *pfs_thread;
+  PFS_events_stages *stage;
+
+  DBUG_ASSERT(events_stages_history_per_thread != 0);
+  set_position(pos);
+  DBUG_ASSERT(m_pos.m_index_1 < thread_max);
+  pfs_thread= &thread_array[m_pos.m_index_1];
+
+  if (! pfs_thread->m_lock.is_populated())
+    return HA_ERR_RECORD_DELETED;
+
+  DBUG_ASSERT(m_pos.m_index_2 < events_stages_history_per_thread);
+
+  if ( ! pfs_thread->m_stages_history_full &&
+      (m_pos.m_index_2 >= pfs_thread->m_stages_history_index))
+    return HA_ERR_RECORD_DELETED;
+
+  stage= &pfs_thread->m_stages_history[m_pos.m_index_2];
+
+  if (stage->m_class == NULL)
+    return HA_ERR_RECORD_DELETED;
+
+  make_row(stage);
+  return 0;
+}
+
+int table_events_stages_history::delete_all_rows(void)
+{
+  reset_events_stages_history();
+  return 0;
+}
+
+PFS_engine_table* table_events_stages_history_long::create(void)
+{
+  return new table_events_stages_history_long();
+}
+
+table_events_stages_history_long::table_events_stages_history_long()
+  : table_events_stages_common(&m_share, &m_pos),
+  m_pos(0), m_next_pos(0)
+{}
+
+void table_events_stages_history_long::reset_position(void)
+{
+  m_pos.m_index= 0;
+  m_next_pos.m_index= 0;
+}
+
+int table_events_stages_history_long::rnd_init(bool scan)
+{
+  m_normalizer= time_normalizer::get(stage_timer);
+  return 0;
+}
+
+int table_events_stages_history_long::rnd_next(void)
+{
+  PFS_events_stages *stage;
+  uint limit;
+
+  if (events_stages_history_long_size == 0)
+    return HA_ERR_END_OF_FILE;
+
+  if (events_stages_history_long_full)
+    limit= events_stages_history_long_size;
+  else
+    limit= events_stages_history_long_index % events_stages_history_long_size;
+
+  for (m_pos.set_at(&m_next_pos); m_pos.m_index < limit; m_pos.next())
+  {
+    stage= &events_stages_history_long_array[m_pos.m_index];
+
+    if (stage->m_class != NULL)
+    {
+      make_row(stage);
+      /* Next iteration, look for the next entry */
+      m_next_pos.set_after(&m_pos);
+      return 0;
+    }
+  }
+
+  return HA_ERR_END_OF_FILE;
+}
+
+int table_events_stages_history_long::rnd_pos(const void *pos)
+{
+  PFS_events_stages *stage;
+  uint limit;
+
+  if (events_stages_history_long_size == 0)
+    return HA_ERR_RECORD_DELETED;
+
+  set_position(pos);
+
+  if (events_stages_history_long_full)
+    limit= events_stages_history_long_size;
+  else
+    limit= events_stages_history_long_index % events_stages_history_long_size;
+
+  if (m_pos.m_index > limit)
+    return HA_ERR_RECORD_DELETED;
+
+  stage= &events_stages_history_long_array[m_pos.m_index];
+
+  if (stage->m_class == NULL)
+    return HA_ERR_RECORD_DELETED;
+
+  make_row(stage);
+  return 0;
+}
+
+int table_events_stages_history_long::delete_all_rows(void)
+{
+  reset_events_stages_history_long();
+  return 0;
+}
+
diff --git a/storage/perfschema/table_events_stages.h b/storage/perfschema/table_events_stages.h
new file mode 100644
index 00000000000..6bc712c15a5
--- /dev/null
+++ b/storage/perfschema/table_events_stages.h
@@ -0,0 +1,212 @@
+/* Copyright (c) 2010, 2011, Oracle and/or its affiliates. All rights reserved.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; version 2 of the License.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software Foundation,
+  51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA */
+
+#ifndef TABLE_EVENTS_STAGES_H
+#define TABLE_EVENTS_STAGES_H
+
+/**
+  @file storage/perfschema/table_events_stages.h
+  Table EVENTS_STAGES_xxx (declarations).
+*/
+
+#include "pfs_column_types.h"
+#include "pfs_engine_table.h"
+#include "pfs_events_stages.h"
+
+struct PFS_thread;
+
+/**
+  @addtogroup Performance_schema_tables
+  @{
+*/
+
+/** A row of table_events_stages_common. */
+struct row_events_stages
+{
+  /** Column THREAD_ID. */
+  ulong m_thread_internal_id;
+  /** Column EVENT_ID. */
+  ulonglong m_event_id;
+  /** Column END_EVENT_ID. */
+  ulonglong m_end_event_id;
+  /** Column NESTING_EVENT_ID. */
+  ulonglong m_nesting_event_id;
+  /** Column NESTING_EVENT_TYPE. */
+  enum_event_type m_nesting_event_type;
+  /** Column EVENT_NAME. */
+  const char *m_name;
+  /** Length in bytes of @c m_name. */
+  uint m_name_length;
+  /** Column TIMER_START. */
+  ulonglong m_timer_start;
+  /** Column TIMER_END. */
+  ulonglong m_timer_end;
+  /** Column TIMER_WAIT. */
+  ulonglong m_timer_wait;
+  /** Column SOURCE. */
+  char m_source[COL_SOURCE_SIZE];
+  /** Length in bytes of @c m_source. */
+  uint m_source_length;
+};
+
+/** Position of a cursor on PERFORMANCE_SCHEMA.EVENTS_STAGES_HISTORY. */
+struct pos_events_stages_history : public PFS_double_index
+{
+  pos_events_stages_history()
+    : PFS_double_index(0, 0)
+  {}
+
+  inline void reset(void)
+  {
+    m_index_1= 0;
+    m_index_2= 0;
+  }
+
+  inline void next_thread(void)
+  {
+    m_index_1++;
+    m_index_2= 0;
+  }
+};
+
+/**
+  Adapter, for table sharing the structure of
+  PERFORMANCE_SCHEMA.EVENTS_STAGES_CURRENT.
+*/
+class table_events_stages_common : public PFS_engine_table
+{
+protected:
+  virtual int read_row_values(TABLE *table,
+                              unsigned char *buf,
+                              Field **fields,
+                              bool read_all);
+
+  table_events_stages_common(const PFS_engine_table_share *share, void *pos);
+
+  ~table_events_stages_common()
+  {}
+
+  void make_row(PFS_events_stages *stage);
+
+  /** Current row. */
+  row_events_stages m_row;
+  /** True if the current row exists. */
+  bool m_row_exists;
+};
+
+/** Table PERFORMANCE_SCHEMA.EVENTS_STAGES_CURRENT. */
+class table_events_stages_current : public table_events_stages_common
+{
+public:
+  /** Table share */
+  static PFS_engine_table_share m_share;
+  static PFS_engine_table* create();
+  static int delete_all_rows();
+
+  virtual int rnd_init(bool scan);
+  virtual int rnd_next();
+  virtual int rnd_pos(const void *pos);
+  virtual void reset_position(void);
+
+protected:
+  table_events_stages_current();
+
+public:
+  ~table_events_stages_current()
+  {}
+
+private:
+  friend class table_events_stages_history;
+  friend class table_events_stages_history_long;
+
+  /** Table share lock. */
+  static THR_LOCK m_table_lock;
+  /**
+    Fields definition.
+    Also used by table_events_stages_history
+    and table_events_stages_history_long.
+  */
+  static TABLE_FIELD_DEF m_field_def;
+
+  /** Current position. */
+  PFS_simple_index m_pos;
+  /** Next position. */
+  PFS_simple_index m_next_pos;
+};
+
+/** Table PERFORMANCE_SCHEMA.EVENTS_STAGES_HISTORY. */
+class table_events_stages_history : public table_events_stages_common
+{
+public:
+  /** Table share */
+  static PFS_engine_table_share m_share;
+  static PFS_engine_table* create();
+  static int delete_all_rows();
+
+  virtual int rnd_init(bool scan);
+  virtual int rnd_next();
+  virtual int rnd_pos(const void *pos);
+  virtual void reset_position(void);
+
+protected:
+  table_events_stages_history();
+
+public:
+  ~table_events_stages_history()
+  {}
+
+private:
+  /** Table share lock. */
+  static THR_LOCK m_table_lock;
+
+  /** Current position. */
+  pos_events_stages_history m_pos;
+  /** Next position. */
+  pos_events_stages_history m_next_pos;
+};
+
+/** Table PERFORMANCE_SCHEMA.EVENTS_STAGES_HISTORY_LONG. */
+class table_events_stages_history_long : public table_events_stages_common
+{
+public:
+  /** Table share */
+  static PFS_engine_table_share m_share;
+  static PFS_engine_table* create();
+  static int delete_all_rows();
+
+  virtual int rnd_init(bool scan);
+  virtual int rnd_next();
+  virtual int rnd_pos(const void *pos);
+  virtual void reset_position(void);
+
+protected:
+  table_events_stages_history_long();
+
+public:
+  ~table_events_stages_history_long()
+  {}
+
+private:
+  /** Table share lock. */
+  static THR_LOCK m_table_lock;
+
+  /** Current position. */
+  PFS_simple_index m_pos;
+  /** Next position. */
+  PFS_simple_index m_next_pos;
+};
+
+/** @} */
+#endif
diff --git a/storage/perfschema/table_events_statements.cc b/storage/perfschema/table_events_statements.cc
new file mode 100644
index 00000000000..d453b14470f
--- /dev/null
+++ b/storage/perfschema/table_events_statements.cc
@@ -0,0 +1,886 @@
+/* Copyright (c) 2010, 2011, Oracle and/or its affiliates. All rights reserved.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; version 2 of the License.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software Foundation,
+  51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA */
+
+/**
+  @file storage/perfschema/table_events_statements.cc
+  Table EVENTS_STATEMENTS_xxx (implementation).
+*/
+
+#include "my_global.h"
+#include "my_pthread.h"
+#include "table_events_statements.h"
+#include "pfs_instr_class.h"
+#include "pfs_instr.h"
+#include "pfs_events_statements.h"
+#include "pfs_timer.h"
+#include "sp_head.h" /* TYPE_ENUM_FUNCTION, ... */
+#include "table_helper.h"
+#include "my_md5.h"
+
+THR_LOCK table_events_statements_current::m_table_lock;
+
+static const TABLE_FIELD_TYPE field_types[]=
+{
+  {
+    { C_STRING_WITH_LEN("THREAD_ID") },
+    { C_STRING_WITH_LEN("int(11)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("EVENT_ID") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("END_EVENT_ID") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("EVENT_NAME") },
+    { C_STRING_WITH_LEN("varchar(128)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SOURCE") },
+    { C_STRING_WITH_LEN("varchar(64)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("TIMER_START") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("TIMER_END") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("TIMER_WAIT") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("LOCK_TIME") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SQL_TEXT") },
+    { C_STRING_WITH_LEN("longtext") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("DIGEST") },
+    { C_STRING_WITH_LEN("varchar(32)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("DIGEST_TEXT") },
+    { C_STRING_WITH_LEN("longtext") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("CURRENT_SCHEMA") },
+    { C_STRING_WITH_LEN("varchar(64)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("OBJECT_TYPE") },
+    { C_STRING_WITH_LEN("varchar(64)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("OBJECT_SCHEMA") },
+    { C_STRING_WITH_LEN("varchar(64)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("OBJECT_NAME") },
+    { C_STRING_WITH_LEN("varchar(64)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("OBJECT_INSTANCE_BEGIN") },
+    { C_STRING_WITH_LEN("bigint") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("MYSQL_ERRNO") },
+    { C_STRING_WITH_LEN("int(11)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("RETURNED_SQLSTATE") },
+    { C_STRING_WITH_LEN("varchar(5)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("MESSAGE_TEXT") },
+    { C_STRING_WITH_LEN("varchar(128)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("ERRORS") },
+    { C_STRING_WITH_LEN("bigint") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("WARNINGS") },
+    { C_STRING_WITH_LEN("bigint") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("ROWS_AFFECTED") },
+    { C_STRING_WITH_LEN("bigint") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("ROWS_SENT") },
+    { C_STRING_WITH_LEN("bigint") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("ROWS_EXAMINED") },
+    { C_STRING_WITH_LEN("bigint") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("CREATED_TMP_DISK_TABLES") },
+    { C_STRING_WITH_LEN("bigint") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("CREATED_TMP_TABLES") },
+    { C_STRING_WITH_LEN("bigint") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SELECT_FULL_JOIN") },
+    { C_STRING_WITH_LEN("bigint") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SELECT_FULL_RANGE_JOIN") },
+    { C_STRING_WITH_LEN("bigint") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SELECT_RANGE") },
+    { C_STRING_WITH_LEN("bigint") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SELECT_RANGE_CHECK") },
+    { C_STRING_WITH_LEN("bigint") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SELECT_SCAN") },
+    { C_STRING_WITH_LEN("bigint") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SORT_MERGE_PASSES") },
+    { C_STRING_WITH_LEN("bigint") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SORT_RANGE") },
+    { C_STRING_WITH_LEN("bigint") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SORT_ROWS") },
+    { C_STRING_WITH_LEN("bigint") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SORT_SCAN") },
+    { C_STRING_WITH_LEN("bigint") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("NO_INDEX_USED") },
+    { C_STRING_WITH_LEN("bigint") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("NO_GOOD_INDEX_USED") },
+    { C_STRING_WITH_LEN("bigint") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("NESTING_EVENT_ID") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("NESTING_EVENT_TYPE") },
+    { C_STRING_WITH_LEN("enum(\'STATEMENT\',\'STAGE\',\'WAIT\'") },
+    { NULL, 0}
+  }
+};
+
+TABLE_FIELD_DEF
+table_events_statements_current::m_field_def=
+{40 , field_types };
+
+PFS_engine_table_share
+table_events_statements_current::m_share=
+{
+  { C_STRING_WITH_LEN("events_statements_current") },
+  &pfs_truncatable_acl,
+  &table_events_statements_current::create,
+  NULL, /* write_row */
+  &table_events_statements_current::delete_all_rows,
+  NULL, /* get_row_count */
+  1000, /* records */
+  sizeof(PFS_simple_index), /* ref length */
+  &m_table_lock,
+  &m_field_def,
+  false /* checked */
+};
+
+THR_LOCK table_events_statements_history::m_table_lock;
+
+PFS_engine_table_share
+table_events_statements_history::m_share=
+{
+  { C_STRING_WITH_LEN("events_statements_history") },
+  &pfs_truncatable_acl,
+  &table_events_statements_history::create,
+  NULL, /* write_row */
+  &table_events_statements_history::delete_all_rows,
+  NULL, /* get_row_count */
+  1000, /* records */
+  sizeof(pos_events_statements_history), /* ref length */
+  &m_table_lock,
+  &table_events_statements_current::m_field_def,
+  false /* checked */
+};
+
+THR_LOCK table_events_statements_history_long::m_table_lock;
+
+PFS_engine_table_share
+table_events_statements_history_long::m_share=
+{
+  { C_STRING_WITH_LEN("events_statements_history_long") },
+  &pfs_truncatable_acl,
+  &table_events_statements_history_long::create,
+  NULL, /* write_row */
+  &table_events_statements_history_long::delete_all_rows,
+  NULL, /* get_row_count */
+  10000, /* records */
+  sizeof(PFS_simple_index), /* ref length */
+  &m_table_lock,
+  &table_events_statements_current::m_field_def,
+  false /* checked */
+};
+
+table_events_statements_common::table_events_statements_common
+(const PFS_engine_table_share *share, void *pos)
+  : PFS_engine_table(share, pos),
+  m_row_exists(false)
+{}
+
+/**
+  Build a row.
+  @param statement                      the statement the cursor is reading
+*/
+void table_events_statements_common::make_row(PFS_events_statements *statement)
+{
+  const char *base;
+  const char *safe_source_file;
+
+  m_row_exists= false;
+
+  PFS_statement_class *unsafe= (PFS_statement_class*) statement->m_class;
+  PFS_statement_class *klass= sanitize_statement_class(unsafe);
+  if (unlikely(klass == NULL))
+    return;
+
+  m_row.m_thread_internal_id= statement->m_thread_internal_id;
+  m_row.m_event_id= statement->m_event_id;
+  m_row.m_end_event_id= statement->m_end_event_id;
+  m_row.m_nesting_event_id= statement->m_nesting_event_id;
+  m_row.m_nesting_event_type= statement->m_nesting_event_type;
+
+  m_normalizer->to_pico(statement->m_timer_start, statement->m_timer_end,
+                      & m_row.m_timer_start, & m_row.m_timer_end, & m_row.m_timer_wait);
+  m_row.m_lock_time= statement->m_lock_time * MICROSEC_TO_PICOSEC;
+
+  m_row.m_name= klass->m_name;
+  m_row.m_name_length= klass->m_name_length;
+
+  m_row.m_sqltext_length= statement->m_sqltext_length;
+  if (m_row.m_sqltext_length > 0)
+    memcpy(m_row.m_sqltext, statement->m_sqltext, m_row.m_sqltext_length);
+
+  m_row.m_current_schema_name_length= statement->m_current_schema_name_length;
+  if (m_row.m_current_schema_name_length > 0)
+    memcpy(m_row.m_current_schema_name, statement->m_current_schema_name, m_row.m_current_schema_name_length);
+
+  safe_source_file= statement->m_source_file;
+  if (unlikely(safe_source_file == NULL))
+    return;
+
+  base= base_name(safe_source_file);
+  m_row.m_source_length= my_snprintf(m_row.m_source, sizeof(m_row.m_source),
+                                     "%s:%d", base, statement->m_source_line);
+  if (m_row.m_source_length > sizeof(m_row.m_source))
+    m_row.m_source_length= sizeof(m_row.m_source);
+
+  memcpy(m_row.m_message_text, statement->m_message_text, sizeof(m_row.m_message_text));
+  m_row.m_sql_errno= statement->m_sql_errno;
+  memcpy(m_row.m_sqlstate, statement->m_sqlstate, SQLSTATE_LENGTH);
+  m_row.m_error_count= statement->m_error_count;
+  m_row.m_warning_count= statement->m_warning_count;
+  m_row.m_rows_affected= statement->m_rows_affected;
+
+  m_row.m_rows_sent= statement->m_rows_sent;
+  m_row.m_rows_examined= statement->m_rows_examined;
+  m_row.m_created_tmp_disk_tables= statement->m_created_tmp_disk_tables;
+  m_row.m_created_tmp_tables= statement->m_created_tmp_tables;
+  m_row.m_select_full_join= statement->m_select_full_join;
+  m_row.m_select_full_range_join= statement->m_select_full_range_join;
+  m_row.m_select_range= statement->m_select_range;
+  m_row.m_select_range_check= statement->m_select_range_check;
+  m_row.m_select_scan= statement->m_select_scan;
+  m_row.m_sort_merge_passes= statement->m_sort_merge_passes;
+  m_row.m_sort_range= statement->m_sort_range;
+  m_row.m_sort_rows= statement->m_sort_rows;
+  m_row.m_sort_scan= statement->m_sort_scan;
+  m_row.m_no_index_used= statement->m_no_index_used;
+  m_row.m_no_good_index_used= statement->m_no_good_index_used;
+  /* 
+    Filling up statement digest information.
+  */
+  PSI_digest_storage *digest= & statement->m_digest_storage;
+  if (digest->m_byte_count > 0)
+  {
+    PFS_digest_hash md5;
+    compute_md5_hash((char *) md5.m_md5,
+                     (char *) digest->m_token_array,
+                     digest->m_byte_count);
+
+    /* Generate the DIGEST string from the MD5 digest  */
+    MD5_HASH_TO_STRING(md5.m_md5,
+                       m_row.m_digest.m_digest);
+    m_row.m_digest.m_digest_length= MD5_HASH_TO_STRING_LENGTH;
+
+    /* Generate the DIGEST_TEXT string from the token array */
+    get_digest_text(m_row.m_digest.m_digest_text, digest);
+    m_row.m_digest.m_digest_text_length= strlen(m_row.m_digest.m_digest_text);
+  }
+  else
+  {
+    m_row.m_digest.m_digest_length= 0;
+    m_row.m_digest.m_digest_text_length= 0;
+  }
+
+  m_row_exists= true;
+  return;
+}
+
+int table_events_statements_common::read_row_values(TABLE *table,
+                                                    unsigned char *buf,
+                                                    Field **fields,
+                                                    bool read_all)
+{
+  Field *f;
+  uint len;
+
+  if (unlikely(! m_row_exists))
+    return HA_ERR_RECORD_DELETED;
+
+  /* Set the null bits */
+  DBUG_ASSERT(table->s->null_bytes == 3);
+  buf[0]= 0;
+  buf[1]= 0;
+  buf[2]= 0;
+
+  for (; (f= *fields) ; fields++)
+  {
+    if (read_all || bitmap_is_set(table->read_set, f->field_index))
+    {
+      switch(f->field_index)
+      {
+      case 0: /* THREAD_ID */
+        set_field_ulong(f, m_row.m_thread_internal_id);
+        break;
+      case 1: /* EVENT_ID */
+        set_field_ulonglong(f, m_row.m_event_id);
+        break;
+      case 2: /* END_EVENT_ID */
+        if (m_row.m_end_event_id > 0)
+          set_field_ulonglong(f, m_row.m_end_event_id - 1);
+        else
+          f->set_null();
+        break;
+      case 3: /* EVENT_NAME */
+        set_field_varchar_utf8(f, m_row.m_name, m_row.m_name_length);
+        break;
+      case 4: /* SOURCE */
+        set_field_varchar_utf8(f, m_row.m_source, m_row.m_source_length);
+        break;
+      case 5: /* TIMER_START */
+        if (m_row.m_timer_start != 0)
+          set_field_ulonglong(f, m_row.m_timer_start);
+        else
+          f->set_null();
+        break;
+      case 6: /* TIMER_END */
+        if (m_row.m_timer_end != 0)
+          set_field_ulonglong(f, m_row.m_timer_end);
+        else
+          f->set_null();
+        break;
+      case 7: /* TIMER_WAIT */
+        if (m_row.m_timer_wait != 0)
+          set_field_ulonglong(f, m_row.m_timer_wait);
+        else
+          f->set_null();
+        break;
+      case 8: /* LOCK_TIME */
+        if (m_row.m_lock_time != 0)
+          set_field_ulonglong(f, m_row.m_lock_time);
+        else
+          f->set_null();
+        break;
+      case 9: /* SQL_TEXT */
+        if (m_row.m_sqltext_length)
+          set_field_longtext_utf8(f, m_row.m_sqltext, m_row.m_sqltext_length);
+        else
+          f->set_null();
+        break;
+      case 10: /* DIGEST */
+        if (m_row.m_digest.m_digest_length > 0)
+          set_field_varchar_utf8(f, m_row.m_digest.m_digest,
+                                 m_row.m_digest.m_digest_length);
+        else
+          f->set_null();
+        break;
+      case 11: /* DIGEST_TEXT */
+        if (m_row.m_digest.m_digest_text_length > 0)
+           set_field_longtext_utf8(f, m_row.m_digest.m_digest_text,
+                                   m_row.m_digest.m_digest_text_length);
+        else
+          f->set_null();
+        break;
+      case 12: /* CURRENT_SCHEMA */
+        if (m_row.m_current_schema_name_length)
+          set_field_varchar_utf8(f, m_row.m_current_schema_name, m_row.m_current_schema_name_length);
+        else
+          f->set_null();
+        break;
+      case 13: /* OBJECT_TYPE */
+        f->set_null();
+        break;
+      case 14: /* OBJECT_SCHEMA */
+        f->set_null();
+        break;
+      case 15: /* OBJECT_NAME */
+        f->set_null();
+        break;
+      case 16: /* OBJECT_INSTANCE_BEGIN */
+        f->set_null();
+        break;
+      case 17: /* MYSQL_ERRNO */
+        set_field_ulong(f, m_row.m_sql_errno);
+        break;
+      case 18: /* RETURNED_SQLSTATE */
+        if (m_row.m_sqlstate[0] != 0)
+          set_field_varchar_utf8(f, m_row.m_sqlstate, SQLSTATE_LENGTH);
+        else
+          f->set_null();
+        break;
+      case 19: /* MESSAGE_TEXT */
+        len= strlen(m_row.m_message_text);
+        if (len)
+          set_field_varchar_utf8(f, m_row.m_message_text, len);
+        else
+          f->set_null();
+        break;
+      case 20: /* ERRORS */
+        set_field_ulonglong(f, m_row.m_error_count);
+        break;
+      case 21: /* WARNINGS */
+        set_field_ulonglong(f, m_row.m_warning_count);
+        break;
+      case 22: /* ROWS_AFFECTED */
+        set_field_ulonglong(f, m_row.m_rows_affected);
+        break;
+      case 23: /* ROWS_SENT */
+        set_field_ulonglong(f, m_row.m_rows_sent);
+        break;
+      case 24: /* ROWS_EXAMINED */
+        set_field_ulonglong(f, m_row.m_rows_examined);
+        break;
+      case 25: /* CREATED_TMP_DISK_TABLES */
+        set_field_ulonglong(f, m_row.m_created_tmp_disk_tables);
+        break;
+      case 26: /* CREATED_TMP_TABLES */
+        set_field_ulonglong(f, m_row.m_created_tmp_tables);
+        break;
+      case 27: /* SELECT_FULL_JOIN */
+        set_field_ulonglong(f, m_row.m_select_full_join);
+        break;
+      case 28: /* SELECT_FULL_RANGE_JOIN */
+        set_field_ulonglong(f, m_row.m_select_full_range_join);
+        break;
+      case 29: /* SELECT_RANGE */
+        set_field_ulonglong(f, m_row.m_select_range);
+        break;
+      case 30: /* SELECT_RANGE_CHECK */
+        set_field_ulonglong(f, m_row.m_select_range_check);
+        break;
+      case 31: /* SELECT_SCAN */
+        set_field_ulonglong(f, m_row.m_select_scan);
+        break;
+      case 32: /* SORT_MERGE_PASSES */
+        set_field_ulonglong(f, m_row.m_sort_merge_passes);
+        break;
+      case 33: /* SORT_RANGE */
+        set_field_ulonglong(f, m_row.m_sort_range);
+        break;
+      case 34: /* SORT_ROWS */
+        set_field_ulonglong(f, m_row.m_sort_rows);
+        break;
+      case 35: /* SORT_SCAN */
+        set_field_ulonglong(f, m_row.m_sort_scan);
+        break;
+      case 36: /* NO_INDEX_USED */
+        set_field_ulonglong(f, m_row.m_no_index_used);
+        break;
+      case 37: /* NO_GOOD_INDEX_USED */
+        set_field_ulonglong(f, m_row.m_no_good_index_used);
+        break;
+      case 38: /* NESTING_EVENT_ID */
+        if (m_row.m_nesting_event_id != 0)
+          set_field_ulonglong(f, m_row.m_nesting_event_id);
+        else
+          f->set_null();
+        break;
+      case 39: /* NESTING_EVENT_TYPE */
+        if (m_row.m_nesting_event_id != 0)
+          set_field_enum(f, m_row.m_nesting_event_type);
+        else
+          f->set_null();
+        break;
+      default:
+        DBUG_ASSERT(false);
+      }
+    }
+  }
+  return 0;
+}
+
+PFS_engine_table* table_events_statements_current::create(void)
+{
+  return new table_events_statements_current();
+}
+
+table_events_statements_current::table_events_statements_current()
+  : table_events_statements_common(&m_share, &m_pos),
+  m_pos(), m_next_pos()
+{}
+
+void table_events_statements_current::reset_position(void)
+{
+  m_pos.reset();
+  m_next_pos.reset();
+}
+
+int table_events_statements_current::rnd_init(bool scan)
+{
+  m_normalizer= time_normalizer::get(statement_timer);
+  return 0;
+}
+
+int table_events_statements_current::rnd_next(void)
+{
+  PFS_thread *pfs_thread;
+  PFS_events_statements *statement;
+
+  for (m_pos.set_at(&m_next_pos);
+       m_pos.m_index_1 < thread_max;
+       m_pos.next_thread())
+  {
+    pfs_thread= &thread_array[m_pos.m_index_1];
+
+    if (! pfs_thread->m_lock.is_populated())
+    {
+      /* This thread does not exist */
+      continue;
+    }
+
+    uint safe_events_statements_count= pfs_thread->m_events_statements_count;
+
+    if (safe_events_statements_count == 0)
+    {
+      /* Display the last top level statement, when completed */
+      if (m_pos.m_index_2 >= 1)
+        continue;
+    }
+    else
+    {
+      /* Display all pending statements, when in progress */
+      if (m_pos.m_index_2 >= safe_events_statements_count)
+        continue;
+    }
+
+    statement= &pfs_thread->m_statement_stack[m_pos.m_index_2];
+
+    make_row(statement);
+    m_next_pos.set_after(&m_pos);
+    return 0;
+  }
+
+  return HA_ERR_END_OF_FILE;
+}
+
+int table_events_statements_current::rnd_pos(const void *pos)
+{
+  PFS_thread *pfs_thread;
+  PFS_events_statements *statement;
+
+  set_position(pos);
+  DBUG_ASSERT(m_pos.m_index_1 < thread_max);
+  pfs_thread= &thread_array[m_pos.m_index_1];
+
+  if (! pfs_thread->m_lock.is_populated())
+    return HA_ERR_RECORD_DELETED;
+
+  uint safe_events_statements_count= pfs_thread->m_events_statements_count;
+
+  if (safe_events_statements_count == 0)
+  {
+    /* Display the last top level statement, when completed */
+    if (m_pos.m_index_2 >= 1)
+      return HA_ERR_RECORD_DELETED;
+  }
+  else
+  {
+    /* Display all pending statements, when in progress */
+    if (m_pos.m_index_2 >= safe_events_statements_count)
+      return HA_ERR_RECORD_DELETED;
+  }
+
+  DBUG_ASSERT(m_pos.m_index_2 < statement_stack_max);
+
+  statement= &pfs_thread->m_statement_stack[m_pos.m_index_2];
+
+  if (statement->m_class == NULL)
+    return HA_ERR_RECORD_DELETED;
+
+  make_row(statement);
+  return 0;
+}
+
+int table_events_statements_current::delete_all_rows(void)
+{
+  reset_events_statements_current();
+  return 0;
+}
+
+PFS_engine_table* table_events_statements_history::create(void)
+{
+  return new table_events_statements_history();
+}
+
+table_events_statements_history::table_events_statements_history()
+  : table_events_statements_common(&m_share, &m_pos),
+  m_pos(), m_next_pos()
+{}
+
+void table_events_statements_history::reset_position(void)
+{
+  m_pos.reset();
+  m_next_pos.reset();
+}
+
+int table_events_statements_history::rnd_init(bool scan)
+{
+  m_normalizer= time_normalizer::get(statement_timer);
+  return 0;
+}
+
+int table_events_statements_history::rnd_next(void)
+{
+  PFS_thread *pfs_thread;
+  PFS_events_statements *statement;
+
+  if (events_statements_history_per_thread == 0)
+    return HA_ERR_END_OF_FILE;
+
+  for (m_pos.set_at(&m_next_pos);
+       m_pos.m_index_1 < thread_max;
+       m_pos.next_thread())
+  {
+    pfs_thread= &thread_array[m_pos.m_index_1];
+
+    if (! pfs_thread->m_lock.is_populated())
+    {
+      /* This thread does not exist */
+      continue;
+    }
+
+    if (m_pos.m_index_2 >= events_statements_history_per_thread)
+    {
+      /* This thread does not have more (full) history */
+      continue;
+    }
+
+    if ( ! pfs_thread->m_statements_history_full &&
+        (m_pos.m_index_2 >= pfs_thread->m_statements_history_index))
+    {
+      /* This thread does not have more (not full) history */
+      continue;
+    }
+
+    statement= &pfs_thread->m_statements_history[m_pos.m_index_2];
+
+    if (statement->m_class != NULL)
+    {
+      make_row(statement);
+      /* Next iteration, look for the next history in this thread */
+      m_next_pos.set_after(&m_pos);
+      return 0;
+    }
+  }
+
+  return HA_ERR_END_OF_FILE;
+}
+
+int table_events_statements_history::rnd_pos(const void *pos)
+{
+  PFS_thread *pfs_thread;
+  PFS_events_statements *statement;
+
+  DBUG_ASSERT(events_statements_history_per_thread != 0);
+  set_position(pos);
+  DBUG_ASSERT(m_pos.m_index_1 < thread_max);
+  pfs_thread= &thread_array[m_pos.m_index_1];
+
+  if (! pfs_thread->m_lock.is_populated())
+    return HA_ERR_RECORD_DELETED;
+
+  DBUG_ASSERT(m_pos.m_index_2 < events_statements_history_per_thread);
+
+  if ( ! pfs_thread->m_statements_history_full &&
+      (m_pos.m_index_2 >= pfs_thread->m_statements_history_index))
+    return HA_ERR_RECORD_DELETED;
+
+  statement= &pfs_thread->m_statements_history[m_pos.m_index_2];
+
+  if (statement->m_class == NULL)
+    return HA_ERR_RECORD_DELETED;
+
+  make_row(statement);
+  return 0;
+}
+
+int table_events_statements_history::delete_all_rows(void)
+{
+  reset_events_statements_history();
+  return 0;
+}
+
+PFS_engine_table* table_events_statements_history_long::create(void)
+{
+  return new table_events_statements_history_long();
+}
+
+table_events_statements_history_long::table_events_statements_history_long()
+  : table_events_statements_common(&m_share, &m_pos),
+  m_pos(0), m_next_pos(0)
+{}
+
+void table_events_statements_history_long::reset_position(void)
+{
+  m_pos.m_index= 0;
+  m_next_pos.m_index= 0;
+}
+
+int table_events_statements_history_long::rnd_init(bool scan)
+{
+  m_normalizer= time_normalizer::get(statement_timer);
+  return 0;
+}
+
+int table_events_statements_history_long::rnd_next(void)
+{
+  PFS_events_statements *statement;
+  uint limit;
+
+  if (events_statements_history_long_size == 0)
+    return HA_ERR_END_OF_FILE;
+
+  if (events_statements_history_long_full)
+    limit= events_statements_history_long_size;
+  else
+    limit= events_statements_history_long_index % events_statements_history_long_size;
+
+  for (m_pos.set_at(&m_next_pos); m_pos.m_index < limit; m_pos.next())
+  {
+    statement= &events_statements_history_long_array[m_pos.m_index];
+
+    if (statement->m_class != NULL)
+    {
+      make_row(statement);
+      /* Next iteration, look for the next entry */
+      m_next_pos.set_after(&m_pos);
+      return 0;
+    }
+  }
+
+  return HA_ERR_END_OF_FILE;
+}
+
+int table_events_statements_history_long::rnd_pos(const void *pos)
+{
+  PFS_events_statements *statement;
+  uint limit;
+
+  if (events_statements_history_long_size == 0)
+    return HA_ERR_RECORD_DELETED;
+
+  set_position(pos);
+
+  if (events_statements_history_long_full)
+    limit= events_statements_history_long_size;
+  else
+    limit= events_statements_history_long_index % events_statements_history_long_size;
+
+  if (m_pos.m_index >= limit)
+    return HA_ERR_RECORD_DELETED;
+
+  statement= &events_statements_history_long_array[m_pos.m_index];
+
+  if (statement->m_class == NULL)
+    return HA_ERR_RECORD_DELETED;
+
+  make_row(statement);
+  return 0;
+}
+
+int table_events_statements_history_long::delete_all_rows(void)
+{
+  reset_events_statements_history_long();
+  return 0;
+}
+
diff --git a/storage/perfschema/table_events_statements.h b/storage/perfschema/table_events_statements.h
new file mode 100644
index 00000000000..acd82de4fcf
--- /dev/null
+++ b/storage/perfschema/table_events_statements.h
@@ -0,0 +1,288 @@
+/* Copyright (c) 2010, 2011, Oracle and/or its affiliates. All rights reserved.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; version 2 of the License.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software Foundation,
+  51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA */
+
+#ifndef TABLE_EVENTS_STATEMENTS_H
+#define TABLE_EVENTS_STATEMENTS_H
+
+/**
+  @file storage/perfschema/table_events_statements.h
+  Table EVENTS_STATEMENTS_xxx (declarations).
+*/
+
+#include "pfs_column_types.h"
+#include "pfs_engine_table.h"
+#include "pfs_events_statements.h"
+#include "table_helper.h"
+
+struct PFS_thread;
+
+/**
+  @addtogroup Performance_schema_tables
+  @{
+*/
+
+/** A row of table_events_statements_common. */
+struct row_events_statements
+{
+  /** Column THREAD_ID. */
+  ulong m_thread_internal_id;
+  /** Column EVENT_ID. */
+  ulonglong m_event_id;
+  /** Column END_EVENT_ID. */
+  ulonglong m_end_event_id;
+  /** Column NESTING_EVENT_ID. */
+  ulonglong m_nesting_event_id;
+  /** Column NESTING_EVENT_TYPE. */
+  enum_event_type m_nesting_event_type;
+  /** Column EVENT_NAME. */
+  const char *m_name;
+  /** Length in bytes of @c m_name. */
+  uint m_name_length;
+  /** Column TIMER_START. */
+  ulonglong m_timer_start;
+  /** Column TIMER_END. */
+  ulonglong m_timer_end;
+  /** Column TIMER_WAIT. */
+  ulonglong m_timer_wait;
+  /** Column LOCK_TIME. */
+  ulonglong m_lock_time;
+  /** Column SOURCE. */
+  char m_source[COL_SOURCE_SIZE];
+  /** Length in bytes of @c m_source. */
+  uint m_source_length;
+  /** Column SQL_TEXT. */
+  char m_sqltext[COL_INFO_SIZE];
+  /** Column DIGEST and DIGEST_TEXT. */
+  PFS_digest_row m_digest;
+  /** Length in bytes of @c m_info. */
+  uint m_sqltext_length;
+  /** Column CURRENT_SCHEMA. */
+  char m_current_schema_name[NAME_LEN];
+  /** Length in bytes of @c m_current_schema_name. */
+  uint m_current_schema_name_length;
+
+  /** Column MESSAGE_TEXT. */
+  char m_message_text[MYSQL_ERRMSG_SIZE+1];
+  /** Column MYSQL_ERRNO. */
+  uint m_sql_errno;
+  /** Column RETURNED_SQLSTATE. */
+  char m_sqlstate[SQLSTATE_LENGTH];
+  /** Column ERRORS. */
+  uint m_error_count;
+  /** Column WARNINGS. */
+  uint m_warning_count;
+  /** Column ROWS_AFFECTED. */
+  ulonglong m_rows_affected;
+  /** Column ROWS_SENT. */
+  ulonglong m_rows_sent;
+  /** Column ROWS_EXAMINED. */
+  ulonglong m_rows_examined;
+  /** Column CREATED_TMP_DISK_TABLES. */
+  ulonglong m_created_tmp_disk_tables;
+  /** Column CREATED_TMP_TABLES. */
+  ulonglong m_created_tmp_tables;
+  /** Column SELECT_FULL_JOIN. */
+  ulonglong m_select_full_join;
+  /** Column SELECT_FULL_RANGE_JOIN. */
+  ulonglong m_select_full_range_join;
+  /** Column SELECT_RANGE. */
+  ulonglong m_select_range;
+  /** Column SELECT_RANGE_CHECK. */
+  ulonglong m_select_range_check;
+  /** Column SELECT_SCAN. */
+  ulonglong m_select_scan;
+  /** Column SORT_MERGE_PASSES. */
+  ulonglong m_sort_merge_passes;
+  /** Column SORT_RANGE. */
+  ulonglong m_sort_range;
+  /** Column SORT_ROWS. */
+  ulonglong m_sort_rows;
+  /** Column SORT_SCAN. */
+  ulonglong m_sort_scan;
+  /** Column NO_INDEX_USED. */
+  ulonglong m_no_index_used;
+  /** Column NO_GOOD_INDEX_USED. */
+  ulonglong m_no_good_index_used;
+};
+
+/** Position of a cursor on PERFORMANCE_SCHEMA.EVENTS_STATEMENTS_CURRENT. */
+struct pos_events_statements_current : public PFS_double_index
+{
+  pos_events_statements_current()
+    : PFS_double_index(0, 0)
+  {}
+
+  inline void reset(void)
+  {
+    m_index_1= 0;
+    m_index_2= 0;
+  }
+
+  inline void next_thread(void)
+  {
+    m_index_1++;
+    m_index_2= 0;
+  }
+};
+
+/** Position of a cursor on PERFORMANCE_SCHEMA.EVENTS_STATEMENTS_HISTORY. */
+struct pos_events_statements_history : public PFS_double_index
+{
+  pos_events_statements_history()
+    : PFS_double_index(0, 0)
+  {}
+
+  inline void reset(void)
+  {
+    m_index_1= 0;
+    m_index_2= 0;
+  }
+
+  inline void next_thread(void)
+  {
+    m_index_1++;
+    m_index_2= 0;
+  }
+};
+
+/**
+  Adapter, for table sharing the structure of
+  PERFORMANCE_SCHEMA.EVENTS_STATEMENTS_CURRENT.
+*/
+class table_events_statements_common : public PFS_engine_table
+{
+protected:
+  virtual int read_row_values(TABLE *table,
+                              unsigned char *buf,
+                              Field **fields,
+                              bool read_all);
+
+  table_events_statements_common(const PFS_engine_table_share *share, void *pos);
+
+  ~table_events_statements_common()
+  {}
+
+  void make_row(PFS_events_statements *statement);
+
+  /** Current row. */
+  row_events_statements m_row;
+  /** True if the current row exists. */
+  bool m_row_exists;
+};
+
+/** Table PERFORMANCE_SCHEMA.EVENTS_STATEMENTS_CURRENT. */
+class table_events_statements_current : public table_events_statements_common
+{
+public:
+  /** Table share */
+  static PFS_engine_table_share m_share;
+  static PFS_engine_table* create();
+  static int delete_all_rows();
+
+  virtual int rnd_init(bool scan);
+  virtual int rnd_next();
+  virtual int rnd_pos(const void *pos);
+  virtual void reset_position(void);
+
+protected:
+  table_events_statements_current();
+
+public:
+  ~table_events_statements_current()
+  {}
+
+private:
+  friend class table_events_statements_history;
+  friend class table_events_statements_history_long;
+
+  /** Table share lock. */
+  static THR_LOCK m_table_lock;
+  /**
+    Fields definition.
+    Also used by table_events_statements_history
+    and table_events_statements_history_long.
+  */
+  static TABLE_FIELD_DEF m_field_def;
+
+  /** Current position. */
+  pos_events_statements_current m_pos;
+  /** Next position. */
+  pos_events_statements_current m_next_pos;
+};
+
+/** Table PERFORMANCE_SCHEMA.EVENTS_STATEMENTS_HISTORY. */
+class table_events_statements_history : public table_events_statements_common
+{
+public:
+  /** Table share */
+  static PFS_engine_table_share m_share;
+  static PFS_engine_table* create();
+  static int delete_all_rows();
+
+  virtual int rnd_init(bool scan);
+  virtual int rnd_next();
+  virtual int rnd_pos(const void *pos);
+  virtual void reset_position(void);
+
+protected:
+  table_events_statements_history();
+
+public:
+  ~table_events_statements_history()
+  {}
+
+private:
+  /** Table share lock. */
+  static THR_LOCK m_table_lock;
+
+  /** Current position. */
+  pos_events_statements_history m_pos;
+  /** Next position. */
+  pos_events_statements_history m_next_pos;
+};
+
+/** Table PERFORMANCE_SCHEMA.EVENTS_STATEMENTS_HISTORY_LONG. */
+class table_events_statements_history_long : public table_events_statements_common
+{
+public:
+  /** Table share */
+  static PFS_engine_table_share m_share;
+  static PFS_engine_table* create();
+  static int delete_all_rows();
+
+  virtual int rnd_init(bool scan);
+  virtual int rnd_next();
+  virtual int rnd_pos(const void *pos);
+  virtual void reset_position(void);
+
+protected:
+  table_events_statements_history_long();
+
+public:
+  ~table_events_statements_history_long()
+  {}
+
+private:
+  /** Table share lock. */
+  static THR_LOCK m_table_lock;
+
+  /** Current position. */
+  PFS_simple_index m_pos;
+  /** Next position. */
+  PFS_simple_index m_next_pos;
+};
+
+/** @} */
+#endif
diff --git a/storage/perfschema/table_events_waits.cc b/storage/perfschema/table_events_waits.cc
index 8408cc55975..d1c82e81f75 100644
--- a/storage/perfschema/table_events_waits.cc
+++ b/storage/perfschema/table_events_waits.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2008, 2011, Oracle and/or its affiliates. All rights reserved.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
@@ -21,9 +21,12 @@
 #include "my_global.h"
 #include "my_pthread.h"
 #include "table_events_waits.h"
+#include "pfs_global.h"
 #include "pfs_instr_class.h"
 #include "pfs_instr.h"
 #include "pfs_events_waits.h"
+#include "pfs_timer.h"
+#include "m_string.h"
 
 THR_LOCK table_events_waits_current::m_table_lock;
 
@@ -40,6 +43,11 @@ static const TABLE_FIELD_TYPE field_types[]=
     { NULL, 0}
   },
   {
+    { C_STRING_WITH_LEN("END_EVENT_ID") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
     { C_STRING_WITH_LEN("EVENT_NAME") },
     { C_STRING_WITH_LEN("varchar(128)") },
     { NULL, 0}
@@ -80,6 +88,11 @@ static const TABLE_FIELD_TYPE field_types[]=
     { NULL, 0}
   },
   {
+    { C_STRING_WITH_LEN("INDEX_NAME") },
+    { C_STRING_WITH_LEN("varchar(64)") },
+    { NULL, 0}
+  },
+  {
     { C_STRING_WITH_LEN("OBJECT_TYPE") },
     { C_STRING_WITH_LEN("varchar(64)") },
     { NULL, 0}
@@ -95,8 +108,13 @@ static const TABLE_FIELD_TYPE field_types[]=
     { NULL, 0}
   },
   {
+    { C_STRING_WITH_LEN("NESTING_EVENT_TYPE") },
+    { C_STRING_WITH_LEN("enum(\'STATEMENT\',\'STAGE\',\'WAIT\'") },
+    { NULL, 0}
+  },
+  {
     { C_STRING_WITH_LEN("OPERATION") },
-    { C_STRING_WITH_LEN("varchar(16)") },
+    { C_STRING_WITH_LEN("varchar(32)") },
     { NULL, 0}
   },
   {
@@ -113,7 +131,7 @@ static const TABLE_FIELD_TYPE field_types[]=
 
 TABLE_FIELD_DEF
 table_events_waits_current::m_field_def=
-{ 16, field_types };
+{ 19, field_types };
 
 PFS_engine_table_share
 table_events_waits_current::m_share=
@@ -123,6 +141,7 @@ table_events_waits_current::m_share=
   &table_events_waits_current::create,
   NULL, /* write_row */
   &table_events_waits_current::delete_all_rows,
+  NULL, /* get_row_count */
   1000, /* records */
   sizeof(pos_events_waits_current), /* ref length */
   &m_table_lock,
@@ -140,6 +159,7 @@ table_events_waits_history::m_share=
   &table_events_waits_history::create,
   NULL, /* write_row */
   &table_events_waits_history::delete_all_rows,
+  NULL, /* get_row_count */
   1000, /* records */
   sizeof(pos_events_waits_history), /* ref length */
   &m_table_lock,
@@ -157,6 +177,7 @@ table_events_waits_history_long::m_share=
   &table_events_waits_history_long::create,
   NULL, /* write_row */
   &table_events_waits_history_long::delete_all_rows,
+  NULL, /* get_row_count */
   10000, /* records */
   sizeof(PFS_simple_index), /* ref length */
   &m_table_lock,
@@ -176,6 +197,153 @@ void table_events_waits_common::clear_object_columns()
   m_row.m_object_type_length= 0;
   m_row.m_object_schema_length= 0;
   m_row.m_object_name_length= 0;
+  m_row.m_index_name_length= 0;
+  m_row.m_object_instance_addr= 0;
+}
+
+int table_events_waits_common::make_table_object_columns(volatile PFS_events_waits *wait)
+{
+  uint safe_index;
+  PFS_table_share *safe_table_share;
+
+  safe_table_share= sanitize_table_share(wait->m_weak_table_share);
+  if (unlikely(safe_table_share == NULL))
+    return 1;
+
+  if (wait->m_object_type == OBJECT_TYPE_TABLE)
+  {
+    m_row.m_object_type= "TABLE";
+    m_row.m_object_type_length= 5;
+  }
+  else
+  {
+    m_row.m_object_type= "TEMPORARY TABLE";
+    m_row.m_object_type_length= 15;
+  }
+
+  if (safe_table_share->get_version() == wait->m_weak_version)
+  {
+    /* OBJECT SCHEMA */
+    m_row.m_object_schema_length= safe_table_share->m_schema_name_length;
+    if (unlikely((m_row.m_object_schema_length == 0) ||
+                 (m_row.m_object_schema_length > sizeof(m_row.m_object_schema))))
+      return 1;
+    memcpy(m_row.m_object_schema, safe_table_share->m_schema_name, m_row.m_object_schema_length);
+
+    /* OBJECT NAME */
+    m_row.m_object_name_length= safe_table_share->m_table_name_length;
+    if (unlikely((m_row.m_object_name_length == 0) ||
+                 (m_row.m_object_name_length > sizeof(m_row.m_object_name))))
+      return 1;
+    memcpy(m_row.m_object_name, safe_table_share->m_table_name, m_row.m_object_name_length);
+
+    /* INDEX NAME */
+    safe_index= wait->m_index;
+    if (safe_index < MAX_KEY && safe_index < safe_table_share->m_key_count)
+    {
+      PFS_table_key *key= & safe_table_share->m_keys[safe_index];
+      m_row.m_index_name_length= key->m_name_length;
+      if (unlikely((m_row.m_index_name_length == 0) ||
+                   (m_row.m_index_name_length > sizeof(m_row.m_index_name))))
+        return 1;
+      memcpy(m_row.m_index_name, key->m_name, m_row.m_index_name_length);
+    }
+    else
+      m_row.m_index_name_length= 0;
+  }
+  else
+  {
+    m_row.m_object_schema_length= 0;
+    m_row.m_object_name_length= 0;
+    m_row.m_index_name_length= 0;
+  }
+
+  m_row.m_object_instance_addr= (intptr) wait->m_object_instance_addr;
+  return 0;
+}
+
+int table_events_waits_common::make_file_object_columns(volatile PFS_events_waits *wait)
+{
+  PFS_file *safe_file;
+
+  safe_file= sanitize_file(wait->m_weak_file);
+  if (unlikely(safe_file == NULL))
+    return 1;
+
+  m_row.m_object_type= "FILE";
+  m_row.m_object_type_length= 4;
+  m_row.m_object_schema_length= 0;
+  m_row.m_object_instance_addr= (intptr) wait->m_object_instance_addr;
+
+  if (safe_file->get_version() == wait->m_weak_version)
+  {
+    /* OBJECT NAME */
+    m_row.m_object_name_length= safe_file->m_filename_length;
+    if (unlikely((m_row.m_object_name_length == 0) ||
+                 (m_row.m_object_name_length > sizeof(m_row.m_object_name))))
+      return 1;
+    memcpy(m_row.m_object_name, safe_file->m_filename, m_row.m_object_name_length);
+  }
+  else
+  {
+    m_row.m_object_name_length= 0;
+  }
+
+  m_row.m_index_name_length= 0;
+
+  return 0;
+}
+
+int table_events_waits_common::make_socket_object_columns(volatile PFS_events_waits *wait)
+{
+  PFS_socket *safe_socket;
+
+  safe_socket= sanitize_socket(wait->m_weak_socket);
+  if (unlikely(safe_socket == NULL))
+    return 1;
+
+  m_row.m_object_type= "SOCKET";
+  m_row.m_object_type_length= 6;
+  m_row.m_object_schema_length= 0;
+  m_row.m_object_instance_addr= (intptr) wait->m_object_instance_addr;
+
+  if (safe_socket->get_version() == wait->m_weak_version)
+  {
+    /* Convert port number to string, include delimiter in port name length */
+
+    uint port;
+    char port_str[128];
+    char ip_str[INET6_ADDRSTRLEN+1];
+    uint ip_len= 0;
+    port_str[0]= ':';
+
+    /* Get the IP address and port number */
+    ip_len= pfs_get_socket_address(ip_str, sizeof(ip_str), &port,
+                                   &safe_socket->m_sock_addr,
+                                   safe_socket->m_addr_len);
+
+    /* Convert port number to a string (length includes ':') */
+    int port_len= int10_to_str(port, (port_str+1), 10) - port_str + 1;
+
+    /* OBJECT NAME */
+    m_row.m_object_name_length= ip_len + port_len;
+
+    if (unlikely((m_row.m_object_name_length == 0) ||
+                 (m_row.m_object_name_length > sizeof(m_row.m_object_name))))
+      return 1;
+
+    char *name= m_row.m_object_name;
+    memcpy(name, ip_str, ip_len);
+    memcpy(name + ip_len, port_str, port_len);
+  }
+  else
+  {
+    m_row.m_object_name_length= 0;
+  }
+
+  m_row.m_index_name_length= 0;
+
+  return 0;
 }
 
 /**
@@ -194,9 +362,6 @@ void table_events_waits_common::make_row(bool thread_own_wait,
   PFS_instr_class *safe_class;
   const char *base;
   const char *safe_source_file;
-  const char *safe_table_schema_name;
-  const char *safe_table_object_name;
-  const char *safe_file_name;
 
   m_row_exists= false;
   safe_thread= sanitize_thread(pfs_thread);
@@ -224,13 +389,6 @@ void table_events_waits_common::make_row(bool thread_own_wait,
     and sanitizes all the data before returning a row.
   */
 
-  m_row.m_thread_internal_id= safe_thread->m_thread_internal_id;
-  m_row.m_event_id= wait->m_event_id;
-  m_row.m_timer_state= wait->m_timer_state;
-  m_row.m_timer_start= wait->m_timer_start;
-  m_row.m_timer_end= wait->m_timer_end;
-  m_row.m_object_instance_addr= (intptr) wait->m_object_instance_addr;
-
   /*
     PFS_events_waits::m_class needs to be sanitized,
     for race conditions when this code:
@@ -239,6 +397,10 @@ void table_events_waits_common::make_row(bool thread_own_wait,
   */
   switch (wait->m_wait_class)
   {
+  case WAIT_CLASS_IDLE:
+    clear_object_columns();
+    safe_class= sanitize_idle_class(wait->m_class);
+    break;
   case WAIT_CLASS_MUTEX:
     clear_object_columns();
     safe_class= sanitize_mutex_class((PFS_mutex_class*) wait->m_class);
@@ -252,43 +414,38 @@ void table_events_waits_common::make_row(bool thread_own_wait,
     safe_class= sanitize_cond_class((PFS_cond_class*) wait->m_class);
     break;
   case WAIT_CLASS_TABLE:
-    m_row.m_object_type= "TABLE";
-    m_row.m_object_type_length= 5;
-    m_row.m_object_schema_length= wait->m_schema_name_length;
-    safe_table_schema_name= sanitize_table_schema_name(wait->m_schema_name);
-    if (unlikely((m_row.m_object_schema_length == 0) ||
-                 (m_row.m_object_schema_length > sizeof(m_row.m_object_schema)) ||
-                 (safe_table_schema_name == NULL)))
-      return;
-    memcpy(m_row.m_object_schema, safe_table_schema_name, m_row.m_object_schema_length);
-    m_row.m_object_name_length= wait->m_object_name_length;
-    safe_table_object_name= sanitize_table_object_name(wait->m_object_name);
-    if (unlikely((m_row.m_object_name_length == 0) ||
-                 (m_row.m_object_name_length > sizeof(m_row.m_object_name)) ||
-                 (safe_table_object_name == NULL)))
+    if (make_table_object_columns(wait))
       return;
-    memcpy(m_row.m_object_name, safe_table_object_name, m_row.m_object_name_length);
-    safe_class= &global_table_class;
+    safe_class= sanitize_table_class(wait->m_class);
     break;
   case WAIT_CLASS_FILE:
-    m_row.m_object_type= "FILE";
-    m_row.m_object_type_length= 4;
-    m_row.m_object_schema_length= 0;
-    m_row.m_object_name_length= wait->m_object_name_length;
-    safe_file_name= sanitize_file_name(wait->m_object_name);
-    if (unlikely((m_row.m_object_name_length == 0) ||
-                 (m_row.m_object_name_length > sizeof(m_row.m_object_name)) ||
-                 (safe_file_name == NULL)))
+    if (make_file_object_columns(wait))
       return;
-    memcpy(m_row.m_object_name, safe_file_name, m_row.m_object_name_length);
     safe_class= sanitize_file_class((PFS_file_class*) wait->m_class);
     break;
+  case WAIT_CLASS_SOCKET:
+    if (make_socket_object_columns(wait))
+      return;
+    safe_class= sanitize_socket_class((PFS_socket_class*) wait->m_class);
+    break;
   case NO_WAIT_CLASS:
   default:
     return;
   }
+
   if (unlikely(safe_class == NULL))
     return;
+
+  m_row.m_thread_internal_id= safe_thread->m_thread_internal_id;
+  m_row.m_event_id= wait->m_event_id;
+  m_row.m_end_event_id= wait->m_end_event_id;
+  m_row.m_nesting_event_id= wait->m_nesting_event_id;
+  m_row.m_nesting_event_type= wait->m_nesting_event_type;
+
+  get_normalizer(safe_class);
+  m_normalizer->to_pico(wait->m_timer_start, wait->m_timer_end,
+                      & m_row.m_timer_start, & m_row.m_timer_end, & m_row.m_timer_wait);
+
   m_row.m_name= safe_class->m_name;
   m_row.m_name_length= safe_class->m_name_length;
 
@@ -307,7 +464,7 @@ void table_events_waits_common::make_row(bool thread_own_wait,
     m_row.m_source_length= sizeof(m_row.m_source);
   m_row.m_operation= wait->m_operation;
   m_row.m_number_of_bytes= wait->m_number_of_bytes;
-  m_row.m_flags= 0;
+  m_row.m_flags= wait->m_flags;
 
   if (thread_own_wait)
   {
@@ -368,7 +525,46 @@ static const LEX_STRING operation_names_map[]=
   { C_STRING_WITH_LEN("chsize") },
   { C_STRING_WITH_LEN("delete") },
   { C_STRING_WITH_LEN("rename") },
-  { C_STRING_WITH_LEN("sync") }
+  { C_STRING_WITH_LEN("sync") },
+
+  /* Table io operations */
+  { C_STRING_WITH_LEN("fetch") },
+  { C_STRING_WITH_LEN("insert") }, /* write row */
+  { C_STRING_WITH_LEN("update") }, /* update row */
+  { C_STRING_WITH_LEN("delete") }, /* delete row */
+
+  /* Table lock operations */
+  { C_STRING_WITH_LEN("read normal") },
+  { C_STRING_WITH_LEN("read with shared locks") },
+  { C_STRING_WITH_LEN("read high priority") },
+  { C_STRING_WITH_LEN("read no inserts") },
+  { C_STRING_WITH_LEN("write allow write") },
+  { C_STRING_WITH_LEN("write concurrent insert") },
+  { C_STRING_WITH_LEN("write delayed") },
+  { C_STRING_WITH_LEN("write low priority") },
+  { C_STRING_WITH_LEN("write normal") },
+  { C_STRING_WITH_LEN("read external") },
+  { C_STRING_WITH_LEN("write external") },
+
+  /* Socket operations */
+  { C_STRING_WITH_LEN("create") },
+  { C_STRING_WITH_LEN("connect") },
+  { C_STRING_WITH_LEN("bind") },
+  { C_STRING_WITH_LEN("close") },
+  { C_STRING_WITH_LEN("send") },
+  { C_STRING_WITH_LEN("recv") },
+  { C_STRING_WITH_LEN("sendto") },
+  { C_STRING_WITH_LEN("recvfrom") },
+  { C_STRING_WITH_LEN("sendmsg") },
+  { C_STRING_WITH_LEN("recvmsg") },
+  { C_STRING_WITH_LEN("seek") },
+  { C_STRING_WITH_LEN("opt") },
+  { C_STRING_WITH_LEN("stat") },
+  { C_STRING_WITH_LEN("shutdown") },
+  { C_STRING_WITH_LEN("select") },
+
+  /* Idle operations */
+  { C_STRING_WITH_LEN("idle") }
 };
 
 
@@ -411,35 +607,40 @@ int table_events_waits_common::read_row_values(TABLE *table,
       case 1: /* EVENT_ID */
         set_field_ulonglong(f, m_row.m_event_id);
         break;
-      case 2: /* EVENT_NAME */
+      case 2: /* END_EVENT_ID */
+        if (m_row.m_end_event_id > 0)
+          set_field_ulonglong(f, m_row.m_end_event_id - 1);
+        else
+          f->set_null();
+        break;
+      case 3: /* EVENT_NAME */
         set_field_varchar_utf8(f, m_row.m_name, m_row.m_name_length);
         break;
-      case 3: /* SOURCE */
+      case 4: /* SOURCE */
         set_field_varchar_utf8(f, m_row.m_source, m_row.m_source_length);
         break;
-      case 4: /* TIMER_START */
-        if ((m_row.m_timer_state == TIMER_STATE_STARTED) ||
-            (m_row.m_timer_state == TIMER_STATE_TIMED))
+      case 5: /* TIMER_START */
+        if (m_row.m_timer_start != 0)
           set_field_ulonglong(f, m_row.m_timer_start);
         else
           f->set_null();
         break;
-      case 5: /* TIMER_END */
-        if (m_row.m_timer_state == TIMER_STATE_TIMED)
+      case 6: /* TIMER_END */
+        if (m_row.m_timer_end != 0)
           set_field_ulonglong(f, m_row.m_timer_end);
         else
           f->set_null();
         break;
-      case 6: /* TIMER_WAIT */
-        if (m_row.m_timer_state == TIMER_STATE_TIMED)
-          set_field_ulonglong(f, m_row.m_timer_end - m_row.m_timer_start);
+      case 7: /* TIMER_WAIT */
+        if (m_row.m_timer_wait != 0)
+          set_field_ulonglong(f, m_row.m_timer_wait);
         else
           f->set_null();
         break;
-      case 7: /* SPINS */
+      case 8: /* SPINS */
         f->set_null();
         break;
-      case 8: /* OBJECT_SCHEMA */
+      case 9: /* OBJECT_SCHEMA */
         if (m_row.m_object_schema_length > 0)
         {
           set_field_varchar_utf8(f, m_row.m_object_schema,
@@ -448,7 +649,7 @@ int table_events_waits_common::read_row_values(TABLE *table,
         else
           f->set_null();
         break;
-      case 9: /* OBJECT_NAME */
+      case 10: /* OBJECT_NAME */
         if (m_row.m_object_name_length > 0)
         {
           set_field_varchar_utf8(f, m_row.m_object_name,
@@ -457,7 +658,16 @@ int table_events_waits_common::read_row_values(TABLE *table,
         else
           f->set_null();
         break;
-      case 10: /* OBJECT_TYPE */
+      case 11: /* INDEX_NAME */
+        if (m_row.m_index_name_length > 0)
+        {
+          set_field_varchar_utf8(f, m_row.m_index_name,
+                                 m_row.m_index_name_length);
+        }
+        else
+          f->set_null();
+        break;
+      case 12: /* OBJECT_TYPE */
         if (m_row.m_object_type)
         {
           set_field_varchar_utf8(f, m_row.m_object_type,
@@ -466,26 +676,39 @@ int table_events_waits_common::read_row_values(TABLE *table,
         else
           f->set_null();
         break;
-      case 11: /* OBJECT_INSTANCE */
+      case 13: /* OBJECT_INSTANCE */
         set_field_ulonglong(f, m_row.m_object_instance_addr);
         break;
-      case 12: /* NESTING_EVENT_ID */
-        f->set_null();
+      case 14: /* NESTING_EVENT_ID */
+        if (m_row.m_nesting_event_id != 0)
+          set_field_ulonglong(f, m_row.m_nesting_event_id);
+        else
+          f->set_null();
+        break;
+      case 15: /* NESTING_EVENT_TYPE */
+        if (m_row.m_nesting_event_id != 0)
+          set_field_enum(f, m_row.m_nesting_event_type);
+        else
+          f->set_null();
         break;
-      case 13: /* OPERATION */
+      case 16: /* OPERATION */
         operation= &operation_names_map[(int) m_row.m_operation - 1];
         set_field_varchar_utf8(f, operation->str, operation->length);
         break;
-      case 14: /* NUMBER_OF_BYTES */
+      case 17: /* NUMBER_OF_BYTES */
         if ((m_row.m_operation == OPERATION_TYPE_FILEREAD) ||
             (m_row.m_operation == OPERATION_TYPE_FILEWRITE) ||
-            (m_row.m_operation == OPERATION_TYPE_FILECHSIZE))
+            (m_row.m_operation == OPERATION_TYPE_FILECHSIZE) ||
+            (m_row.m_operation == OPERATION_TYPE_SOCKETSEND) ||
+            (m_row.m_operation == OPERATION_TYPE_SOCKETRECV) ||
+            (m_row.m_operation == OPERATION_TYPE_SOCKETSENDTO) ||
+            (m_row.m_operation == OPERATION_TYPE_SOCKETRECVFROM))
           set_field_ulonglong(f, m_row.m_number_of_bytes);
         else
           f->set_null();
         break;
-      case 15: /* FLAGS */
-        set_field_ulong(f, m_row.m_flags);
+      case 18: /* FLAGS */
+        f->set_null();
         break;
       default:
         DBUG_ASSERT(false);
@@ -532,17 +755,31 @@ int table_events_waits_current::rnd_next(void)
       We do not show nested events for now,
       this will be revised with TABLE io
     */
-#define ONLY_SHOW_ONE_WAIT
+// #define ONLY_SHOW_ONE_WAIT
 
 #ifdef ONLY_SHOW_ONE_WAIT
     if (m_pos.m_index_2 >= 1)
       continue;
 #else
-    if (m_pos.m_index_2 >= pfs_thread->m_wait_locker_count)
-      continue;
-#endif
+    /* m_events_waits_stack[0] is a dummy record */
+    PFS_events_waits *top_wait = &pfs_thread->m_events_waits_stack[WAIT_STACK_BOTTOM];
+    wait= &pfs_thread->m_events_waits_stack[m_pos.m_index_2 + WAIT_STACK_BOTTOM];
 
-    wait= &pfs_thread->m_wait_locker_stack[m_pos.m_index_2].m_waits_current;
+    PFS_events_waits *safe_current = pfs_thread->m_events_waits_current;
+
+    if (safe_current == top_wait)
+    {
+      /* Display the last top level wait, when completed */
+      if (m_pos.m_index_2 >= 1)
+        continue;
+    }
+    else
+    {
+      /* Display all pending waits, when in progress */
+      if (wait >= safe_current)
+        continue;
+    }
+#endif
 
     if (wait->m_wait_class == NO_WAIT_CLASS)
     {
@@ -574,14 +811,31 @@ int table_events_waits_current::rnd_pos(const void *pos)
   if (! pfs_thread->m_lock.is_populated())
     return HA_ERR_RECORD_DELETED;
 
-#ifdef ONLY_SHOW_CURRENT_WAITS
-  if (m_pos.m_index_2 >= pfs_thread->m_wait_locker_count)
+#ifdef ONLY_SHOW_ONE_WAIT
+  if (m_pos.m_index_2 >= 1)
     return HA_ERR_RECORD_DELETED;
-#endif
+#else
+  /* m_events_waits_stack[0] is a dummy record */
+  PFS_events_waits *top_wait = &pfs_thread->m_events_waits_stack[WAIT_STACK_BOTTOM];
+  wait= &pfs_thread->m_events_waits_stack[m_pos.m_index_2 + WAIT_STACK_BOTTOM];
 
-  DBUG_ASSERT(m_pos.m_index_2 < LOCKER_STACK_SIZE);
+  PFS_events_waits *safe_current = pfs_thread->m_events_waits_current;
+
+  if (safe_current == top_wait)
+  {
+    /* Display the last top level wait, when completed */
+    if (m_pos.m_index_2 >= 1)
+      return HA_ERR_RECORD_DELETED;
+  }
+  else
+  {
+    /* Display all pending waits, when in progress */
+    if (wait >= safe_current)
+      return HA_ERR_RECORD_DELETED;
+  }
+#endif
 
-  wait= &pfs_thread->m_wait_locker_stack[m_pos.m_index_2].m_waits_current;
+  DBUG_ASSERT(m_pos.m_index_2 < WAIT_STACK_LOGICAL_SIZE);
 
   if (wait->m_wait_class == NO_WAIT_CLASS)
     return HA_ERR_RECORD_DELETED;
@@ -685,12 +939,11 @@ int table_events_waits_history::rnd_pos(const void *pos)
       (m_pos.m_index_2 >= pfs_thread->m_waits_history_index))
     return HA_ERR_RECORD_DELETED;
 
-  if (pfs_thread->m_waits_history[m_pos.m_index_2].m_wait_class
-      == NO_WAIT_CLASS)
-    return HA_ERR_RECORD_DELETED;
-
   wait= &pfs_thread->m_waits_history[m_pos.m_index_2];
 
+  if (wait->m_wait_class == NO_WAIT_CLASS)
+    return HA_ERR_RECORD_DELETED;
+
   make_row(true, pfs_thread, wait);
   return 0;
 }
diff --git a/storage/perfschema/table_events_waits.h b/storage/perfschema/table_events_waits.h
index aa4edb4a368..72065c765ca 100644
--- a/storage/perfschema/table_events_waits.h
+++ b/storage/perfschema/table_events_waits.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2008, 2011, Oracle and/or its affiliates. All rights reserved.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
@@ -39,18 +39,22 @@ struct row_events_waits
   ulong m_thread_internal_id;
   /** Column EVENT_ID. */
   ulonglong m_event_id;
+  /** Column END_EVENT_ID. */
+  ulonglong m_end_event_id;
+  /** Column NESTING_EVENT_ID. */
+  ulonglong m_nesting_event_id;
+  /** Column NESTING_EVENT_TYPE. */
+  enum_event_type m_nesting_event_type;
   /** Column EVENT_NAME. */
   const char *m_name;
   /** Length in bytes of @c m_name. */
   uint m_name_length;
-  /** Timer state. */
-  enum timer_state m_timer_state;
   /** Column TIMER_START. */
   ulonglong m_timer_start;
-  /** True if TIMER_END is null. */
-  bool m_timer_end_null;
   /** Column TIMER_END. */
   ulonglong m_timer_end;
+  /** Column TIMER_WAIT. */
+  ulonglong m_timer_wait;
   /** Column OBJECT_TYPE. */
   const char *m_object_type;
   /** Length in bytes of @c m_object_type. */
@@ -63,6 +67,10 @@ struct row_events_waits
   char m_object_name[COL_OBJECT_NAME_EXTENDED_SIZE];
   /** Length in bytes of @c m_object_name. */
   uint m_object_name_length;
+  /** Column INDEX_NAME. */
+  char m_index_name[COL_INDEX_NAME_SIZE];
+  /** Length in bytes of @c m_index_name. */
+  uint m_index_name_length;
   /** Column OBJECT_INSTANCE_BEGIN. */
   intptr m_object_instance_addr;
   /** Column SOURCE. */
@@ -135,13 +143,16 @@ protected:
   {}
 
   void clear_object_columns();
+  int make_table_object_columns(volatile PFS_events_waits *wait);
+  int make_file_object_columns(volatile PFS_events_waits *wait);
+  int make_socket_object_columns(volatile PFS_events_waits *wait);
 
   void make_row(bool thread_own_wait, PFS_thread *pfs_thread,
                 volatile PFS_events_waits *wait);
 
   /** Current row. */
   row_events_waits m_row;
-  /** True is the current row exists. */
+  /** True if the current row exists. */
   bool m_row_exists;
 };
 
diff --git a/storage/perfschema/table_events_waits_summary.cc b/storage/perfschema/table_events_waits_summary.cc
index 05f280f8521..2a144a07344 100644
--- a/storage/perfschema/table_events_waits_summary.cc
+++ b/storage/perfschema/table_events_waits_summary.cc
@@ -26,318 +26,6 @@
 #include "table_events_waits_summary.h"
 #include "pfs_global.h"
 
-THR_LOCK table_events_waits_summary_by_thread_by_event_name::m_table_lock;
-
-static const TABLE_FIELD_TYPE ews_by_thread_by_event_name_field_types[]=
-{
-  {
-    { C_STRING_WITH_LEN("THREAD_ID") },
-    { C_STRING_WITH_LEN("int(11)") },
-    { NULL, 0}
-  },
-  {
-    { C_STRING_WITH_LEN("EVENT_NAME") },
-    { C_STRING_WITH_LEN("varchar(128)") },
-    { NULL, 0}
-  },
-  {
-    { C_STRING_WITH_LEN("COUNT_STAR") },
-    { C_STRING_WITH_LEN("bigint(20)") },
-    { NULL, 0}
-  },
-  {
-    { C_STRING_WITH_LEN("SUM_TIMER_WAIT") },
-    { C_STRING_WITH_LEN("bigint(20)") },
-    { NULL, 0}
-  },
-  {
-    { C_STRING_WITH_LEN("MIN_TIMER_WAIT") },
-    { C_STRING_WITH_LEN("bigint(20)") },
-    { NULL, 0}
-  },
-  {
-    { C_STRING_WITH_LEN("AVG_TIMER_WAIT") },
-    { C_STRING_WITH_LEN("bigint(20)") },
-    { NULL, 0}
-  },
-  {
-    { C_STRING_WITH_LEN("MAX_TIMER_WAIT") },
-    { C_STRING_WITH_LEN("bigint(20)") },
-    { NULL, 0}
-  }
-};
-
-TABLE_FIELD_DEF
-table_events_waits_summary_by_thread_by_event_name::m_field_def=
-{ 7, ews_by_thread_by_event_name_field_types };
-
-PFS_engine_table_share
-table_events_waits_summary_by_thread_by_event_name::m_share=
-{
-  { C_STRING_WITH_LEN("events_waits_summary_by_thread_by_event_name") },
-  &pfs_truncatable_acl,
-  &table_events_waits_summary_by_thread_by_event_name::create,
-  NULL, /* write_row */
-  &table_events_waits_summary_by_thread_by_event_name::delete_all_rows,
-  1000, /* records */
-  sizeof(pos_events_waits_summary_by_thread_by_event_name),
-  &m_table_lock,
-  &m_field_def,
-  false /* checked */
-};
-
-PFS_engine_table*
-table_events_waits_summary_by_thread_by_event_name::create(void)
-{
-  return new table_events_waits_summary_by_thread_by_event_name();
-}
-
-int
-table_events_waits_summary_by_thread_by_event_name::delete_all_rows(void)
-{
-  reset_per_thread_wait_stat();
-  return 0;
-}
-
-table_events_waits_summary_by_thread_by_event_name
-::table_events_waits_summary_by_thread_by_event_name()
-  : PFS_engine_table(&m_share, &m_pos),
-    m_row_exists(false), m_pos(), m_next_pos()
-{}
-
-void table_events_waits_summary_by_thread_by_event_name::reset_position(void)
-{
-  m_pos.reset();
-  m_next_pos.reset();
-}
-
-int table_events_waits_summary_by_thread_by_event_name::rnd_next(void)
-{
-  PFS_thread *thread;
-  PFS_mutex_class *mutex_class;
-  PFS_rwlock_class *rwlock_class;
-  PFS_cond_class *cond_class;
-  PFS_file_class *file_class;
-
-  for (m_pos.set_at(&m_next_pos);
-       m_pos.has_more_thread();
-       m_pos.next_thread())
-  {
-    thread= &thread_array[m_pos.m_index_1];
-    if (thread->m_lock.is_populated())
-    {
-      for ( ; m_pos.has_more_view(); m_pos.next_view())
-      {
-        switch (m_pos.m_index_2) {
-        case pos_events_waits_summary_by_thread_by_event_name::VIEW_MUTEX:
-          mutex_class= find_mutex_class(m_pos.m_index_3);
-          if (mutex_class)
-          {
-            make_mutex_row(thread, mutex_class);
-            m_next_pos.set_after(&m_pos);
-            return 0;
-          }
-          break;
-        case pos_events_waits_summary_by_thread_by_event_name::VIEW_RWLOCK:
-          rwlock_class= find_rwlock_class(m_pos.m_index_3);
-          if (rwlock_class)
-          {
-            make_rwlock_row(thread, rwlock_class);
-            m_next_pos.set_after(&m_pos);
-            return 0;
-          }
-          break;
-        case pos_events_waits_summary_by_thread_by_event_name::VIEW_COND:
-          cond_class= find_cond_class(m_pos.m_index_3);
-          if (cond_class)
-          {
-            make_cond_row(thread, cond_class);
-            m_next_pos.set_after(&m_pos);
-            return 0;
-          }
-          break;
-        case pos_events_waits_summary_by_thread_by_event_name::VIEW_FILE:
-          file_class= find_file_class(m_pos.m_index_3);
-          if (file_class)
-          {
-            make_file_row(thread, file_class);
-            m_next_pos.set_after(&m_pos);
-            return 0;
-          }
-          break;
-        }
-      }
-    }
-  }
-
-  return HA_ERR_END_OF_FILE;
-}
-
-int
-table_events_waits_summary_by_thread_by_event_name::rnd_pos(const void *pos)
-{
-  PFS_thread *thread;
-  PFS_mutex_class *mutex_class;
-  PFS_rwlock_class *rwlock_class;
-  PFS_cond_class *cond_class;
-  PFS_file_class *file_class;
-
-  set_position(pos);
-  DBUG_ASSERT(m_pos.m_index_1 < thread_max);
-
-  thread= &thread_array[m_pos.m_index_1];
-  if (! thread->m_lock.is_populated())
-    return HA_ERR_RECORD_DELETED;
-
-  switch (m_pos.m_index_2) {
-  case pos_events_waits_summary_by_thread_by_event_name::VIEW_MUTEX:
-    mutex_class= find_mutex_class(m_pos.m_index_3);
-    if (mutex_class)
-    {
-      make_mutex_row(thread, mutex_class);
-      return 0;
-    }
-    break;
-  case pos_events_waits_summary_by_thread_by_event_name::VIEW_RWLOCK:
-    rwlock_class= find_rwlock_class(m_pos.m_index_3);
-    if (rwlock_class)
-    {
-      make_rwlock_row(thread, rwlock_class);
-      return 0;
-    }
-    break;
-  case pos_events_waits_summary_by_thread_by_event_name::VIEW_COND:
-    cond_class= find_cond_class(m_pos.m_index_3);
-    if (cond_class)
-    {
-      make_cond_row(thread, cond_class);
-      return 0;
-    }
-    break;
-  case pos_events_waits_summary_by_thread_by_event_name::VIEW_FILE:
-    file_class= find_file_class(m_pos.m_index_3);
-    if (file_class)
-    {
-      make_file_row(thread, file_class);
-      return 0;
-    }
-    break;
-  }
-  return HA_ERR_RECORD_DELETED;
-}
-
-void table_events_waits_summary_by_thread_by_event_name
-::make_instr_row(PFS_thread *thread, PFS_instr_class *klass,
-                 PFS_single_stat_chain *stat)
-{
-  pfs_lock lock;
-
-  m_row_exists= false;
-
-  /* Protect this reader against a thread termination */
-  thread->m_lock.begin_optimistic_lock(&lock);
-
-  m_row.m_thread_internal_id= thread->m_thread_internal_id;
-  m_row.m_name= klass->m_name;
-  m_row.m_name_length= klass->m_name_length;
-
-  m_row.m_count= stat->m_count;
-  m_row.m_sum= stat->m_sum;
-  m_row.m_min= stat->m_min;
-  m_row.m_max= stat->m_max;
-
-  if (m_row.m_count)
-    m_row.m_avg= m_row.m_sum / m_row.m_count;
-  else
-  {
-    m_row.m_min= 0;
-    m_row.m_avg= 0;
-  }
-
-  if (thread->m_lock.end_optimistic_lock(&lock))
-    m_row_exists= true;
-}
-
-void table_events_waits_summary_by_thread_by_event_name
-::make_mutex_row(PFS_thread *thread, PFS_mutex_class *klass)
-{
-  PFS_single_stat_chain *stat;
-  stat= find_per_thread_mutex_class_wait_stat(thread, klass);
-  make_instr_row(thread, klass, stat);
-}
-
-void table_events_waits_summary_by_thread_by_event_name
-::make_rwlock_row(PFS_thread *thread, PFS_rwlock_class *klass)
-{
-  PFS_single_stat_chain *stat;
-  stat= find_per_thread_rwlock_class_wait_stat(thread, klass);
-  make_instr_row(thread, klass, stat);
-}
-
-void table_events_waits_summary_by_thread_by_event_name
-::make_cond_row(PFS_thread *thread, PFS_cond_class *klass)
-{
-  PFS_single_stat_chain *stat;
-  stat= find_per_thread_cond_class_wait_stat(thread, klass);
-  make_instr_row(thread, klass, stat);
-}
-
-void table_events_waits_summary_by_thread_by_event_name
-::make_file_row(PFS_thread *thread, PFS_file_class *klass)
-{
-  PFS_single_stat_chain *stat;
-  stat= find_per_thread_file_class_wait_stat(thread, klass);
-  make_instr_row(thread, klass, stat);
-}
-
-int table_events_waits_summary_by_thread_by_event_name
-::read_row_values(TABLE *table, unsigned char *, Field **fields,
-                  bool read_all)
-{
-  Field *f;
-
-  if (unlikely(! m_row_exists))
-    return HA_ERR_RECORD_DELETED;
-
-  /* Set the null bits */
-  DBUG_ASSERT(table->s->null_bytes == 0);
-
-  for (; (f= *fields) ; fields++)
-  {
-    if (read_all || bitmap_is_set(table->read_set, f->field_index))
-    {
-      switch(f->field_index)
-      {
-      case 0: /* THREAD_ID */
-        set_field_ulong(f, m_row.m_thread_internal_id);
-        break;
-      case 1: /* NAME */
-        set_field_varchar_utf8(f, m_row.m_name, m_row.m_name_length);
-        break;
-      case 2: /* COUNT */
-        set_field_ulonglong(f, m_row.m_count);
-        break;
-      case 3: /* SUM */
-        set_field_ulonglong(f, m_row.m_sum);
-        break;
-      case 4: /* MIN */
-        set_field_ulonglong(f, m_row.m_min);
-        break;
-      case 5: /* AVG */
-        set_field_ulonglong(f, m_row.m_avg);
-        break;
-      case 6: /* MAX */
-        set_field_ulonglong(f, m_row.m_max);
-        break;
-      default:
-        DBUG_ASSERT(false);
-      }
-    }
-  }
-
-  return 0;
-}
-
 THR_LOCK table_events_waits_summary_by_instance::m_table_lock;
 
 static const TABLE_FIELD_TYPE ews_by_instance_field_types[]=
@@ -391,6 +79,7 @@ table_events_waits_summary_by_instance::m_share=
   &table_events_waits_summary_by_instance::create,
   NULL, /* write_row */
   &table_events_waits_summary_by_instance::delete_all_rows,
+  NULL, /* get_row_count */
   1000, /* records */
   sizeof(pos_all_instr),
   &m_table_lock,
@@ -416,10 +105,10 @@ table_events_waits_summary_by_instance
 
 void table_events_waits_summary_by_instance
 ::make_instr_row(PFS_instr *pfs, PFS_instr_class *klass,
-                 const void *object_instance_begin)
+                 const void *object_instance_begin,
+                 PFS_single_stat *pfs_stat)
 {
   pfs_lock lock;
-
   m_row_exists= false;
 
   /*
@@ -432,18 +121,8 @@ void table_events_waits_summary_by_instance
   m_row.m_name_length= klass->m_name_length;
   m_row.m_object_instance_addr= (intptr) object_instance_begin;
 
-  m_row.m_count= pfs->m_wait_stat.m_count;
-  m_row.m_sum= pfs->m_wait_stat.m_sum;
-  m_row.m_min= pfs->m_wait_stat.m_min;
-  m_row.m_max= pfs->m_wait_stat.m_max;
-
-  if (m_row.m_count)
-    m_row.m_avg= m_row.m_sum / m_row.m_count;
-  else
-  {
-    m_row.m_min= 0;
-    m_row.m_avg= 0;
-  }
+  get_normalizer(klass);
+  m_row.m_stat.set(m_normalizer, pfs_stat);
 
   if (pfs->m_lock.end_optimistic_lock(&lock))
     m_row_exists= true;
@@ -460,7 +139,7 @@ void table_events_waits_summary_by_instance::make_mutex_row(PFS_mutex *pfs)
   if (unlikely(safe_class == NULL))
     return;
 
-  make_instr_row(pfs, safe_class, pfs->m_identity);
+  make_instr_row(pfs, safe_class, pfs->m_identity, &pfs->m_wait_stat);
 }
 
 /**
@@ -474,7 +153,7 @@ void table_events_waits_summary_by_instance::make_rwlock_row(PFS_rwlock *pfs)
   if (unlikely(safe_class == NULL))
     return;
 
-  make_instr_row(pfs, safe_class, pfs->m_identity);
+  make_instr_row(pfs, safe_class, pfs->m_identity, &pfs->m_wait_stat);
 }
 
 /**
@@ -488,7 +167,7 @@ void table_events_waits_summary_by_instance::make_cond_row(PFS_cond *pfs)
   if (unlikely(safe_class == NULL))
     return;
 
-  make_instr_row(pfs, safe_class, pfs->m_identity);
+  make_instr_row(pfs, safe_class, pfs->m_identity, &pfs->m_wait_stat);
 }
 
 /**
@@ -506,7 +185,32 @@ void table_events_waits_summary_by_instance::make_file_row(PFS_file *pfs)
     Files don't have a in memory structure associated to it,
     so we use the address of the PFS_file buffer as object_instance_begin
   */
-  make_instr_row(pfs, safe_class, pfs);
+  make_instr_row(pfs, safe_class, pfs, &pfs->m_wait_stat);
+}
+
+/**
+  Build a row, for socket statistics in a thread.
+  @param pfs              the socket this cursor is reading
+*/
+void table_events_waits_summary_by_instance::make_socket_row(PFS_socket *pfs)
+{
+  PFS_socket_class *safe_class;
+  safe_class= sanitize_socket_class(pfs->m_class);
+  if (unlikely(safe_class == NULL))
+    return;
+
+  /*
+     Consolidate wait times and byte counts for individual operations. This is
+     done by the consumer in order to reduce overhead on the socket instrument.
+  */
+  PFS_byte_stat pfs_stat;
+  pfs->m_socket_stat.m_io_stat.sum(&pfs_stat);
+
+  /*
+    Sockets don't have an associated in-memory structure, so use the address of
+    the PFS_socket buffer as object_instance_begin.
+  */
+  make_instr_row(pfs, safe_class, pfs, &pfs_stat);
 }
 
 int table_events_waits_summary_by_instance
@@ -534,19 +238,19 @@ int table_events_waits_summary_by_instance
         set_field_ulonglong(f, m_row.m_object_instance_addr);
         break;
       case 2: /* COUNT */
-        set_field_ulonglong(f, m_row.m_count);
+        set_field_ulonglong(f, m_row.m_stat.m_count);
         break;
       case 3: /* SUM */
-        set_field_ulonglong(f, m_row.m_sum);
+        set_field_ulonglong(f, m_row.m_stat.m_sum);
         break;
       case 4: /* MIN */
-        set_field_ulonglong(f, m_row.m_min);
+        set_field_ulonglong(f, m_row.m_stat.m_min);
         break;
       case 5: /* AVG */
-        set_field_ulonglong(f, m_row.m_avg);
+        set_field_ulonglong(f, m_row.m_stat.m_avg);
         break;
       case 6: /* MAX */
-        set_field_ulonglong(f, m_row.m_max);
+        set_field_ulonglong(f, m_row.m_stat.m_max);
         break;
       default:
         DBUG_ASSERT(false);
diff --git a/storage/perfschema/table_events_waits_summary.h b/storage/perfschema/table_events_waits_summary.h
index 628bb75553f..7463ace3eb6 100644
--- a/storage/perfschema/table_events_waits_summary.h
+++ b/storage/perfschema/table_events_waits_summary.h
@@ -26,124 +26,13 @@
 #include "pfs_instr_class.h"
 #include "pfs_instr.h"
 #include "table_all_instr.h"
+#include "table_helper.h"
 
 /**
   @addtogroup Performance_schema_tables
   @{
 */
 
-/**
-  A row of table
-  PERFORMANCE_SCHEMA.EVENTS_WAITS_SUMMARY_BY_THREAD_BY_EVENT_NAME.
-*/
-struct row_events_waits_summary_by_thread_by_event_name
-{
-  /** Column THREAD_ID. */
-  ulong m_thread_internal_id;
-  /** Column EVENT_NAME. */
-  const char *m_name;
-  /** Length in bytes of @c m_name. */
-  uint m_name_length;
-  /** Column COUNT_STAR. */
-  ulonglong m_count;
-  /** Column SUM_TIMER_WAIT. */
-  ulonglong m_sum;
-  /** Column MIN_TIMER_WAIT. */
-  ulonglong m_min;
-  /** Column AVG_TIMER_WAIT. */
-  ulonglong m_avg;
-  /** Column MAX_TIMER_WAIT. */
-  ulonglong m_max;
-};
-
-/**
-  Position of a cursor on
-  PERFORMANCE_SCHEMA.EVENTS_WAITS_SUMMARY_BY_THREAD_BY_EVENT_NAME.
-*/
-struct pos_events_waits_summary_by_thread_by_event_name
-: public PFS_triple_index, public PFS_instrument_view_constants
-{
-  pos_events_waits_summary_by_thread_by_event_name()
-    : PFS_triple_index(0, VIEW_MUTEX, 1)
-  {}
-
-  inline void reset(void)
-  {
-    m_index_1= 0;
-    m_index_2= VIEW_MUTEX;
-    m_index_3= 1;
-  }
-
-  inline bool has_more_thread(void)
-  { return (m_index_1 < thread_max); }
-
-  inline bool has_more_view(void)
-  { return (m_index_2 <= VIEW_FILE); }
-
-  inline void next_thread(void)
-  {
-    m_index_1++;
-    m_index_2= VIEW_MUTEX;
-    m_index_3= 1;
-  }
-
-  inline void next_view(void)
-  {
-    m_index_2++;
-    m_index_3= 1;
-  }
-};
-
-/** Table PERFORMANCE_SCHEMA.EVENTS_WAITS_SUMMARY_BY_THREAD_BY_EVENT_NAME. */
-class table_events_waits_summary_by_thread_by_event_name
-  : public PFS_engine_table
-{
-public:
-  /** Table share */
-  static PFS_engine_table_share m_share;
-  static PFS_engine_table* create();
-  static int delete_all_rows();
-
-  virtual int rnd_next();
-  virtual int rnd_pos(const void *pos);
-  virtual void reset_position(void);
-
-protected:
-  virtual int read_row_values(TABLE *table,
-                              unsigned char *buf,
-                              Field **fields,
-                              bool read_all);
-
-  table_events_waits_summary_by_thread_by_event_name();
-
-public:
-  ~table_events_waits_summary_by_thread_by_event_name()
-  {}
-
-protected:
-  void make_instr_row(PFS_thread *thread, PFS_instr_class *klass,
-                      PFS_single_stat_chain *stat);
-  void make_mutex_row(PFS_thread *thread, PFS_mutex_class *klass);
-  void make_rwlock_row(PFS_thread *thread, PFS_rwlock_class *klass);
-  void make_cond_row(PFS_thread *thread, PFS_cond_class *klass);
-  void make_file_row(PFS_thread *thread, PFS_file_class *klass);
-
-private:
-  /** Table share lock. */
-  static THR_LOCK m_table_lock;
-  /** Fields definition. */
-  static TABLE_FIELD_DEF m_field_def;
-
-  /** Current row. */
-  row_events_waits_summary_by_thread_by_event_name m_row;
-  /** True is the current row exists. */
-  bool m_row_exists;
-  /** Current position. */
-  pos_events_waits_summary_by_thread_by_event_name m_pos;
-  /** Next position. */
-  pos_events_waits_summary_by_thread_by_event_name m_next_pos;
-};
-
 /** A row of PERFORMANCE_SCHEMA.EVENTS_WAITS_SUMMARY_BY_INSTANCE. */
 struct row_events_waits_summary_by_instance
 {
@@ -153,16 +42,8 @@ struct row_events_waits_summary_by_instance
   uint m_name_length;
   /** Column OBJECT_INSTANCE_BEGIN. */
   intptr m_object_instance_addr;
-  /** Column COUNT_STAR. */
-  ulonglong m_count;
-  /** Column SUM_TIMER_WAIT. */
-  ulonglong m_sum;
-  /** Column MIN_TIMER_WAIT. */
-  ulonglong m_min;
-  /** Column AVG_TIMER_WAIT. */
-  ulonglong m_avg;
-  /** Column MAX_TIMER_WAIT. */
-  ulonglong m_max;
+  /** Columns COUNT_STAR, SUM/MIN/AVG/MAX TIMER_WAIT. */
+  PFS_stat_row m_stat;
 };
 
 /** Table PERFORMANCE_SCHEMA.EVENTS_WAITS_SUMMARY_BY_INSTANCE. */
@@ -176,11 +57,13 @@ public:
 
 protected:
   void make_instr_row(PFS_instr *pfs, PFS_instr_class *klass,
-                      const void *object_instance_begin);
+                      const void *object_instance_begin,
+                      PFS_single_stat *pfs_stat);
   virtual void make_mutex_row(PFS_mutex *pfs);
   virtual void make_rwlock_row(PFS_rwlock *pfs);
   virtual void make_cond_row(PFS_cond *pfs);
   virtual void make_file_row(PFS_file *pfs);
+  virtual void make_socket_row(PFS_socket *pfs);
 
   virtual int read_row_values(TABLE *table,
                               unsigned char *buf,
@@ -201,7 +84,7 @@ private:
 
   /** Current row. */
   row_events_waits_summary_by_instance m_row;
-  /** True is the current row exists. */
+  /** True if the current row exists. */
   bool m_row_exists;
 };
 
diff --git a/storage/perfschema/table_ews_by_account_by_event_name.cc b/storage/perfschema/table_ews_by_account_by_event_name.cc
new file mode 100644
index 00000000000..992e7c18f17
--- /dev/null
+++ b/storage/perfschema/table_ews_by_account_by_event_name.cc
@@ -0,0 +1,288 @@
+/* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; version 2 of the License.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+/**
+  @file storage/perfschema/table_ews_by_account_by_event_name.cc
+  Table EVENTS_WAITS_SUMMARY_BY_ACCOUNT_BY_EVENT_NAME (implementation).
+*/
+
+#include "my_global.h"
+#include "my_pthread.h"
+#include "pfs_instr_class.h"
+#include "pfs_column_types.h"
+#include "pfs_column_values.h"
+#include "table_ews_by_account_by_event_name.h"
+#include "pfs_global.h"
+#include "pfs_visitor.h"
+
+THR_LOCK table_ews_by_account_by_event_name::m_table_lock;
+
+static const TABLE_FIELD_TYPE field_types[]=
+{
+  {
+    { C_STRING_WITH_LEN("USER") },
+    { C_STRING_WITH_LEN("char(16)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("HOST") },
+    { C_STRING_WITH_LEN("char(60)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("EVENT_NAME") },
+    { C_STRING_WITH_LEN("varchar(128)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("COUNT_STAR") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_TIMER_WAIT") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("MIN_TIMER_WAIT") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("AVG_TIMER_WAIT") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("MAX_TIMER_WAIT") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  }
+};
+
+TABLE_FIELD_DEF
+table_ews_by_account_by_event_name::m_field_def=
+{ 8, field_types };
+
+PFS_engine_table_share
+table_ews_by_account_by_event_name::m_share=
+{
+  { C_STRING_WITH_LEN("events_waits_summary_by_account_by_event_name") },
+  &pfs_truncatable_acl,
+  table_ews_by_account_by_event_name::create,
+  NULL, /* write_row */
+  table_ews_by_account_by_event_name::delete_all_rows,
+  NULL, /* get_row_count */
+  1000, /* records */
+  sizeof(pos_ews_by_account_by_event_name),
+  &m_table_lock,
+  &m_field_def,
+  false /* checked */
+};
+
+PFS_engine_table*
+table_ews_by_account_by_event_name::create(void)
+{
+  return new table_ews_by_account_by_event_name();
+}
+
+int
+table_ews_by_account_by_event_name::delete_all_rows(void)
+{
+  reset_events_waits_by_thread();
+  reset_events_waits_by_account();
+  return 0;
+}
+
+table_ews_by_account_by_event_name::table_ews_by_account_by_event_name()
+  : PFS_engine_table(&m_share, &m_pos),
+    m_row_exists(false), m_pos(), m_next_pos()
+{}
+
+void table_ews_by_account_by_event_name::reset_position(void)
+{
+  m_pos.reset();
+  m_next_pos.reset();
+}
+
+int table_ews_by_account_by_event_name::rnd_next(void)
+{
+  PFS_account *account;
+  PFS_instr_class *instr_class;
+
+  for (m_pos.set_at(&m_next_pos);
+       m_pos.has_more_account();
+       m_pos.next_account())
+  {
+    account= &account_array[m_pos.m_index_1];
+    if (account->m_lock.is_populated())
+    {
+      for ( ;
+           m_pos.has_more_view();
+           m_pos.next_view())
+      {
+        switch (m_pos.m_index_2)
+        {
+        case pos_ews_by_account_by_event_name::VIEW_MUTEX:
+          instr_class= find_mutex_class(m_pos.m_index_3);
+          break;
+        case pos_ews_by_account_by_event_name::VIEW_RWLOCK:
+          instr_class= find_rwlock_class(m_pos.m_index_3);
+          break;
+        case pos_ews_by_account_by_event_name::VIEW_COND:
+          instr_class= find_cond_class(m_pos.m_index_3);
+          break;
+        case pos_ews_by_account_by_event_name::VIEW_FILE:
+          instr_class= find_file_class(m_pos.m_index_3);
+          break;
+        case pos_ews_by_account_by_event_name::VIEW_TABLE:
+          instr_class= find_table_class(m_pos.m_index_3);
+          break;
+        case pos_ews_by_account_by_event_name::VIEW_SOCKET:
+          instr_class= find_socket_class(m_pos.m_index_3);
+          break;
+        case pos_ews_by_account_by_event_name::VIEW_IDLE:
+          instr_class= find_idle_class(m_pos.m_index_3);
+          break;
+        default:
+          instr_class= NULL;
+          DBUG_ASSERT(false);
+          break;
+        }
+
+        if (instr_class)
+        {
+          make_row(account, instr_class);
+          m_next_pos.set_after(&m_pos);
+          return 0;
+        }
+      }
+    }
+  }
+
+  return HA_ERR_END_OF_FILE;
+}
+
+int
+table_ews_by_account_by_event_name::rnd_pos(const void *pos)
+{
+  PFS_account *account;
+  PFS_instr_class *instr_class;
+
+  set_position(pos);
+  DBUG_ASSERT(m_pos.m_index_1 < account_max);
+
+  account= &account_array[m_pos.m_index_1];
+  if (! account->m_lock.is_populated())
+    return HA_ERR_RECORD_DELETED;
+
+  switch (m_pos.m_index_2)
+  {
+  case pos_ews_by_account_by_event_name::VIEW_MUTEX:
+    instr_class= find_mutex_class(m_pos.m_index_3);
+    break;
+  case pos_ews_by_account_by_event_name::VIEW_RWLOCK:
+    instr_class= find_rwlock_class(m_pos.m_index_3);
+    break;
+  case pos_ews_by_account_by_event_name::VIEW_COND:
+    instr_class= find_cond_class(m_pos.m_index_3);
+    break;
+  case pos_ews_by_account_by_event_name::VIEW_FILE:
+    instr_class= find_file_class(m_pos.m_index_3);
+    break;
+  case pos_ews_by_account_by_event_name::VIEW_TABLE:
+    instr_class= find_table_class(m_pos.m_index_3);
+    break;
+  case pos_ews_by_account_by_event_name::VIEW_SOCKET:
+    instr_class= find_socket_class(m_pos.m_index_3);
+    break;
+  case pos_ews_by_account_by_event_name::VIEW_IDLE:
+    instr_class= find_idle_class(m_pos.m_index_3);
+    break;
+  default:
+    instr_class= NULL;
+    DBUG_ASSERT(false);
+  }
+  if (instr_class)
+  {
+    make_row(account, instr_class);
+    return 0;
+  }
+
+  return HA_ERR_RECORD_DELETED;
+}
+
+void table_ews_by_account_by_event_name
+::make_row(PFS_account *account, PFS_instr_class *klass)
+{
+  pfs_lock lock;
+  m_row_exists= false;
+
+  account->m_lock.begin_optimistic_lock(&lock);
+
+  if (m_row.m_account.make_row(account))
+    return;
+
+  m_row.m_event_name.make_row(klass);
+
+  PFS_connection_wait_visitor visitor(klass);
+  PFS_connection_iterator::visit_account(account, true, & visitor);
+
+  if (! account->m_lock.end_optimistic_lock(&lock))
+    return;
+
+  m_row_exists= true;
+
+  get_normalizer(klass);
+  m_row.m_stat.set(m_normalizer, & visitor.m_stat);
+}
+
+int table_ews_by_account_by_event_name
+::read_row_values(TABLE *table, unsigned char *buf, Field **fields,
+                  bool read_all)
+{
+  Field *f;
+
+  if (unlikely(! m_row_exists))
+    return HA_ERR_RECORD_DELETED;
+
+  /* Set the null bits */
+  DBUG_ASSERT(table->s->null_bytes == 1);
+  buf[0]= 0;
+
+  for (; (f= *fields) ; fields++)
+  {
+    if (read_all || bitmap_is_set(table->read_set, f->field_index))
+    {
+      switch(f->field_index)
+      {
+      case 0: /* USER */
+      case 1: /* HOST */
+        m_row.m_account.set_field(f->field_index, f);
+        break;
+      case 2: /* EVENT_NAME */
+        m_row.m_event_name.set_field(f);
+        break;
+      default: /* 3, ... COUNT/SUM/MIN/AVG/MAX */
+        m_row.m_stat.set_field(f->field_index - 3, f);
+        break;
+      }
+    }
+  }
+
+  return 0;
+}
+
diff --git a/storage/perfschema/table_ews_by_account_by_event_name.h b/storage/perfschema/table_ews_by_account_by_event_name.h
new file mode 100644
index 00000000000..0a0ca83131a
--- /dev/null
+++ b/storage/perfschema/table_ews_by_account_by_event_name.h
@@ -0,0 +1,136 @@
+/* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; version 2 of the License.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+#ifndef TABLE_EWS_BY_ACCOUNT_BY_EVENT_NAME_H
+#define TABLE_EWS_BY_ACCOUNT_BY_EVENT_NAME_H
+
+/**
+  @file storage/perfschema/table_ews_by_account_by_event_name.h
+  Table EVENTS_WAITS_SUMMARY_BY_ACCOUNT_BY_EVENT_NAME (declarations).
+*/
+
+#include "pfs_column_types.h"
+#include "pfs_engine_table.h"
+#include "pfs_instr_class.h"
+#include "pfs_instr.h"
+#include "pfs_account.h"
+#include "table_helper.h"
+
+/**
+  @addtogroup Performance_schema_tables
+  @{
+*/
+
+/**
+  A row of table
+  PERFORMANCE_SCHEMA.EVENTS_WAITS_SUMMARY_BY_ACCOUNT_BY_EVENT_NAME.
+*/
+struct row_ews_by_account_by_event_name
+{
+  /** Column USER, HOST. */
+  PFS_account_row m_account;
+  /** Column EVENT_NAME. */
+  PFS_event_name_row m_event_name;
+  /** Columns COUNT_STAR, SUM/MIN/AVG/MAX TIMER_WAIT. */
+  PFS_stat_row m_stat;
+};
+
+/**
+  Position of a cursor on
+  PERFORMANCE_SCHEMA.EVENTS_WAITS_SUMMARY_BY_ACCOUNT_BY_EVENT_NAME.
+  Index 1 on user@host (0 based)
+  Index 2 on instrument view
+  Index 3 on instrument class (1 based)
+*/
+struct pos_ews_by_account_by_event_name
+: public PFS_triple_index, public PFS_instrument_view_constants
+{
+  pos_ews_by_account_by_event_name()
+    : PFS_triple_index(0, FIRST_VIEW, 1)
+  {}
+
+  inline void reset(void)
+  {
+    m_index_1= 0;
+    m_index_2= VIEW_MUTEX;
+    m_index_3= 1;
+  }
+
+  inline bool has_more_account(void)
+  { return (m_index_1 < account_max); }
+
+  inline void next_account(void)
+  {
+    m_index_1++;
+    m_index_2= FIRST_VIEW;
+    m_index_3= 1;
+  }
+
+  inline bool has_more_view(void)
+  { return (m_index_2 <= LAST_VIEW); }
+
+  inline void next_view(void)
+  {
+    m_index_2++;
+    m_index_3= 1;
+  }
+};
+
+/** Table PERFORMANCE_SCHEMA.EVENTS_WAITS_SUMMARY_BY_ACCOUNT_BY_EVENT_NAME. */
+class table_ews_by_account_by_event_name : public PFS_engine_table
+{
+public:
+  /** Table share */
+  static PFS_engine_table_share m_share;
+  static PFS_engine_table* create();
+  static int delete_all_rows();
+
+  virtual int rnd_next();
+  virtual int rnd_pos(const void *pos);
+  virtual void reset_position(void);
+
+protected:
+  virtual int read_row_values(TABLE *table,
+                              unsigned char *buf,
+                              Field **fields,
+                              bool read_all);
+
+  table_ews_by_account_by_event_name();
+
+public:
+  ~table_ews_by_account_by_event_name()
+  {}
+
+protected:
+  void make_row(PFS_account *account, PFS_instr_class *klass);
+
+private:
+  /** Table share lock. */
+  static THR_LOCK m_table_lock;
+  /** Fields definition. */
+  static TABLE_FIELD_DEF m_field_def;
+
+  /** Current row. */
+  row_ews_by_account_by_event_name m_row;
+  /** True is the current row exists. */
+  bool m_row_exists;
+  /** Current position. */
+  pos_ews_by_account_by_event_name m_pos;
+  /** Next position. */
+  pos_ews_by_account_by_event_name m_next_pos;
+};
+
+/** @} */
+#endif
diff --git a/storage/perfschema/table_ews_by_host_by_event_name.cc b/storage/perfschema/table_ews_by_host_by_event_name.cc
new file mode 100644
index 00000000000..8a62990c8ed
--- /dev/null
+++ b/storage/perfschema/table_ews_by_host_by_event_name.cc
@@ -0,0 +1,285 @@
+/* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; version 2 of the License.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+/**
+  @file storage/perfschema/table_ews_by_host_by_event_name.cc
+  Table EVENTS_WAITS_SUMMARY_BY_HOST_BY_EVENT_NAME (implementation).
+*/
+
+#include "my_global.h"
+#include "my_pthread.h"
+#include "pfs_instr_class.h"
+#include "pfs_column_types.h"
+#include "pfs_column_values.h"
+#include "table_ews_by_host_by_event_name.h"
+#include "pfs_global.h"
+#include "pfs_account.h"
+#include "pfs_visitor.h"
+
+THR_LOCK table_ews_by_host_by_event_name::m_table_lock;
+
+static const TABLE_FIELD_TYPE field_types[]=
+{
+  {
+    { C_STRING_WITH_LEN("HOST") },
+    { C_STRING_WITH_LEN("char(60)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("EVENT_NAME") },
+    { C_STRING_WITH_LEN("varchar(128)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("COUNT_STAR") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_TIMER_WAIT") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("MIN_TIMER_WAIT") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("AVG_TIMER_WAIT") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("MAX_TIMER_WAIT") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  }
+};
+
+TABLE_FIELD_DEF
+table_ews_by_host_by_event_name::m_field_def=
+{ 7, field_types };
+
+PFS_engine_table_share
+table_ews_by_host_by_event_name::m_share=
+{
+  { C_STRING_WITH_LEN("events_waits_summary_by_host_by_event_name") },
+  &pfs_truncatable_acl,
+  table_ews_by_host_by_event_name::create,
+  NULL, /* write_row */
+  table_ews_by_host_by_event_name::delete_all_rows,
+  NULL, /* get_row_count */
+  1000, /* records */
+  sizeof(pos_ews_by_host_by_event_name),
+  &m_table_lock,
+  &m_field_def,
+  false /* checked */
+};
+
+PFS_engine_table*
+table_ews_by_host_by_event_name::create(void)
+{
+  return new table_ews_by_host_by_event_name();
+}
+
+int
+table_ews_by_host_by_event_name::delete_all_rows(void)
+{
+  reset_events_waits_by_thread();
+  reset_events_waits_by_account();
+  reset_events_waits_by_host();
+  return 0;
+}
+
+table_ews_by_host_by_event_name::table_ews_by_host_by_event_name()
+  : PFS_engine_table(&m_share, &m_pos),
+    m_row_exists(false), m_pos(), m_next_pos()
+{}
+
+void table_ews_by_host_by_event_name::reset_position(void)
+{
+  m_pos.reset();
+  m_next_pos.reset();
+}
+
+int table_ews_by_host_by_event_name::rnd_next(void)
+{
+  PFS_host *host;
+  PFS_instr_class *instr_class;
+
+  for (m_pos.set_at(&m_next_pos);
+       m_pos.has_more_host();
+       m_pos.next_host())
+  {
+    host= &host_array[m_pos.m_index_1];
+    if (host->m_lock.is_populated())
+    {
+      for ( ;
+           m_pos.has_more_view();
+           m_pos.next_view())
+      {
+        switch (m_pos.m_index_2)
+        {
+        case pos_ews_by_host_by_event_name::VIEW_MUTEX:
+          instr_class= find_mutex_class(m_pos.m_index_3);
+          break;
+        case pos_ews_by_host_by_event_name::VIEW_RWLOCK:
+          instr_class= find_rwlock_class(m_pos.m_index_3);
+          break;
+        case pos_ews_by_host_by_event_name::VIEW_COND:
+          instr_class= find_cond_class(m_pos.m_index_3);
+          break;
+        case pos_ews_by_host_by_event_name::VIEW_FILE:
+          instr_class= find_file_class(m_pos.m_index_3);
+          break;
+        case pos_ews_by_host_by_event_name::VIEW_TABLE:
+          instr_class= find_table_class(m_pos.m_index_3);
+          break;
+        case pos_ews_by_host_by_event_name::VIEW_SOCKET:
+          instr_class= find_socket_class(m_pos.m_index_3);
+          break;
+        case pos_ews_by_host_by_event_name::VIEW_IDLE:
+          instr_class= find_idle_class(m_pos.m_index_3);
+          break;
+        default:
+          instr_class= NULL;
+          DBUG_ASSERT(false);
+          break;
+        }
+
+        if (instr_class)
+        {
+          make_row(host, instr_class);
+          m_next_pos.set_after(&m_pos);
+          return 0;
+        }
+      }
+    }
+  }
+
+  return HA_ERR_END_OF_FILE;
+}
+
+int
+table_ews_by_host_by_event_name::rnd_pos(const void *pos)
+{
+  PFS_host *host;
+  PFS_instr_class *instr_class;
+
+  set_position(pos);
+  DBUG_ASSERT(m_pos.m_index_1 < host_max);
+
+  host= &host_array[m_pos.m_index_1];
+  if (! host->m_lock.is_populated())
+    return HA_ERR_RECORD_DELETED;
+
+  switch (m_pos.m_index_2)
+  {
+  case pos_ews_by_host_by_event_name::VIEW_MUTEX:
+    instr_class= find_mutex_class(m_pos.m_index_3);
+    break;
+  case pos_ews_by_host_by_event_name::VIEW_RWLOCK:
+    instr_class= find_rwlock_class(m_pos.m_index_3);
+    break;
+  case pos_ews_by_host_by_event_name::VIEW_COND:
+    instr_class= find_cond_class(m_pos.m_index_3);
+    break;
+  case pos_ews_by_host_by_event_name::VIEW_FILE:
+    instr_class= find_file_class(m_pos.m_index_3);
+    break;
+  case pos_ews_by_host_by_event_name::VIEW_TABLE:
+    instr_class= find_table_class(m_pos.m_index_3);
+    break;
+  case pos_ews_by_host_by_event_name::VIEW_SOCKET:
+    instr_class= find_socket_class(m_pos.m_index_3);
+    break;
+  case pos_ews_by_host_by_event_name::VIEW_IDLE:
+    instr_class= find_idle_class(m_pos.m_index_3);
+    break;
+  default:
+    instr_class= NULL;
+    DBUG_ASSERT(false);
+    break;
+  }
+  if (instr_class)
+  {
+    make_row(host, instr_class);
+    return 0;
+  }
+
+  return HA_ERR_RECORD_DELETED;
+}
+
+void table_ews_by_host_by_event_name
+::make_row(PFS_host *host, PFS_instr_class *klass)
+{
+  pfs_lock lock;
+  m_row_exists= false;
+
+  host->m_lock.begin_optimistic_lock(&lock);
+
+  if (m_row.m_host.make_row(host))
+    return;
+
+  m_row.m_event_name.make_row(klass);
+
+  PFS_connection_wait_visitor visitor(klass);
+  PFS_connection_iterator::visit_host(host, true, true, & visitor);
+
+  if (! host->m_lock.end_optimistic_lock(&lock))
+    return;
+
+  m_row_exists= true;
+
+  get_normalizer(klass);
+  m_row.m_stat.set(m_normalizer, &visitor.m_stat);
+}
+
+int table_ews_by_host_by_event_name
+::read_row_values(TABLE *table, unsigned char *buf, Field **fields,
+                  bool read_all)
+{
+  Field *f;
+
+  if (unlikely(! m_row_exists))
+    return HA_ERR_RECORD_DELETED;
+
+  /* Set the null bits */
+  DBUG_ASSERT(table->s->null_bytes == 1);
+  buf[0]= 0;
+
+  for (; (f= *fields) ; fields++)
+  {
+    if (read_all || bitmap_is_set(table->read_set, f->field_index))
+    {
+      switch(f->field_index)
+      {
+      case 0: /* HOST */
+        m_row.m_host.set_field(f);
+        break;
+      case 1: /* EVENT_NAME */
+        m_row.m_event_name.set_field(f);
+        break;
+      default: /* 2, ... COUNT/SUM/MIN/AVG/MAX */
+        m_row.m_stat.set_field(f->field_index - 2, f);
+        break;
+      }
+    }
+  }
+
+  return 0;
+}
+
diff --git a/storage/perfschema/table_ews_by_host_by_event_name.h b/storage/perfschema/table_ews_by_host_by_event_name.h
new file mode 100644
index 00000000000..28b8d0250c2
--- /dev/null
+++ b/storage/perfschema/table_ews_by_host_by_event_name.h
@@ -0,0 +1,136 @@
+/* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; version 2 of the License.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+#ifndef TABLE_EWS_BY_HOST_BY_EVENT_NAME_H
+#define TABLE_EWS_BY_HOST_BY_EVENT_NAME_H
+
+/**
+  @file storage/perfschema/table_ews_by_host_by_event_name.h
+  Table EVENTS_WAITS_SUMMARY_BY_HOST_BY_EVENT_NAME (declarations).
+*/
+
+#include "pfs_column_types.h"
+#include "pfs_engine_table.h"
+#include "pfs_instr_class.h"
+#include "pfs_instr.h"
+#include "pfs_host.h"
+#include "table_helper.h"
+
+/**
+  @addtogroup Performance_schema_tables
+  @{
+*/
+
+/**
+  A row of table
+  PERFORMANCE_SCHEMA.EVENTS_WAITS_SUMMARY_BY_HOST_BY_EVENT_NAME.
+*/
+struct row_ews_by_host_by_event_name
+{
+  /** Column HOST. */
+  PFS_host_row m_host;
+  /** Column EVENT_NAME. */
+  PFS_event_name_row m_event_name;
+  /** Columns COUNT_STAR, SUM/MIN/AVG/MAX TIMER_WAIT. */
+  PFS_stat_row m_stat;
+};
+
+/**
+  Position of a cursor on
+  PERFORMANCE_SCHEMA.EVENTS_WAITS_SUMMARY_BY_HOST_BY_EVENT_NAME.
+  Index 1 on host (0 based)
+  Index 2 on instrument view
+  Index 3 on instrument class (1 based)
+*/
+struct pos_ews_by_host_by_event_name
+: public PFS_triple_index, public PFS_instrument_view_constants
+{
+  pos_ews_by_host_by_event_name()
+    : PFS_triple_index(0, FIRST_VIEW, 1)
+  {}
+
+  inline void reset(void)
+  {
+    m_index_1= 0;
+    m_index_2= FIRST_VIEW;
+    m_index_3= 1;
+  }
+
+  inline bool has_more_host(void)
+  { return (m_index_1 < host_max); }
+
+  inline void next_host(void)
+  {
+    m_index_1++;
+    m_index_2= FIRST_VIEW;
+    m_index_3= 1;
+  }
+
+  inline bool has_more_view(void)
+  { return (m_index_2 <= LAST_VIEW); }
+
+  inline void next_view(void)
+  {
+    m_index_2++;
+    m_index_3= 1;
+  }
+};
+
+/** Table PERFORMANCE_SCHEMA.EVENTS_WAITS_SUMMARY_BY_HOST_BY_EVENT_NAME. */
+class table_ews_by_host_by_event_name : public PFS_engine_table
+{
+public:
+  /** Table share */
+  static PFS_engine_table_share m_share;
+  static PFS_engine_table* create();
+  static int delete_all_rows();
+
+  virtual int rnd_next();
+  virtual int rnd_pos(const void *pos);
+  virtual void reset_position(void);
+
+protected:
+  virtual int read_row_values(TABLE *table,
+                              unsigned char *buf,
+                              Field **fields,
+                              bool read_all);
+
+  table_ews_by_host_by_event_name();
+
+public:
+  ~table_ews_by_host_by_event_name()
+  {}
+
+protected:
+  void make_row(PFS_host *host, PFS_instr_class *klass);
+
+private:
+  /** Table share lock. */
+  static THR_LOCK m_table_lock;
+  /** Fields definition. */
+  static TABLE_FIELD_DEF m_field_def;
+
+  /** Current row. */
+  row_ews_by_host_by_event_name m_row;
+  /** True is the current row exists. */
+  bool m_row_exists;
+  /** Current position. */
+  pos_ews_by_host_by_event_name m_pos;
+  /** Next position. */
+  pos_ews_by_host_by_event_name m_next_pos;
+};
+
+/** @} */
+#endif
diff --git a/storage/perfschema/table_ews_by_thread_by_event_name.cc b/storage/perfschema/table_ews_by_thread_by_event_name.cc
new file mode 100644
index 00000000000..25e3cf395c4
--- /dev/null
+++ b/storage/perfschema/table_ews_by_thread_by_event_name.cc
@@ -0,0 +1,299 @@
+/* Copyright (c) 2010, 2011, Oracle and/or its affiliates. All rights reserved.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; version 2 of the License.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301  USA */
+
+/**
+  @file storage/perfschema/table_ews_by_thread_by_event_name.cc
+  Table EVENTS_WAITS_SUMMARY_BY_HOST_BY_EVENT_NAME (implementation).
+*/
+
+#include "my_global.h"
+#include "my_pthread.h"
+#include "pfs_instr_class.h"
+#include "pfs_column_types.h"
+#include "pfs_column_values.h"
+#include "table_ews_by_thread_by_event_name.h"
+#include "pfs_global.h"
+#include "pfs_visitor.h"
+
+THR_LOCK table_ews_by_thread_by_event_name::m_table_lock;
+
+static const TABLE_FIELD_TYPE field_types[]=
+{
+  {
+    { C_STRING_WITH_LEN("THREAD_ID") },
+    { C_STRING_WITH_LEN("int(11)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("EVENT_NAME") },
+    { C_STRING_WITH_LEN("varchar(128)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("COUNT_STAR") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_TIMER_WAIT") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("MIN_TIMER_WAIT") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("AVG_TIMER_WAIT") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("MAX_TIMER_WAIT") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  }
+};
+
+TABLE_FIELD_DEF
+table_ews_by_thread_by_event_name::m_field_def=
+{ 7, field_types };
+
+PFS_engine_table_share
+table_ews_by_thread_by_event_name::m_share=
+{
+  { C_STRING_WITH_LEN("events_waits_summary_by_thread_by_event_name") },
+  &pfs_truncatable_acl,
+  table_ews_by_thread_by_event_name::create,
+  NULL, /* write_row */
+  table_ews_by_thread_by_event_name::delete_all_rows,
+  NULL, /* get_row_count */
+  1000, /* records */
+  sizeof(pos_ews_by_thread_by_event_name),
+  &m_table_lock,
+  &m_field_def,
+  false /* checked */
+};
+
+PFS_engine_table*
+table_ews_by_thread_by_event_name::create(void)
+{
+  return new table_ews_by_thread_by_event_name();
+}
+
+int
+table_ews_by_thread_by_event_name::delete_all_rows(void)
+{
+  reset_events_waits_by_thread();
+  return 0;
+}
+
+table_ews_by_thread_by_event_name::table_ews_by_thread_by_event_name()
+  : PFS_engine_table(&m_share, &m_pos),
+    m_row_exists(false), m_pos(), m_next_pos()
+{}
+
+void table_ews_by_thread_by_event_name::reset_position(void)
+{
+  m_pos.reset();
+  m_next_pos.reset();
+}
+
+int table_ews_by_thread_by_event_name::rnd_next(void)
+{
+  PFS_thread *thread;
+  PFS_instr_class *instr_class;
+
+  for (m_pos.set_at(&m_next_pos);
+       m_pos.has_more_thread();
+       m_pos.next_thread())
+  {
+    thread= &thread_array[m_pos.m_index_1];
+
+    /*
+      Important note: the thread scan is the outer loop (index 1),
+      to minimize the number of calls to atomic operations.
+    */
+    if (thread->m_lock.is_populated())
+    {
+      for ( ;
+           m_pos.has_more_view();
+           m_pos.next_view())
+      {
+        switch (m_pos.m_index_2)
+        {
+        case pos_ews_by_thread_by_event_name::VIEW_MUTEX:
+          instr_class= find_mutex_class(m_pos.m_index_3);
+          break;
+        case pos_ews_by_thread_by_event_name::VIEW_RWLOCK:
+          instr_class= find_rwlock_class(m_pos.m_index_3);
+          break;
+        case pos_ews_by_thread_by_event_name::VIEW_COND:
+          instr_class= find_cond_class(m_pos.m_index_3);
+          break;
+        case pos_ews_by_thread_by_event_name::VIEW_FILE:
+          instr_class= find_file_class(m_pos.m_index_3);
+          break;
+        case pos_ews_by_thread_by_event_name::VIEW_TABLE:
+          instr_class= find_table_class(m_pos.m_index_3);
+          break;
+        case pos_ews_by_thread_by_event_name::VIEW_SOCKET:
+          instr_class= find_socket_class(m_pos.m_index_3);
+          break;
+        case pos_ews_by_thread_by_event_name::VIEW_IDLE:
+          instr_class= find_idle_class(m_pos.m_index_3);
+          break;
+        default:
+          DBUG_ASSERT(false);
+          instr_class= NULL;
+          break;
+        }
+
+        if (instr_class != NULL)
+        {
+          make_row(thread, instr_class);
+          m_next_pos.set_after(&m_pos);
+          return 0;
+        }
+      }
+    }
+  }
+
+  return HA_ERR_END_OF_FILE;
+}
+
+int
+table_ews_by_thread_by_event_name::rnd_pos(const void *pos)
+{
+  PFS_thread *thread;
+  PFS_instr_class *instr_class;
+
+  set_position(pos);
+  DBUG_ASSERT(m_pos.m_index_1 < thread_max);
+
+  thread= &thread_array[m_pos.m_index_1];
+  if (! thread->m_lock.is_populated())
+    return HA_ERR_RECORD_DELETED;
+
+  switch (m_pos.m_index_2)
+  {
+  case pos_ews_by_thread_by_event_name::VIEW_MUTEX:
+    instr_class= find_mutex_class(m_pos.m_index_3);
+    break;
+  case pos_ews_by_thread_by_event_name::VIEW_RWLOCK:
+    instr_class= find_rwlock_class(m_pos.m_index_3);
+    break;
+  case pos_ews_by_thread_by_event_name::VIEW_COND:
+    instr_class= find_cond_class(m_pos.m_index_3);
+    break;
+  case pos_ews_by_thread_by_event_name::VIEW_FILE:
+    instr_class= find_file_class(m_pos.m_index_3);
+    break;
+  case pos_ews_by_thread_by_event_name::VIEW_TABLE:
+    instr_class= find_table_class(m_pos.m_index_3);
+    break;
+  case pos_ews_by_thread_by_event_name::VIEW_SOCKET:
+    instr_class= find_socket_class(m_pos.m_index_3);
+    break;
+  case pos_ews_by_thread_by_event_name::VIEW_IDLE:
+    instr_class= find_idle_class(m_pos.m_index_3);
+    break;
+  default:
+    DBUG_ASSERT(false);
+    instr_class= NULL;
+  }
+
+  if (instr_class)
+  {
+    make_row(thread, instr_class);
+    return 0;
+  }
+  return HA_ERR_RECORD_DELETED;
+}
+
+void table_ews_by_thread_by_event_name
+::make_row(PFS_thread *thread, PFS_instr_class *klass)
+{
+  pfs_lock lock;
+  m_row_exists= false;
+
+  /* Protect this reader against a thread termination */
+  thread->m_lock.begin_optimistic_lock(&lock);
+
+  m_row.m_thread_internal_id= thread->m_thread_internal_id;
+
+  m_row.m_event_name.make_row(klass);
+
+  PFS_connection_wait_visitor visitor(klass);
+  PFS_connection_iterator::visit_thread(thread, &visitor);
+
+  /*
+     If the aggregation for this class is deferred, then we must pull the
+     current wait stats from the instances associated with this thread.
+  */  
+  if (klass->is_deferred())
+  {
+    /* Visit instances owned by this thread. Do not visit the class. */
+    PFS_instance_wait_visitor inst_visitor;
+    PFS_instance_iterator::visit_instances(klass, &inst_visitor,
+                                           thread, false);
+    /* Combine the deferred stats and global stats */
+    visitor.m_stat.aggregate(&inst_visitor.m_stat);
+  }
+
+  if (! thread->m_lock.end_optimistic_lock(&lock))
+    return;
+
+  m_row_exists= true;
+
+  get_normalizer(klass);
+  m_row.m_stat.set(m_normalizer, & visitor.m_stat);
+}
+
+int table_ews_by_thread_by_event_name
+::read_row_values(TABLE *table, unsigned char *, Field **fields,
+                  bool read_all)
+{
+  Field *f;
+
+  if (unlikely(! m_row_exists))
+    return HA_ERR_RECORD_DELETED;
+
+  /* Set the null bits */
+  DBUG_ASSERT(table->s->null_bytes == 0);
+
+  for (; (f= *fields) ; fields++)
+  {
+    if (read_all || bitmap_is_set(table->read_set, f->field_index))
+    {
+      switch(f->field_index)
+      {
+      case 0: /* THREAD_ID */
+        set_field_ulong(f, m_row.m_thread_internal_id);
+        break;
+      case 1: /* EVENT_NAME */
+        m_row.m_event_name.set_field(f);
+        break;
+      default: /* 2, ... COUNT/SUM/MIN/AVG/MAX */
+        m_row.m_stat.set_field(f->field_index - 2, f);
+        break;
+      }
+    }
+  }
+
+  return 0;
+}
+
diff --git a/storage/perfschema/table_ews_by_thread_by_event_name.h b/storage/perfschema/table_ews_by_thread_by_event_name.h
new file mode 100644
index 00000000000..b0710bb8a57
--- /dev/null
+++ b/storage/perfschema/table_ews_by_thread_by_event_name.h
@@ -0,0 +1,135 @@
+/* Copyright (c) 2010, 2011, Oracle and/or its affiliates. All rights reserved.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; version 2 of the License.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301  USA */
+
+#ifndef TABLE_EWS_BY_THREAD_BY_EVENT_NAME_H
+#define TABLE_EWS_BY_THREAD_BY_EVENT_NAME_H
+
+/**
+  @file storage/perfschema/table_ews_by_thread_by_event_name.h
+  Table EVENTS_WAITS_SUMMARY_BY_THREAD_BY_EVENT_NAME (declarations).
+*/
+
+#include "pfs_column_types.h"
+#include "pfs_engine_table.h"
+#include "pfs_instr_class.h"
+#include "pfs_instr.h"
+#include "table_helper.h"
+
+/**
+  @addtogroup Performance_schema_tables
+  @{
+*/
+
+/**
+  A row of table
+  PERFORMANCE_SCHEMA.EVENTS_WAITS_SUMMARY_BY_THREAD_BY_EVENT_NAME.
+*/
+struct row_ews_by_thread_by_event_name
+{
+  /** Column THREAD_ID. */
+  ulong m_thread_internal_id;
+  /** Column EVENT_NAME. */
+  PFS_event_name_row m_event_name;
+  /** Columns COUNT_STAR, SUM/MIN/AVG/MAX TIMER_WAIT. */
+  PFS_stat_row m_stat;
+};
+
+/**
+  Position of a cursor on
+  PERFORMANCE_SCHEMA.EVENTS_WAITS_SUMMARY_BY_THREAD_BY_EVENT_NAME.
+  Index 1 on thread (0 based)
+  Index 2 on instrument view
+  Index 3 on instrument class (1 based)
+*/
+struct pos_ews_by_thread_by_event_name
+: public PFS_triple_index, public PFS_instrument_view_constants
+{
+  pos_ews_by_thread_by_event_name()
+    : PFS_triple_index(0, FIRST_VIEW, 1)
+  {}
+
+  inline void reset(void)
+  {
+    m_index_1= 0;
+    m_index_2= FIRST_VIEW;
+    m_index_3= 1;
+  }
+
+  inline bool has_more_thread(void)
+  { return (m_index_1 < thread_max); }
+
+  inline void next_thread(void)
+  {
+    m_index_1++;
+    m_index_2= FIRST_VIEW;
+    m_index_3= 1;
+  }
+
+  inline bool has_more_view(void)
+  { return (m_index_2 <= LAST_VIEW); }
+
+  inline void next_view(void)
+  {
+    m_index_2++;
+    m_index_3= 1;
+  }
+};
+
+/** Table PERFORMANCE_SCHEMA.EVENTS_WAITS_SUMMARY_BY_THREAD_BY_EVENT_NAME. */
+class table_ews_by_thread_by_event_name : public PFS_engine_table
+{
+public:
+  /** Table share */
+  static PFS_engine_table_share m_share;
+  static PFS_engine_table* create();
+  static int delete_all_rows();
+
+  virtual int rnd_next();
+  virtual int rnd_pos(const void *pos);
+  virtual void reset_position(void);
+
+protected:
+  virtual int read_row_values(TABLE *table,
+                              unsigned char *buf,
+                              Field **fields,
+                              bool read_all);
+
+  table_ews_by_thread_by_event_name();
+
+public:
+  ~table_ews_by_thread_by_event_name()
+  {}
+
+protected:
+  void make_row(PFS_thread *thread, PFS_instr_class *klass);
+
+private:
+  /** Table share lock. */
+  static THR_LOCK m_table_lock;
+  /** Fields definition. */
+  static TABLE_FIELD_DEF m_field_def;
+
+  /** Current row. */
+  row_ews_by_thread_by_event_name m_row;
+  /** True is the current row exists. */
+  bool m_row_exists;
+  /** Current position. */
+  pos_ews_by_thread_by_event_name m_pos;
+  /** Next position. */
+  pos_ews_by_thread_by_event_name m_next_pos;
+};
+
+/** @} */
+#endif
diff --git a/storage/perfschema/table_ews_by_user_by_event_name.cc b/storage/perfschema/table_ews_by_user_by_event_name.cc
new file mode 100644
index 00000000000..8a169019e87
--- /dev/null
+++ b/storage/perfschema/table_ews_by_user_by_event_name.cc
@@ -0,0 +1,285 @@
+/* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; version 2 of the License.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+/**
+  @file storage/perfschema/table_ews_by_user_by_event_name.cc
+  Table EVENTS_WAITS_SUMMARY_BY_USER_BY_EVENT_NAME (implementation).
+*/
+
+#include "my_global.h"
+#include "my_pthread.h"
+#include "pfs_instr_class.h"
+#include "pfs_column_types.h"
+#include "pfs_column_values.h"
+#include "table_ews_by_user_by_event_name.h"
+#include "pfs_global.h"
+#include "pfs_account.h"
+#include "pfs_visitor.h"
+
+THR_LOCK table_ews_by_user_by_event_name::m_table_lock;
+
+static const TABLE_FIELD_TYPE field_types[]=
+{
+  {
+    { C_STRING_WITH_LEN("USER") },
+    { C_STRING_WITH_LEN("char(16)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("EVENT_NAME") },
+    { C_STRING_WITH_LEN("varchar(128)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("COUNT_STAR") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_TIMER_WAIT") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("MIN_TIMER_WAIT") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("AVG_TIMER_WAIT") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("MAX_TIMER_WAIT") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  }
+};
+
+TABLE_FIELD_DEF
+table_ews_by_user_by_event_name::m_field_def=
+{ 7, field_types };
+
+PFS_engine_table_share
+table_ews_by_user_by_event_name::m_share=
+{
+  { C_STRING_WITH_LEN("events_waits_summary_by_user_by_event_name") },
+  &pfs_truncatable_acl,
+  table_ews_by_user_by_event_name::create,
+  NULL, /* write_row */
+  table_ews_by_user_by_event_name::delete_all_rows,
+  NULL, /* get_row_count */
+  1000, /* records */
+  sizeof(pos_ews_by_user_by_event_name),
+  &m_table_lock,
+  &m_field_def,
+  false /* checked */
+};
+
+PFS_engine_table*
+table_ews_by_user_by_event_name::create(void)
+{
+  return new table_ews_by_user_by_event_name();
+}
+
+int
+table_ews_by_user_by_event_name::delete_all_rows(void)
+{
+  reset_events_waits_by_thread();
+  reset_events_waits_by_account();
+  reset_events_waits_by_user();
+  return 0;
+}
+
+table_ews_by_user_by_event_name::table_ews_by_user_by_event_name()
+  : PFS_engine_table(&m_share, &m_pos),
+    m_row_exists(false), m_pos(), m_next_pos()
+{}
+
+void table_ews_by_user_by_event_name::reset_position(void)
+{
+  m_pos.reset();
+  m_next_pos.reset();
+}
+
+int table_ews_by_user_by_event_name::rnd_next(void)
+{
+  PFS_user *user;
+  PFS_instr_class *instr_class;
+
+  for (m_pos.set_at(&m_next_pos);
+       m_pos.has_more_user();
+       m_pos.next_user())
+  {
+    user= &user_array[m_pos.m_index_1];
+    if (user->m_lock.is_populated())
+    {
+      for ( ;
+           m_pos.has_more_view();
+           m_pos.next_view())
+      {
+        switch (m_pos.m_index_2)
+        {
+        case pos_ews_by_user_by_event_name::VIEW_MUTEX:
+          instr_class= find_mutex_class(m_pos.m_index_3);
+          break;
+        case pos_ews_by_user_by_event_name::VIEW_RWLOCK:
+          instr_class= find_rwlock_class(m_pos.m_index_3);
+          break;
+        case pos_ews_by_user_by_event_name::VIEW_COND:
+          instr_class= find_cond_class(m_pos.m_index_3);
+          break;
+        case pos_ews_by_user_by_event_name::VIEW_FILE:
+          instr_class= find_file_class(m_pos.m_index_3);
+          break;
+        case pos_ews_by_user_by_event_name::VIEW_TABLE:
+          instr_class= find_table_class(m_pos.m_index_3);
+          break;
+        case pos_ews_by_user_by_event_name::VIEW_SOCKET:
+          instr_class= find_socket_class(m_pos.m_index_3);
+          break;
+        case pos_ews_by_user_by_event_name::VIEW_IDLE:
+          instr_class= find_idle_class(m_pos.m_index_3);
+          break;
+        default:
+          instr_class= NULL;
+          DBUG_ASSERT(false);
+          break;
+        }
+
+        if (instr_class)
+        {
+          make_row(user, instr_class);
+          m_next_pos.set_after(&m_pos);
+          return 0;
+        }
+      }
+    }
+  }
+
+  return HA_ERR_END_OF_FILE;
+}
+
+int
+table_ews_by_user_by_event_name::rnd_pos(const void *pos)
+{
+  PFS_user *user;
+  PFS_instr_class *instr_class;
+
+  set_position(pos);
+  DBUG_ASSERT(m_pos.m_index_1 < user_max);
+
+  user= &user_array[m_pos.m_index_1];
+  if (! user->m_lock.is_populated())
+    return HA_ERR_RECORD_DELETED;
+
+  switch (m_pos.m_index_2)
+  {
+  case pos_ews_by_user_by_event_name::VIEW_MUTEX:
+    instr_class= find_mutex_class(m_pos.m_index_3);
+    break;
+  case pos_ews_by_user_by_event_name::VIEW_RWLOCK:
+    instr_class= find_rwlock_class(m_pos.m_index_3);
+    break;
+  case pos_ews_by_user_by_event_name::VIEW_COND:
+    instr_class= find_cond_class(m_pos.m_index_3);
+    break;
+  case pos_ews_by_user_by_event_name::VIEW_FILE:
+    instr_class= find_file_class(m_pos.m_index_3);
+    break;
+  case pos_ews_by_user_by_event_name::VIEW_TABLE:
+    instr_class= find_table_class(m_pos.m_index_3);
+    break;
+  case pos_ews_by_user_by_event_name::VIEW_SOCKET:
+    instr_class= find_socket_class(m_pos.m_index_3);
+    break;
+  case pos_ews_by_user_by_event_name::VIEW_IDLE:
+    instr_class= find_idle_class(m_pos.m_index_3);
+    break;
+  default:
+    instr_class= NULL;
+    DBUG_ASSERT(false);
+    break;
+  }
+  if (instr_class)
+  {
+    make_row(user, instr_class);
+    return 0;
+  }
+
+  return HA_ERR_RECORD_DELETED;
+}
+
+void table_ews_by_user_by_event_name
+::make_row(PFS_user *user, PFS_instr_class *klass)
+{
+  pfs_lock lock;
+  m_row_exists= false;
+
+  user->m_lock.begin_optimistic_lock(&lock);
+
+  if (m_row.m_user.make_row(user))
+    return;
+
+  m_row.m_event_name.make_row(klass);
+
+  PFS_connection_wait_visitor visitor(klass);
+  PFS_connection_iterator::visit_user(user, true, true, & visitor);
+
+  if (! user->m_lock.end_optimistic_lock(&lock))
+    return;
+
+  m_row_exists= true;
+
+  get_normalizer(klass);
+  m_row.m_stat.set(m_normalizer, &visitor.m_stat);
+}
+
+int table_ews_by_user_by_event_name
+::read_row_values(TABLE *table, unsigned char *buf, Field **fields,
+                  bool read_all)
+{
+  Field *f;
+
+  if (unlikely(! m_row_exists))
+    return HA_ERR_RECORD_DELETED;
+
+  /* Set the null bits */
+  DBUG_ASSERT(table->s->null_bytes == 1);
+  buf[0]= 0;
+
+  for (; (f= *fields) ; fields++)
+  {
+    if (read_all || bitmap_is_set(table->read_set, f->field_index))
+    {
+      switch(f->field_index)
+      {
+      case 0: /* USER */
+        m_row.m_user.set_field(f);
+        break;
+      case 1: /* EVENT_NAME */
+        m_row.m_event_name.set_field(f);
+        break;
+      default: /* 2, ... COUNT/SUM/MIN/AVG/MAX */
+        m_row.m_stat.set_field(f->field_index - 2, f);
+        break;
+      }
+    }
+  }
+
+  return 0;
+}
+
diff --git a/storage/perfschema/table_ews_by_user_by_event_name.h b/storage/perfschema/table_ews_by_user_by_event_name.h
new file mode 100644
index 00000000000..88b78a1ed7a
--- /dev/null
+++ b/storage/perfschema/table_ews_by_user_by_event_name.h
@@ -0,0 +1,136 @@
+/* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; version 2 of the License.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+#ifndef TABLE_EWS_BY_USER_BY_EVENT_NAME_H
+#define TABLE_EWS_BY_USER_BY_EVENT_NAME_H
+
+/**
+  @file storage/perfschema/table_ews_by_user_by_event_name.h
+  Table EVENTS_WAITS_SUMMARY_BY_USER_BY_EVENT_NAME (declarations).
+*/
+
+#include "pfs_column_types.h"
+#include "pfs_engine_table.h"
+#include "pfs_instr_class.h"
+#include "pfs_instr.h"
+#include "pfs_user.h"
+#include "table_helper.h"
+
+/**
+  @addtogroup Performance_schema_tables
+  @{
+*/
+
+/**
+  A row of table
+  PERFORMANCE_SCHEMA.EVENTS_WAITS_SUMMARY_BY_USER_BY_EVENT_NAME.
+*/
+struct row_ews_by_user_by_event_name
+{
+  /** Column USER. */
+  PFS_user_row m_user;
+  /** Column EVENT_NAME. */
+  PFS_event_name_row m_event_name;
+  /** Columns COUNT_STAR, SUM/MIN/AVG/MAX TIMER_WAIT. */
+  PFS_stat_row m_stat;
+};
+
+/**
+  Position of a cursor on
+  PERFORMANCE_SCHEMA.EVENTS_WAITS_SUMMARY_BY_USER_BY_EVENT_NAME.
+  Index 1 on user (0 based)
+  Index 2 on instrument view
+  Index 3 on instrument class (1 based)
+*/
+struct pos_ews_by_user_by_event_name
+: public PFS_triple_index, public PFS_instrument_view_constants
+{
+  pos_ews_by_user_by_event_name()
+    : PFS_triple_index(0, FIRST_VIEW, 1)
+  {}
+
+  inline void reset(void)
+  {
+    m_index_1= 0;
+    m_index_2= FIRST_VIEW;
+    m_index_3= 1;
+  }
+
+  inline bool has_more_user(void)
+  { return (m_index_1 < user_max); }
+
+  inline void next_user(void)
+  {
+    m_index_1++;
+    m_index_2= FIRST_VIEW;
+    m_index_3= 1;
+  }
+
+  inline bool has_more_view(void)
+  { return (m_index_2 <= LAST_VIEW); }
+
+  inline void next_view(void)
+  {
+    m_index_2++;
+    m_index_3= 1;
+  }
+};
+
+/** Table PERFORMANCE_SCHEMA.EVENTS_WAITS_SUMMARY_BY_USER_BY_EVENT_NAME. */
+class table_ews_by_user_by_event_name : public PFS_engine_table
+{
+public:
+  /** Table share */
+  static PFS_engine_table_share m_share;
+  static PFS_engine_table* create();
+  static int delete_all_rows();
+
+  virtual int rnd_next();
+  virtual int rnd_pos(const void *pos);
+  virtual void reset_position(void);
+
+protected:
+  virtual int read_row_values(TABLE *table,
+                              unsigned char *buf,
+                              Field **fields,
+                              bool read_all);
+
+  table_ews_by_user_by_event_name();
+
+public:
+  ~table_ews_by_user_by_event_name()
+  {}
+
+protected:
+  void make_row(PFS_user *user, PFS_instr_class *klass);
+
+private:
+  /** Table share lock. */
+  static THR_LOCK m_table_lock;
+  /** Fields definition. */
+  static TABLE_FIELD_DEF m_field_def;
+
+  /** Current row. */
+  row_ews_by_user_by_event_name m_row;
+  /** True is the current row exists. */
+  bool m_row_exists;
+  /** Current position. */
+  pos_ews_by_user_by_event_name m_pos;
+  /** Next position. */
+  pos_ews_by_user_by_event_name m_next_pos;
+};
+
+/** @} */
+#endif
diff --git a/storage/perfschema/table_ews_global_by_event_name.cc b/storage/perfschema/table_ews_global_by_event_name.cc
index 3177584231d..c71a1ed479e 100644
--- a/storage/perfschema/table_ews_global_by_event_name.cc
+++ b/storage/perfschema/table_ews_global_by_event_name.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2010, 2011, Oracle and/or its affiliates. All rights reserved.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
@@ -10,8 +10,8 @@
   GNU General Public License for more details.
 
   You should have received a copy of the GNU General Public License
-  along with this program; if not, write to the Free Software Foundation,
-  51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA */
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301  USA */
 
 /**
   @file storage/perfschema/table_ews_global_by_event_name.cc
@@ -25,6 +25,9 @@
 #include "pfs_column_values.h"
 #include "table_ews_global_by_event_name.h"
 #include "pfs_global.h"
+#include "pfs_instr.h"
+#include "pfs_timer.h"
+#include "pfs_visitor.h"
 
 THR_LOCK table_ews_global_by_event_name::m_table_lock;
 
@@ -71,50 +74,320 @@ table_ews_global_by_event_name::m_share=
 {
   { C_STRING_WITH_LEN("events_waits_summary_global_by_event_name") },
   &pfs_truncatable_acl,
-  &table_ews_global_by_event_name::create,
+  table_ews_global_by_event_name::create,
   NULL, /* write_row */
-  &table_ews_global_by_event_name::delete_all_rows,
+  table_ews_global_by_event_name::delete_all_rows,
+  NULL, /* get_row_count */
   1000, /* records */
-  sizeof(pos_all_instr_class),
+  sizeof(pos_ews_global_by_event_name),
   &m_table_lock,
   &m_field_def,
   false /* checked */
 };
 
-PFS_engine_table* table_ews_global_by_event_name::create(void)
+PFS_engine_table*
+table_ews_global_by_event_name::create(void)
 {
   return new table_ews_global_by_event_name();
 }
 
-int table_ews_global_by_event_name::delete_all_rows(void)
+int
+table_ews_global_by_event_name::delete_all_rows(void)
 {
-  reset_instrument_class_waits();
+  reset_events_waits_by_instance();
+  reset_table_waits_by_table_handle();
+  reset_table_waits_by_table();
+  reset_events_waits_global();
   return 0;
 }
 
-table_ews_global_by_event_name
-::table_ews_global_by_event_name()
-  : table_all_instr_class(&m_share)
+table_ews_global_by_event_name::table_ews_global_by_event_name()
+  : PFS_engine_table(&m_share, &m_pos),
+    m_row_exists(false), m_pos(), m_next_pos()
 {}
 
-void table_ews_global_by_event_name
-::make_instr_row(PFS_instr_class *klass)
+void table_ews_global_by_event_name::reset_position(void)
+{
+  m_pos.reset();
+  m_next_pos.reset();
+}
+
+int table_ews_global_by_event_name::rnd_next(void)
+{
+  PFS_mutex_class *mutex_class;
+  PFS_rwlock_class *rwlock_class;
+  PFS_cond_class *cond_class;
+  PFS_file_class *file_class;
+  PFS_socket_class *socket_class;
+  PFS_instr_class *instr_class;
+
+  if (global_instr_class_waits_array == NULL)
+    return HA_ERR_END_OF_FILE;
+
+  for (m_pos.set_at(&m_next_pos);
+       m_pos.has_more_view();
+       m_pos.next_view())
+  {
+    switch (m_pos.m_index_1)
+    {
+    case pos_ews_global_by_event_name::VIEW_MUTEX:
+      mutex_class= find_mutex_class(m_pos.m_index_2);
+      if (mutex_class)
+      {
+        make_mutex_row(mutex_class);
+        m_next_pos.set_after(&m_pos);
+        return 0;
+      }
+      break;
+    case pos_ews_global_by_event_name::VIEW_RWLOCK:
+      rwlock_class= find_rwlock_class(m_pos.m_index_2);
+      if (rwlock_class)
+      {
+        make_rwlock_row(rwlock_class);
+        m_next_pos.set_after(&m_pos);
+        return 0;
+      }
+      break;
+    case pos_ews_global_by_event_name::VIEW_COND:
+      cond_class= find_cond_class(m_pos.m_index_2);
+      if (cond_class)
+      {
+        make_cond_row(cond_class);
+        m_next_pos.set_after(&m_pos);
+        return 0;
+      }
+      break;
+    case pos_ews_global_by_event_name::VIEW_FILE:
+      file_class= find_file_class(m_pos.m_index_2);
+      if (file_class)
+      {
+        make_file_row(file_class);
+        m_next_pos.set_after(&m_pos);
+        return 0;
+      }
+      break;
+    case pos_ews_global_by_event_name::VIEW_TABLE:
+      if (m_pos.m_index_2 == 1)
+      {
+        make_table_io_row(&global_table_io_class);
+        m_next_pos.set_after(&m_pos);
+        return 0;
+      }
+      if (m_pos.m_index_2 == 2)
+      {
+        make_table_lock_row(&global_table_lock_class);
+        m_next_pos.set_after(&m_pos);
+        return 0;
+      }
+      break;
+    case pos_ews_global_by_event_name::VIEW_SOCKET:
+      socket_class= find_socket_class(m_pos.m_index_2);
+      if (socket_class)
+      {
+        make_socket_row(socket_class);
+        m_next_pos.set_after(&m_pos);
+        return 0;
+      }
+      break;
+    case pos_ews_global_by_event_name::VIEW_IDLE:
+      instr_class= find_idle_class(m_pos.m_index_2);
+      if (instr_class)
+      {
+        make_idle_row(instr_class);
+        m_next_pos.set_after(&m_pos);
+        return 0;
+      }
+      break;
+    default:
+      break;
+    }
+  }
+
+  return HA_ERR_END_OF_FILE;
+}
+
+int
+table_ews_global_by_event_name::rnd_pos(const void *pos)
 {
-  m_row.m_name= klass->m_name;
-  m_row.m_name_length= klass->m_name_length;
+  PFS_mutex_class *mutex_class;
+  PFS_rwlock_class *rwlock_class;
+  PFS_cond_class *cond_class;
+  PFS_file_class *file_class;
+  PFS_socket_class *socket_class;
+  PFS_instr_class *instr_class;
 
-  m_row.m_count= klass->m_wait_stat.m_count;
-  m_row.m_sum= klass->m_wait_stat.m_sum;
-  m_row.m_min= klass->m_wait_stat.m_min;
-  m_row.m_max= klass->m_wait_stat.m_max;
+  set_position(pos);
 
-  if (m_row.m_count)
-    m_row.m_avg= m_row.m_sum / m_row.m_count;
-  else
+  if (global_instr_class_waits_array == NULL)
+    return HA_ERR_END_OF_FILE;
+
+  switch (m_pos.m_index_1)
   {
-    m_row.m_min= 0;
-    m_row.m_avg= 0;
+  case pos_ews_global_by_event_name::VIEW_MUTEX:
+    mutex_class= find_mutex_class(m_pos.m_index_2);
+    if (mutex_class)
+    {
+      make_mutex_row(mutex_class);
+      return 0;
+    }
+    break;
+  case pos_ews_global_by_event_name::VIEW_RWLOCK:
+    rwlock_class= find_rwlock_class(m_pos.m_index_2);
+    if (rwlock_class)
+    {
+      make_rwlock_row(rwlock_class);
+      return 0;
+    }
+    break;
+  case pos_ews_global_by_event_name::VIEW_COND:
+    cond_class= find_cond_class(m_pos.m_index_2);
+    if (cond_class)
+    {
+      make_cond_row(cond_class);
+      return 0;
+    }
+    break;
+  case pos_ews_global_by_event_name::VIEW_FILE:
+    file_class= find_file_class(m_pos.m_index_2);
+    if (file_class)
+    {
+      make_file_row(file_class);
+      return 0;
+    }
+    break;
+  case pos_ews_global_by_event_name::VIEW_TABLE:
+    DBUG_ASSERT(m_pos.m_index_2 >= 1);
+    DBUG_ASSERT(m_pos.m_index_2 <= 2);
+    if (m_pos.m_index_2 == 1)
+      make_table_io_row(&global_table_io_class);
+    else
+      make_table_lock_row(&global_table_lock_class);
+    break;
+  case pos_ews_global_by_event_name::VIEW_SOCKET:
+    socket_class= find_socket_class(m_pos.m_index_2);
+    if (socket_class)
+    {
+      make_socket_row(socket_class);
+      return 0;
+    }
+    break;
+  case pos_ews_global_by_event_name::VIEW_IDLE:
+    instr_class= find_idle_class(m_pos.m_index_2);
+    if (instr_class)
+    {
+      make_idle_row(instr_class);
+      return 0;
+    }
+    break;
   }
+
+  return HA_ERR_RECORD_DELETED;
+}
+
+void table_ews_global_by_event_name
+::make_mutex_row(PFS_mutex_class *klass)
+{
+  m_row.m_event_name.make_row(klass);
+
+  PFS_instance_wait_visitor visitor;
+  PFS_instance_iterator::visit_mutex_instances(klass, & visitor);
+
+  get_normalizer(klass);
+  m_row.m_stat.set(m_normalizer, & visitor.m_stat);
+  m_row_exists= true;
+}
+
+void table_ews_global_by_event_name
+::make_rwlock_row(PFS_rwlock_class *klass)
+{
+  m_row.m_event_name.make_row(klass);
+
+  PFS_instance_wait_visitor visitor;
+  PFS_instance_iterator::visit_rwlock_instances(klass, & visitor);
+
+  get_normalizer(klass);
+  m_row.m_stat.set(m_normalizer, & visitor.m_stat);
+  m_row_exists= true;
+}
+
+void table_ews_global_by_event_name
+::make_cond_row(PFS_cond_class *klass)
+{
+  m_row.m_event_name.make_row(klass);
+
+  PFS_instance_wait_visitor visitor;
+  PFS_instance_iterator::visit_cond_instances(klass, & visitor);
+
+  get_normalizer(klass);
+  m_row.m_stat.set(m_normalizer, & visitor.m_stat);
+  m_row_exists= true;
+}
+
+void table_ews_global_by_event_name
+::make_file_row(PFS_file_class *klass)
+{
+  m_row.m_event_name.make_row(klass);
+
+  PFS_instance_wait_visitor visitor;
+  PFS_instance_iterator::visit_file_instances(klass, & visitor);
+
+  get_normalizer(klass);
+  m_row.m_stat.set(m_normalizer, & visitor.m_stat);
+  m_row_exists= true;
+}
+
+void table_ews_global_by_event_name
+::make_table_io_row(PFS_instr_class *klass)
+{
+  m_row.m_event_name.make_row(klass);
+
+  PFS_table_io_wait_visitor visitor;
+  PFS_object_iterator::visit_all_tables(& visitor);
+
+  get_normalizer(klass);
+  m_row.m_stat.set(m_normalizer, & visitor.m_stat);
+  m_row_exists= true;
+}
+
+void table_ews_global_by_event_name
+::make_table_lock_row(PFS_instr_class *klass)
+{
+  m_row.m_event_name.make_row(klass);
+
+  PFS_table_lock_wait_visitor visitor;
+  PFS_object_iterator::visit_all_tables(& visitor);
+  
+  get_normalizer(klass);
+  m_row.m_stat.set(m_normalizer, & visitor.m_stat);
+  m_row_exists= true;
+}
+
+void table_ews_global_by_event_name
+::make_socket_row(PFS_socket_class *klass)
+{
+  m_row.m_event_name.make_row(klass);
+
+  PFS_instance_wait_visitor visitor;
+  PFS_instance_iterator::visit_socket_instances(klass, &visitor);
+
+  get_normalizer(klass);
+  m_row.m_stat.set(m_normalizer, &visitor.m_stat);
+  m_row_exists= true;
+}
+
+void table_ews_global_by_event_name
+::make_idle_row(PFS_instr_class *klass)
+{
+  m_row.m_event_name.make_row(klass);
+
+  PFS_connection_wait_visitor visitor(klass);
+  PFS_connection_iterator::visit_global(false, /* hosts */
+                                        false, /* users */
+                                        false, /* accts */
+                                        true,  /* threads */ &visitor);
+  get_normalizer(klass);
+  m_row.m_stat.set(m_normalizer, &visitor.m_stat);
+  m_row_exists= true;
 }
 
 int table_ews_global_by_event_name
@@ -123,40 +396,24 @@ int table_ews_global_by_event_name
 {
   Field *f;
 
+  if (unlikely(! m_row_exists))
+    return HA_ERR_RECORD_DELETED;
+
   /* Set the null bits */
   DBUG_ASSERT(table->s->null_bytes == 0);
 
-  /*
-    The row always exist,
-    the instrument classes are static and never disappear.
-  */
-
   for (; (f= *fields) ; fields++)
   {
     if (read_all || bitmap_is_set(table->read_set, f->field_index))
     {
       switch(f->field_index)
       {
-      case 0: /* NAME */
-        set_field_varchar_utf8(f, m_row.m_name, m_row.m_name_length);
-        break;
-      case 1: /* COUNT */
-        set_field_ulonglong(f, m_row.m_count);
-        break;
-      case 2: /* SUM */
-        set_field_ulonglong(f, m_row.m_sum);
-        break;
-      case 3: /* MIN */
-        set_field_ulonglong(f, m_row.m_min);
-        break;
-      case 4: /* AVG */
-        set_field_ulonglong(f, m_row.m_avg);
+      case 0: /* EVENT_NAME */
+        m_row.m_event_name.set_field(f);
         break;
-      case 5: /* MAX */
-        set_field_ulonglong(f, m_row.m_max);
+      default: /* 1, ... COUNT/SUM/MIN/AVG/MAX */
+        m_row.m_stat.set_field(f->field_index - 1, f);
         break;
-      default:
-        DBUG_ASSERT(false);
       }
     }
   }
diff --git a/storage/perfschema/table_ews_global_by_event_name.h b/storage/perfschema/table_ews_global_by_event_name.h
index 7e66448e96c..a118e536b6a 100644
--- a/storage/perfschema/table_ews_global_by_event_name.h
+++ b/storage/perfschema/table_ews_global_by_event_name.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2010, 2011, Oracle and/or its affiliates. All rights reserved.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
@@ -10,11 +10,11 @@
   GNU General Public License for more details.
 
   You should have received a copy of the GNU General Public License
-  along with this program; if not, write to the Free Software Foundation,
-  51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA */
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301  USA */
 
-#ifndef TABLE_EVENTS_WAITS_SUMMARY_GLOBAL_BY_EVENT_NAME_H
-#define TABLE_EVENTS_WAITS_SUMMARY_GLOBAL_BY_EVENT_NAME_H
+#ifndef TABLE_EWS_GLOBAL_BY_EVENT_NAME_H
+#define TABLE_EWS_GLOBAL_BY_EVENT_NAME_H
 
 /**
   @file storage/perfschema/table_ews_global_by_event_name.h
@@ -25,34 +25,56 @@
 #include "pfs_engine_table.h"
 #include "pfs_instr_class.h"
 #include "pfs_instr.h"
-#include "table_all_instr.h"
+#include "table_helper.h"
 
 /**
   @addtogroup Performance_schema_tables
   @{
 */
 
-/** A row of PERFORMANCE_SCHEMA.EVENTS_WAITS_SUMMARY_GLOBAL_BY_EVENT_NAME. */
+/**
+  A row of table
+  PERFORMANCE_SCHEMA.EVENTS_WAITS_SUMMARY_GLOBAL_BY_EVENT_NAME.
+*/
 struct row_ews_global_by_event_name
 {
   /** Column EVENT_NAME. */
-  const char *m_name;
-  /** Length in bytes of @c m_name. */
-  uint m_name_length;
-  /** Column COUNT_STAR. */
-  ulonglong m_count;
-  /** Column SUM_TIMER_WAIT. */
-  ulonglong m_sum;
-  /** Column MIN_TIMER_WAIT. */
-  ulonglong m_min;
-  /** Column AVG_TIMER_WAIT. */
-  ulonglong m_avg;
-  /** Column MAX_TIMER_WAIT. */
-  ulonglong m_max;
+  PFS_event_name_row m_event_name;
+  /** Columns COUNT_STAR, SUM/MIN/AVG/MAX TIMER_WAIT. */
+  PFS_stat_row m_stat;
+};
+
+/**
+  Position of a cursor on
+  PERFORMANCE_SCHEMA.EVENTS_WAITS_SUMMARY_GLOBAL_BY_EVENT_NAME.
+  Index 1 on instrument view
+  Index 2 on instrument class (1 based)
+*/
+struct pos_ews_global_by_event_name
+: public PFS_double_index, public PFS_instrument_view_constants
+{
+  pos_ews_global_by_event_name()
+    : PFS_double_index(FIRST_VIEW, 1)
+  {}
+
+  inline void reset(void)
+  {
+    m_index_1= FIRST_VIEW;
+    m_index_2= 1;
+  }
+
+  inline bool has_more_view(void)
+  { return (m_index_1 <= LAST_VIEW); }
+
+  inline void next_view(void)
+  {
+    m_index_1++;
+    m_index_2= 1;
+  }
 };
 
 /** Table PERFORMANCE_SCHEMA.EVENTS_WAITS_SUMMARY_GLOBAL_BY_EVENT_NAME. */
-class table_ews_global_by_event_name : public table_all_instr_class
+class table_ews_global_by_event_name : public PFS_engine_table
 {
 public:
   /** Table share */
@@ -60,9 +82,11 @@ public:
   static PFS_engine_table* create();
   static int delete_all_rows();
 
-protected:
-  virtual void make_instr_row(PFS_instr_class *klass);
+  virtual int rnd_next();
+  virtual int rnd_pos(const void *pos);
+  virtual void reset_position(void);
 
+protected:
   virtual int read_row_values(TABLE *table,
                               unsigned char *buf,
                               Field **fields,
@@ -74,6 +98,16 @@ public:
   ~table_ews_global_by_event_name()
   {}
 
+protected:
+  void make_mutex_row(PFS_mutex_class *klass);
+  void make_rwlock_row(PFS_rwlock_class *klass);
+  void make_cond_row(PFS_cond_class *klass);
+  void make_file_row(PFS_file_class *klass);
+  void make_table_io_row(PFS_instr_class *klass);
+  void make_table_lock_row(PFS_instr_class *klass);
+  void make_socket_row(PFS_socket_class *klass);
+  void make_idle_row(PFS_instr_class *klass);
+
 private:
   /** Table share lock. */
   static THR_LOCK m_table_lock;
@@ -82,6 +116,12 @@ private:
 
   /** Current row. */
   row_ews_global_by_event_name m_row;
+  /** True is the current row exists. */
+  bool m_row_exists;
+  /** Current position. */
+  pos_ews_global_by_event_name m_pos;
+  /** Next position. */
+  pos_ews_global_by_event_name m_next_pos;
 };
 
 /** @} */
diff --git a/storage/perfschema/table_file_instances.cc b/storage/perfschema/table_file_instances.cc
index 9ae732a0e1c..a3d2fc454bb 100644
--- a/storage/perfschema/table_file_instances.cc
+++ b/storage/perfschema/table_file_instances.cc
@@ -59,6 +59,7 @@ table_file_instances::m_share=
   &table_file_instances::create,
   NULL, /* write_row */
   NULL, /* delete_all_rows */
+  NULL, /* get_row_count */
   1000, /* records */
   sizeof(PFS_simple_index),
   &m_table_lock,
diff --git a/storage/perfschema/table_file_instances.h b/storage/perfschema/table_file_instances.h
index 7365000b21f..f7ec16715f3 100644
--- a/storage/perfschema/table_file_instances.h
+++ b/storage/perfschema/table_file_instances.h
@@ -78,7 +78,7 @@ private:
 
   /** Current row. */
   row_file_instances m_row;
-  /** True is the current row exists. */
+  /** True if the current row exists. */
   bool m_row_exists;
   /** Current position. */
   PFS_simple_index m_pos;
diff --git a/storage/perfschema/table_file_summary_by_event_name.cc b/storage/perfschema/table_file_summary_by_event_name.cc
new file mode 100644
index 00000000000..7e72f4d4158
--- /dev/null
+++ b/storage/perfschema/table_file_summary_by_event_name.cc
@@ -0,0 +1,352 @@
+/* Copyright (c) 2008, 2011, Oracle and/or its affiliates. All rights reserved.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; version 2 of the License.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software Foundation,
+  51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA */
+
+/**
+  @file storage/perfschema/table_file_summary.cc
+  Table FILE_SUMMARY_BY_EVENT_NAME(implementation).
+*/
+
+#include "my_global.h"
+#include "my_pthread.h"
+#include "pfs_instr_class.h"
+#include "pfs_column_types.h"
+#include "pfs_column_values.h"
+#include "table_file_summary_by_event_name.h"
+#include "pfs_global.h"
+#include "pfs_visitor.h"
+
+THR_LOCK table_file_summary_by_event_name::m_table_lock;
+
+static const TABLE_FIELD_TYPE field_types[]=
+{
+  {
+    { C_STRING_WITH_LEN("EVENT_NAME") },
+    { C_STRING_WITH_LEN("varchar(128)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("COUNT_STAR") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_TIMER_WAIT") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("MIN_TIMER_WAIT") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("AVG_TIMER_WAIT") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("MAX_TIMER_WAIT") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+
+  /** Read */
+  {
+    { C_STRING_WITH_LEN("COUNT_READ") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_TIMER_READ") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("MIN_TIMER_READ") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("AVG_TIMER_READ") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("MAX_TIMER_READ") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_NUMBER_OF_BYTES_READ") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+
+  /** Write */
+  {
+    { C_STRING_WITH_LEN("COUNT_WRITE") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_TIMER_WRITE") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("MIN_TIMER_WRITE") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("AVG_TIMER_WRITE") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("MAX_TIMER_WRITE") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_NUMBER_OF_BYTES_WRITE") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+
+  /** Misc */
+  {
+    { C_STRING_WITH_LEN("COUNT_MISC") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_TIMER_MISC") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("MIN_TIMER_MISC") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("AVG_TIMER_MISC") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("MAX_TIMER_MISC") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  }
+};
+
+TABLE_FIELD_DEF
+table_file_summary_by_event_name::m_field_def=
+{ 23, field_types };
+
+PFS_engine_table_share
+table_file_summary_by_event_name::m_share=
+{
+  { C_STRING_WITH_LEN("file_summary_by_event_name") },
+  &pfs_truncatable_acl,
+  &table_file_summary_by_event_name::create,
+  NULL, /* write_row */
+  table_file_summary_by_event_name::delete_all_rows,
+  NULL, /* get_row_count */
+  1000, /* records */
+  sizeof(PFS_simple_index),
+  &m_table_lock,
+  &m_field_def,
+  false /* checked */
+};
+
+PFS_engine_table* table_file_summary_by_event_name::create(void)
+{
+  return new table_file_summary_by_event_name();
+}
+
+int table_file_summary_by_event_name::delete_all_rows(void)
+{
+  reset_file_instance_io();
+  reset_file_class_io();
+  return 0;
+}
+
+table_file_summary_by_event_name::table_file_summary_by_event_name()
+  : PFS_engine_table(&m_share, &m_pos),
+  m_pos(1), m_next_pos(1)
+{}
+
+void table_file_summary_by_event_name::reset_position(void)
+{
+  m_pos.m_index= 1;
+  m_next_pos.m_index= 1;
+}
+
+int table_file_summary_by_event_name::rnd_next(void)
+{
+  PFS_file_class *file_class;
+
+  m_pos.set_at(&m_next_pos);
+
+  file_class= find_file_class(m_pos.m_index);
+  if (file_class)
+  {
+    make_row(file_class);
+    m_next_pos.set_after(&m_pos);
+    return 0;
+  }
+
+  return HA_ERR_END_OF_FILE;
+}
+
+int table_file_summary_by_event_name::rnd_pos(const void *pos)
+{
+  PFS_file_class *file_class;
+
+  set_position(pos);
+
+  file_class= find_file_class(m_pos.m_index);
+  if (file_class)
+  {
+    make_row(file_class);
+    return 0;
+  }
+
+  return HA_ERR_RECORD_DELETED;
+}
+
+/**
+  Build a row.
+  @param klass            the file class the cursor is reading
+*/
+void table_file_summary_by_event_name::make_row(PFS_file_class *file_class)
+{
+  m_row.m_event_name.make_row(file_class);
+
+  PFS_instance_file_io_stat_visitor visitor;
+  PFS_instance_iterator::visit_file_instances(file_class, &visitor);
+
+  time_normalizer *normalizer= time_normalizer::get(wait_timer);
+  
+  /* Collect timer and byte count stats */
+  m_row.m_io_stat.set(normalizer, &visitor.m_file_io_stat);
+  m_row_exists= true;
+
+}
+
+int table_file_summary_by_event_name::read_row_values(TABLE *table,
+                                                      unsigned char *,
+                                                      Field **fields,
+                                                      bool read_all)
+{
+  Field *f;
+
+  if (unlikely(!m_row_exists))
+    return HA_ERR_RECORD_DELETED;
+
+  /* Set the null bits */
+  DBUG_ASSERT(table->s->null_bytes == 0);
+
+  for (; (f= *fields) ; fields++)
+  {
+    if (read_all || bitmap_is_set(table->read_set, f->field_index))
+    {
+      switch(f->field_index)
+      {
+      case  0: /* EVENT_NAME */
+        m_row.m_event_name.set_field(f);
+        break;
+      case  1: /* COUNT_STAR */
+        set_field_ulonglong(f, m_row.m_io_stat.m_all.m_waits.m_count);
+        break;
+      case  2: /* SUM_TIMER_WAIT */
+        set_field_ulonglong(f, m_row.m_io_stat.m_all.m_waits.m_sum);
+        break;
+      case  3: /* MIN_TIMER_WAIT */
+        set_field_ulonglong(f, m_row.m_io_stat.m_all.m_waits.m_min);
+        break;
+      case  4: /* AVG_TIMER_WAIT */
+        set_field_ulonglong(f, m_row.m_io_stat.m_all.m_waits.m_avg);
+        break;
+      case  5: /* MAX_TIMER_WAIT */
+        set_field_ulonglong(f, m_row.m_io_stat.m_all.m_waits.m_max);
+        break;
+
+      case  6: /* COUNT_READ */
+        set_field_ulonglong(f, m_row.m_io_stat.m_read.m_waits.m_count);
+        break;
+      case  7: /* SUM_TIMER_READ */
+        set_field_ulonglong(f, m_row.m_io_stat.m_read.m_waits.m_sum);
+        break;
+      case  8: /* MIN_TIMER_READ */
+        set_field_ulonglong(f, m_row.m_io_stat.m_read.m_waits.m_min);
+        break;
+      case  9: /* AVG_TIMER_READ */
+        set_field_ulonglong(f, m_row.m_io_stat.m_read.m_waits.m_avg);
+        break;
+      case 10: /* MAX_TIMER_READ */
+        set_field_ulonglong(f, m_row.m_io_stat.m_read.m_waits.m_max);
+        break;
+      case 11: /* SUM_NUMBER_OF_BYTES_READ */
+        set_field_ulonglong(f, m_row.m_io_stat.m_read.m_bytes);
+        break;
+
+      case 12: /* COUNT_WRITE */
+        set_field_ulonglong(f, m_row.m_io_stat.m_write.m_waits.m_count);
+        break;
+      case 13: /* SUM_TIMER_WRITE */
+        set_field_ulonglong(f, m_row.m_io_stat.m_write.m_waits.m_sum);
+        break;
+      case 14: /* MIN_TIMER_WRITE */
+        set_field_ulonglong(f, m_row.m_io_stat.m_write.m_waits.m_min);
+        break;
+      case 15: /* AVG_TIMER_WRITE */
+        set_field_ulonglong(f, m_row.m_io_stat.m_write.m_waits.m_avg);
+        break;
+      case 16: /* MAX_TIMER_WRITE */
+        set_field_ulonglong(f, m_row.m_io_stat.m_write.m_waits.m_max);
+        break;
+      case 17: /* SUM_NUMBER_OF_BYTES_WRITE */
+        set_field_ulonglong(f, m_row.m_io_stat.m_write.m_bytes);
+        break;
+
+      case 18: /* COUNT_MISC */
+        set_field_ulonglong(f, m_row.m_io_stat.m_misc.m_waits.m_count);
+        break;
+      case 19: /* SUM_TIMER_MISC */
+        set_field_ulonglong(f, m_row.m_io_stat.m_misc.m_waits.m_sum);
+        break;
+      case 20: /* MIN_TIMER_MISC */
+        set_field_ulonglong(f, m_row.m_io_stat.m_misc.m_waits.m_min);
+        break;
+      case 21: /* AVG_TIMER_MISC */
+        set_field_ulonglong(f, m_row.m_io_stat.m_misc.m_waits.m_avg);
+        break;
+      case 22: /* MAX_TIMER_MISC */
+        set_field_ulonglong(f, m_row.m_io_stat.m_misc.m_waits.m_max);
+        break;
+
+      default:
+        DBUG_ASSERT(false);
+        break;
+      }
+    } // if
+  } // for
+
+  return 0;
+}
diff --git a/storage/perfschema/table_file_summary_by_event_name.h b/storage/perfschema/table_file_summary_by_event_name.h
new file mode 100644
index 00000000000..8a51dffad65
--- /dev/null
+++ b/storage/perfschema/table_file_summary_by_event_name.h
@@ -0,0 +1,91 @@
+/* Copyright (c) 2008, 2011, Oracle and/or its affiliates. All rights reserved.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; version 2 of the License.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software Foundation,
+  51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA */
+
+#ifndef TABLE_FILE_SUMMARY_H
+#define TABLE_FILE_SUMMARY_H
+
+/**
+  @file storage/perfschema/table_file_summary_by_event_name.h
+  Table FILE_SUMMARY_BY_EVENT_NAME (declarations).
+*/
+
+#include "pfs_column_types.h"
+#include "pfs_engine_table.h"
+#include "pfs_instr_class.h"
+#include "pfs_instr.h"
+#include "table_helper.h"
+
+/**
+  @addtogroup Performance_schema_tables
+  @{
+*/
+
+/** A row of PERFORMANCE_SCHEMA.FILE_SUMMARY_BY_EVENT_NAME. */
+struct row_file_summary_by_event_name
+{
+  /** Column EVENT_NAME. */
+  PFS_event_name_row m_event_name;
+
+  /** Columns COUNT_STAR, SUM/MIN/AVG/MAX TIMER and NUMBER_OF_BYTES
+      for READ, WRITE and MISC operation types.
+  */
+  PFS_file_io_stat_row m_io_stat;
+};
+
+/** Table PERFORMANCE_SCHEMA.FILE_SUMMARY_BY_EVENT_NAME. */
+class table_file_summary_by_event_name : public PFS_engine_table
+{
+public:
+  /** Table share */
+  static PFS_engine_table_share m_share;
+  static PFS_engine_table* create();
+  static int delete_all_rows();
+
+  virtual int rnd_next();
+  virtual int rnd_pos(const void *pos);
+  virtual void reset_position(void);
+
+private:
+  virtual int read_row_values(TABLE *table,
+                              unsigned char *buf,
+                              Field **fields,
+                              bool read_all);
+
+  table_file_summary_by_event_name();
+
+public:
+  ~table_file_summary_by_event_name()
+  {}
+
+private:
+  void make_row(PFS_file_class *klass);
+
+  /** Table share lock. */
+  static THR_LOCK m_table_lock;
+  /** Fields definition. */
+  static TABLE_FIELD_DEF m_field_def;
+
+  /** Current row. */
+  row_file_summary_by_event_name m_row;
+  /** True if the current row exists. */
+  bool m_row_exists;
+  /** Current position. */
+  PFS_simple_index m_pos;
+  /** Next position. */
+  PFS_simple_index m_next_pos;
+};
+
+/** @} */
+#endif
diff --git a/storage/perfschema/table_file_summary_by_instance.cc b/storage/perfschema/table_file_summary_by_instance.cc
new file mode 100644
index 00000000000..40478647f5b
--- /dev/null
+++ b/storage/perfschema/table_file_summary_by_instance.cc
@@ -0,0 +1,381 @@
+/* Copyright (c) 2008, 2011, Oracle and/or its affiliates. All rights reserved.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; version 2 of the License.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software Foundation,
+  51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA */
+
+/**
+  @file storage/perfschema/table_file_summary.cc
+  Table FILE_SUMMARY_BY_INSTANCE (implementation).
+*/
+
+#include "my_global.h"
+#include "my_pthread.h"
+#include "pfs_instr_class.h"
+#include "pfs_column_types.h"
+#include "pfs_column_values.h"
+#include "table_file_summary_by_instance.h"
+#include "pfs_global.h"
+
+THR_LOCK table_file_summary_by_instance::m_table_lock;
+
+static const TABLE_FIELD_TYPE field_types[]=
+{
+  {
+    { C_STRING_WITH_LEN("FILE_NAME") },
+    { C_STRING_WITH_LEN("varchar(512)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("EVENT_NAME") },
+    { C_STRING_WITH_LEN("varchar(128)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("OBJECT_INSTANCE_BEGIN") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("COUNT_STAR") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_TIMER_WAIT") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("MIN_TIMER_WAIT") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("AVG_TIMER_WAIT") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("MAX_TIMER_WAIT") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+
+  /** Read */
+  {
+    { C_STRING_WITH_LEN("COUNT_READ") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_TIMER_READ") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("MIN_TIMER_READ") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("AVG_TIMER_READ") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("MAX_TIMER_READ") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_NUMBER_OF_BYTES_READ") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+
+  /** Write */
+  {
+    { C_STRING_WITH_LEN("COUNT_WRITE") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_TIMER_WRITE") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("MIN_TIMER_WRITE") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("AVG_TIMER_WRITE") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("MAX_TIMER_WRITE") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_NUMBER_OF_BYTES_WRITE") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+
+  /** Misc */
+  {
+    { C_STRING_WITH_LEN("COUNT_MISC") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_TIMER_MISC") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("MIN_TIMER_MISC") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("AVG_TIMER_MISC") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("MAX_TIMER_MISC") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  }
+};
+
+TABLE_FIELD_DEF
+table_file_summary_by_instance::m_field_def=
+{ 25, field_types };
+
+PFS_engine_table_share
+table_file_summary_by_instance::m_share=
+{
+  { C_STRING_WITH_LEN("file_summary_by_instance") },
+  &pfs_truncatable_acl,
+  &table_file_summary_by_instance::create,
+  NULL, /* write_row */
+  table_file_summary_by_instance::delete_all_rows,
+  NULL, /* get_row_count */
+  1000, /* records */
+  sizeof(PFS_simple_index),
+  &m_table_lock,
+  &m_field_def,
+  false /* checked */
+};
+
+PFS_engine_table* table_file_summary_by_instance::create(void)
+{
+  return new table_file_summary_by_instance();
+}
+
+int table_file_summary_by_instance::delete_all_rows(void)
+{
+  reset_file_instance_io();
+  return 0;
+}
+
+table_file_summary_by_instance::table_file_summary_by_instance()
+  : PFS_engine_table(&m_share, &m_pos),
+  m_row_exists(false), m_pos(0), m_next_pos(0)
+{}
+
+void table_file_summary_by_instance::reset_position(void)
+{
+  m_pos.m_index= 0;
+  m_next_pos.m_index= 0;
+}
+
+int table_file_summary_by_instance::rnd_next(void)
+{
+  PFS_file *pfs;
+
+  for (m_pos.set_at(&m_next_pos);
+       m_pos.m_index < file_max;
+       m_pos.next())
+  {
+    pfs= &file_array[m_pos.m_index];
+    if (pfs->m_lock.is_populated())
+    {
+      make_row(pfs);
+      m_next_pos.set_after(&m_pos);
+      return 0;
+    }
+  }
+
+  return HA_ERR_END_OF_FILE;
+}
+
+int table_file_summary_by_instance::rnd_pos(const void *pos)
+{
+  PFS_file *pfs;
+
+  set_position(pos);
+  DBUG_ASSERT(m_pos.m_index < file_max);
+  pfs= &file_array[m_pos.m_index];
+
+  if (! pfs->m_lock.is_populated())
+    return HA_ERR_RECORD_DELETED;
+
+  make_row(pfs);
+  return 0;
+}
+
+/**
+  Build a row.
+  @param pfs              the file the cursor is reading
+*/
+void table_file_summary_by_instance::make_row(PFS_file *pfs)
+{
+  pfs_lock lock;
+  PFS_file_class *safe_class;
+
+  m_row_exists= false;
+
+  /* Protect this reader against a file delete */
+  pfs->m_lock.begin_optimistic_lock(&lock);
+
+  safe_class= sanitize_file_class(pfs->m_class);
+  if (unlikely(safe_class == NULL))
+    return;
+
+  m_row.m_filename= pfs->m_filename;
+  m_row.m_filename_length= pfs->m_filename_length;
+  m_row.m_event_name.make_row(safe_class);
+  m_row.m_identity= pfs->m_identity;
+
+  time_normalizer *normalizer= time_normalizer::get(wait_timer);
+
+  /* Collect timer and byte count stats */
+  m_row.m_io_stat.set(normalizer, &pfs->m_file_stat.m_io_stat);
+
+  if (pfs->m_lock.end_optimistic_lock(&lock))
+    m_row_exists= true;
+}
+
+int table_file_summary_by_instance::read_row_values(TABLE *table,
+                                                          unsigned char *,
+                                                          Field **fields,
+                                                          bool read_all)
+{
+  Field *f;
+
+  if (unlikely(! m_row_exists))
+    return HA_ERR_RECORD_DELETED;
+
+  /* Set the null bits */
+  DBUG_ASSERT(table->s->null_bytes == 0);
+
+  for (; (f= *fields) ; fields++)
+  {
+    if (read_all || bitmap_is_set(table->read_set, f->field_index))
+    {
+      switch(f->field_index)
+      {
+      case  0: /* FILE_NAME */
+        set_field_varchar_utf8(f, m_row.m_filename, m_row.m_filename_length);
+        break;
+      case  1: /* EVENT_NAME */
+        m_row.m_event_name.set_field(f);
+        break;
+      case  2: /* OBJECT_INSTANCE */
+        set_field_ulonglong(f, (ulonglong)m_row.m_identity);
+        break;
+
+      case  3:/* COUNT_STAR */
+        set_field_ulonglong(f, m_row.m_io_stat.m_all.m_waits.m_count);
+        break;
+      case  4:/* SUM_TIMER_WAIT */
+        set_field_ulonglong(f, m_row.m_io_stat.m_all.m_waits.m_sum);
+        break;
+      case  5: /* MIN_TIMER_WAIT */
+        set_field_ulonglong(f, m_row.m_io_stat.m_all.m_waits.m_min);
+        break;
+      case  6: /* AVG_TIMER_WAIT */
+        set_field_ulonglong(f, m_row.m_io_stat.m_all.m_waits.m_avg);
+        break;
+      case  7: /* MAX_TIMER_WAIT */
+        set_field_ulonglong(f, m_row.m_io_stat.m_all.m_waits.m_max);
+        break;
+
+      case  8: /* COUNT_READ */
+        set_field_ulonglong(f, m_row.m_io_stat.m_read.m_waits.m_count);
+        break;
+      case  9: /* SUM_TIMER_READ */
+        set_field_ulonglong(f, m_row.m_io_stat.m_read.m_waits.m_sum);
+        break;
+      case 10: /* MIN_TIMER_READ */
+        set_field_ulonglong(f, m_row.m_io_stat.m_read.m_waits.m_min);
+        break;
+      case 11: /* AVG_TIMER_READ */
+        set_field_ulonglong(f, m_row.m_io_stat.m_read.m_waits.m_avg);
+        break;
+      case 12: /* MAX_TIMER_READ */
+        set_field_ulonglong(f, m_row.m_io_stat.m_read.m_waits.m_max);
+        break;
+      case 13: /* SUM_NUMBER_OF_BYTES_READ */
+        set_field_ulonglong(f, m_row.m_io_stat.m_read.m_bytes);
+        break;
+
+      case 14: /* COUNT_WRITE */
+        set_field_ulonglong(f, m_row.m_io_stat.m_write.m_waits.m_count);
+        break;
+      case 15: /* SUM_TIMER_WRITE */
+        set_field_ulonglong(f, m_row.m_io_stat.m_write.m_waits.m_sum);
+        break;
+      case 16: /* MIN_TIMER_WRITE */
+        set_field_ulonglong(f, m_row.m_io_stat.m_write.m_waits.m_min);
+        break;
+      case 17: /* AVG_TIMER_WRITE */
+        set_field_ulonglong(f, m_row.m_io_stat.m_write.m_waits.m_avg);
+        break;
+      case 18: /* MAX_TIMER_WRITE */
+        set_field_ulonglong(f, m_row.m_io_stat.m_write.m_waits.m_max);
+        break;
+      case 19: /* SUM_NUMBER_OF_BYTES_WRITE */
+        set_field_ulonglong(f, m_row.m_io_stat.m_write.m_bytes);
+        break;
+
+      case 20: /* COUNT_MISC */
+        set_field_ulonglong(f, m_row.m_io_stat.m_misc.m_waits.m_count);
+        break;
+      case 21: /* SUM_TIMER_MISC */
+        set_field_ulonglong(f, m_row.m_io_stat.m_misc.m_waits.m_sum);
+        break;
+      case 22: /* MIN_TIMER_MISC */
+        set_field_ulonglong(f, m_row.m_io_stat.m_misc.m_waits.m_min);
+        break;
+      case 23: /* AVG_TIMER_MISC */
+        set_field_ulonglong(f, m_row.m_io_stat.m_misc.m_waits.m_avg);
+        break;
+      case 24: /* MAX_TIMER_MISC */
+        set_field_ulonglong(f, m_row.m_io_stat.m_misc.m_waits.m_max);
+        break;
+      default:
+        DBUG_ASSERT(false);
+      }
+    }
+  }
+
+  return 0;
+}
+
diff --git a/storage/perfschema/table_file_summary_by_instance.h b/storage/perfschema/table_file_summary_by_instance.h
new file mode 100644
index 00000000000..d9f406966db
--- /dev/null
+++ b/storage/perfschema/table_file_summary_by_instance.h
@@ -0,0 +1,99 @@
+/* Copyright (c) 2008, 2011 Oracle and/or its affiliates. All rights reserved.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; version 2 of the License.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software Foundation,
+  51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA */
+
+#ifndef TABLE_FILE_SUMMARY_BY_INSTANCE_H
+#define TABLE_FILE_SUMMARY_BY_INSTANCE_H
+
+/**
+  @file storage/perfschema/table_file_summary_by_instance.h
+  Table FILE_SUMMARY_BY_INSTANCE (declarations).
+*/
+
+#include "pfs_column_types.h"
+#include "pfs_engine_table.h"
+#include "pfs_instr_class.h"
+#include "pfs_instr.h"
+#include "table_helper.h"
+
+/**
+  @addtogroup Performance_schema_tables
+  @{
+*/
+
+/** A row of PERFORMANCE_SCHEMA.FILE_SUMMARY_BY_INSTANCE. */
+struct row_file_summary_by_instance
+{
+  /** Column FILE_NAME. */
+  const char *m_filename;
+  /** Length in bytes of @c m_filename. */
+  uint m_filename_length;
+
+  /** Column EVENT_NAME. */
+  PFS_event_name_row m_event_name;
+
+  /** Column OBJECT_INSTANCE_BEGIN */
+  const void *m_identity;
+  /**
+    Columns COUNT_STAR, SUM/MIN/AVG/MAX TIMER and NUMBER_OF_BYTES for READ,
+    WRITE and MISC operation types.
+  */
+  PFS_file_io_stat_row m_io_stat;
+};
+
+/** Table PERFORMANCE_SCHEMA.FILE_SUMMARY_BY_INSTANCE. */
+class table_file_summary_by_instance : public PFS_engine_table
+{
+public:
+  /** Table share */
+  static PFS_engine_table_share m_share;
+  static PFS_engine_table* create();
+  static int delete_all_rows();
+
+  virtual int rnd_next();
+  virtual int rnd_pos(const void *pos);
+  virtual void reset_position(void);
+
+private:
+  virtual int read_row_values(TABLE *table,
+                              unsigned char *buf,
+                              Field **fields,
+                              bool read_all);
+
+  table_file_summary_by_instance();
+
+public:
+  ~table_file_summary_by_instance()
+  {}
+
+private:
+  void make_row(PFS_file *pfs);
+
+  /** Table share lock. */
+  static THR_LOCK m_table_lock;
+  /** Fields definition. */
+  static TABLE_FIELD_DEF m_field_def;
+
+  /** Current row. */
+  row_file_summary_by_instance m_row;
+  /** True if the current row exists. */
+  bool m_row_exists;
+  /** Current position. */
+  PFS_simple_index m_pos;
+  /** Next position. */
+  PFS_simple_index m_next_pos;
+};
+
+/** @} */
+#endif
diff --git a/storage/perfschema/table_helper.cc b/storage/perfschema/table_helper.cc
new file mode 100644
index 00000000000..d3954179539
--- /dev/null
+++ b/storage/perfschema/table_helper.cc
@@ -0,0 +1,340 @@
+/* Copyright (c) 2010, 2011, Oracle and/or its affiliates. All rights reserved.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; version 2 of the License.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+/**
+  @file storage/perfschema/table_helper.cc
+  Performance schema table helpers (implementation).
+*/
+
+#include "my_global.h"
+#include "my_pthread.h"
+#include "pfs_engine_table.h"
+#include "table_helper.h"
+#include "pfs_host.h"
+#include "pfs_user.h"
+#include "pfs_account.h"
+
+int PFS_host_row::make_row(PFS_host *pfs)
+{
+  m_hostname_length= pfs->m_hostname_length;
+  if (m_hostname_length > sizeof(m_hostname))
+    return 1;
+  if (m_hostname_length > 0)
+    memcpy(m_hostname, pfs->m_hostname, sizeof(m_hostname));
+  return 0;
+}
+
+void PFS_host_row::set_field(Field *f)
+{
+  if (m_hostname_length > 0)
+    PFS_engine_table::set_field_char_utf8(f, m_hostname, m_hostname_length);
+  else
+    f->set_null();
+}
+
+int PFS_user_row::make_row(PFS_user *pfs)
+{
+  m_username_length= pfs->m_username_length;
+  if (m_username_length > sizeof(m_username))
+    return 1;
+  if (m_username_length > 0)
+    memcpy(m_username, pfs->m_username, sizeof(m_username));
+  return 0;
+}
+
+void PFS_user_row::set_field(Field *f)
+{
+  if (m_username_length > 0)
+    PFS_engine_table::set_field_char_utf8(f, m_username, m_username_length);
+  else
+    f->set_null();
+}
+
+int PFS_account_row::make_row(PFS_account *pfs)
+{
+  m_username_length= pfs->m_username_length;
+  if (m_username_length > sizeof(m_username))
+    return 1;
+  if (m_username_length > 0)
+    memcpy(m_username, pfs->m_username, sizeof(m_username));
+
+  m_hostname_length= pfs->m_hostname_length;
+  if (m_hostname_length > sizeof(m_hostname))
+    return 1;
+  if (m_hostname_length > 0)
+    memcpy(m_hostname, pfs->m_hostname, sizeof(m_hostname));
+
+  return 0;
+}
+
+void PFS_account_row::set_field(uint index, Field *f)
+{
+  switch (index)
+  {
+    case 0: /* USER */
+      if (m_username_length > 0)
+        PFS_engine_table::set_field_char_utf8(f, m_username, m_username_length);
+      else
+        f->set_null();
+      break;
+    case 1: /* HOST */
+      if (m_hostname_length > 0)
+        PFS_engine_table::set_field_char_utf8(f, m_hostname, m_hostname_length);
+      else
+        f->set_null();
+      break;
+    default:
+      DBUG_ASSERT(false);
+      break;
+  }
+}
+
+int PFS_digest_row::make_row(PFS_statements_digest_stat* pfs)
+{
+  /*
+    "0" value for byte_count indicates special entry i.e. aggregated
+    stats at index 0 of statements_digest_stat_array. So do not calculate
+    digest/digest_text as it should always be "NULL".
+  */
+  if (pfs->m_digest_storage.m_byte_count != 0)
+  {
+    /*
+      Calculate digest from MD5 HASH collected to be shown as
+      DIGEST in this row.
+    */
+    MD5_HASH_TO_STRING(pfs->m_digest_hash.m_md5, m_digest);
+    m_digest_length= MD5_HASH_TO_STRING_LENGTH;
+
+    /* 
+      Caclulate digest_text information from the token array collected
+      to be shown as DIGEST_TEXT column.
+    */ 
+    get_digest_text(m_digest_text, &pfs->m_digest_storage);
+    m_digest_text_length= strlen(m_digest_text);
+  }
+  else
+  {
+    m_digest_length= 0;
+    m_digest_text_length= 0;
+  }
+  
+  return 0;
+}
+
+void PFS_digest_row::set_field(uint index, Field *f)
+{
+  switch (index)
+  {
+    case 0: /* DIGEST */
+      if (m_digest_length > 0)
+        PFS_engine_table::set_field_varchar_utf8(f, m_digest,
+                                                 m_digest_length);
+      else
+        f->set_null();
+      break;
+    case 1: /* DIGEST_TEXT */
+      if (m_digest_text_length > 0)
+        PFS_engine_table::set_field_longtext_utf8(f, m_digest_text,
+                                                  m_digest_text_length);
+      else
+        f->set_null();
+      break;
+    default:
+      DBUG_ASSERT(false);
+      break;
+  }
+}
+
+int PFS_object_row::make_row(PFS_table_share *pfs)
+{
+  m_object_type= pfs->get_object_type();
+
+  m_schema_name_length= pfs->m_schema_name_length;
+  if (m_schema_name_length > sizeof(m_schema_name))
+    return 1;
+  if (m_schema_name_length > 0)
+    memcpy(m_schema_name, pfs->m_schema_name, sizeof(m_schema_name));
+
+  m_object_name_length= pfs->m_table_name_length;
+  if (m_object_name_length > sizeof(m_object_name))
+    return 1;
+  if (m_object_name_length > 0)
+    memcpy(m_object_name, pfs->m_table_name, sizeof(m_object_name));
+
+  return 0;
+}
+
+void PFS_object_row::set_field(uint index, Field *f)
+{
+  switch(index)
+  {
+    case 0: /* OBJECT_TYPE */
+      set_field_object_type(f, m_object_type);
+      break;
+    case 1: /* SCHEMA_NAME */
+      PFS_engine_table::set_field_varchar_utf8(f, m_schema_name, m_schema_name_length);
+      break;
+    case 2: /* OBJECT_NAME */
+      PFS_engine_table::set_field_varchar_utf8(f, m_object_name, m_object_name_length);
+      break;
+    default:
+      DBUG_ASSERT(false);
+  }
+}
+
+int PFS_index_row::make_row(PFS_table_share *pfs, uint table_index)
+{
+  if (m_object_row.make_row(pfs))
+    return 1;
+
+  if (table_index < MAX_KEY)
+  {
+    PFS_table_key *key= &pfs->m_keys[table_index];
+    m_index_name_length= key->m_name_length;
+    if (m_index_name_length > sizeof(m_index_name))
+      return 1;
+    memcpy(m_index_name, key->m_name, sizeof(m_index_name));
+  }
+  else
+    m_index_name_length= 0;
+
+  return 0;
+}
+
+void PFS_index_row::set_field(uint index, Field *f)
+{
+  switch(index)
+  {
+    case 0: /* OBJECT_TYPE */
+    case 1: /* SCHEMA_NAME */
+    case 2: /* OBJECT_NAME */
+      m_object_row.set_field(index, f);
+      break;
+    case 3: /* INDEX_NAME */
+      if (m_index_name_length > 0)
+        PFS_engine_table::set_field_varchar_utf8(f, m_index_name, m_index_name_length);
+      else
+        f->set_null();
+      break;
+    default:
+      DBUG_ASSERT(false);
+  }
+}
+
+void PFS_statement_stat_row::set_field(uint index, Field *f)
+{
+  switch (index)
+  {
+    case 0: /* COUNT_STAR */
+    case 1: /* SUM_TIMER_WAIT */
+    case 2: /* MIN_TIMER_WAIT */
+    case 3: /* AVG_TIMER_WAIT */
+    case 4: /* MAX_TIMER_WAIT */
+      m_timer1_row.set_field(index, f);
+      break;
+    case 5: /* SUM_LOCK_TIME */
+      PFS_engine_table::set_field_ulonglong(f, m_lock_time);
+      break;
+    case 6: /* SUM_ERRORS */
+      PFS_engine_table::set_field_ulonglong(f, m_error_count);
+      break;
+    case 7: /* SUM_WARNINGS */
+      PFS_engine_table::set_field_ulonglong(f, m_warning_count);
+      break;
+    case 8: /* SUM_ROWS_AFFECTED */
+      PFS_engine_table::set_field_ulonglong(f, m_rows_affected);
+      break;
+    case 9: /* SUM_ROWS_SENT */
+      PFS_engine_table::set_field_ulonglong(f, m_rows_sent);
+      break;
+    case 10: /* SUM_ROWS_EXAMINED */
+      PFS_engine_table::set_field_ulonglong(f, m_rows_examined);
+      break;
+    case 11: /* SUM_CREATED_TMP_DISK_TABLES */
+      PFS_engine_table::set_field_ulonglong(f, m_created_tmp_disk_tables);
+      break;
+    case 12: /* SUM_CREATED_TMP_TABLES */
+      PFS_engine_table::set_field_ulonglong(f, m_created_tmp_tables);
+      break;
+    case 13: /* SUM_SELECT_FULL_JOIN */
+      PFS_engine_table::set_field_ulonglong(f, m_select_full_join);
+      break;
+    case 14: /* SUM_SELECT_FULL_RANGE_JOIN */
+      PFS_engine_table::set_field_ulonglong(f, m_select_full_range_join);
+      break;
+    case 15: /* SUM_SELECT_RANGE */
+      PFS_engine_table::set_field_ulonglong(f, m_select_range);
+      break;
+    case 16: /* SUM_SELECT_RANGE_CHECK */
+      PFS_engine_table::set_field_ulonglong(f, m_select_range_check);
+      break;
+    case 17: /* SUM_SELECT_SCAN */
+      PFS_engine_table::set_field_ulonglong(f, m_select_scan);
+      break;
+    case 18: /* SUM_SORT_MERGE_PASSES */
+      PFS_engine_table::set_field_ulonglong(f, m_sort_merge_passes);
+      break;
+    case 19: /* SUM_SORT_RANGE */
+      PFS_engine_table::set_field_ulonglong(f, m_sort_range);
+      break;
+    case 20: /* SUM_SORT_ROWS */
+      PFS_engine_table::set_field_ulonglong(f, m_sort_rows);
+      break;
+    case 21: /* SUM_SORT_SCAN */
+      PFS_engine_table::set_field_ulonglong(f, m_sort_scan);
+      break;
+    case 22: /* SUM_NO_INDEX_USED */
+      PFS_engine_table::set_field_ulonglong(f, m_no_index_used);
+      break;
+    case 23: /* SUM_NO_GOOD_INDEX_USED */
+      PFS_engine_table::set_field_ulonglong(f, m_no_good_index_used);
+      break;
+    default:
+      DBUG_ASSERT(false);
+      break;
+  }
+}
+
+void PFS_connection_stat_row::set_field(uint index, Field *f)
+{
+  switch (index)
+  {
+    case 0: /* CURRENT_CONNECTIONS */
+      PFS_engine_table::set_field_ulonglong(f, m_current_connections);
+      break;
+    case 1: /* TOTAL_CONNECTIONS */
+      PFS_engine_table::set_field_ulonglong(f, m_total_connections);
+      break;
+    default:
+      DBUG_ASSERT(false);
+      break;
+  }
+}
+
+void set_field_object_type(Field *f, enum_object_type object_type)
+{
+  switch (object_type)
+  {
+  case OBJECT_TYPE_TABLE:
+    PFS_engine_table::set_field_varchar_utf8(f, "TABLE", 5);
+    break;
+  case OBJECT_TYPE_TEMPORARY_TABLE:
+    PFS_engine_table::set_field_varchar_utf8(f, "TEMPORARY TABLE", 15);
+    break;
+  default:
+    DBUG_ASSERT(false);
+  }
+}
+
diff --git a/storage/perfschema/table_helper.h b/storage/perfschema/table_helper.h
new file mode 100644
index 00000000000..798ff16f4e5
--- /dev/null
+++ b/storage/perfschema/table_helper.h
@@ -0,0 +1,518 @@
+/* Copyright (c) 2008, 2011, Oracle and/or its affiliates. All rights reserved.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; version 2 of the License.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+#ifndef PFS_TABLE_HELPER_H
+#define PFS_TABLE_HELPER_H
+
+#include "pfs_column_types.h"
+#include "pfs_stat.h"
+#include "pfs_timer.h"
+#include "pfs_engine_table.h"
+#include "pfs_instr_class.h"
+#include "pfs_digest.h"
+
+/*
+  Write MD5 hash value in a string to be used 
+  as DIGEST for the statement.
+*/
+#define MD5_HASH_TO_STRING(_hash, _str)                    \
+  sprintf(_str, "%02x%02x%02x%02x%02x%02x%02x%02x"         \
+                "%02x%02x%02x%02x%02x%02x%02x%02x",        \
+          _hash[0], _hash[1], _hash[2], _hash[3],          \
+          _hash[4], _hash[5], _hash[6], _hash[7],          \
+          _hash[8], _hash[9], _hash[10], _hash[11],        \
+          _hash[12], _hash[13], _hash[14], _hash[15])
+
+#define MD5_HASH_TO_STRING_LENGTH 32
+
+struct PFS_host;
+struct PFS_user;
+struct PFS_account;
+
+/**
+  @file storage/perfschema/table_helper.h
+  Performance schema table helpers (declarations).
+*/
+
+/**
+  @addtogroup Performance_schema_tables
+  @{
+*/
+
+/** Namespace, internal views used within table setup_instruments. */
+struct PFS_instrument_view_constants
+{
+  static const uint FIRST_VIEW= 1;
+  static const uint VIEW_MUTEX= 1;
+  static const uint VIEW_RWLOCK= 2;
+  static const uint VIEW_COND= 3;
+  static const uint VIEW_FILE= 4;
+  static const uint VIEW_TABLE= 5;
+  static const uint VIEW_SOCKET= 6;
+  static const uint VIEW_IDLE= 7;
+  static const uint LAST_VIEW= 7;
+};
+
+/** Namespace, internal views used within object summaries. */
+struct PFS_object_view_constants
+{
+  static const uint FIRST_VIEW= 1;
+  static const uint VIEW_TABLE= 1;
+  static const uint LAST_VIEW= 1;
+
+  /* Future use */
+  static const uint VIEW_EVENT= 2;
+  static const uint VIEW_PROCEDURE= 3;
+  static const uint VIEW_FUNCTION= 4;
+};
+
+/** Row fragment for column HOST. */
+struct PFS_host_row
+{
+  /** Column HOST. */
+  char m_hostname[HOSTNAME_LENGTH];
+  /** Length in bytes of @c m_hostname. */
+  uint m_hostname_length;
+
+  /** Build a row from a memory buffer. */
+  int make_row(PFS_host *pfs);
+  /** Set a table field from the row. */
+  void set_field(Field *f);
+};
+
+/** Row fragment for column USER. */
+struct PFS_user_row
+{
+  /** Column USER. */
+  char m_username[USERNAME_LENGTH];
+  /** Length in bytes of @c m_username. */
+  uint m_username_length;
+
+  /** Build a row from a memory buffer. */
+  int make_row(PFS_user *pfs);
+  /** Set a table field from the row. */
+  void set_field(Field *f);
+};
+
+/** Row fragment for columns USER, HOST. */
+struct PFS_account_row
+{
+  /** Column USER. */
+  char m_username[USERNAME_LENGTH];
+  /** Length in bytes of @c m_username. */
+  uint m_username_length;
+  /** Column HOST. */
+  char m_hostname[HOSTNAME_LENGTH];
+  /** Length in bytes of @c m_hostname. */
+  uint m_hostname_length;
+
+  /** Build a row from a memory buffer. */
+  int make_row(PFS_account *pfs);
+  /** Set a table field from the row. */
+  void set_field(uint index, Field *f);
+};
+
+/** Row fragment for columns DIGEST, DIGEST_TEXT. */
+struct PFS_digest_row
+{
+  /** Column DIGEST. */
+  char m_digest[COL_DIGEST_SIZE];
+  /** Length in bytes of @c m_digest. */
+  uint m_digest_length;
+  /** Column DIGEST_TEXT. */
+  char m_digest_text[COL_DIGEST_TEXT_SIZE];
+  /** Length in bytes of @c m_digest_text. */
+  uint m_digest_text_length;
+
+  /** Build a row from a memory buffer. */
+  int make_row(PFS_statements_digest_stat*);
+  /** Set a table field from the row. */
+  void set_field(uint index, Field *f);
+};
+
+/** Row fragment for column EVENT_NAME. */
+struct PFS_event_name_row
+{
+  /** Column EVENT_NAME. */
+  const char *m_name;
+  /** Length in bytes of @c m_name. */
+  uint m_name_length;
+
+  /** Build a row from a memory buffer. */
+  inline void make_row(PFS_instr_class *pfs)
+  {
+    m_name= pfs->m_name;
+    m_name_length= pfs->m_name_length;
+  }
+
+  /** Set a table field from the row. */
+  inline void set_field(Field *f)
+  {
+    PFS_engine_table::set_field_varchar_utf8(f, m_name, m_name_length);
+  }
+};
+
+/** Row fragment for columns OBJECT_TYPE, SCHEMA_NAME, OBJECT_NAME. */
+struct PFS_object_row
+{
+  /** Column OBJECT_TYPE. */
+  enum_object_type m_object_type;
+  /** Column SCHEMA_NAME. */
+  char m_schema_name[NAME_LEN];
+  /** Length in bytes of @c m_schema_name. */
+  uint m_schema_name_length;
+  /** Column OBJECT_NAME. */
+  char m_object_name[NAME_LEN];
+  /** Length in bytes of @c m_object_name. */
+  uint m_object_name_length;
+
+  /** Build a row from a memory buffer. */
+  int make_row(PFS_table_share *pfs);
+  /** Set a table field from the row. */
+  void set_field(uint index, Field *f);
+};
+
+/** Row fragment for columns OBJECT_TYPE, SCHEMA_NAME, OBJECT_NAME, INDEX_NAME. */
+struct PFS_index_row
+{
+  PFS_object_row m_object_row;
+  /** Column INDEX_NAME. */
+  char m_index_name[NAME_LEN];
+  /** Length in bytes of @c m_index_name. */
+  uint m_index_name_length;
+
+  /** Build a row from a memory buffer. */
+  int make_row(PFS_table_share *pfs, uint table_index);
+  /** Set a table field from the row. */
+  void set_field(uint index, Field *f);
+};
+
+/** Row fragment for single statistics columns (COUNT, SUM, MIN, AVG, MAX) */
+struct PFS_stat_row
+{
+  /** Column COUNT_STAR. */
+  ulonglong m_count;
+  /** Column SUM_TIMER_WAIT. */
+  ulonglong m_sum;
+  /** Column MIN_TIMER_WAIT. */
+  ulonglong m_min;
+  /** Column AVG_TIMER_WAIT. */
+  ulonglong m_avg;
+  /** Column MAX_TIMER_WAIT. */
+  ulonglong m_max;
+
+  /** Build a row with timer fields from a memory buffer. */
+  inline void set(time_normalizer *normalizer, const PFS_single_stat *stat)
+  {
+    m_count= stat->m_count;
+
+    if (m_count)
+    {
+      m_sum= normalizer->wait_to_pico(stat->m_sum);
+      m_min= normalizer->wait_to_pico(stat->m_min);
+      m_max= normalizer->wait_to_pico(stat->m_max);
+      m_avg= normalizer->wait_to_pico(stat->m_sum / m_count);
+    }
+    else
+    {
+      m_sum= 0;
+      m_min= 0;
+      m_avg= 0;
+      m_max= 0;
+    }
+  }
+
+  /** Set a table field from the row. */
+  void set_field(uint index, Field *f)
+  {
+    switch (index)
+    {
+      case 0: /* COUNT */
+        PFS_engine_table::set_field_ulonglong(f, m_count);
+        break;
+      case 1: /* SUM */
+        PFS_engine_table::set_field_ulonglong(f, m_sum);
+        break;
+      case 2: /* MIN */
+        PFS_engine_table::set_field_ulonglong(f, m_min);
+        break;
+      case 3: /* AVG */
+        PFS_engine_table::set_field_ulonglong(f, m_avg);
+        break;
+      case 4: /* MAX */
+        PFS_engine_table::set_field_ulonglong(f, m_max);
+        break;
+      default:
+        DBUG_ASSERT(false);
+    }
+  }
+};
+
+/** Row fragment for timer and byte count stats. Corresponds to PFS_byte_stat */
+struct PFS_byte_stat_row
+{
+  PFS_stat_row m_waits;
+  ulonglong    m_bytes;
+
+  /** Build a row with timer and byte count fields from a memory buffer. */
+  inline void set(time_normalizer *normalizer, const PFS_byte_stat *stat)
+  {
+    m_waits.set(normalizer, stat);
+    m_bytes= stat->m_bytes;
+  }
+};
+
+/** Row fragment for table io statistics columns. */
+struct PFS_table_io_stat_row
+{
+  PFS_stat_row m_all;
+  PFS_stat_row m_all_read;
+  PFS_stat_row m_all_write;
+  PFS_stat_row m_fetch;
+  PFS_stat_row m_insert;
+  PFS_stat_row m_update;
+  PFS_stat_row m_delete;
+
+  /** Build a row from a memory buffer. */
+  inline void set(time_normalizer *normalizer, const PFS_table_io_stat *stat)
+  {
+    PFS_single_stat all_read;
+    PFS_single_stat all_write;
+    PFS_single_stat all;
+
+    m_fetch.set(normalizer, & stat->m_fetch);
+
+    all_read.aggregate(& stat->m_fetch);
+
+    m_insert.set(normalizer, & stat->m_insert);
+    m_update.set(normalizer, & stat->m_update);
+    m_delete.set(normalizer, & stat->m_delete);
+
+    all_write.aggregate(& stat->m_insert);
+    all_write.aggregate(& stat->m_update);
+    all_write.aggregate(& stat->m_delete);
+
+    all.aggregate(& all_read);
+    all.aggregate(& all_write);
+
+    m_all_read.set(normalizer, & all_read);
+    m_all_write.set(normalizer, & all_write);
+    m_all.set(normalizer, & all);
+  }
+};
+
+/** Row fragment for table lock statistics columns. */
+struct PFS_table_lock_stat_row
+{
+  PFS_stat_row m_all;
+  PFS_stat_row m_all_read;
+  PFS_stat_row m_all_write;
+  PFS_stat_row m_read_normal;
+  PFS_stat_row m_read_with_shared_locks;
+  PFS_stat_row m_read_high_priority;
+  PFS_stat_row m_read_no_insert;
+  PFS_stat_row m_read_external;
+  PFS_stat_row m_write_allow_write;
+  PFS_stat_row m_write_concurrent_insert;
+  PFS_stat_row m_write_delayed;
+  PFS_stat_row m_write_low_priority;
+  PFS_stat_row m_write_normal;
+  PFS_stat_row m_write_external;
+
+  /** Build a row from a memory buffer. */
+  inline void set(time_normalizer *normalizer, const PFS_table_lock_stat *stat)
+  {
+    PFS_single_stat all_read;
+    PFS_single_stat all_write;
+    PFS_single_stat all;
+
+    m_read_normal.set(normalizer, & stat->m_stat[PFS_TL_READ]);
+    m_read_with_shared_locks.set(normalizer, & stat->m_stat[PFS_TL_READ_WITH_SHARED_LOCKS]);
+    m_read_high_priority.set(normalizer, & stat->m_stat[PFS_TL_READ_HIGH_PRIORITY]);
+    m_read_no_insert.set(normalizer, & stat->m_stat[PFS_TL_READ_NO_INSERT]);
+    m_read_external.set(normalizer, & stat->m_stat[PFS_TL_READ_EXTERNAL]);
+
+    all_read.aggregate(& stat->m_stat[PFS_TL_READ]);
+    all_read.aggregate(& stat->m_stat[PFS_TL_READ_WITH_SHARED_LOCKS]);
+    all_read.aggregate(& stat->m_stat[PFS_TL_READ_HIGH_PRIORITY]);
+    all_read.aggregate(& stat->m_stat[PFS_TL_READ_NO_INSERT]);
+    all_read.aggregate(& stat->m_stat[PFS_TL_READ_EXTERNAL]);
+
+    m_write_allow_write.set(normalizer, & stat->m_stat[PFS_TL_WRITE_ALLOW_WRITE]);
+    m_write_concurrent_insert.set(normalizer, & stat->m_stat[PFS_TL_WRITE_CONCURRENT_INSERT]);
+    m_write_delayed.set(normalizer, & stat->m_stat[PFS_TL_WRITE_DELAYED]);
+    m_write_low_priority.set(normalizer, & stat->m_stat[PFS_TL_WRITE_LOW_PRIORITY]);
+    m_write_normal.set(normalizer, & stat->m_stat[PFS_TL_WRITE]);
+    m_write_external.set(normalizer, & stat->m_stat[PFS_TL_WRITE_EXTERNAL]);
+
+    all_write.aggregate(& stat->m_stat[PFS_TL_WRITE_ALLOW_WRITE]);
+    all_write.aggregate(& stat->m_stat[PFS_TL_WRITE_CONCURRENT_INSERT]);
+    all_write.aggregate(& stat->m_stat[PFS_TL_WRITE_DELAYED]);
+    all_write.aggregate(& stat->m_stat[PFS_TL_WRITE_LOW_PRIORITY]);
+    all_write.aggregate(& stat->m_stat[PFS_TL_WRITE]);
+    all_write.aggregate(& stat->m_stat[PFS_TL_WRITE_EXTERNAL]);
+
+    all.aggregate(& all_read);
+    all.aggregate(& all_write);
+
+    m_all_read.set(normalizer, & all_read);
+    m_all_write.set(normalizer, & all_write);
+    m_all.set(normalizer, & all);
+  }
+};
+
+/** Row fragment for stage statistics columns. */
+struct PFS_stage_stat_row
+{
+  PFS_stat_row m_timer1_row;
+
+  /** Build a row from a memory buffer. */
+  inline void set(time_normalizer *normalizer, const PFS_stage_stat *stat)
+  {
+    m_timer1_row.set(normalizer, & stat->m_timer1_stat);
+  }
+
+  /** Set a table field from the row. */
+  void set_field(uint index, Field *f)
+  {
+     m_timer1_row.set_field(index, f);
+  }
+};
+
+/** Row fragment for statement statistics columns. */
+struct PFS_statement_stat_row
+{
+  PFS_stat_row m_timer1_row;
+  ulonglong m_error_count;
+  ulonglong m_warning_count;
+  ulonglong m_rows_affected;
+  ulonglong m_lock_time;
+  ulonglong m_rows_sent;
+  ulonglong m_rows_examined;
+  ulonglong m_created_tmp_disk_tables;
+  ulonglong m_created_tmp_tables;
+  ulonglong m_select_full_join;
+  ulonglong m_select_full_range_join;
+  ulonglong m_select_range;
+  ulonglong m_select_range_check;
+  ulonglong m_select_scan;
+  ulonglong m_sort_merge_passes;
+  ulonglong m_sort_range;
+  ulonglong m_sort_rows;
+  ulonglong m_sort_scan;
+  ulonglong m_no_index_used;
+  ulonglong m_no_good_index_used;
+
+  /** Build a row from a memory buffer. */
+  inline void set(time_normalizer *normalizer, const PFS_statement_stat *stat)
+  {
+    m_timer1_row.set(normalizer, & stat->m_timer1_stat);
+
+    m_error_count= stat->m_error_count;
+    m_warning_count= stat->m_warning_count;
+    m_lock_time= stat->m_lock_time * MICROSEC_TO_PICOSEC;
+    m_rows_affected= stat->m_rows_affected;
+    m_rows_sent= stat->m_rows_sent;
+    m_rows_examined= stat->m_rows_examined;
+    m_created_tmp_disk_tables= stat->m_created_tmp_disk_tables;
+    m_created_tmp_tables= stat->m_created_tmp_tables;
+    m_select_full_join= stat->m_select_full_join;
+    m_select_full_range_join= stat->m_select_full_range_join;
+    m_select_range= stat->m_select_range;
+    m_select_range_check= stat->m_select_range_check;
+    m_select_scan= stat->m_select_scan;
+    m_sort_merge_passes= stat->m_sort_range;
+    m_sort_range= stat->m_sort_range;
+    m_sort_rows= stat->m_sort_rows;
+    m_sort_scan= stat->m_sort_scan;
+    m_no_index_used= stat->m_no_index_used;
+    m_no_good_index_used= stat->m_no_good_index_used;
+  }
+
+  /** Set a table field from the row. */
+  void set_field(uint index, Field *f);
+};
+
+struct PFS_connection_stat_row
+{
+  ulonglong m_current_connections;
+  ulonglong m_total_connections;
+
+  inline void set(const PFS_connection_stat *stat)
+  {
+    m_current_connections= stat->m_current_connections;
+    m_total_connections= stat->m_total_connections;
+  }
+
+  /** Set a table field from the row. */
+  void set_field(uint index, Field *f);
+};
+
+void set_field_object_type(Field *f, enum_object_type object_type);
+
+/** Row fragment for socket io statistics columns. */
+struct PFS_socket_io_stat_row
+{
+  PFS_byte_stat_row m_read;
+  PFS_byte_stat_row m_write;
+  PFS_byte_stat_row m_misc;
+  PFS_byte_stat_row m_all;
+  
+  inline void set(time_normalizer *normalizer, const PFS_socket_io_stat *stat)
+  {
+    PFS_byte_stat all;
+
+    m_read.set(normalizer, &stat->m_read);
+    m_write.set(normalizer, &stat->m_write);
+    m_misc.set(normalizer, &stat->m_misc);
+    
+    /* Combine stats for all operations */
+    all.aggregate(&stat->m_read);
+    all.aggregate(&stat->m_write);
+    all.aggregate(&stat->m_misc);
+
+    m_all.set(normalizer, &all);
+  }
+};
+
+/** Row fragment for file io statistics columns. */
+struct PFS_file_io_stat_row
+{
+  PFS_byte_stat_row m_read;
+  PFS_byte_stat_row m_write;
+  PFS_byte_stat_row m_misc;
+  PFS_byte_stat_row m_all;
+  
+  inline void set(time_normalizer *normalizer, const PFS_file_io_stat *stat)
+  {
+    PFS_byte_stat all;
+
+    m_read.set(normalizer, &stat->m_read);
+    m_write.set(normalizer, &stat->m_write);
+    m_misc.set(normalizer, &stat->m_misc);
+    
+    /* Combine stats for all operations */
+    all.aggregate(&stat->m_read);
+    all.aggregate(&stat->m_write);
+    all.aggregate(&stat->m_misc);
+
+    m_all.set(normalizer, &all);
+  }
+};
+
+/** @} */
+
+#endif
+
diff --git a/storage/perfschema/table_host_cache.cc b/storage/perfschema/table_host_cache.cc
new file mode 100644
index 00000000000..d243204ddcd
--- /dev/null
+++ b/storage/perfschema/table_host_cache.cc
@@ -0,0 +1,484 @@
+/* Copyright (c) 2011, Oracle and/or its affiliates. All rights reserved.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; version 2 of the License.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software Foundation,
+  51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA */
+
+/**
+  @file storage/perfschema/table_host_cache.cc
+  Table HOST_CACHE (implementation).
+*/
+
+#include "my_global.h"
+#include "my_pthread.h"
+#include "table_host_cache.h"
+#include "hostname.h"
+
+#ifdef NOT_YET_PORTED
+
+THR_LOCK table_host_cache::m_table_lock;
+
+static const TABLE_FIELD_TYPE field_types[]=
+{
+  {
+    { C_STRING_WITH_LEN("IP") },
+    { C_STRING_WITH_LEN("varchar(64)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("HOST") },
+    { C_STRING_WITH_LEN("varchar(255)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("HOST_VALIDATED") },
+    { C_STRING_WITH_LEN("enum(\'YES\',\'NO\')") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_CONNECT_ERRORS") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("COUNT_HOST_BLOCKED_ERRORS") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("COUNT_NAMEINFO_TRANSIENT_ERRORS") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("COUNT_NAMEINFO_PERMANENT_ERRORS") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("COUNT_FORMAT_ERRORS") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("COUNT_ADDRINFO_TRANSIENT_ERRORS") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("COUNT_ADDRINFO_PERMANENT_ERRORS") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("COUNT_FCRDNS_ERRORS") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("COUNT_HOST_ACL_ERRORS") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("COUNT_NO_AUTH_PLUGIN_ERRORS") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("COUNT_AUTH_PLUGIN_ERRORS") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("COUNT_HANDSHAKE_ERRORS") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("COUNT_PROXY_USER_ERRORS") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("COUNT_PROXY_USER_ACL_ERRORS") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("COUNT_AUTHENTICATION_ERRORS") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("COUNT_SSL_ERRORS") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("COUNT_MAX_USER_CONNECTIONS_ERRORS") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("COUNT_MAX_USER_CONNECTIONS_PER_HOUR_ERRORS") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("COUNT_DEFAULT_DATABASE_ERRORS") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("COUNT_INIT_CONNECT_ERRORS") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("COUNT_LOCAL_ERRORS") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("COUNT_UNKNOWN_ERRORS") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("FIRST_SEEN") },
+    { C_STRING_WITH_LEN("timestamp") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("LAST_SEEN") },
+    { C_STRING_WITH_LEN("timestamp") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("FIRST_ERROR_SEEN") },
+    { C_STRING_WITH_LEN("timestamp") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("LAST_ERROR_SEEN") },
+    { C_STRING_WITH_LEN("timestamp") },
+    { NULL, 0}
+  }
+};
+
+TABLE_FIELD_DEF
+table_host_cache::m_field_def=
+{ 29, field_types };
+
+PFS_engine_table_share
+table_host_cache::m_share=
+{
+  { C_STRING_WITH_LEN("host_cache") },
+  &pfs_truncatable_acl,
+  &table_host_cache::create,
+  NULL, /* write_row */
+  table_host_cache::delete_all_rows,
+  NULL, /* get_row_count */
+  1000, /* records */
+  sizeof(PFS_simple_index), /* ref length */
+  &m_table_lock,
+  &m_field_def,
+  false /* checked */
+};
+
+PFS_engine_table* table_host_cache::create(void)
+{
+  table_host_cache *t= new table_host_cache();
+  if (t != NULL)
+  {
+    THD *thd= current_thd;
+    DBUG_ASSERT(thd != NULL);
+    t->materialize(thd);
+  }
+  return t;
+}
+
+int
+table_host_cache::delete_all_rows(void)
+{
+  /*
+    TRUNCATE TABLE performance_schema.host_cache
+    is an alternate syntax for
+    FLUSH HOSTS
+  */
+  hostname_cache_refresh();
+  return 0;
+}
+
+table_host_cache::table_host_cache()
+  : PFS_engine_table(&m_share, &m_pos),
+    m_all_rows(NULL), m_row_count(0),
+    m_row(NULL), m_pos(0), m_next_pos(0)
+{}
+
+void table_host_cache::materialize(THD *thd)
+{
+  Host_entry *current;
+  Host_entry *first;
+  uint size;
+  uint index;
+  row_host_cache *rows;
+  row_host_cache *row;
+
+  DBUG_ASSERT(m_all_rows == NULL);
+  DBUG_ASSERT(m_row_count == 0);
+
+  hostname_cache_lock();
+
+  size= hostname_cache_size();
+  if (size == 0)
+  {
+    /* Normal case, the cache is empty. */
+    goto end;
+  }
+
+  rows= (row_host_cache*) thd->alloc(size * sizeof(row_host_cache));
+  if (rows == NULL)
+  {
+    /* Out of memory, this thread will error out. */
+    goto end;
+  }
+
+  index= 0;
+  row= rows;
+
+  first= hostname_cache_first();
+  current= first;
+
+  while ((current != NULL) && (index < size))
+  {
+    make_row(current, row);
+    index++;
+    row++;
+    current= current->next();
+    /* Host cache is a circular linked list. */
+    if (current == first)
+      break;
+  }
+
+  m_all_rows= rows;
+  m_row_count= index;
+
+end:
+  hostname_cache_unlock();
+}
+
+void table_host_cache::make_row(Host_entry *entry, row_host_cache *row)
+{
+  row->m_ip_length= strlen(entry->ip_key);
+  strcpy(row->m_ip, entry->ip_key);
+  row->m_hostname_length= entry->m_hostname_length;
+  if (row->m_hostname_length > 0)
+    strncpy(row->m_hostname, entry->m_hostname, row->m_hostname_length);
+  row->m_host_validated= entry->m_host_validated;
+  row->m_sum_connect_errors= entry->m_errors.m_connect;
+  row->m_count_host_blocked_errors= entry->m_errors.m_host_blocked;
+  row->m_count_nameinfo_transient_errors= entry->m_errors.m_nameinfo_transient;
+  row->m_count_nameinfo_permanent_errors= entry->m_errors.m_nameinfo_permanent;
+  row->m_count_format_errors= entry->m_errors.m_format;
+  row->m_count_addrinfo_transient_errors= entry->m_errors.m_addrinfo_transient;
+  row->m_count_addrinfo_permanent_errors= entry->m_errors.m_addrinfo_permanent;
+  row->m_count_fcrdns_errors= entry->m_errors.m_FCrDNS;
+  row->m_count_host_acl_errors= entry->m_errors.m_host_acl;
+  row->m_count_no_auth_plugin_errors= entry->m_errors.m_no_auth_plugin;
+  row->m_count_auth_plugin_errors= entry->m_errors.m_auth_plugin;
+  row->m_count_handshake_errors= entry->m_errors.m_handshake;
+  row->m_count_proxy_user_errors= entry->m_errors.m_proxy_user;
+  row->m_count_proxy_user_acl_errors= entry->m_errors.m_proxy_user_acl;
+  row->m_count_authentication_errors= entry->m_errors.m_authentication;
+  row->m_count_ssl_errors= entry->m_errors.m_ssl;
+  row->m_count_max_user_connection_errors= entry->m_errors.m_max_user_connection;
+  row->m_count_max_user_connection_per_hour_errors= entry->m_errors.m_max_user_connection_per_hour;
+  row->m_count_default_database_errors= entry->m_errors.m_default_database;
+  row->m_count_init_connect_errors= entry->m_errors.m_init_connect;
+  row->m_count_local_errors= entry->m_errors.m_local;
+
+  /*
+    Reserved for future use, to help with backward compatibility.
+    When new errors are added in entry->m_errors.m_xxx,
+    report them in this column (GA releases),
+    until the table HOST_CACHE structure can be extended (next development version).
+  */
+  row->m_count_unknown_errors= 0;
+
+  row->m_first_seen= entry->m_first_seen;
+  row->m_last_seen= entry->m_last_seen;
+  row->m_first_error_seen= entry->m_first_error_seen;
+  row->m_last_error_seen= entry->m_last_error_seen;
+}
+
+void table_host_cache::reset_position(void)
+{
+  m_pos.m_index= 0;
+  m_next_pos.m_index= 0;
+}
+
+int table_host_cache::rnd_next(void)
+{
+  int result;
+
+  m_pos.set_at(&m_next_pos);
+
+  if (m_pos.m_index < m_row_count)
+  {
+    m_row= &m_all_rows[m_pos.m_index];
+    m_next_pos.set_after(&m_pos);
+    result= 0;
+  }
+  else
+  {
+    m_row= NULL;
+    result= HA_ERR_END_OF_FILE;
+  }
+
+  return result;
+}
+
+int table_host_cache::rnd_pos(const void *pos)
+{
+  set_position(pos);
+  DBUG_ASSERT(m_pos.m_index < m_row_count);
+  m_row= &m_all_rows[m_pos.m_index];
+  return 0;
+}
+
+int table_host_cache::read_row_values(TABLE *table,
+                                      unsigned char *buf,
+                                      Field **fields,
+                                      bool read_all)
+{
+  Field *f;
+
+  DBUG_ASSERT(m_row);
+
+  /* Set the null bits */
+  DBUG_ASSERT(table->s->null_bytes == 1);
+  buf[0]= 0;
+
+  for (; (f= *fields) ; fields++)
+  {
+    if (read_all || bitmap_is_set(table->read_set, f->field_index))
+    {
+      switch(f->field_index)
+      {
+      case 0: /* IP */
+        set_field_varchar_utf8(f, m_row->m_ip, m_row->m_ip_length);
+        break;
+      case 1: /* HOST */
+        if (m_row->m_hostname_length > 0)
+          set_field_varchar_utf8(f, m_row->m_hostname, m_row->m_hostname_length);
+        else
+          f->set_null();
+        break;
+      case 2: /* HOST_VALIDATED */
+        set_field_enum(f, m_row->m_host_validated ? ENUM_YES : ENUM_NO);
+        break;
+      case 3: /* SUM_CONNECT_ERRORS */
+        set_field_ulonglong(f, m_row->m_sum_connect_errors);
+        break;
+      case 4: /* COUNT_HOST_BLOCKED_ERRORS. */
+        set_field_ulonglong(f, m_row->m_count_host_blocked_errors);
+        break;
+      case 5: /* COUNT_NAMEINFO_TRANSIENT_ERRORS */
+        set_field_ulonglong(f, m_row->m_count_nameinfo_transient_errors);
+        break;
+      case 6: /* COUNT_NAMEINFO_PERSISTENT_ERRORS */
+        set_field_ulonglong(f, m_row->m_count_nameinfo_permanent_errors);
+        break;
+      case 7: /* COUNT_FORMAT_ERRORS */
+        set_field_ulonglong(f, m_row->m_count_format_errors);
+        break;
+      case 8: /* COUNT_ADDRINFO_TRANSIENT_ERRORS */
+        set_field_ulonglong(f, m_row->m_count_addrinfo_transient_errors);
+        break;
+      case 9: /* COUNT_ADDRINFO_PERSISTENT_ERRORS */
+        set_field_ulonglong(f, m_row->m_count_addrinfo_permanent_errors);
+        break;
+      case 10: /* COUNT_FCRDNS_ERRORS */
+        set_field_ulonglong(f, m_row->m_count_fcrdns_errors);
+        break;
+      case 11: /* COUNT_HOST_ACL_ERRORS */
+        set_field_ulonglong(f, m_row->m_count_host_acl_errors);
+        break;
+      case 12: /* COUNT_NO_AUTH_PLUGIN_ERRORS */
+        set_field_ulonglong(f, m_row->m_count_no_auth_plugin_errors);
+        break;
+      case 13: /* COUNT_AUTH_PLUGIN_ERRORS */
+        set_field_ulonglong(f, m_row->m_count_auth_plugin_errors);
+        break;
+      case 14: /* COUNT_HANDSHAKE_ERRORS */
+        set_field_ulonglong(f, m_row->m_count_handshake_errors);
+        break;
+      case 15: /* COUNT_PROXY_USER_ERRORS */
+        set_field_ulonglong(f, m_row->m_count_proxy_user_errors);
+        break;
+      case 16: /* COUNT_PROXY_USER_ACL_ERRORS */
+        set_field_ulonglong(f, m_row->m_count_proxy_user_acl_errors);
+        break;
+      case 17: /* COUNT_AUTHENTICATION_ERRORS */
+        set_field_ulonglong(f, m_row->m_count_authentication_errors);
+        break;
+      case 18: /* COUNT_SSL_ERRORS */
+        set_field_ulonglong(f, m_row->m_count_ssl_errors);
+        break;
+      case 19: /* COUNT_MAX_USER_CONNECTION_ERRORS */
+        set_field_ulonglong(f, m_row->m_count_max_user_connection_errors);
+        break;
+      case 20: /* COUNT_MAX_USER_CONNECTION_PER_HOUR_ERRORS */
+        set_field_ulonglong(f, m_row->m_count_max_user_connection_per_hour_errors);
+        break;
+      case 21: /* COUNT_DEFAULT_DATABASE_ERRORS */
+        set_field_ulonglong(f, m_row->m_count_default_database_errors);
+        break;
+      case 22: /* COUNT_INIT_CONNECT_ERRORS */
+        set_field_ulonglong(f, m_row->m_count_init_connect_errors);
+        break;
+      case 23: /* COUNT_LOCAL_ERRORS */
+        set_field_ulonglong(f, m_row->m_count_local_errors);
+        break;
+      case 24: /* COUNT_UNKNOWN_ERRORS */
+        set_field_ulonglong(f, m_row->m_count_unknown_errors);
+        break;
+      case 25: /* FIRST_SEEN */
+        set_field_timestamp(f, m_row->m_first_seen);
+        break;
+      case 26: /* LAST_SEEN */
+        set_field_timestamp(f, m_row->m_last_seen);
+        break;
+      case 27: /* FIRST_ERROR_SEEN */
+        if (m_row->m_first_error_seen != 0)
+          set_field_timestamp(f, m_row->m_first_error_seen);
+        else
+          f->set_null();
+        break;
+      case 28: /* LAST_ERROR_SEEN */
+        if (m_row->m_last_error_seen != 0)
+          set_field_timestamp(f, m_row->m_last_error_seen);
+        else
+          f->set_null();
+        break;
+      default:
+        DBUG_ASSERT(false);
+      }
+    }
+  }
+
+  return 0;
+}
+
+#endif /* NOT_YET_PORTED */
diff --git a/storage/perfschema/table_host_cache.h b/storage/perfschema/table_host_cache.h
new file mode 100644
index 00000000000..543da1274e9
--- /dev/null
+++ b/storage/perfschema/table_host_cache.h
@@ -0,0 +1,144 @@
+/* Copyright (c) 2011, Oracle and/or its affiliates. All rights reserved.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; version 2 of the License.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software Foundation,
+  51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA */
+
+#ifndef TABLE_HOST_CACHE_H
+#define TABLE_HOST_CACHE_H
+
+/**
+  @file storage/perfschema/table_host_cache.h
+  Table HOST_CACHE (declarations).
+*/
+
+#include "pfs_column_types.h"
+#include "pfs_engine_table.h"
+
+class Host_entry;
+
+/**
+  @addtogroup Performance_schema_tables
+  @{
+*/
+
+/** A row of PERFORMANCE_SCHEMA.HOST_CACHE. */
+struct row_host_cache
+{
+  /** Column IP. */
+  char m_ip[64];
+  uint m_ip_length;
+  /** Column HOST. */
+  char m_hostname[255];
+  uint m_hostname_length;
+  /** Column HOST_VALIDATED. */
+  bool m_host_validated;
+  /** Column SUM_CONNECT_ERRORS. */
+  ulonglong m_sum_connect_errors;
+  /** Column COUNT_HOST_BLOCKED_ERRORS. */
+  ulonglong m_count_host_blocked_errors;
+  /** Column COUNT_NAMEINFO_TRANSIENT_ERRORS. */
+  ulonglong m_count_nameinfo_transient_errors;
+  /** Column COUNT_NAMEINFO_PERMANENT_ERRORS. */
+  ulonglong m_count_nameinfo_permanent_errors;
+  /** Column COUNT_FORMAT_ERRORS. */
+  ulonglong m_count_format_errors;
+  /** Column COUNT_ADDRINFO_TRANSIENT_ERRORS. */
+  ulonglong m_count_addrinfo_transient_errors;
+  /** Column COUNT_ADDRINFO_PERMANENT_ERRORS. */
+  ulonglong m_count_addrinfo_permanent_errors;
+  /** Column COUNT_FCRDNS_ERRORS. */
+  ulonglong m_count_fcrdns_errors;
+  /** Column COUNT_HOST_ACL_ERRORS. */
+  ulonglong m_count_host_acl_errors;
+  /** Column COUNT_NO_AUTH_PLUGIN_ERRORS. */
+  ulonglong m_count_no_auth_plugin_errors;
+  /** Column COUNT_AUTH_PLUGIN_ERRORS. */
+  ulonglong m_count_auth_plugin_errors;
+  /** Column COUNT_HANDSHAKE_ERRORS. */
+  ulonglong m_count_handshake_errors;
+  /** Column COUNT_PROXY_USER_ERRORS. */
+  ulonglong m_count_proxy_user_errors;
+  /** Column COUNT_PROXY_USER_ACL_ERRORS. */
+  ulonglong m_count_proxy_user_acl_errors;
+  /** Column COUNT_AUTHENTICATION_ERRORS. */
+  ulonglong m_count_authentication_errors;
+  /** Column COUNT_SSL_ERRORS. */
+  ulonglong m_count_ssl_errors;
+  /** Column COUNT_MAX_USER_CONNECTION_ERRORS. */
+  ulonglong m_count_max_user_connection_errors;
+  /** Column COUNT_MAX_USER_CONNECTION_PER_HOUR_ERRORS. */
+  ulonglong m_count_max_user_connection_per_hour_errors;
+  /** Column COUNT_DEFAULT_DATABASE_ERRORS. */
+  ulonglong m_count_default_database_errors;
+  /** Column COUNT_INIT_CONNECT_ERRORS. */
+  ulonglong m_count_init_connect_errors;
+  /** Column COUNT_LOCAL_ERRORS. */
+  ulonglong m_count_local_errors;
+  /** Column COUNT_UNKNOWN_ERRORS. */
+  ulonglong m_count_unknown_errors;
+  /** Column FIRST_SEEN. */
+  ulonglong m_first_seen;
+  /** Column LAST_SEEN. */
+  ulonglong m_last_seen;
+  /** Column FIRST_ERROR_SEEN. */
+  ulonglong m_first_error_seen;
+  /** Column LAST_ERROR_SEEN. */
+  ulonglong m_last_error_seen;
+};
+
+/** Table PERFORMANCE_SCHEMA.HOST_CACHE. */
+class table_host_cache : public PFS_engine_table
+{
+public:
+  /** Table share. */
+  static PFS_engine_table_share m_share;
+  static PFS_engine_table* create();
+  static int delete_all_rows();
+
+  virtual int rnd_next();
+  virtual int rnd_pos(const void *pos);
+  virtual void reset_position(void);
+
+protected:
+  virtual int read_row_values(TABLE *table,
+                              unsigned char *buf,
+                              Field **fields,
+                              bool read_all);
+
+  table_host_cache();
+
+public:
+  ~table_host_cache()
+  {}
+
+private:
+  void materialize(THD *thd);
+  static void make_row(Host_entry *entry, row_host_cache *row);
+
+  /** Table share lock. */
+  static THR_LOCK m_table_lock;
+  /** Fields definition. */
+  static TABLE_FIELD_DEF m_field_def;
+
+  row_host_cache *m_all_rows;
+  uint m_row_count;
+  /** Current row. */
+  row_host_cache *m_row;
+  /** Current position. */
+  PFS_simple_index m_pos;
+  /** Next position. */
+  PFS_simple_index m_next_pos;
+};
+
+/** @} */
+#endif
diff --git a/storage/perfschema/table_hosts.cc b/storage/perfschema/table_hosts.cc
new file mode 100644
index 00000000000..8e919ec0724
--- /dev/null
+++ b/storage/perfschema/table_hosts.cc
@@ -0,0 +1,147 @@
+/* Copyright (c) 2011, Oracle and/or its affiliates. All rights reserved.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; version 2 of the License.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software Foundation,
+  51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA */
+
+#include "my_global.h"
+#include "my_pthread.h"
+#include "table_hosts.h"
+#include "pfs_instr_class.h"
+#include "pfs_instr.h"
+#include "pfs_account.h"
+#include "pfs_host.h"
+#include "pfs_visitor.h"
+
+THR_LOCK table_hosts::m_table_lock;
+
+static const TABLE_FIELD_TYPE field_types[]=
+{
+  {
+    { C_STRING_WITH_LEN("HOST") },
+    { C_STRING_WITH_LEN("char(60)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("CURRENT_CONNECTIONS") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("TOTAL_CONNECTIONS") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  }
+};
+
+TABLE_FIELD_DEF
+table_hosts::m_field_def=
+{ 3, field_types };
+
+PFS_engine_table_share
+table_hosts::m_share=
+{
+  { C_STRING_WITH_LEN("hosts") },
+  &pfs_truncatable_acl,
+  &table_hosts::create,
+  NULL, /* write_row */
+  table_hosts::delete_all_rows,
+  NULL, /* get_row_count */
+  1000, /* records */
+  sizeof(PFS_simple_index), /* ref length */
+  &m_table_lock,
+  &m_field_def,
+  false /* checked */
+};
+
+PFS_engine_table* table_hosts::create()
+{
+  return new table_hosts();
+}
+
+int
+table_hosts::delete_all_rows(void)
+{
+  reset_events_waits_by_thread();
+  reset_events_waits_by_account();
+  reset_events_waits_by_host();
+  reset_events_stages_by_thread();
+  reset_events_stages_by_account();
+  reset_events_stages_by_host();
+  reset_events_statements_by_thread();
+  reset_events_statements_by_account();
+  reset_events_statements_by_host();
+  purge_all_account();
+  purge_all_host();
+  return 0;
+}
+
+table_hosts::table_hosts()
+  : cursor_by_host(& m_share),
+  m_row_exists(false)
+{}
+
+void table_hosts::make_row(PFS_host *pfs)
+{
+  pfs_lock lock;
+
+  m_row_exists= false;
+  pfs->m_lock.begin_optimistic_lock(&lock);
+
+  if (m_row.m_host.make_row(pfs))
+    return;
+
+  PFS_connection_stat_visitor visitor;
+  PFS_connection_iterator::visit_host(pfs, true, true, & visitor);
+
+  if (! pfs->m_lock.end_optimistic_lock(& lock))
+    return;
+
+  m_row.m_connection_stat.set(& visitor.m_stat);
+  m_row_exists= true;
+}
+
+int table_hosts::read_row_values(TABLE *table,
+                                 unsigned char *buf,
+                                 Field **fields,
+                                 bool read_all)
+{
+  Field *f;
+
+  if (unlikely(! m_row_exists))
+    return HA_ERR_RECORD_DELETED;
+
+  /* Set the null bits */
+  DBUG_ASSERT(table->s->null_bytes == 1);
+  buf[0]= 0;
+
+  for (; (f= *fields) ; fields++)
+  {
+    if (read_all || bitmap_is_set(table->read_set, f->field_index))
+    {
+      switch(f->field_index)
+      {
+      case 0: /* HOST */
+        m_row.m_host.set_field(f);
+        break;
+      case 1: /* CURRENT_CONNECTIONS */
+      case 2: /* TOTAL_CONNECTIONS */
+        m_row.m_connection_stat.set_field(f->field_index - 1, f);
+        break;
+      default:
+        DBUG_ASSERT(false);
+      }
+    }
+  }
+  return 0;
+}
+
diff --git a/storage/perfschema/table_hosts.h b/storage/perfschema/table_hosts.h
new file mode 100644
index 00000000000..6fdbf1bb0d9
--- /dev/null
+++ b/storage/perfschema/table_hosts.h
@@ -0,0 +1,80 @@
+/* Copyright (c) 2011, Oracle and/or its affiliates. All rights reserved.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; version 2 of the License.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software Foundation,
+  51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA */
+
+#ifndef TABLE_HOSTS_H
+#define TABLE_HOSTS_H
+
+#include "pfs_column_types.h"
+#include "cursor_by_host.h"
+#include "table_helper.h"
+
+struct PFS_host;
+
+/**
+  \addtogroup Performance_schema_tables
+  @{
+*/
+
+/**
+  A row of PERFORMANCE_SCHEMA.HOSTS.
+*/
+struct row_hosts
+{
+  /** Column HOST. */
+  PFS_host_row m_host;
+  /** Columns CURRENT_CONNECTIONS, TOTAL_CONNECTIONS. */
+  PFS_connection_stat_row m_connection_stat;
+};
+
+/** Table PERFORMANCE_SCHEMA.THREADS. */
+class table_hosts : public cursor_by_host
+{
+public:
+  /** Table share */
+  static PFS_engine_table_share m_share;
+  /** Table builder */
+  static PFS_engine_table* create();
+  static int delete_all_rows();
+
+protected:
+  virtual int read_row_values(TABLE *table,
+                              unsigned char *buf,
+                              Field **fields,
+                              bool read_all);
+
+
+protected:
+  table_hosts();
+
+public:
+  ~table_hosts()
+  {}
+
+private:
+  virtual void make_row(PFS_host *pfs);
+
+  /** Table share lock. */
+  static THR_LOCK m_table_lock;
+  /** Fields definition. */
+  static TABLE_FIELD_DEF m_field_def;
+
+  /** Current row. */
+  row_hosts m_row;
+  /** True if the current row exists. */
+  bool m_row_exists;
+};
+
+/** @} */
+#endif
diff --git a/storage/perfschema/table_os_global_by_type.cc b/storage/perfschema/table_os_global_by_type.cc
new file mode 100644
index 00000000000..82d176cd5b2
--- /dev/null
+++ b/storage/perfschema/table_os_global_by_type.cc
@@ -0,0 +1,270 @@
+/* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; version 2 of the License.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+/**
+  @file storage/perfschema/table_os_global_by_type.cc
+  Table OBJECTS_SUMMARY_GLOBAL_BY_TYPE (implementation).
+*/
+
+#include "my_global.h"
+#include "my_pthread.h"
+#include "pfs_instr_class.h"
+#include "pfs_column_types.h"
+#include "pfs_column_values.h"
+#include "table_os_global_by_type.h"
+#include "pfs_global.h"
+
+THR_LOCK table_os_global_by_type::m_table_lock;
+
+static const TABLE_FIELD_TYPE field_types[]=
+{
+  {
+    { C_STRING_WITH_LEN("OBJECT_TYPE") },
+    { C_STRING_WITH_LEN("varchar(64)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("OBJECT_SCHEMA") },
+    { C_STRING_WITH_LEN("varchar(64)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("OBJECT_NAME") },
+    { C_STRING_WITH_LEN("varchar(64)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("COUNT_STAR") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_TIMER_WAIT") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("MIN_TIMER_WAIT") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("AVG_TIMER_WAIT") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("MAX_TIMER_WAIT") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  }
+};
+
+TABLE_FIELD_DEF
+table_os_global_by_type::m_field_def=
+{ 8, field_types };
+
+PFS_engine_table_share
+table_os_global_by_type::m_share=
+{
+  { C_STRING_WITH_LEN("objects_summary_global_by_type") },
+  &pfs_truncatable_acl,
+  table_os_global_by_type::create,
+  NULL, /* write_row */
+  table_os_global_by_type::delete_all_rows,
+  NULL, /* get_row_count */
+  1000, /* records */
+  sizeof(pos_os_global_by_type),
+  &m_table_lock,
+  &m_field_def,
+  false /* checked */
+};
+
+PFS_engine_table*
+table_os_global_by_type::create(void)
+{
+  return new table_os_global_by_type();
+}
+
+int
+table_os_global_by_type::delete_all_rows(void)
+{
+  reset_table_waits_by_table_handle();
+  reset_table_waits_by_table();
+  return 0;
+}
+
+table_os_global_by_type::table_os_global_by_type()
+  : PFS_engine_table(&m_share, &m_pos),
+    m_row_exists(false), m_pos(), m_next_pos()
+{}
+
+void table_os_global_by_type::reset_position(void)
+{
+  m_pos.reset();
+  m_next_pos.reset();
+}
+
+int table_os_global_by_type::rnd_next(void)
+{
+  PFS_table_share *table_share;
+
+  for (m_pos.set_at(&m_next_pos);
+       m_pos.has_more_view();
+       m_pos.next_view())
+  {
+    switch (m_pos.m_index_1) {
+    case pos_os_global_by_type::VIEW_TABLE:
+      for ( ; m_pos.m_index_2 < table_share_max; m_pos.m_index_2++)
+      {
+        table_share= &table_share_array[m_pos.m_index_2];
+        if (table_share->m_lock.is_populated())
+        {
+          make_row(table_share);
+          m_next_pos.set_after(&m_pos);
+          return 0;
+        }
+      }
+      break;
+    default:
+      break;
+    }
+  }
+
+  return HA_ERR_END_OF_FILE;
+}
+
+int
+table_os_global_by_type::rnd_pos(const void *pos)
+{
+  PFS_table_share *table_share;
+
+  set_position(pos);
+
+  switch (m_pos.m_index_1) {
+  case pos_os_global_by_type::VIEW_TABLE:
+    DBUG_ASSERT(m_pos.m_index_2 < table_share_max);
+    table_share= &table_share_array[m_pos.m_index_2];
+    if (table_share->m_lock.is_populated())
+    {
+      make_row(table_share);
+      return 0;
+    }
+    break;
+  default:
+    break;
+  }
+
+  return HA_ERR_RECORD_DELETED;
+}
+
+void table_os_global_by_type::make_row(PFS_table_share *share)
+{
+  pfs_lock lock;
+  PFS_single_stat cumulated_stat;
+
+  m_row_exists= false;
+
+  share->m_lock.begin_optimistic_lock(&lock);
+
+  m_row.m_object_type= share->get_object_type();
+  memcpy(m_row.m_schema_name, share->m_schema_name, share->m_schema_name_length);
+  m_row.m_schema_name_length= share->m_schema_name_length;
+  memcpy(m_row.m_object_name, share->m_table_name, share->m_table_name_length);
+  m_row.m_object_name_length= share->m_table_name_length;
+  share->m_table_stat.sum(& cumulated_stat);
+
+  if (! share->m_lock.end_optimistic_lock(&lock))
+    return;
+
+  m_row_exists= true;
+
+  if (share->get_refcount() > 0)
+  {
+    /* For all the table handles still opened ... */
+    PFS_table *table= table_array;
+    PFS_table *table_last= table_array + table_max;
+    for ( ; table < table_last ; table++)
+    {
+      if ((table->m_share == share) && (table->m_lock.is_populated()))
+      {
+        /*
+          If the opened table handle is for this table share,
+          aggregate the table handle statistics.
+        */
+        table->m_table_stat.sum(& cumulated_stat);
+      }
+    }
+  }
+
+  time_normalizer *normalizer= time_normalizer::get(wait_timer);
+  m_row.m_stat.set(normalizer, &cumulated_stat);
+}
+
+int table_os_global_by_type::read_row_values(TABLE *table,
+                                             unsigned char *buf,
+                                             Field **fields,
+                                             bool read_all)
+{
+  Field *f;
+
+  if (unlikely(! m_row_exists))
+    return HA_ERR_RECORD_DELETED;
+
+  /* Set the null bits */
+  DBUG_ASSERT(table->s->null_bytes == 1);
+  buf[0]= 0;
+
+  for (; (f= *fields) ; fields++)
+  {
+    if (read_all || bitmap_is_set(table->read_set, f->field_index))
+    {
+      switch(f->field_index)
+      {
+      case 0: /* OBJECT_TYPE */
+        set_field_object_type(f, m_row.m_object_type);
+        break;
+      case 1: /* SCHEMA_NAME */
+        set_field_varchar_utf8(f, m_row.m_schema_name,
+                               m_row.m_schema_name_length);
+        break;
+      case 2: /* OBJECT_NAME */
+        set_field_varchar_utf8(f, m_row.m_object_name,
+                               m_row.m_object_name_length);
+        break;
+      case 3: /* COUNT */
+        set_field_ulonglong(f, m_row.m_stat.m_count);
+        break;
+      case 4: /* SUM */
+        set_field_ulonglong(f, m_row.m_stat.m_sum);
+        break;
+      case 5: /* MIN */
+        set_field_ulonglong(f, m_row.m_stat.m_min);
+        break;
+      case 6: /* AVG */
+        set_field_ulonglong(f, m_row.m_stat.m_avg);
+        break;
+      case 7: /* MAX */
+        set_field_ulonglong(f, m_row.m_stat.m_max);
+        break;
+      default:
+        DBUG_ASSERT(false);
+      }
+    }
+  }
+
+  return 0;
+}
+
diff --git a/storage/perfschema/table_os_global_by_type.h b/storage/perfschema/table_os_global_by_type.h
new file mode 100644
index 00000000000..585bf6bbca5
--- /dev/null
+++ b/storage/perfschema/table_os_global_by_type.h
@@ -0,0 +1,130 @@
+/* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; version 2 of the License.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+#ifndef TABLE_OBJECTS_SUMMARY_GLOBAL_BY_TYPE_H
+#define TABLE_OBJECTS_SUMMARY_GLOBAL_BY_TYPE_H
+
+/**
+  @file storage/perfschema/table_os_global_by_type.h
+  Table OBJECTS_SUMMARY_GLOBAL_BY_TYPE (declarations).
+*/
+
+#include "pfs_column_types.h"
+#include "pfs_engine_table.h"
+#include "pfs_instr_class.h"
+#include "pfs_instr.h"
+#include "table_helper.h"
+
+/**
+  @addtogroup Performance_schema_tables
+  @{
+*/
+
+/**
+  A row of table
+  PERFORMANCE_SCHEMA.OBJECTS_SUMMARY_GLOBAL_BY_TYPE.
+*/
+struct row_os_global_by_type
+{
+  /** Column OBJECT_TYPE. */
+  enum_object_type m_object_type;
+  /** Column SCHEMA_NAME. */
+  char m_schema_name[NAME_LEN];
+  /** Length in bytes of @c m_schema_name. */
+  uint m_schema_name_length;
+  /** Column OBJECT_NAME. */
+  char m_object_name[NAME_LEN];
+  /** Length in bytes of @c m_object_name. */
+  uint m_object_name_length;
+  /** Columns COUNT_STAR, SUM/MIN/AVG/MAX TIMER_WAIT. */
+  PFS_stat_row m_stat;
+};
+
+/**
+  Position of a cursor on
+  PERFORMANCE_SCHEMA.OBJECTS_SUMMARY_GLOBAL_BY_TYPE.
+  Index 1 on object type
+  Index 2 on object instance (0 based)
+*/
+struct pos_os_global_by_type : public PFS_double_index,
+                               public PFS_object_view_constants
+{
+  pos_os_global_by_type()
+    : PFS_double_index(FIRST_VIEW, 0)
+  {}
+
+  inline void reset(void)
+  {
+    m_index_1= FIRST_VIEW;
+    m_index_2= 0;
+  }
+
+  inline bool has_more_view(void)
+  { return (m_index_1 <= LAST_VIEW); }
+
+  inline void next_view(void)
+  {
+    m_index_1++;
+    m_index_2= 0;
+  }
+};
+
+/** Table PERFORMANCE_SCHEMA.OBJECTS_SUMMARY_GLOBAL_BY_TYPE. */
+class table_os_global_by_type : public PFS_engine_table
+{
+public:
+  /** Table share */
+  static PFS_engine_table_share m_share;
+  static PFS_engine_table* create();
+  static int delete_all_rows();
+
+  virtual int rnd_next();
+  virtual int rnd_pos(const void *pos);
+  virtual void reset_position(void);
+
+protected:
+  virtual int read_row_values(TABLE *table,
+                              unsigned char *buf,
+                              Field **fields,
+                              bool read_all);
+
+  table_os_global_by_type();
+
+public:
+  ~table_os_global_by_type()
+  {}
+
+protected:
+  void make_row(PFS_table_share *table_share);
+
+private:
+  /** Table share lock. */
+  static THR_LOCK m_table_lock;
+  /** Fields definition. */
+  static TABLE_FIELD_DEF m_field_def;
+
+  /** Current row. */
+  row_os_global_by_type m_row;
+  /** True is the current row exists. */
+  bool m_row_exists;
+  /** Current position. */
+  pos_os_global_by_type m_pos;
+  /** Next position. */
+  pos_os_global_by_type m_next_pos;
+};
+
+
+/** @} */
+#endif
diff --git a/storage/perfschema/table_performance_timers.cc b/storage/perfschema/table_performance_timers.cc
index acd379bc57b..a891d2f04cd 100644
--- a/storage/perfschema/table_performance_timers.cc
+++ b/storage/perfschema/table_performance_timers.cc
@@ -63,6 +63,7 @@ table_performance_timers::m_share=
   &table_performance_timers::create,
   NULL, /* write_row */
   NULL, /* delete_all_rows */
+  NULL, /* get_row_count */
   COUNT_TIMER_NAME, /* records */
   sizeof(PFS_simple_index), /* ref length */
   &m_table_lock,
diff --git a/storage/perfschema/table_setup_actors.cc b/storage/perfschema/table_setup_actors.cc
new file mode 100644
index 00000000000..15d3d9d22a8
--- /dev/null
+++ b/storage/perfschema/table_setup_actors.cc
@@ -0,0 +1,302 @@
+/* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; version 2 of the License.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software Foundation,
+  51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA */
+
+/**
+  @file storage/perfschema/table_setup_actors.cc
+  Table SETUP_ACTORS (implementation).
+*/
+
+#include "my_global.h"
+#include "my_pthread.h"
+#include "pfs_instr_class.h"
+#include "pfs_column_types.h"
+#include "pfs_column_values.h"
+#include "pfs_setup_actor.h"
+#include "table_setup_actors.h"
+#include "pfs_global.h"
+
+THR_LOCK table_setup_actors::m_table_lock;
+
+static const TABLE_FIELD_TYPE field_types[]=
+{
+  {
+    { C_STRING_WITH_LEN("HOST") },
+    { C_STRING_WITH_LEN("char(60)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("USER") },
+    { C_STRING_WITH_LEN("char(16)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("ROLE") },
+    { C_STRING_WITH_LEN("char(16)") },
+    { NULL, 0}
+  }
+};
+
+TABLE_FIELD_DEF
+table_setup_actors::m_field_def=
+{ 3, field_types };
+
+PFS_engine_table_share
+table_setup_actors::m_share=
+{
+  { C_STRING_WITH_LEN("setup_actors") },
+  &pfs_editable_acl,
+  table_setup_actors::create,
+  table_setup_actors::write_row,
+  table_setup_actors::delete_all_rows,
+  table_setup_actors::get_row_count,
+  1000, /* records */
+  sizeof(PFS_simple_index),
+  &m_table_lock,
+  &m_field_def,
+  false /* checked */
+};
+
+PFS_engine_table* table_setup_actors::create()
+{
+  return new table_setup_actors();
+}
+
+int table_setup_actors::write_row(TABLE *table, unsigned char *buf,
+                                  Field **fields)
+{
+  Field *f;
+  String user_data("%", 1, &my_charset_utf8_bin);
+  String host_data("%", 1, &my_charset_utf8_bin);
+  String role_data("%", 1, &my_charset_utf8_bin);
+  String *user= &user_data;
+  String *host= &host_data;
+  String *role= &role_data;
+
+  for (; (f= *fields) ; fields++)
+  {
+    if (bitmap_is_set(table->write_set, f->field_index))
+    {
+      switch(f->field_index)
+      {
+      case 0: /* HOST */
+        host= get_field_char_utf8(f, &host_data);
+        break;
+      case 1: /* USER */
+        user= get_field_char_utf8(f, &user_data);
+        break;
+      case 2: /* ROLE */
+        role= get_field_char_utf8(f, &role_data);
+        break;
+      default:
+        DBUG_ASSERT(false);
+      }
+    }
+  }
+
+  return insert_setup_actor(user, host, role);
+}
+
+int table_setup_actors::delete_all_rows(void)
+{
+  return reset_setup_actor();
+}
+
+ha_rows table_setup_actors::get_row_count(void)
+{
+  return setup_actor_count();
+}
+
+table_setup_actors::table_setup_actors()
+  : PFS_engine_table(&m_share, &m_pos),
+  m_row_exists(false), m_pos(0), m_next_pos(0)
+{}
+
+void table_setup_actors::reset_position(void)
+{
+  m_pos.m_index= 0;
+  m_next_pos.m_index= 0;
+}
+
+int table_setup_actors::rnd_next()
+{
+  PFS_setup_actor *pfs;
+
+  for (m_pos.set_at(&m_next_pos);
+       m_pos.m_index < setup_actor_max;
+       m_pos.next())
+  {
+    pfs= &setup_actor_array[m_pos.m_index];
+    if (pfs->m_lock.is_populated())
+    {
+      make_row(pfs);
+      m_next_pos.set_after(&m_pos);
+      return 0;
+    }
+  }
+
+  return HA_ERR_END_OF_FILE;
+}
+
+int table_setup_actors::rnd_pos(const void *pos)
+{
+  PFS_setup_actor *pfs;
+
+  set_position(pos);
+
+  DBUG_ASSERT(m_pos.m_index < setup_actor_max);
+  pfs= &setup_actor_array[m_pos.m_index];
+  if (pfs->m_lock.is_populated())
+  {
+    make_row(pfs);
+    return 0;
+  }
+
+  return HA_ERR_RECORD_DELETED;
+}
+
+void table_setup_actors::make_row(PFS_setup_actor *pfs)
+{
+  pfs_lock lock;
+
+  m_row_exists= false;
+
+  pfs->m_lock.begin_optimistic_lock(&lock);
+
+  m_row.m_hostname_length= pfs->m_hostname_length;
+  if (unlikely((m_row.m_hostname_length == 0) ||
+               (m_row.m_hostname_length > sizeof(m_row.m_hostname))))
+    return;
+  memcpy(m_row.m_hostname, pfs->m_hostname, m_row.m_hostname_length);
+
+  m_row.m_username_length= pfs->m_username_length;
+  if (unlikely((m_row.m_username_length == 0) ||
+               (m_row.m_username_length > sizeof(m_row.m_username))))
+    return;
+  memcpy(m_row.m_username, pfs->m_username, m_row.m_username_length);
+
+  m_row.m_rolename_length= pfs->m_rolename_length;
+  if (unlikely((m_row.m_rolename_length == 0) ||
+               (m_row.m_rolename_length > sizeof(m_row.m_rolename))))
+    return;
+  memcpy(m_row.m_rolename, pfs->m_rolename, m_row.m_rolename_length);
+
+  if (pfs->m_lock.end_optimistic_lock(&lock))
+    m_row_exists= true;
+}
+
+int table_setup_actors::read_row_values(TABLE *table,
+                                        unsigned char *buf,
+                                        Field **fields,
+                                        bool read_all)
+{
+  Field *f;
+
+  if (unlikely(! m_row_exists))
+    return HA_ERR_RECORD_DELETED;
+
+  /* Set the null bits */
+  DBUG_ASSERT(table->s->null_bytes == 1);
+
+  for (; (f= *fields) ; fields++)
+  {
+    if (read_all || bitmap_is_set(table->read_set, f->field_index))
+    {
+      switch(f->field_index)
+      {
+      case 0: /* HOST */
+        set_field_char_utf8(f, m_row.m_hostname, m_row.m_hostname_length);
+        break;
+      case 1: /* USER */
+        set_field_char_utf8(f, m_row.m_username, m_row.m_username_length);
+        break;
+      case 2: /* ROLE */
+        set_field_char_utf8(f, m_row.m_rolename, m_row.m_rolename_length);
+        break;
+      default:
+        DBUG_ASSERT(false);
+      }
+    }
+  }
+
+  return 0;
+}
+
+int table_setup_actors::update_row_values(TABLE *table,
+                                          const unsigned char *old_buf,
+                                          unsigned char *new_buf,
+                                          Field **fields)
+{
+  Field *f;
+
+  for (; (f= *fields) ; fields++)
+  {
+    if (bitmap_is_set(table->write_set, f->field_index))
+    {
+      switch(f->field_index)
+      {
+      case 0: /* HOST */
+      case 1: /* USER */
+      case 2: /* ROLE */
+        return HA_ERR_WRONG_COMMAND;
+        break;
+      default:
+        DBUG_ASSERT(false);
+      }
+    }
+  }
+
+  return 0;
+}
+
+int table_setup_actors::delete_row_values(TABLE *table,
+                                          const unsigned char *buf,
+                                          Field **fields)
+{
+  Field *f;
+  String user_data("", 0, &my_charset_utf8_bin);
+  String host_data("", 0, &my_charset_utf8_bin);
+  String role_data("", 0, &my_charset_utf8_bin);
+  String *user= NULL;
+  String *host= NULL;
+  String *role= NULL;
+
+  for (; (f= *fields) ; fields++)
+  {
+    if (bitmap_is_set(table->read_set, f->field_index))
+    {
+      switch(f->field_index)
+      {
+      case 0: /* HOST */
+        host= get_field_char_utf8(f, &host_data);
+        break;
+      case 1: /* USER */
+        user= get_field_char_utf8(f, &user_data);
+        break;
+      case 2: /* ROLE */
+        role= get_field_char_utf8(f, &role_data);
+        break;
+      default:
+        DBUG_ASSERT(false);
+      }
+    }
+  }
+
+  DBUG_ASSERT(user != NULL);
+  DBUG_ASSERT(host != NULL);
+  DBUG_ASSERT(role != NULL);
+
+  return delete_setup_actor(user, host, role);
+}
+
diff --git a/storage/perfschema/table_setup_actors.h b/storage/perfschema/table_setup_actors.h
new file mode 100644
index 00000000000..be3ab1bdf0d
--- /dev/null
+++ b/storage/perfschema/table_setup_actors.h
@@ -0,0 +1,106 @@
+/* Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; version 2 of the License.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software Foundation,
+  51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA */
+
+#ifndef TABLE_SETUP_ACTORS_H
+#define TABLE_SETUP_ACTORS_H
+
+/**
+  @file storage/perfschema/table_setup_actors.h
+  Table SETUP_ACTORS (declarations).
+*/
+
+#include "pfs_engine_table.h"
+
+struct PFS_setup_actor;
+
+/**
+  @addtogroup Performance_schema_tables
+  @{
+*/
+
+/** A row of PERFORMANCE_SCHEMA.SETUP_ACTORS. */
+struct row_setup_actors
+{
+  /** Column HOST. */
+  char m_hostname[HOSTNAME_LENGTH];
+  /** Length in bytes of @c m_hostname. */
+  uint m_hostname_length;
+  /** Column USER. */
+  char m_username[USERNAME_LENGTH];
+  /** Length in bytes of @c m_username. */
+  uint m_username_length;
+  /** Column ROLE. */
+  char m_rolename[16];
+  /** Length in bytes of @c m_rolename. */
+  uint m_rolename_length;
+};
+
+/** Table PERFORMANCE_SCHEMA.SETUP_ACTORS. */
+class table_setup_actors : public PFS_engine_table
+{
+public:
+  /** Table share. */
+  static PFS_engine_table_share m_share;
+  /** Table builder. */
+  static PFS_engine_table* create();
+  static int write_row(TABLE *table, unsigned char *buf, Field **fields);
+  static int delete_all_rows();
+  static ha_rows get_row_count();
+
+  virtual int rnd_next();
+  virtual int rnd_pos(const void *pos);
+  virtual void reset_position(void);
+
+protected:
+  virtual int read_row_values(TABLE *table,
+                              unsigned char *buf,
+                              Field **fields,
+                              bool read_all);
+
+  virtual int update_row_values(TABLE *table,
+                                const unsigned char *old_buf,
+                                unsigned char *new_buf,
+                                Field **fields);
+
+  virtual int delete_row_values(TABLE *table,
+                                const unsigned char *buf,
+                                Field **fields);
+
+  table_setup_actors();
+
+public:
+  ~table_setup_actors()
+  {}
+
+private:
+  void make_row(PFS_setup_actor *actor);
+
+  /** Table share lock. */
+  static THR_LOCK m_table_lock;
+  /** Fields definition. */
+  static TABLE_FIELD_DEF m_field_def;
+
+  /** Current row. */
+  row_setup_actors m_row;
+  /** True if the current row exists. */
+  bool m_row_exists;
+  /** Current position. */
+  PFS_simple_index m_pos;
+  /** Next position. */
+  PFS_simple_index m_next_pos;
+};
+
+/** @} */
+#endif
diff --git a/storage/perfschema/table_setup_consumers.cc b/storage/perfschema/table_setup_consumers.cc
index 601e0483b14..7b5441b684a 100644
--- a/storage/perfschema/table_setup_consumers.cc
+++ b/storage/perfschema/table_setup_consumers.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2008, 2011, Oracle and/or its affiliates. All rights reserved.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
@@ -23,41 +23,70 @@
 #include "table_setup_consumers.h"
 #include "pfs_instr.h"
 #include "pfs_events_waits.h"
+#include "pfs_digest.h"
 
-#define COUNT_SETUP_CONSUMERS 8
+#define COUNT_SETUP_CONSUMERS 12
 static row_setup_consumers all_setup_consumers_data[COUNT_SETUP_CONSUMERS]=
 {
   {
-    { C_STRING_WITH_LEN("events_waits_current") },
-    &flag_events_waits_current
+    { C_STRING_WITH_LEN("events_stages_current") },
+    &flag_events_stages_current,
+    false
   },
   {
-    { C_STRING_WITH_LEN("events_waits_history") },
-    &flag_events_waits_history
+    { C_STRING_WITH_LEN("events_stages_history") },
+    &flag_events_stages_history,
+    false
   },
   {
-    { C_STRING_WITH_LEN("events_waits_history_long") },
-    &flag_events_waits_history_long
+    { C_STRING_WITH_LEN("events_stages_history_long") },
+    &flag_events_stages_history_long,
+    false
+  },
+  {
+    { C_STRING_WITH_LEN("events_statements_current") },
+    &flag_events_statements_current,
+    false
   },
   {
-    { C_STRING_WITH_LEN("events_waits_summary_by_thread_by_event_name") },
-    &flag_events_waits_summary_by_thread_by_event_name
+    { C_STRING_WITH_LEN("events_statements_history") },
+    &flag_events_statements_history,
+    false
   },
   {
-    { C_STRING_WITH_LEN("events_waits_summary_by_event_name") },
-    &flag_events_waits_summary_by_event_name
+    { C_STRING_WITH_LEN("events_statements_history_long") },
+    &flag_events_statements_history_long,
+    false
   },
   {
-    { C_STRING_WITH_LEN("events_waits_summary_by_instance") },
-    &flag_events_waits_summary_by_instance
+    { C_STRING_WITH_LEN("events_waits_current") },
+    &flag_events_waits_current,
+    false
   },
   {
-    { C_STRING_WITH_LEN("file_summary_by_event_name") },
-    &flag_file_summary_by_event_name
+    { C_STRING_WITH_LEN("events_waits_history") },
+    &flag_events_waits_history,
+    false
   },
   {
-    { C_STRING_WITH_LEN("file_summary_by_instance") },
-    &flag_file_summary_by_instance
+    { C_STRING_WITH_LEN("events_waits_history_long") },
+    &flag_events_waits_history_long,
+    false
+  },
+  {
+    { C_STRING_WITH_LEN("global_instrumentation") },
+    &flag_global_instrumentation,
+    true
+  },
+  {
+    { C_STRING_WITH_LEN("thread_instrumentation") },
+    &flag_thread_instrumentation,
+    false
+  },
+  {
+    { C_STRING_WITH_LEN("statements_digest") },
+    &flag_statements_digest,
+    false
   }
 };
 
@@ -89,6 +118,7 @@ table_setup_consumers::m_share=
   &table_setup_consumers::create,
   NULL, /* write_row */
   NULL, /* delete_all_rows */
+  NULL, /* get_row_count */
   COUNT_SETUP_CONSUMERS, /* records */
   sizeof(PFS_simple_index), /* ref length */
   &m_table_lock,
@@ -205,6 +235,9 @@ int table_setup_consumers::update_row_values(TABLE *table,
     }
   }
 
+  if (m_row->m_refresh)
+    update_instruments_derived_flags();
+
   return 0;
 }
 
diff --git a/storage/perfschema/table_setup_consumers.h b/storage/perfschema/table_setup_consumers.h
index 3ef85f6914b..bc7e9d553bb 100644
--- a/storage/perfschema/table_setup_consumers.h
+++ b/storage/perfschema/table_setup_consumers.h
@@ -1,5 +1,4 @@
-/* Copyright (c) 2008 MySQL AB, 2010 Sun Microsystems, Inc.
-   Use is subject to license terms.
+/* Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
@@ -37,6 +36,8 @@ struct row_setup_consumers
   LEX_STRING m_name;
   /** Column ENABLED. */
   bool *m_enabled_ptr;
+  /** Hidden column, refresh. */
+  bool m_refresh;
 };
 
 /** Table PERFORMANCE_SCHEMA.SETUP_CONSUMERS. */
diff --git a/storage/perfschema/table_setup_instruments.cc b/storage/perfschema/table_setup_instruments.cc
index 480c0dbc13f..31e2adb0f62 100644
--- a/storage/perfschema/table_setup_instruments.cc
+++ b/storage/perfschema/table_setup_instruments.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2008, 2011, Oracle and/or its affiliates. All rights reserved.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
@@ -21,10 +21,12 @@
 #include "my_global.h"
 #include "my_pthread.h"
 #include "pfs_instr_class.h"
+#include "pfs_instr.h"
 #include "pfs_column_types.h"
 #include "pfs_column_values.h"
 #include "table_setup_instruments.h"
 #include "pfs_global.h"
+#include "pfs_setup_object.h"
 
 THR_LOCK table_setup_instruments::m_table_lock;
 
@@ -59,6 +61,7 @@ table_setup_instruments::m_share=
   &table_setup_instruments::create,
   NULL, /* write_row */
   NULL, /* delete_all_rows */
+  NULL, /* get_row_count */
   1000, /* records */
   sizeof(pos_setup_instruments),
   &m_table_lock,
@@ -84,56 +87,55 @@ void table_setup_instruments::reset_position(void)
 
 int table_setup_instruments::rnd_next(void)
 {
-  PFS_mutex_class *mutex_class;
-  PFS_rwlock_class *rwlock_class;
-  PFS_cond_class *cond_class;
-  PFS_file_class *file_class;
+  PFS_instr_class *instr_class= NULL;
+
+  /* Do not advertise hard coded instruments when disabled. */
+  if (! pfs_initialized)
+    return HA_ERR_END_OF_FILE;
 
   for (m_pos.set_at(&m_next_pos);
        m_pos.has_more_view();
        m_pos.next_view())
   {
-    switch (m_pos.m_index_1) {
+    switch (m_pos.m_index_1)
+    {
     case pos_setup_instruments::VIEW_MUTEX:
-      mutex_class= find_mutex_class(m_pos.m_index_2);
-      if (mutex_class)
-      {
-        make_row(mutex_class);
-        m_next_pos.set_after(&m_pos);
-        return 0;
-      }
+      instr_class= find_mutex_class(m_pos.m_index_2);
       break;
     case pos_setup_instruments::VIEW_RWLOCK:
-      rwlock_class= find_rwlock_class(m_pos.m_index_2);
-      if (rwlock_class)
-      {
-        make_row(rwlock_class);
-        m_next_pos.set_after(&m_pos);
-        return 0;
-      }
+      instr_class= find_rwlock_class(m_pos.m_index_2);
       break;
     case pos_setup_instruments::VIEW_COND:
-      cond_class= find_cond_class(m_pos.m_index_2);
-      if (cond_class)
-      {
-        make_row(cond_class);
-        m_next_pos.set_after(&m_pos);
-        return 0;
-      }
+      instr_class= find_cond_class(m_pos.m_index_2);
       break;
     case pos_setup_instruments::VIEW_THREAD:
-      /* Reserved for WL#4674, PERFORMANCE_SCHEMA Setup For Actors. */
+      /* Not used yet  */
       break;
     case pos_setup_instruments::VIEW_FILE:
-      file_class= find_file_class(m_pos.m_index_2);
-      if (file_class)
-      {
-        make_row(file_class);
-        m_next_pos.set_after(&m_pos);
-        return 0;
-      }
+      instr_class= find_file_class(m_pos.m_index_2);
+      break;
+    case pos_setup_instruments::VIEW_TABLE:
+      instr_class= find_table_class(m_pos.m_index_2);
+      break;
+    case pos_setup_instruments::VIEW_STAGE:
+      instr_class= find_stage_class(m_pos.m_index_2);
+      break;
+    case pos_setup_instruments::VIEW_STATEMENT:
+      instr_class= find_statement_class(m_pos.m_index_2);
+      break;
+    case pos_setup_instruments::VIEW_SOCKET:
+      instr_class= find_socket_class(m_pos.m_index_2);
+      break;
+    case pos_setup_instruments::VIEW_IDLE:
+      instr_class= find_idle_class(m_pos.m_index_2);
       break;
     }
+    if (instr_class)
+    {
+      make_row(instr_class);
+      m_next_pos.set_after(&m_pos);
+      return 0;
+    }
   }
 
   return HA_ERR_END_OF_FILE;
@@ -141,60 +143,59 @@ int table_setup_instruments::rnd_next(void)
 
 int table_setup_instruments::rnd_pos(const void *pos)
 {
-  PFS_mutex_class *mutex_class;
-  PFS_rwlock_class *rwlock_class;
-  PFS_cond_class *cond_class;
-  PFS_file_class *file_class;
+  PFS_instr_class *instr_class= NULL;
+
+  /* Do not advertise hard coded instruments when disabled. */
+  if (! pfs_initialized)
+    return HA_ERR_END_OF_FILE;
 
   set_position(pos);
 
-  switch (m_pos.m_index_1) {
+  switch (m_pos.m_index_1)
+  {
   case pos_setup_instruments::VIEW_MUTEX:
-    mutex_class= find_mutex_class(m_pos.m_index_2);
-    if (mutex_class)
-    {
-      make_row(mutex_class);
-      return 0;
-    }
+    instr_class= find_mutex_class(m_pos.m_index_2);
     break;
   case pos_setup_instruments::VIEW_RWLOCK:
-    rwlock_class= find_rwlock_class(m_pos.m_index_2);
-    if (rwlock_class)
-    {
-      make_row(rwlock_class);
-      return 0;
-    }
+    instr_class= find_rwlock_class(m_pos.m_index_2);
     break;
   case pos_setup_instruments::VIEW_COND:
-    cond_class= find_cond_class(m_pos.m_index_2);
-    if (cond_class)
-    {
-      make_row(cond_class);
-      return 0;
-    }
+    instr_class= find_cond_class(m_pos.m_index_2);
     break;
   case pos_setup_instruments::VIEW_THREAD:
-    /* Reserved for WL#4674, PERFORMANCE_SCHEMA Setup For Actors. */
+    /* Not used yet */
     break;
   case pos_setup_instruments::VIEW_FILE:
-    file_class= find_file_class(m_pos.m_index_2);
-    if (file_class)
-    {
-      make_row(file_class);
-      return 0;
-    }
+    instr_class= find_file_class(m_pos.m_index_2);
+    break;
+  case pos_setup_instruments::VIEW_TABLE:
+    instr_class= find_table_class(m_pos.m_index_2);
+    break;
+  case pos_setup_instruments::VIEW_STAGE:
+    instr_class= find_stage_class(m_pos.m_index_2);
+    break;
+  case pos_setup_instruments::VIEW_STATEMENT:
+    instr_class= find_statement_class(m_pos.m_index_2);
+    break;
+  case pos_setup_instruments::VIEW_SOCKET:
+    instr_class= find_socket_class(m_pos.m_index_2);
+    break;
+  case pos_setup_instruments::VIEW_IDLE:
+    instr_class= find_idle_class(m_pos.m_index_2);
     break;
   }
+  if (instr_class)
+  {
+    make_row(instr_class);
+    return 0;
+  }
 
   return HA_ERR_RECORD_DELETED;
 }
 
 void table_setup_instruments::make_row(PFS_instr_class *klass)
 {
-  m_row.m_name= &klass->m_name[0];
-  m_row.m_name_length= klass->m_name_length;
-  m_row.m_enabled_ptr= &klass->m_enabled;
-  m_row.m_timed_ptr= &klass->m_timed;
+  m_row.m_instr_class= klass;
 }
 
 int table_setup_instruments::read_row_values(TABLE *table,
@@ -218,16 +219,13 @@ int table_setup_instruments::read_row_values(TABLE *table,
       switch(f->field_index)
       {
       case 0: /* NAME */
-        set_field_varchar_utf8(f, m_row.m_name, m_row.m_name_length);
+        set_field_varchar_utf8(f, m_row.m_instr_class->m_name, m_row.m_instr_class->m_name_length);
         break;
       case 1: /* ENABLED */
-        set_field_enum(f, (*m_row.m_enabled_ptr) ? ENUM_YES : ENUM_NO);
+        set_field_enum(f, m_row.m_instr_class->m_enabled ? ENUM_YES : ENUM_NO);
         break;
       case 2: /* TIMED */
-        if (m_row.m_timed_ptr)
-          set_field_enum(f, (*m_row.m_timed_ptr) ? ENUM_YES : ENUM_NO);
-        else
-          set_field_enum(f, ENUM_NO);
+        set_field_enum(f, m_row.m_instr_class->m_timed ? ENUM_YES : ENUM_NO);
         break;
       default:
         DBUG_ASSERT(false);
@@ -256,14 +254,11 @@ int table_setup_instruments::update_row_values(TABLE *table,
         return HA_ERR_WRONG_COMMAND;
       case 1: /* ENABLED */
         value= (enum_yes_no) get_field_enum(f);
-        *m_row.m_enabled_ptr= (value == ENUM_YES) ? true : false;
+        m_row.m_instr_class->m_enabled= (value == ENUM_YES) ? true : false;
         break;
       case 2: /* TIMED */
-        if (m_row.m_timed_ptr)
-        {
-          value= (enum_yes_no) get_field_enum(f);
-          *m_row.m_timed_ptr= (value == ENUM_YES) ? true : false;
-        }
+        value= (enum_yes_no) get_field_enum(f);
+        m_row.m_instr_class->m_timed= (value == ENUM_YES) ? true : false;
         break;
       default:
         DBUG_ASSERT(false);
@@ -271,6 +266,41 @@ int table_setup_instruments::update_row_values(TABLE *table,
     }
   }
 
+  switch (m_pos.m_index_1)
+  {
+    case pos_setup_instruments::VIEW_MUTEX:
+      update_mutex_derived_flags();
+      break;
+    case pos_setup_instruments::VIEW_RWLOCK:
+      update_rwlock_derived_flags();
+      break;
+    case pos_setup_instruments::VIEW_COND:
+      update_cond_derived_flags();
+      break;
+    case pos_setup_instruments::VIEW_THREAD:
+      /* Not used yet  */
+      break;
+    case pos_setup_instruments::VIEW_FILE:
+      update_file_derived_flags();
+      break;
+    case pos_setup_instruments::VIEW_TABLE:
+      update_table_derived_flags();
+      break;
+    case pos_setup_instruments::VIEW_STAGE:
+    case pos_setup_instruments::VIEW_STATEMENT:
+      /* No flag to update. */
+      break;
+    case pos_setup_instruments::VIEW_SOCKET:
+      update_socket_derived_flags();
+      break;
+    case pos_setup_instruments::VIEW_IDLE:
+      /* No flag to update. */
+      break;
+    default:
+      DBUG_ASSERT(false);
+      break;
+  }
+
   return 0;
 }
 
diff --git a/storage/perfschema/table_setup_instruments.h b/storage/perfschema/table_setup_instruments.h
index 42b1a6122c8..cb4c6a06de1 100644
--- a/storage/perfschema/table_setup_instruments.h
+++ b/storage/perfschema/table_setup_instruments.h
@@ -1,5 +1,4 @@
-/* Copyright (c) 2008 MySQL AB, 2010 Sun Microsystems, Inc.
-   Use is subject to license terms.
+/* Copyright (c) 2008, 2011, Oracle and/or its affiliates. All rights reserved.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
@@ -33,38 +32,38 @@
 /** A row of PERFORMANCE_SCHEMA.SETUP_INSTRUMENTS. */
 struct row_setup_instruments
 {
-  /** Column NAME. */
-  const char *m_name;
-  /** Length in bytes of @c m_name. */
-  uint m_name_length;
-  /** Column ENABLED. */
-  bool *m_enabled_ptr;
-  /** Column TIMED. */
-  bool *m_timed_ptr;
+  /** Columns NAME, ENABLED, TIMED. */
+  PFS_instr_class *m_instr_class;
 };
 
 /** Position of a cursor on PERFORMANCE_SCHEMA.SETUP_INSTRUMENTS. */
 struct pos_setup_instruments : public PFS_double_index
 {
+  static const uint FIRST_VIEW= 1;
   static const uint VIEW_MUTEX= 1;
   static const uint VIEW_RWLOCK= 2;
   static const uint VIEW_COND= 3;
-  /** Reverved for WL#4674, PERFORMANCE_SCHEMA Setup For Actors. */
   static const uint VIEW_THREAD= 4;
   static const uint VIEW_FILE= 5;
+  static const uint VIEW_TABLE= 6;
+  static const uint VIEW_STAGE= 7;
+  static const uint VIEW_STATEMENT= 8;
+  static const uint VIEW_SOCKET= 9;
+  static const uint VIEW_IDLE= 10;
+  static const uint LAST_VIEW= 10;
 
   pos_setup_instruments()
-    : PFS_double_index(VIEW_MUTEX, 1)
+    : PFS_double_index(FIRST_VIEW, 1)
   {}
 
   inline void reset(void)
   {
-    m_index_1= VIEW_MUTEX;
+    m_index_1= FIRST_VIEW;
     m_index_2= 1;
   }
 
   inline bool has_more_view(void)
-  { return (m_index_1 <= VIEW_FILE); }
+  { return (m_index_1 <= LAST_VIEW); }
 
   inline void next_view(void)
   {
diff --git a/storage/perfschema/table_setup_objects.cc b/storage/perfschema/table_setup_objects.cc
new file mode 100644
index 00000000000..33e360e989b
--- /dev/null
+++ b/storage/perfschema/table_setup_objects.cc
@@ -0,0 +1,382 @@
+/* Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; version 2 of the License.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software Foundation,
+  51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA */
+
+/**
+  @file storage/perfschema/table_setup_objects.cc
+  Table SETUP_OBJECTS (implementation).
+*/
+
+#include "my_global.h"
+#include "my_pthread.h"
+#include "pfs_instr.h"
+#include "pfs_column_types.h"
+#include "pfs_column_values.h"
+#include "pfs_setup_object.h"
+#include "table_setup_objects.h"
+#include "table_helper.h"
+#include "pfs_global.h"
+
+THR_LOCK table_setup_objects::m_table_lock;
+
+static const TABLE_FIELD_TYPE field_types[]=
+{
+  {
+    { C_STRING_WITH_LEN("OBJECT_TYPE") },
+    { C_STRING_WITH_LEN("enum(\'TABLE\')") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("OBJECT_SCHEMA") },
+    { C_STRING_WITH_LEN("varchar(64)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("OBJECT_NAME") },
+    { C_STRING_WITH_LEN("varchar(64)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("ENABLED") },
+    { C_STRING_WITH_LEN("enum(\'YES\',\'NO\')") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("TIMED") },
+    { C_STRING_WITH_LEN("enum(\'YES\',\'NO\')") },
+    { NULL, 0}
+  }
+};
+
+TABLE_FIELD_DEF
+table_setup_objects::m_field_def=
+{ 5, field_types };
+
+PFS_engine_table_share
+table_setup_objects::m_share=
+{
+  { C_STRING_WITH_LEN("setup_objects") },
+  &pfs_editable_acl,
+  table_setup_objects::create,
+  table_setup_objects::write_row,
+  table_setup_objects::delete_all_rows,
+  table_setup_objects::get_row_count,
+  1000, /* records */
+  sizeof(PFS_simple_index),
+  &m_table_lock,
+  &m_field_def,
+  false /* checked */
+};
+
+int update_derived_flags()
+{
+  PFS_thread *thread= PFS_thread::get_current_thread();
+  if (unlikely(thread == NULL))
+    return HA_ERR_OUT_OF_MEM;
+
+  update_table_share_derived_flags(thread);
+  update_table_derived_flags();
+  return 0;
+}
+
+PFS_engine_table* table_setup_objects::create(void)
+{
+  return new table_setup_objects();
+}
+
+int table_setup_objects::write_row(TABLE *table, unsigned char *buf,
+                                   Field **fields)
+{
+  int result;
+  Field *f;
+  enum_object_type object_type= OBJECT_TYPE_TABLE;
+  String object_schema_data("%", 1, &my_charset_utf8_bin);
+  String object_name_data("%", 1, &my_charset_utf8_bin);
+  String *object_schema= &object_schema_data;
+  String *object_name= &object_name_data;
+  enum_yes_no enabled_value= ENUM_YES;
+  enum_yes_no timed_value= ENUM_YES;
+  bool enabled= true;
+  bool timed= true;
+
+  for (; (f= *fields) ; fields++)
+  {
+    if (bitmap_is_set(table->write_set, f->field_index))
+    {
+      switch(f->field_index)
+      {
+      case 0: /* OBJECT_TYPE */
+        object_type= (enum_object_type) get_field_enum(f);
+        break;
+      case 1: /* OBJECT_SCHEMA */
+        object_schema= get_field_varchar_utf8(f, &object_schema_data);
+        break;
+      case 2: /* OBJECT_NAME */
+        object_name= get_field_varchar_utf8(f, &object_name_data);
+        break;
+      case 3: /* ENABLED */
+        enabled_value= (enum_yes_no) get_field_enum(f);
+        break;
+      case 4: /* TIMED */
+        timed_value= (enum_yes_no) get_field_enum(f);
+        break;
+      default:
+        DBUG_ASSERT(false);
+      }
+    }
+  }
+
+  /* Reject illegal enum values in OBJECT_TYPE */
+  if (object_type != OBJECT_TYPE_TABLE)
+    return HA_ERR_NO_REFERENCED_ROW;
+
+  /* Reject illegal enum values in ENABLED */
+  if ((enabled_value != ENUM_YES) && (enabled_value != ENUM_NO))
+    return HA_ERR_NO_REFERENCED_ROW;
+
+  /* Reject illegal enum values in TIMED */
+  if ((timed_value != ENUM_YES) && (timed_value != ENUM_NO))
+    return HA_ERR_NO_REFERENCED_ROW;
+
+  enabled= (enabled_value == ENUM_YES) ? true : false;
+  timed= (timed_value == ENUM_YES) ? true : false;
+
+  result= insert_setup_object(object_type, object_schema, object_name,
+                              enabled, timed);
+  if (result == 0)
+    result= update_derived_flags();
+  return result;
+}
+
+int table_setup_objects::delete_all_rows(void)
+{
+  int result= reset_setup_object();
+  if (result == 0)
+    result= update_derived_flags();
+  return result;
+}
+
+ha_rows table_setup_objects::get_row_count(void)
+{
+  return setup_object_count();
+}
+
+table_setup_objects::table_setup_objects()
+  : PFS_engine_table(&m_share, &m_pos),
+  m_row_exists(false), m_pos(0), m_next_pos(0)
+{}
+
+void table_setup_objects::reset_position(void)
+{
+  m_pos.m_index= 0;
+  m_next_pos.m_index= 0;
+}
+
+int table_setup_objects::rnd_next(void)
+{
+  PFS_setup_object *pfs;
+
+  for (m_pos.set_at(&m_next_pos);
+       m_pos.m_index < setup_object_max;
+       m_pos.next())
+  {
+    pfs= &setup_object_array[m_pos.m_index];
+    if (pfs->m_lock.is_populated())
+    {
+      make_row(pfs);
+      m_next_pos.set_after(&m_pos);
+      return 0;
+    }
+  }
+
+  return HA_ERR_END_OF_FILE;
+}
+
+int table_setup_objects::rnd_pos(const void *pos)
+{
+  PFS_setup_object *pfs;
+
+  set_position(pos);
+
+  DBUG_ASSERT(m_pos.m_index < setup_object_max);
+  pfs= &setup_object_array[m_pos.m_index];
+  if (pfs->m_lock.is_populated())
+  {
+    make_row(pfs);
+    return 0;
+  }
+
+  return HA_ERR_RECORD_DELETED;
+}
+
+void table_setup_objects::make_row(PFS_setup_object *pfs)
+{
+  pfs_lock lock;
+
+  m_row_exists= false;
+
+  pfs->m_lock.begin_optimistic_lock(&lock);
+
+  m_row.m_object_type= pfs->get_object_type();
+  memcpy(m_row.m_schema_name, pfs->m_schema_name, pfs->m_schema_name_length);
+  m_row.m_schema_name_length= pfs->m_schema_name_length;
+  memcpy(m_row.m_object_name, pfs->m_object_name, pfs->m_object_name_length);
+  m_row.m_object_name_length= pfs->m_object_name_length;
+  m_row.m_enabled_ptr= &pfs->m_enabled;
+  m_row.m_timed_ptr= &pfs->m_timed;
+
+  if (pfs->m_lock.end_optimistic_lock(&lock))
+    m_row_exists= true;
+}
+
+int table_setup_objects::read_row_values(TABLE *table,
+                                         unsigned char *buf,
+                                         Field **fields,
+                                         bool read_all)
+{
+  Field *f;
+
+  if (unlikely(! m_row_exists))
+    return HA_ERR_RECORD_DELETED;
+
+  /* Set the null bits */
+  DBUG_ASSERT(table->s->null_bytes == 1);
+  buf[0]= 0;
+
+  for (; (f= *fields) ; fields++)
+  {
+    if (read_all || bitmap_is_set(table->read_set, f->field_index))
+    {
+      switch(f->field_index)
+      {
+      case 0: /* OBJECT_TYPE */
+        set_field_enum(f, m_row.m_object_type);
+        break;
+      case 1: /* OBJECT_SCHEMA */
+        if (m_row.m_schema_name_length)
+          set_field_varchar_utf8(f, m_row.m_schema_name,
+                                 m_row.m_schema_name_length);
+        else
+          f->set_null();
+        break;
+      case 2: /* OBJECT_NAME */
+        if (m_row.m_object_name_length)
+          set_field_varchar_utf8(f, m_row.m_object_name,
+                                 m_row.m_object_name_length);
+        else
+          f->set_null();
+        break;
+      case 3: /* ENABLED */
+        set_field_enum(f, (*m_row.m_enabled_ptr) ? ENUM_YES : ENUM_NO);
+        break;
+      case 4: /* TIMED */
+        set_field_enum(f, (*m_row.m_timed_ptr) ? ENUM_YES : ENUM_NO);
+        break;
+      default:
+        DBUG_ASSERT(false);
+      }
+    }
+  }
+
+  return 0;
+}
+
+int table_setup_objects::update_row_values(TABLE *table,
+                                           const unsigned char *,
+                                           unsigned char *,
+                                           Field **fields)
+{
+  int result;
+  Field *f;
+  enum_yes_no value;
+
+  for (; (f= *fields) ; fields++)
+  {
+    if (bitmap_is_set(table->write_set, f->field_index))
+    {
+      switch(f->field_index)
+      {
+      case 0: /* OBJECT_TYPE */
+      case 1: /* OBJECT_SCHEMA */
+      case 2: /* OBJECT_NAME */
+        return HA_ERR_WRONG_COMMAND;
+      case 3: /* ENABLED */
+        value= (enum_yes_no) get_field_enum(f);
+        /* Reject illegal enum values in ENABLED */
+        if ((value != ENUM_YES) && (value != ENUM_NO))
+          return HA_ERR_NO_REFERENCED_ROW;
+        *m_row.m_enabled_ptr= (value == ENUM_YES) ? true : false;
+        break;
+      case 4: /* TIMED */
+        value= (enum_yes_no) get_field_enum(f);
+        /* Reject illegal enum values in TIMED */
+        if ((value != ENUM_YES) && (value != ENUM_NO))
+          return HA_ERR_NO_REFERENCED_ROW;
+        *m_row.m_timed_ptr= (value == ENUM_YES) ? true : false;
+        break;
+      default:
+        DBUG_ASSERT(false);
+      }
+    }
+  }
+
+  result= update_derived_flags();
+  return result;
+}
+
+int table_setup_objects::delete_row_values(TABLE *table,
+                                           const unsigned char *buf,
+                                           Field **fields)
+{
+  int result;
+  Field *f;
+  enum_object_type object_type= OBJECT_TYPE_TABLE;
+  String object_schema_data("", 0, &my_charset_utf8_bin);
+  String object_name_data("", 0, &my_charset_utf8_bin);
+  String *object_schema= NULL;
+  String *object_name= NULL;
+
+  for (; (f= *fields) ; fields++)
+  {
+    if (bitmap_is_set(table->read_set, f->field_index))
+    {
+      switch(f->field_index)
+      {
+      case 0: /* OBJECT_TYPE */
+        object_type= (enum_object_type) get_field_enum(f);
+        break;
+      case 1: /* OBJECT_SCHEMA */
+        object_schema= get_field_varchar_utf8(f, &object_schema_data);
+        break;
+      case 2: /* OBJECT_NAME */
+        object_name= get_field_varchar_utf8(f, &object_name_data);
+        break;
+      case 3: /* ENABLED */
+      case 4: /* TIMED */
+        break;
+      default:
+        DBUG_ASSERT(false);
+      }
+    }
+  }
+
+  DBUG_ASSERT(object_schema != NULL);
+  DBUG_ASSERT(object_name != NULL);
+
+  result= delete_setup_object(object_type, object_schema, object_name);
+  if (result == 0)
+    result= update_derived_flags();
+  return result;
+}
+
diff --git a/storage/perfschema/table_setup_objects.h b/storage/perfschema/table_setup_objects.h
new file mode 100644
index 00000000000..4b31fa6a8a6
--- /dev/null
+++ b/storage/perfschema/table_setup_objects.h
@@ -0,0 +1,109 @@
+/* Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; version 2 of the License.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software Foundation,
+  51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA */
+
+#ifndef TABLE_SETUP_OBJECTS_H
+#define TABLE_SETUP_OBJECTS_H
+
+/**
+  @file storage/perfschema/table_setup_objects.h
+  Table SETUP_OBJECTS (declarations).
+*/
+
+#include "pfs_engine_table.h"
+#include "table_helper.h"
+
+struct PFS_setup_object;
+
+/**
+  @addtogroup Performance_schema_tables
+  @{
+*/
+
+/** A row of PERFORMANCE_SCHEMA.SETUP_OBJECTS. */
+struct row_setup_objects
+{
+  /** Column OBJECT_TYPE. */
+  enum_object_type m_object_type;
+  /** Column SCHEMA_NAME. */
+  char m_schema_name[NAME_LEN];
+  /** Length in bytes of @c m_schema_name. */
+  uint m_schema_name_length;
+  /** Column OBJECT_NAME. */
+  char m_object_name[NAME_LEN];
+  /** Length in bytes of @c m_object_name. */
+  uint m_object_name_length;
+  /** Column ENABLED. */
+  bool *m_enabled_ptr;
+  /** Column TIMED. */
+  bool *m_timed_ptr;
+};
+
+/** Table PERFORMANCE_SCHEMA.SETUP_OBJECTS. */
+class table_setup_objects : public PFS_engine_table
+{
+public:
+  /** Table share. */
+  static PFS_engine_table_share m_share;
+  /** Table builder. */
+  static PFS_engine_table* create();
+  static int write_row(TABLE *table, unsigned char *buf, Field **fields);
+  static int delete_all_rows();
+  static ha_rows get_row_count();
+
+  virtual int rnd_next();
+  virtual int rnd_pos(const void *pos);
+  virtual void reset_position(void);
+
+protected:
+  virtual int read_row_values(TABLE *table,
+                              unsigned char *buf,
+                              Field **fields,
+                              bool read_all);
+
+  virtual int update_row_values(TABLE *table,
+                                const unsigned char *old_buf,
+                                unsigned char *new_buf,
+                                Field **fields);
+
+  virtual int delete_row_values(TABLE *table,
+                                const unsigned char *buf,
+                                Field **fields);
+
+  table_setup_objects();
+
+public:
+  ~table_setup_objects()
+  {}
+
+private:
+  void make_row(PFS_setup_object *pfs);
+
+  /** Table share lock. */
+  static THR_LOCK m_table_lock;
+  /** Fields definition. */
+  static TABLE_FIELD_DEF m_field_def;
+
+  /** Current row. */
+  row_setup_objects m_row;
+  /** True is the current row exists. */
+  bool m_row_exists;
+  /** Current position. */
+  PFS_simple_index m_pos;
+  /** Next position. */
+  PFS_simple_index m_next_pos;
+};
+
+/** @} */
+#endif
diff --git a/storage/perfschema/table_setup_timers.cc b/storage/perfschema/table_setup_timers.cc
index f8b1bfa4fe2..a6a8a52b7e5 100644
--- a/storage/perfschema/table_setup_timers.cc
+++ b/storage/perfschema/table_setup_timers.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2008, 2011, Oracle and/or its affiliates. All rights reserved.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
@@ -24,12 +24,25 @@
 #include "pfs_column_values.h"
 #include "pfs_timer.h"
 
-#define COUNT_SETUP_TIMERS 1
+#define COUNT_SETUP_TIMERS 4
+
 static row_setup_timers all_setup_timers_data[COUNT_SETUP_TIMERS]=
 {
   {
+    { C_STRING_WITH_LEN("idle") },
+    &idle_timer
+  },
+  {
     { C_STRING_WITH_LEN("wait") },
     &wait_timer
+  },
+  {
+    { C_STRING_WITH_LEN("stage") },
+    &stage_timer
+  },
+  {
+    { C_STRING_WITH_LEN("statement") },
+    &statement_timer
   }
 };
 
@@ -62,6 +75,7 @@ table_setup_timers::m_share=
   &table_setup_timers::create,
   NULL, /* write_row */
   NULL, /* delete_all_rows */
+  NULL, /* get_row_count */
   COUNT_SETUP_TIMERS,
   sizeof(PFS_simple_index),
   &m_table_lock,
diff --git a/storage/perfschema/table_setup_timers.h b/storage/perfschema/table_setup_timers.h
index c147de63a21..a81e6fefaaf 100644
--- a/storage/perfschema/table_setup_timers.h
+++ b/storage/perfschema/table_setup_timers.h
@@ -1,5 +1,4 @@
-/* Copyright (c) 2008 MySQL AB, 2010 Sun Microsystems, Inc.
-   Use is subject to license terms.
+/* Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
diff --git a/storage/perfschema/table_socket_instances.cc b/storage/perfschema/table_socket_instances.cc
new file mode 100644
index 00000000000..f913c8fcc65
--- /dev/null
+++ b/storage/perfschema/table_socket_instances.cc
@@ -0,0 +1,232 @@
+/* Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; version 2 of the License.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software Foundation,
+  51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA */
+
+/**
+  @file storage/perfschema/table_socket_instances.cc
+  Table SOCKET_INSTANCES (implementation).
+*/
+
+#include "my_global.h"
+#include "my_pthread.h"
+#include "pfs_instr.h"
+#include "pfs_column_types.h"
+#include "pfs_column_values.h"
+#include "table_socket_instances.h"
+#include "pfs_global.h"
+
+THR_LOCK table_socket_instances::m_table_lock;
+
+static const TABLE_FIELD_TYPE field_types[]=
+{
+  {
+    { C_STRING_WITH_LEN("EVENT_NAME") },
+    { C_STRING_WITH_LEN("varchar(128)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("OBJECT_INSTANCE_BEGIN") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("THREAD_ID") },
+    { C_STRING_WITH_LEN("int(11)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SOCKET_ID") },
+    { C_STRING_WITH_LEN("int(11)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("IP") },
+    { C_STRING_WITH_LEN("varchar(64)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("PORT") },
+    { C_STRING_WITH_LEN("int(11)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("STATE") },
+    { C_STRING_WITH_LEN("enum('IDLE','ACTIVE')") },
+    { NULL, 0}
+  }
+};
+
+TABLE_FIELD_DEF
+table_socket_instances::m_field_def=
+{ 7, field_types };
+
+PFS_engine_table_share
+table_socket_instances::m_share=
+{
+  { C_STRING_WITH_LEN("socket_instances") },
+  &pfs_readonly_acl,
+  &table_socket_instances::create,
+  NULL, /* write_row */
+  NULL, /* delete_all_rows */
+  NULL, /* get_row_count */
+  1000, /* records */
+  sizeof(PFS_simple_index),
+  &m_table_lock,
+  &m_field_def,
+  false /* checked */
+};
+
+PFS_engine_table* table_socket_instances::create(void)
+{
+  return new table_socket_instances();
+}
+
+table_socket_instances::table_socket_instances()
+  : PFS_engine_table(&m_share, &m_pos),
+  m_row_exists(false), m_pos(0), m_next_pos(0)
+{}
+
+void table_socket_instances::reset_position(void)
+{
+  m_pos.m_index= 0;
+  m_next_pos.m_index= 0;
+}
+
+int table_socket_instances::rnd_next(void)
+{
+  PFS_socket *pfs;
+
+  for (m_pos.set_at(&m_next_pos);
+       m_pos.m_index < socket_max;
+       m_pos.next())
+  {
+    pfs= &socket_array[m_pos.m_index];
+    if (pfs->m_lock.is_populated())
+    {
+      make_row(pfs);
+      m_next_pos.set_after(&m_pos);
+      return 0;
+    }
+  }
+
+  return HA_ERR_END_OF_FILE;
+}
+
+int table_socket_instances::rnd_pos(const void *pos)
+{
+  PFS_socket *pfs;
+
+  set_position(pos);
+  DBUG_ASSERT(m_pos.m_index < socket_max);
+  pfs= &socket_array[m_pos.m_index];
+
+  if (! pfs->m_lock.is_populated())
+    return HA_ERR_RECORD_DELETED;
+
+  make_row(pfs);
+  return 0;
+}
+
+void table_socket_instances::make_row(PFS_socket *pfs)
+{
+  pfs_lock lock;
+  PFS_socket_class *safe_class;
+
+  m_row_exists= false;
+
+  /* Protect this reader against a socket delete */
+  pfs->m_lock.begin_optimistic_lock(&lock);
+
+  safe_class= sanitize_socket_class(pfs->m_class);
+  if (unlikely(safe_class == NULL))
+    return;
+
+  /** Extract ip address and port from raw address */
+  m_row.m_ip_length= pfs_get_socket_address(m_row.m_ip, sizeof(m_row.m_ip),
+                                            &m_row.m_port,
+                                            &pfs->m_sock_addr, pfs->m_addr_len);
+  m_row.m_event_name=        safe_class->m_name;
+  m_row.m_event_name_length= safe_class->m_name_length;
+  m_row.m_identity=          pfs->m_identity;
+  m_row.m_fd=                pfs->m_fd;
+  m_row.m_state=             (pfs->m_idle ? PSI_SOCKET_STATE_IDLE
+                                          : PSI_SOCKET_STATE_ACTIVE);
+  PFS_thread *safe_thread= sanitize_thread(pfs->m_thread_owner);
+
+  if (safe_thread != NULL)
+  {
+    m_row.m_thread_id= safe_thread->m_thread_internal_id;
+    m_row.m_thread_id_set= true;
+  }
+  else
+    m_row.m_thread_id_set= false;
+
+
+  if (pfs->m_lock.end_optimistic_lock(&lock))
+    m_row_exists= true;
+}
+
+int table_socket_instances::read_row_values(TABLE *table,
+                                          unsigned char *buf,
+                                          Field **fields,
+                                          bool read_all)
+{
+  Field *f;
+
+  if (unlikely(!m_row_exists))
+    return HA_ERR_RECORD_DELETED;
+
+  /* Set the null bits */
+  DBUG_ASSERT(table->s->null_bytes == 1);
+  buf[0]= 0;
+
+  for (; (f= *fields) ; fields++)
+  {
+    if (read_all || bitmap_is_set(table->read_set, f->field_index))
+    {
+      switch(f->field_index)
+      {
+      case 0: /* EVENT_NAME */
+        set_field_varchar_utf8(f, m_row.m_event_name, m_row.m_event_name_length);
+        break;
+      case 1: /* OBJECT_INSTANCE_BEGIN */
+        set_field_ulonglong(f, (intptr)m_row.m_identity);
+        break;
+      case 2: /* THREAD_ID */
+        if (m_row.m_thread_id_set)
+          set_field_ulong(f, m_row.m_thread_id);
+        else
+          f->set_null();
+        break;
+      case 3: /* SOCKET_ID */
+        set_field_ulong(f, m_row.m_fd);
+        break;
+      case 4: /* IP */
+        set_field_varchar_utf8(f, m_row.m_ip, m_row.m_ip_length);
+        break;
+      case 5: /* PORT */
+        set_field_ulong(f, m_row.m_port);
+        break;
+      case 6: /* STATE */
+        set_field_enum(f, m_row.m_state);
+        break;
+      default:
+        DBUG_ASSERT(false);
+      }
+    }
+  }
+
+  return 0;
+}
+
diff --git a/storage/perfschema/table_socket_instances.h b/storage/perfschema/table_socket_instances.h
new file mode 100644
index 00000000000..2a80aeaa76a
--- /dev/null
+++ b/storage/perfschema/table_socket_instances.h
@@ -0,0 +1,102 @@
+/* Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; version 2 of the License.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software Foundation,
+  51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA */
+
+#ifndef TABLE_SOCKET_INSTANCES_H
+#define TABLE_SOCKET_INSTANCES_H
+
+/**
+  @file storage/perfschema/table_socket_instances.h
+  Table SOCKET_INSTANCES (declarations).
+*/
+
+#include "pfs_column_types.h"
+#include "pfs_engine_table.h"
+
+/**
+  @addtogroup Performance_schema_tables
+  @{
+*/
+
+/** A row of PERFORMANCE_SCHEMA.SOCKET_INSTANCES. */
+struct row_socket_instances
+{
+  /** Column EVENT_NAME. */
+  const char *m_event_name;
+  /** Length in bytes of @c m_event_name. */
+  uint m_event_name_length;
+  /** Column OBJECT_INSTANCE_BEGIN */
+  const void *m_identity;
+  /** Column THREAD_ID */
+  uint m_thread_id;
+  /** True if thread_is is set */
+  bool m_thread_id_set;
+  /** Column SOCKET_ID */
+  uint m_fd;
+  /** Socket ip address, IPV4 or IPV6 */
+  char m_ip[INET6_ADDRSTRLEN+1];
+  /** Length in bytes of @c m_ip. */
+  uint m_ip_length;
+  /** Column PORT */
+  uint m_port;
+  /** Socket state: ACTIVE or IDLE */
+  PSI_socket_state m_state;
+
+  row_socket_instances() {m_thread_id_set= false;}
+};
+
+/** Table PERFORMANCE_SCHEMA.SOCKET_INSTANCES. */
+class table_socket_instances : public PFS_engine_table
+{
+public:
+  /** Table share */
+  static PFS_engine_table_share m_share;
+  static PFS_engine_table* create();
+
+  virtual int rnd_next();
+  virtual int rnd_pos(const void *pos);
+  virtual void reset_position(void);
+
+private:
+  virtual int read_row_values(TABLE *table,
+                              unsigned char *buf,
+                              Field **fields,
+                              bool read_all);
+
+  table_socket_instances();
+
+public:
+  ~table_socket_instances()
+  {}
+
+private:
+  void make_row(PFS_socket *pfs);
+
+  /** Table share lock. */
+  static THR_LOCK m_table_lock;
+  /** Fields definition. */
+  static TABLE_FIELD_DEF m_field_def;
+
+  /** Current row. */
+  row_socket_instances m_row;
+  /** True if the current row exists. */
+  bool m_row_exists;
+  /** Current position. */
+  PFS_simple_index m_pos;
+  /** Next position. */
+  PFS_simple_index m_next_pos;
+};
+
+/** @} */
+#endif
diff --git a/storage/perfschema/table_socket_summary_by_event_name.cc b/storage/perfschema/table_socket_summary_by_event_name.cc
new file mode 100644
index 00000000000..0262178e5b5
--- /dev/null
+++ b/storage/perfschema/table_socket_summary_by_event_name.cc
@@ -0,0 +1,348 @@
+/* Copyright (c) 2008, 2011, Oracle and/or its affiliates. All rights reserved.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; version 2 of the License.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software Foundation,
+  51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA */
+
+/**
+  @file storage/perfschema/table_socket_summary_by_event_name.cc
+  Table SOCKET_EVENT_NAMES (implementation).
+*/
+
+#include "my_global.h"
+#include "my_pthread.h"
+#include "pfs_instr.h"
+#include "pfs_column_types.h"
+#include "pfs_column_values.h"
+#include "table_socket_summary_by_event_name.h"
+#include "pfs_global.h"
+#include "pfs_visitor.h"
+
+THR_LOCK table_socket_summary_by_event_name::m_table_lock;
+
+static const TABLE_FIELD_TYPE field_types[]=
+{
+  {
+    { C_STRING_WITH_LEN("EVENT_NAME") },
+    { C_STRING_WITH_LEN("varchar(128)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("COUNT_STAR") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_TIMER_WAIT") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("MIN_TIMER_WAIT") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("AVG_TIMER_WAIT") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("MAX_TIMER_WAIT") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+
+  /** Read */
+  {
+    { C_STRING_WITH_LEN("COUNT_READ") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_TIMER_READ") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("MIN_TIMER_READ") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("AVG_TIMER_READ") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("MAX_TIMER_READ") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_NUMBER_OF_BYTES_READ") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+
+  /** Write */
+  {
+    { C_STRING_WITH_LEN("COUNT_WRITE") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_TIMER_WRITE") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("MIN_TIMER_WRITE") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("AVG_TIMER_WRITE") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("MAX_TIMER_WRITE") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_NUMBER_OF_BYTES_WRITE") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+
+  /** Misc */
+  {
+    { C_STRING_WITH_LEN("COUNT_MISC") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_TIMER_MISC") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("MIN_TIMER_MISC") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("AVG_TIMER_MISC") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("MAX_TIMER_MISC") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  }
+};
+
+TABLE_FIELD_DEF
+table_socket_summary_by_event_name::m_field_def=
+{ 23, field_types };
+
+PFS_engine_table_share
+table_socket_summary_by_event_name::m_share=
+{
+  { C_STRING_WITH_LEN("socket_summary_by_event_name") },
+  &pfs_readonly_acl,
+  &table_socket_summary_by_event_name::create,
+  NULL, /* write_row */
+  table_socket_summary_by_event_name::delete_all_rows,
+  NULL, /* get_row_count */
+  1000, /* records */
+  sizeof(PFS_simple_index),
+  &m_table_lock,
+  &m_field_def,
+  false /* checked */
+};
+
+PFS_engine_table* table_socket_summary_by_event_name::create(void)
+{
+  return new table_socket_summary_by_event_name();
+}
+
+table_socket_summary_by_event_name::table_socket_summary_by_event_name()
+  : PFS_engine_table(&m_share, &m_pos),
+  m_row_exists(false), m_pos(1), m_next_pos(1)
+{}
+
+int table_socket_summary_by_event_name::delete_all_rows(void)
+{
+  reset_socket_instance_io();
+  reset_socket_class_io();
+  return 0;
+}
+
+void table_socket_summary_by_event_name::reset_position(void)
+{
+  m_pos.m_index= 1;
+  m_next_pos.m_index= 1;
+}
+
+int table_socket_summary_by_event_name::rnd_next(void)
+{
+  PFS_socket_class *socket_class;
+
+  m_pos.set_at(&m_next_pos);
+
+  socket_class= find_socket_class(m_pos.m_index);
+  if (socket_class)
+  {
+    make_row(socket_class);
+    m_next_pos.set_after(&m_pos);
+    return 0;
+  }
+
+  return HA_ERR_END_OF_FILE;
+}
+
+int table_socket_summary_by_event_name::rnd_pos(const void *pos)
+{
+  PFS_socket_class *socket_class;
+
+  set_position(pos);
+
+  socket_class= find_socket_class(m_pos.m_index);
+  if (socket_class)
+  {
+    make_row(socket_class);
+    return 0;
+  }
+
+  return HA_ERR_RECORD_DELETED;
+}
+
+void table_socket_summary_by_event_name::make_row(PFS_socket_class *socket_class)
+{
+  m_row.m_event_name.make_row(socket_class);
+
+  PFS_instance_socket_io_stat_visitor visitor;
+  PFS_instance_iterator::visit_socket_instances(socket_class, &visitor);
+
+  time_normalizer *normalizer= time_normalizer::get(wait_timer);
+  
+  /* Collect timer and byte count stats */
+  m_row.m_io_stat.set(normalizer, &visitor.m_socket_io_stat);
+  m_row_exists= true;
+}
+
+int table_socket_summary_by_event_name::read_row_values(TABLE *table,
+                                          unsigned char *,
+                                          Field **fields,
+                                          bool read_all)
+{
+  Field *f;
+
+  if (unlikely(!m_row_exists))
+    return HA_ERR_RECORD_DELETED;
+
+  /* Set the null bits */
+  DBUG_ASSERT(table->s->null_bytes == 0);
+
+  for (; (f= *fields) ; fields++)
+  {
+    if (read_all || bitmap_is_set(table->read_set, f->field_index))
+    {
+      switch(f->field_index)
+      {
+      case  0: /* EVENT_NAME */
+        m_row.m_event_name.set_field(f);
+        break;
+      case  1: /* COUNT_STAR */
+        set_field_ulonglong(f, m_row.m_io_stat.m_all.m_waits.m_count);
+        break;
+      case  2: /* SUM_TIMER_WAIT */
+        set_field_ulonglong(f, m_row.m_io_stat.m_all.m_waits.m_sum);
+        break;
+      case  3: /* MIN_TIMER_WAIT */
+        set_field_ulonglong(f, m_row.m_io_stat.m_all.m_waits.m_min);
+        break;
+      case  4: /* AVG_TIMER_WAIT */
+        set_field_ulonglong(f, m_row.m_io_stat.m_all.m_waits.m_avg);
+        break;
+      case  5: /* MAX_TIMER_WAIT */
+        set_field_ulonglong(f, m_row.m_io_stat.m_all.m_waits.m_max);
+        break;
+
+      case  6: /* COUNT_READ */
+        set_field_ulonglong(f, m_row.m_io_stat.m_read.m_waits.m_count);
+        break;
+      case  7: /* SUM_TIMER_READ */
+        set_field_ulonglong(f, m_row.m_io_stat.m_read.m_waits.m_sum);
+        break;
+      case  8: /* MIN_TIMER_READ */
+        set_field_ulonglong(f, m_row.m_io_stat.m_read.m_waits.m_min);
+        break;
+      case  9: /* AVG_TIMER_READ */
+        set_field_ulonglong(f, m_row.m_io_stat.m_read.m_waits.m_avg);
+        break;
+      case 10: /* MAX_TIMER_READ */
+        set_field_ulonglong(f, m_row.m_io_stat.m_read.m_waits.m_max);
+        break;
+      case 11: /* SUM_NUMBER_OF_BYTES_READ */
+        set_field_ulonglong(f, m_row.m_io_stat.m_read.m_bytes);
+        break;
+
+      case 12: /* COUNT_WRITE */
+        set_field_ulonglong(f, m_row.m_io_stat.m_write.m_waits.m_count);
+        break;
+      case 13: /* SUM_TIMER_WRITE */
+        set_field_ulonglong(f, m_row.m_io_stat.m_write.m_waits.m_sum);
+        break;
+      case 14: /* MIN_TIMER_WRITE */
+        set_field_ulonglong(f, m_row.m_io_stat.m_write.m_waits.m_min);
+        break;
+      case 15: /* AVG_TIMER_WRITE */
+        set_field_ulonglong(f, m_row.m_io_stat.m_write.m_waits.m_avg);
+        break;
+      case 16: /* MAX_TIMER_WRITE */
+        set_field_ulonglong(f, m_row.m_io_stat.m_write.m_waits.m_max);
+        break;
+      case 17: /* SUM_NUMBER_OF_BYTES_WRITE */
+        set_field_ulonglong(f, m_row.m_io_stat.m_write.m_bytes);
+        break;
+
+      case 18: /* COUNT_MISC */
+        set_field_ulonglong(f, m_row.m_io_stat.m_misc.m_waits.m_count);
+        break;
+      case 19: /* SUM_TIMER_MISC */
+        set_field_ulonglong(f, m_row.m_io_stat.m_misc.m_waits.m_sum);
+        break;
+      case 20: /* MIN_TIMER_MISC */
+        set_field_ulonglong(f, m_row.m_io_stat.m_misc.m_waits.m_min);
+        break;
+      case 21: /* AVG_TIMER_MISC */
+        set_field_ulonglong(f, m_row.m_io_stat.m_misc.m_waits.m_avg);
+        break;
+      case 22: /* MAX_TIMER_MISC */
+        set_field_ulonglong(f, m_row.m_io_stat.m_misc.m_waits.m_max);
+        break;
+
+      default:
+        DBUG_ASSERT(false);
+        break;
+      }
+    } // if
+  } // for
+
+  return 0;
+}
+
diff --git a/storage/perfschema/table_socket_summary_by_event_name.h b/storage/perfschema/table_socket_summary_by_event_name.h
new file mode 100644
index 00000000000..b34bed41f83
--- /dev/null
+++ b/storage/perfschema/table_socket_summary_by_event_name.h
@@ -0,0 +1,92 @@
+/* Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; version 2 of the License.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software Foundation,
+  51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA */
+
+#ifndef TABLE_SOCKET_SUMMARY_BY_EVENT_NAME_H
+#define TABLE_SOCKET_SUMMARY_BY_EVENT_NAME_H
+
+/**
+  @file storage/perfschema/table_socket_summary_by_event_name.h
+  Table SOCKET_SUMMARY_BY_EVENT_NAME (declarations).
+*/
+
+#include "pfs_column_types.h"
+#include "pfs_engine_table.h"
+#include "pfs_instr_class.h"
+#include "pfs_instr.h"
+#include "table_helper.h"
+
+/**
+  @addtogroup Performance_schema_tables
+  @{
+*/
+
+/**
+  A row of table
+  PERFORMANCE_SCHEMA.SOCKET_SUMMARY_BY_EVENT_NAME.
+*/
+struct row_socket_summary_by_event_name
+{
+  /** Column EVENT_NAME. */
+  PFS_event_name_row m_event_name;
+
+  /** Columns COUNT_STAR, SUM/MIN/AVG/MAX TIMER and NUMBER_OF_BYTES for each operation. */
+  PFS_socket_io_stat_row m_io_stat;
+};
+
+/** Table PERFORMANCE_SCHEMA.SOCKET_SUMMARY_BY_EVENT_NAME. */
+class table_socket_summary_by_event_name : public PFS_engine_table
+{
+public:
+  /** Table share */
+  static PFS_engine_table_share m_share;
+  static PFS_engine_table* create();
+  static int delete_all_rows();
+
+  virtual int rnd_next();
+  virtual int rnd_pos(const void *pos);
+  virtual void reset_position(void);
+
+private:
+  virtual int read_row_values(TABLE *table,
+                              unsigned char *buf,
+                              Field **fields,
+                              bool read_all);
+
+  table_socket_summary_by_event_name();
+
+public:
+  ~table_socket_summary_by_event_name()
+  {}
+
+private:
+  void make_row(PFS_socket_class *socket_class);
+
+  /** Table share lock. */
+  static THR_LOCK m_table_lock;
+  /** Fields definition. */
+  static TABLE_FIELD_DEF m_field_def;
+
+  /** Current row. */
+  row_socket_summary_by_event_name m_row;
+  /** True if the current row exists. */
+  bool m_row_exists;
+  /** Current position. */
+  PFS_simple_index m_pos;
+  /** Next position. */
+  PFS_simple_index m_next_pos;
+};
+
+/** @} */
+#endif
diff --git a/storage/perfschema/table_socket_summary_by_instance.cc b/storage/perfschema/table_socket_summary_by_instance.cc
new file mode 100644
index 00000000000..5b177cb3dce
--- /dev/null
+++ b/storage/perfschema/table_socket_summary_by_instance.cc
@@ -0,0 +1,369 @@
+/* Copyright (c) 2008, 2011, Oracle and/or its affiliates. All rights reserved.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; version 2 of the License.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software Foundation,
+  51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA */
+
+/**
+  @file storage/perfschema/table_socket_summary_by_instance.cc
+  Table SOCKET_SUMMARY_BY_INSTANCE (implementation).
+*/
+
+#include "my_global.h"
+#include "my_pthread.h"
+#include "pfs_instr.h"
+#include "pfs_column_types.h"
+#include "pfs_column_values.h"
+#include "table_socket_summary_by_instance.h"
+#include "pfs_global.h"
+
+THR_LOCK table_socket_summary_by_instance::m_table_lock;
+
+static const TABLE_FIELD_TYPE field_types[]=
+{
+  {
+    { C_STRING_WITH_LEN("EVENT_NAME") },
+    { C_STRING_WITH_LEN("varchar(128)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("OBJECT_INSTANCE_BEGIN") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("COUNT_STAR") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_TIMER_WAIT") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("MIN_TIMER_WAIT") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("AVG_TIMER_WAIT") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("MAX_TIMER_WAIT") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+
+  /** Read */
+  {
+    { C_STRING_WITH_LEN("COUNT_READ") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_TIMER_READ") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("MIN_TIMER_READ") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("AVG_TIMER_READ") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("MAX_TIMER_READ") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_NUMBER_OF_BYTES_READ") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+
+  /** Write */
+  {
+    { C_STRING_WITH_LEN("COUNT_WRITE") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_TIMER_WRITE") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("MIN_TIMER_WRITE") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("AVG_TIMER_WRITE") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("MAX_TIMER_WRITE") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_NUMBER_OF_BYTES_WRITE") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+
+  /** Misc */
+  {
+    { C_STRING_WITH_LEN("COUNT_MISC") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_TIMER_MISC") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("MIN_TIMER_MISC") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("AVG_TIMER_MISC") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("MAX_TIMER_MISC") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  }
+};
+
+TABLE_FIELD_DEF
+table_socket_summary_by_instance::m_field_def=
+{ 24, field_types };
+
+PFS_engine_table_share
+table_socket_summary_by_instance::m_share=
+{
+  { C_STRING_WITH_LEN("socket_summary_by_instance") },
+  &pfs_readonly_acl,
+  &table_socket_summary_by_instance::create,
+  NULL, /* write_row */
+  table_socket_summary_by_instance::delete_all_rows,
+  NULL, /* get_row_count */
+  1000, /* records */
+  sizeof(PFS_simple_index),
+  &m_table_lock,
+  &m_field_def,
+  false /* checked */
+};
+
+PFS_engine_table* table_socket_summary_by_instance::create(void)
+{
+  return new table_socket_summary_by_instance();
+}
+
+table_socket_summary_by_instance::table_socket_summary_by_instance()
+  : PFS_engine_table(&m_share, &m_pos),
+  m_row_exists(false), m_pos(0), m_next_pos(0)
+{}
+
+int table_socket_summary_by_instance::delete_all_rows(void)
+{
+  reset_socket_instance_io();
+  return 0;
+}
+
+void table_socket_summary_by_instance::reset_position(void)
+{
+  m_pos.m_index= 0;
+  m_next_pos.m_index= 0;
+}
+
+int table_socket_summary_by_instance::rnd_next(void)
+{
+  PFS_socket *pfs;
+
+  for (m_pos.set_at(&m_next_pos);
+       m_pos.m_index < socket_max;
+       m_pos.next())
+  {
+    pfs= &socket_array[m_pos.m_index];
+    if (pfs->m_lock.is_populated())
+    {
+      make_row(pfs);
+      m_next_pos.set_after(&m_pos);
+      return 0;
+    }
+  }
+
+  return HA_ERR_END_OF_FILE;
+}
+
+int table_socket_summary_by_instance::rnd_pos(const void *pos)
+{
+  PFS_socket *pfs;
+
+  set_position(pos);
+  DBUG_ASSERT(m_pos.m_index < socket_max);
+  pfs= &socket_array[m_pos.m_index];
+
+  if (! pfs->m_lock.is_populated())
+    return HA_ERR_RECORD_DELETED;
+
+  make_row(pfs);
+  return 0;
+}
+
+void table_socket_summary_by_instance::make_row(PFS_socket *pfs)
+{
+  pfs_lock lock;
+  PFS_socket_class *safe_class;
+
+  m_row_exists= false;
+
+  /* Protect this reader against a socket delete */
+  pfs->m_lock.begin_optimistic_lock(&lock);
+
+  safe_class= sanitize_socket_class(pfs->m_class);
+  if (unlikely(safe_class == NULL))
+    return;
+
+  m_row.m_event_name.make_row(safe_class);
+  m_row.m_identity= pfs->m_identity;
+
+  time_normalizer *normalizer= time_normalizer::get(wait_timer);
+
+  /* Collect timer and byte count stats */
+  m_row.m_io_stat.set(normalizer, &pfs->m_socket_stat.m_io_stat);
+
+  if (!pfs->m_lock.end_optimistic_lock(&lock))
+    return;
+
+  m_row_exists= true;
+}
+
+int table_socket_summary_by_instance::read_row_values(TABLE *table,
+                                          unsigned char *,
+                                          Field **fields,
+                                          bool read_all)
+{
+  Field *f;
+
+  if (unlikely(!m_row_exists))
+    return HA_ERR_RECORD_DELETED;
+
+  /* Set the null bits */
+  DBUG_ASSERT(table->s->null_bytes == 0);
+
+  for (; (f= *fields) ; fields++)
+  {
+    if (read_all || bitmap_is_set(table->read_set, f->field_index))
+    {
+      switch(f->field_index)
+      {
+      case  0: /* EVENT_NAME */
+        m_row.m_event_name.set_field(f);
+        break;
+      case  1: /* OBJECT_INSTANCE */
+        set_field_ulonglong(f, (ulonglong)m_row.m_identity);
+        break;
+
+      case  2:/* COUNT_STAR */
+        set_field_ulonglong(f, m_row.m_io_stat.m_all.m_waits.m_count);
+        break;
+      case  3:/* SUM_TIMER_WAIT */
+        set_field_ulonglong(f, m_row.m_io_stat.m_all.m_waits.m_sum);
+        break;
+      case  4: /* MIN_TIMER_WAIT */
+        set_field_ulonglong(f, m_row.m_io_stat.m_all.m_waits.m_min);
+        break;
+      case  5: /* AVG_TIMER_WAIT */
+        set_field_ulonglong(f, m_row.m_io_stat.m_all.m_waits.m_avg);
+        break;
+      case  6: /* MAX_TIMER_WAIT */
+        set_field_ulonglong(f, m_row.m_io_stat.m_all.m_waits.m_max);
+        break;
+
+      case  7: /* COUNT_READ */
+        set_field_ulonglong(f, m_row.m_io_stat.m_read.m_waits.m_count);
+        break;
+      case  8: /* SUM_TIMER_READ */
+        set_field_ulonglong(f, m_row.m_io_stat.m_read.m_waits.m_sum);
+        break;
+      case  9: /* MIN_TIMER_READ */
+        set_field_ulonglong(f, m_row.m_io_stat.m_read.m_waits.m_min);
+        break;
+      case 10: /* AVG_TIMER_READ */
+        set_field_ulonglong(f, m_row.m_io_stat.m_read.m_waits.m_avg);
+        break;
+      case 11: /* MAX_TIMER_READ */
+        set_field_ulonglong(f, m_row.m_io_stat.m_read.m_waits.m_max);
+        break;
+      case 12: /* SUM_NUMBER_OF_BYTES_READ */
+        set_field_ulonglong(f, m_row.m_io_stat.m_read.m_bytes);
+        break;
+
+      case 13: /* COUNT_WRITE */
+        set_field_ulonglong(f, m_row.m_io_stat.m_write.m_waits.m_count);
+        break;
+      case 14: /* SUM_TIMER_WRITE */
+        set_field_ulonglong(f, m_row.m_io_stat.m_write.m_waits.m_sum);
+        break;
+      case 15: /* MIN_TIMER_WRITE */
+        set_field_ulonglong(f, m_row.m_io_stat.m_write.m_waits.m_min);
+        break;
+      case 16: /* AVG_TIMER_WRITE */
+        set_field_ulonglong(f, m_row.m_io_stat.m_write.m_waits.m_avg);
+        break;
+      case 17: /* MAX_TIMER_WRITE */
+        set_field_ulonglong(f, m_row.m_io_stat.m_write.m_waits.m_max);
+        break;
+      case 18: /* SUM_NUMBER_OF_BYTES_WRITE */
+        set_field_ulonglong(f, m_row.m_io_stat.m_write.m_bytes);
+        break;
+
+      case 19: /* COUNT_MISC */
+        set_field_ulonglong(f, m_row.m_io_stat.m_misc.m_waits.m_count);
+        break;
+      case 20: /* SUM_TIMER_MISC */
+        set_field_ulonglong(f, m_row.m_io_stat.m_misc.m_waits.m_sum);
+        break;
+      case 21: /* MIN_TIMER_MISC */
+        set_field_ulonglong(f, m_row.m_io_stat.m_misc.m_waits.m_min);
+        break;
+      case 22: /* AVG_TIMER_MISC */
+        set_field_ulonglong(f, m_row.m_io_stat.m_misc.m_waits.m_avg);
+        break;
+      case 23: /* MAX_TIMER_MISC */
+        set_field_ulonglong(f, m_row.m_io_stat.m_misc.m_waits.m_max);
+        break;
+      default:
+        DBUG_ASSERT(false);
+        break;
+      }
+    }
+  }
+
+  return 0;
+}
diff --git a/storage/perfschema/table_socket_summary_by_instance.h b/storage/perfschema/table_socket_summary_by_instance.h
new file mode 100644
index 00000000000..f4c8ea41d8c
--- /dev/null
+++ b/storage/perfschema/table_socket_summary_by_instance.h
@@ -0,0 +1,95 @@
+/* Copyright (c) 2008, 2011, Oracle and/or its affiliates. All rights reserved.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; version 2 of the License.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software Foundation,
+  51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA */
+
+#ifndef TABLE_SOCKET_SUMMARY_BY_INSTANCE_H
+#define TABLE_SOCKET_SUMMARY_BY_INSTANCE_H
+
+/**
+  @file storage/perfschema/table_socket_summary_by_instance.h
+  Table SOCKET_SUMMARY_BY_INSTANCE (declarations).
+*/
+
+#include "pfs_column_types.h"
+#include "pfs_engine_table.h"
+#include "pfs_instr_class.h"
+#include "pfs_instr.h"
+#include "table_helper.h"
+
+/**
+  @addtogroup Performance_schema_tables
+  @{
+*/
+
+/**
+  A row of table
+  PERFORMANCE_SCHEMA.SOCKET_SUMMARY_BY_INSTANCE.
+*/
+struct row_socket_summary_by_instance
+{
+  /** Column EVENT_NAME. */
+  PFS_event_name_row m_event_name;
+
+  /** Column OBJECT_INSTANCE_BEGIN */
+  const void *m_identity;
+
+  /** Columns COUNT_STAR, SUM/MIN/AVG/MAX TIMER and NUMBER_OF_BYTES for each operation. */
+  PFS_socket_io_stat_row m_io_stat;
+};
+
+/** Table PERFORMANCE_SCHEMA.SOCKET_SUMMARY_BY_INSTANCE. */
+class table_socket_summary_by_instance : public PFS_engine_table
+{
+public:
+  /** Table share */
+  static PFS_engine_table_share m_share;
+  static PFS_engine_table* create();
+  static int delete_all_rows();
+
+  virtual int rnd_next();
+  virtual int rnd_pos(const void *pos);
+  virtual void reset_position(void);
+
+private:
+  virtual int read_row_values(TABLE *table,
+                              unsigned char *buf,
+                              Field **fields,
+                              bool read_all);
+
+  table_socket_summary_by_instance();
+
+public:
+  ~table_socket_summary_by_instance()
+  {}
+
+private:
+  void make_row(PFS_socket *pfs);
+
+  /** Table share lock. */
+  static THR_LOCK m_table_lock;
+  /** Fields definition. */
+  static TABLE_FIELD_DEF m_field_def;
+
+  /** Current row. */
+  row_socket_summary_by_instance m_row;
+  /** True if the current row exists. */
+  bool m_row_exists;
+  /** Current position. */
+  PFS_simple_index m_pos;
+  /** Next position. */
+  PFS_simple_index m_next_pos;
+};
+
+/** @} */
+#endif
diff --git a/storage/perfschema/table_sync_instances.cc b/storage/perfschema/table_sync_instances.cc
index f2bd9fa1a28..9631c5fb205 100644
--- a/storage/perfschema/table_sync_instances.cc
+++ b/storage/perfschema/table_sync_instances.cc
@@ -60,6 +60,7 @@ table_mutex_instances::m_share=
   &table_mutex_instances::create,
   NULL, /* write_row */
   NULL, /* delete_all_rows */
+  NULL, /* get_row_count */
   1000, /* records */
   sizeof(PFS_simple_index),
   &m_table_lock,
@@ -228,6 +229,7 @@ table_rwlock_instances::m_share=
   &table_rwlock_instances::create,
   NULL, /* write_row */
   NULL, /* delete_all_rows */
+  NULL, /* get_row_count */
   1000, /* records */
   sizeof(PFS_simple_index),
   &m_table_lock,
@@ -393,6 +395,7 @@ table_cond_instances::m_share=
   &table_cond_instances::create,
   NULL, /* write_row */
   NULL, /* delete_all_rows */
+  NULL, /* get_row_count */
   1000, /* records */
   sizeof(PFS_simple_index),
   &m_table_lock,
diff --git a/storage/perfschema/table_sync_instances.h b/storage/perfschema/table_sync_instances.h
index 3c359852338..b6fc78e1cd5 100644
--- a/storage/perfschema/table_sync_instances.h
+++ b/storage/perfschema/table_sync_instances.h
@@ -82,7 +82,7 @@ private:
 
   /** Current row. */
   row_mutex_instances m_row;
-  /** True is the current row exists. */
+  /** True if the current row exists. */
   bool m_row_exists;
   /** Current position. */
   PFS_simple_index m_pos;
@@ -141,7 +141,7 @@ private:
 
   /** Current row. */
   row_rwlock_instances m_row;
-  /** True is the current row exists. */
+  /** True if the current row exists. */
   bool m_row_exists;
   /** Current position. */
   PFS_simple_index m_pos;
@@ -194,7 +194,7 @@ private:
 
   /** Current row. */
   row_cond_instances m_row;
-  /** True is the current row exists. */
+  /** True if the current row exists. */
   bool m_row_exists;
   /** Current position. */
   PFS_simple_index m_pos;
diff --git a/storage/perfschema/table_threads.cc b/storage/perfschema/table_threads.cc
index 541ba860386..91300d6b67e 100644
--- a/storage/perfschema/table_threads.cc
+++ b/storage/perfschema/table_threads.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2008, 2011, Oracle and/or its affiliates. All rights reserved.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
@@ -13,14 +13,10 @@
   along with this program; if not, write to the Free Software Foundation,
   51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA */
 
-/**
-  @file storage/perfschema/table_threads.cc
-  Table THREADS (implementation).
-*/
-
 #include "my_global.h"
 #include "my_pthread.h"
 #include "table_threads.h"
+#include "sql_parse.h"
 #include "pfs_instr_class.h"
 #include "pfs_instr.h"
 
@@ -34,29 +30,85 @@ static const TABLE_FIELD_TYPE field_types[]=
     { NULL, 0}
   },
   {
+    { C_STRING_WITH_LEN("NAME") },
+    { C_STRING_WITH_LEN("varchar(128)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("TYPE") },
+    { C_STRING_WITH_LEN("varchar(10)") },
+    { NULL, 0}
+  },
+  {
     { C_STRING_WITH_LEN("PROCESSLIST_ID") },
     { C_STRING_WITH_LEN("int(11)") },
     { NULL, 0}
   },
   {
-    { C_STRING_WITH_LEN("NAME") },
-    { C_STRING_WITH_LEN("varchar(128)") },
+    { C_STRING_WITH_LEN("PROCESSLIST_USER") },
+    { C_STRING_WITH_LEN("varchar(16)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("PROCESSLIST_HOST") },
+    { C_STRING_WITH_LEN("varchar(60)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("PROCESSLIST_DB") },
+    { C_STRING_WITH_LEN("varchar(64)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("PROCESSLIST_COMMAND") },
+    { C_STRING_WITH_LEN("varchar(16)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("PROCESSLIST_TIME") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("PROCESSLIST_STATE") },
+    { C_STRING_WITH_LEN("varchar(64)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("PROCESSLIST_INFO") },
+    { C_STRING_WITH_LEN("longtext") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("PARENT_THREAD_ID") },
+    { C_STRING_WITH_LEN("int(11)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("ROLE") },
+    { C_STRING_WITH_LEN("varchar(64)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("INSTRUMENTED") },
+    { C_STRING_WITH_LEN("enum(\'YES\',\'NO\')") },
     { NULL, 0}
   }
 };
 
 TABLE_FIELD_DEF
 table_threads::m_field_def=
-{ 3, field_types };
+{ 14, field_types };
 
 PFS_engine_table_share
 table_threads::m_share=
 {
   { C_STRING_WITH_LEN("threads") },
-  &pfs_readonly_acl,
+  &pfs_updatable_acl,
   &table_threads::create,
   NULL, /* write_row */
   NULL, /* delete_all_rows */
+  NULL, /* get_row_count */
   1000, /* records */
   sizeof(PFS_simple_index), /* ref length */
   &m_table_lock,
@@ -64,58 +116,16 @@ table_threads::m_share=
   false /* checked */
 };
 
-PFS_engine_table* table_threads::create(void)
+PFS_engine_table* table_threads::create()
 {
   return new table_threads();
 }
 
 table_threads::table_threads()
-  : PFS_engine_table(&m_share, &m_pos),
-  m_row_exists(false), m_pos(0), m_next_pos(0)
+  : cursor_by_thread(& m_share),
+  m_row_exists(false)
 {}
 
-void table_threads::reset_position(void)
-{
-  m_pos.m_index= 0;
-  m_next_pos.m_index= 0;
-}
-
-int table_threads::rnd_next(void)
-{
-  PFS_thread *pfs;
-
-  for (m_pos.set_at(&m_next_pos);
-       m_pos.m_index < thread_max;
-       m_pos.next())
-  {
-    pfs= &thread_array[m_pos.m_index];
-    if (pfs->m_lock.is_populated())
-    {
-      make_row(pfs);
-      m_next_pos.set_after(&m_pos);
-      return 0;
-    }
-  }
-
-  return HA_ERR_END_OF_FILE;
-}
-
-int table_threads::rnd_pos(const void *pos)
-{
-  PFS_thread *pfs;
-
-  set_position(pos);
-  DBUG_ASSERT(m_pos.m_index < thread_max);
-  pfs= &thread_array[m_pos.m_index];
-  if (pfs->m_lock.is_populated())
-  {
-    make_row(pfs);
-    return 0;
-  }
-
-  return HA_ERR_RECORD_DELETED;
-}
-
 void table_threads::make_row(PFS_thread *pfs)
 {
   pfs_lock lock;
@@ -131,11 +141,40 @@ void table_threads::make_row(PFS_thread *pfs)
     return;
 
   m_row.m_thread_internal_id= pfs->m_thread_internal_id;
+  m_row.m_parent_thread_internal_id= pfs->m_parent_thread_internal_id;
   m_row.m_thread_id= pfs->m_thread_id;
   m_row.m_name= safe_class->m_name;
   m_row.m_name_length= safe_class->m_name_length;
 
-  if (pfs->m_lock.end_optimistic_lock(&lock))
+  m_row.m_username_length= pfs->m_username_length;
+  if (unlikely(m_row.m_username_length > sizeof(m_row.m_username)))
+    return;
+  if (m_row.m_username_length != 0)
+    memcpy(m_row.m_username, pfs->m_username, m_row.m_username_length);
+
+  m_row.m_hostname_length= pfs->m_hostname_length;
+  if (unlikely(m_row.m_hostname_length > sizeof(m_row.m_hostname)))
+    return;
+  if (m_row.m_hostname_length != 0)
+    memcpy(m_row.m_hostname, pfs->m_hostname, m_row.m_hostname_length);
+
+  m_row.m_dbname_length= pfs->m_dbname_length;
+  if (unlikely(m_row.m_dbname_length > sizeof(m_row.m_dbname)))
+    return;
+  if (m_row.m_dbname_length != 0)
+    memcpy(m_row.m_dbname, pfs->m_dbname, m_row.m_dbname_length);
+
+  m_row.m_command= pfs->m_command;
+  m_row.m_start_time= pfs->m_start_time;
+  /* FIXME: need to copy it ? */
+  m_row.m_processlist_state_ptr= pfs->m_processlist_state_ptr;
+  m_row.m_processlist_state_length= pfs->m_processlist_state_length;
+  /* FIXME: need to copy it ? */
+  m_row.m_processlist_info_ptr= pfs->m_processlist_info_ptr;
+  m_row.m_processlist_info_length= pfs->m_processlist_info_length;
+  m_row.m_enabled_ptr= &pfs->m_enabled;
+
+  if (pfs->m_lock.end_optimistic_lock(& lock))
     m_row_exists= true;
 }
 
@@ -150,8 +189,9 @@ int table_threads::read_row_values(TABLE *table,
     return HA_ERR_RECORD_DELETED;
 
   /* Set the null bits */
-  DBUG_ASSERT(table->s->null_bytes == 1);
+  DBUG_ASSERT(table->s->null_bytes == 2);
   buf[0]= 0;
+  buf[1]= 0;
 
   for (; (f= *fields) ; fields++)
   {
@@ -162,12 +202,125 @@ int table_threads::read_row_values(TABLE *table,
       case 0: /* THREAD_ID */
         set_field_ulong(f, m_row.m_thread_internal_id);
         break;
-      case 1: /* PROCESSLIST_ID */
-        set_field_ulong(f, m_row.m_thread_id);
-        break;
-      case 2: /* NAME */
+      case 1: /* NAME */
         set_field_varchar_utf8(f, m_row.m_name, m_row.m_name_length);
         break;
+      case 2: /* TYPE */
+        if (m_row.m_thread_id != 0)
+          set_field_varchar_utf8(f, "FOREGROUND", 10);
+        else
+          set_field_varchar_utf8(f, "BACKGROUND", 10);
+        break;
+      case 3: /* PROCESSLIST_ID */
+        if (m_row.m_thread_id != 0)
+          set_field_ulong(f, m_row.m_thread_id);
+        else
+          f->set_null();
+        break;
+      case 4: /* PROCESSLIST_USER */
+        if (m_row.m_username_length > 0)
+          set_field_varchar_utf8(f, m_row.m_username,
+                                 m_row.m_username_length);
+        else
+          f->set_null();
+        break;
+      case 5: /* PROCESSLIST_HOST */
+        if (m_row.m_hostname_length > 0)
+          set_field_varchar_utf8(f, m_row.m_hostname,
+                                 m_row.m_hostname_length);
+        else
+          f->set_null();
+        break;
+      case 6: /* PROCESSLIST_DB */
+        if (m_row.m_dbname_length > 0)
+          set_field_varchar_utf8(f, m_row.m_dbname,
+                                 m_row.m_dbname_length);
+        else
+          f->set_null();
+        break;
+      case 7: /* PROCESSLIST_COMMAND */
+        if (m_row.m_thread_id != 0)
+          set_field_varchar_utf8(f, command_name[m_row.m_command].str,
+                                 command_name[m_row.m_command].length);
+        else
+          f->set_null();
+        break;
+      case 8: /* PROCESSLIST_TIME */
+        if (m_row.m_start_time)
+        {
+          time_t now= my_time(0);
+          ulonglong elapsed= (now > m_row.m_start_time ? now - m_row.m_start_time : 0);
+          set_field_ulonglong(f, elapsed);
+        }
+        else
+          f->set_null();
+        break;
+      case 9: /* PROCESSLIST_STATE */
+        if (m_row.m_processlist_state_length > 0)
+          set_field_varchar_utf8(f, m_row.m_processlist_state_ptr,
+                                 m_row.m_processlist_state_length);
+        else
+          f->set_null();
+        break;
+      case 10: /* PROCESSLIST_INFO */
+        if (m_row.m_processlist_info_length > 0)
+          set_field_longtext_utf8(f, m_row.m_processlist_info_ptr,
+                                  m_row.m_processlist_info_length);
+        else
+          f->set_null();
+        break;
+      case 11: /* PARENT_THREAD_ID */
+        if (m_row.m_parent_thread_internal_id != 0)
+          set_field_ulong(f, m_row.m_parent_thread_internal_id);
+        else
+          f->set_null();
+        break;
+      case 12: /* ROLE */
+        f->set_null();
+        break;
+      case 13: /* INSTRUMENTED */
+        set_field_enum(f, (*m_row.m_enabled_ptr) ? ENUM_YES : ENUM_NO);
+        break;
+      default:
+        DBUG_ASSERT(false);
+      }
+    }
+  }
+  return 0;
+}
+
+int table_threads::update_row_values(TABLE *table,
+                                     const unsigned char *old_buf,
+                                     unsigned char *new_buf,
+                                     Field **fields)
+{
+  Field *f;
+  enum_yes_no value;
+
+  for (; (f= *fields) ; fields++)
+  {
+    if (bitmap_is_set(table->write_set, f->field_index))
+    {
+      switch(f->field_index)
+      {
+      case 0: /* THREAD_ID */
+      case 1: /* NAME */
+      case 2: /* TYPE */
+      case 3: /* PROCESSLIST_ID */
+      case 4: /* PROCESSLIST_USER */
+      case 5: /* PROCESSLIST_HOST */
+      case 6: /* PROCESSLIST_DB */
+      case 7: /* PROCESSLIST_COMMAND */
+      case 8: /* PROCESSLIST_TIME */
+      case 9: /* PROCESSLIST_STATE */
+      case 10: /* PROCESSLIST_INFO */
+      case 11: /* PARENT_THREAD_ID */
+      case 12: /* ROLE */
+        return HA_ERR_WRONG_COMMAND;
+      case 13: /* INSTRUMENTED */
+        value= (enum_yes_no) get_field_enum(f);
+        *m_row.m_enabled_ptr= (value == ENUM_YES) ? true : false;
+        break;
       default:
         DBUG_ASSERT(false);
       }
diff --git a/storage/perfschema/table_threads.h b/storage/perfschema/table_threads.h
index fb239007069..9819822f8c8 100644
--- a/storage/perfschema/table_threads.h
+++ b/storage/perfschema/table_threads.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2008, 2011, Oracle and/or its affiliates. All rights reserved.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
@@ -16,22 +16,19 @@
 #ifndef TABLE_THREADS_H
 #define TABLE_THREADS_H
 
-/**
-  @file storage/perfschema/table_threads.h
-  Table THREADS (declarations).
-*/
-
 #include "pfs_column_types.h"
-#include "pfs_engine_table.h"
+#include "cursor_by_thread.h"
 
 struct PFS_thread;
 
 /**
-  @addtogroup Performance_schema_tables
+  \addtogroup Performance_schema_tables
   @{
 */
 
-/** A row of PERFORMANCE_SCHEMA.THREADS. */
+/**
+  A row of PERFORMANCE_SCHEMA.THREADS.
+*/
 struct row_threads
 {
   /** Column THREAD_ID. */
@@ -39,29 +36,60 @@ struct row_threads
   /** Column PROCESSLIST_ID. */
   ulong m_thread_id;
   /** Column NAME. */
-  const char *m_name;
+  const char* m_name;
   /** Length in bytes of @c m_name. */
   uint m_name_length;
+  /** Column PROCESSLIST_USER. */
+  char m_username[USERNAME_LENGTH];
+  /** Length in bytes of @c m_username. */
+  uint m_username_length;
+  /** Column PROCESSLIST_HOST. */
+  char m_hostname[HOSTNAME_LENGTH];
+  /** Length in bytes of @c m_hostname. */
+  uint m_hostname_length;
+  /** Column PROCESSLIST_DB. */
+  char m_dbname[NAME_LEN];
+  /** Length in bytes of @c m_dbname. */
+  uint m_dbname_length;
+  /** Column PROCESSLIST_COMMAND. */
+  int m_command;
+  /** Column PROCESSLIST_TIME. */
+  time_t m_start_time;
+  /** Column PROCESSLIST_STATE. */
+  const char* m_processlist_state_ptr;
+  /** Length in bytes of @c m_processlist_state_ptr. */
+  uint m_processlist_state_length;
+  /** Column PROCESSLIST_INFO. */
+  const char* m_processlist_info_ptr;
+  /** Length in bytes of @c m_processlist_info_ptr. */
+  uint m_processlist_info_length;
+  /** Column INSTRUMENTED. */
+  bool *m_enabled_ptr;
+  /** Column PARENT_THREAD_ID. */
+  ulong m_parent_thread_internal_id;
 };
 
 /** Table PERFORMANCE_SCHEMA.THREADS. */
-class table_threads : public PFS_engine_table
+class table_threads : public cursor_by_thread
 {
 public:
-  /** Table share. */
+  /** Table share */
   static PFS_engine_table_share m_share;
+  /** Table builder */
   static PFS_engine_table* create();
 
-  virtual int rnd_next();
-  virtual int rnd_pos(const void *pos);
-  virtual void reset_position(void);
-
 protected:
   virtual int read_row_values(TABLE *table,
                               unsigned char *buf,
                               Field **fields,
                               bool read_all);
 
+
+  virtual int update_row_values(TABLE *table,
+                                const unsigned char *old_buf,
+                                unsigned char *new_buf,
+                                Field **fields);
+
 protected:
   table_threads();
 
@@ -70,7 +98,7 @@ public:
   {}
 
 private:
-  void make_row(PFS_thread *pfs);
+  virtual void make_row(PFS_thread *pfs);
 
   /** Table share lock. */
   static THR_LOCK m_table_lock;
@@ -81,10 +109,6 @@ private:
   row_threads m_row;
   /** True if the current row exists. */
   bool m_row_exists;
-  /** Current position. */
-  PFS_simple_index m_pos;
-  /** Next position. */
-  PFS_simple_index m_next_pos;
 };
 
 /** @} */
diff --git a/storage/perfschema/table_tiws_by_index_usage.cc b/storage/perfschema/table_tiws_by_index_usage.cc
new file mode 100644
index 00000000000..d354c40d3ed
--- /dev/null
+++ b/storage/perfschema/table_tiws_by_index_usage.cc
@@ -0,0 +1,497 @@
+/* Copyright (c) 2010, 2011, Oracle and/or its affiliates. All rights reserved.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; version 2 of the License.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301  USA */
+
+/**
+  @file storage/perfschema/table_tiws_by_index_usage.cc
+  Table TABLE_IO_WAITS_SUMMARY_BY_INDEX_USAGE (implementation).
+*/
+
+#include "my_global.h"
+#include "my_pthread.h"
+#include "pfs_instr_class.h"
+#include "pfs_column_types.h"
+#include "pfs_column_values.h"
+#include "table_tiws_by_index_usage.h"
+#include "pfs_global.h"
+#include "pfs_visitor.h"
+
+THR_LOCK table_tiws_by_index_usage::m_table_lock;
+
+static const TABLE_FIELD_TYPE field_types[]=
+{
+  {
+    { C_STRING_WITH_LEN("OBJECT_TYPE") },
+    { C_STRING_WITH_LEN("varchar(64)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("OBJECT_SCHEMA") },
+    { C_STRING_WITH_LEN("varchar(64)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("OBJECT_NAME") },
+    { C_STRING_WITH_LEN("varchar(64)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("INDEX_NAME") },
+    { C_STRING_WITH_LEN("varchar(64)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("COUNT_STAR") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_TIMER_WAIT") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("MIN_TIMER_WAIT") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("AVG_TIMER_WAIT") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("MAX_TIMER_WAIT") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("COUNT_READ") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_TIMER_READ") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("MIN_TIMER_READ") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("AVG_TIMER_READ") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("MAX_TIMER_READ") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("COUNT_WRITE") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_TIMER_WRITE") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("MIN_TIMER_WRITE") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("AVG_TIMER_WRITE") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("MAX_TIMER_WRITE") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("COUNT_FETCH") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_TIMER_FETCH") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("MIN_TIMER_FETCH") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("AVG_TIMER_FETCH") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("MAX_TIMER_FETCH") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("COUNT_INSERT") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_TIMER_INSERT") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("MIN_TIMER_INSERT") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("AVG_TIMER_INSERT") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("MAX_TIMER_INSERT") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("COUNT_UPDATE") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_TIMER_UPDATE") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("MIN_TIMER_UPDATE") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("AVG_TIMER_UPDATE") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("MAX_TIMER_UPDATE") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("COUNT_DELETE") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_TIMER_DELETE") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("MIN_TIMER_DELETE") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("AVG_TIMER_DELETE") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("MAX_TIMER_DELETE") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  }
+};
+
+TABLE_FIELD_DEF
+table_tiws_by_index_usage::m_field_def=
+{ 39, field_types };
+
+PFS_engine_table_share
+table_tiws_by_index_usage::m_share=
+{
+  { C_STRING_WITH_LEN("table_io_waits_summary_by_index_usage") },
+  &pfs_truncatable_acl,
+  table_tiws_by_index_usage::create,
+  NULL, /* write_row */
+  table_tiws_by_index_usage::delete_all_rows,
+  NULL, /* get_row_count */
+  1000, /* records */
+  sizeof(pos_tiws_by_index_usage),
+  &m_table_lock,
+  &m_field_def,
+  false /* checked */
+};
+
+PFS_engine_table*
+table_tiws_by_index_usage::create(void)
+{
+  return new table_tiws_by_index_usage();
+}
+
+int
+table_tiws_by_index_usage::delete_all_rows(void)
+{
+  reset_table_io_waits_by_table_handle();
+  reset_table_io_waits_by_table();
+  return 0;
+}
+
+table_tiws_by_index_usage::table_tiws_by_index_usage()
+  : PFS_engine_table(&m_share, &m_pos),
+    m_row_exists(false), m_pos(), m_next_pos()
+{}
+
+void table_tiws_by_index_usage::reset_position(void)
+{
+  m_pos.reset();
+  m_next_pos.reset();
+}
+
+int table_tiws_by_index_usage::rnd_init(bool scan)
+{
+  m_normalizer= time_normalizer::get(wait_timer);
+  return 0;
+}
+
+int table_tiws_by_index_usage::rnd_next(void)
+{
+  PFS_table_share *table_share;
+
+  for (m_pos.set_at(&m_next_pos);
+       m_pos.has_more_table();
+       m_pos.next_table())
+  {
+    table_share= &table_share_array[m_pos.m_index_1];
+    if (table_share->m_lock.is_populated())
+    {
+      if (m_pos.m_index_2 < table_share->m_key_count)
+      {
+        make_row(table_share, m_pos.m_index_2);
+        m_next_pos.set_after(&m_pos);
+        return 0;
+      }
+      if (m_pos.m_index_2 <= MAX_KEY)
+      {
+        m_pos.m_index_2= MAX_KEY;
+        make_row(table_share, m_pos.m_index_2);
+        m_next_pos.set_after(&m_pos);
+        return 0;
+      }
+    }
+  }
+
+  return HA_ERR_END_OF_FILE;
+}
+
+int
+table_tiws_by_index_usage::rnd_pos(const void *pos)
+{
+  PFS_table_share *table_share;
+
+  set_position(pos);
+
+  table_share= &table_share_array[m_pos.m_index_1];
+  if (table_share->m_lock.is_populated())
+  {
+    if (m_pos.m_index_2 < table_share->m_key_count)
+    {
+      make_row(table_share, m_pos.m_index_2);
+      return 0;
+    }
+    if (m_pos.m_index_2 == MAX_KEY)
+    {
+      make_row(table_share, m_pos.m_index_2);
+      return 0;
+    }
+  }
+
+  return HA_ERR_RECORD_DELETED;
+}
+
+void table_tiws_by_index_usage::make_row(PFS_table_share *share, uint index)
+{
+  pfs_lock lock;
+
+  m_row_exists= false;
+
+  share->m_lock.begin_optimistic_lock(&lock);
+
+  if (m_row.m_index.make_row(share, index))
+    return;
+
+  PFS_index_io_stat_visitor visitor;
+  PFS_object_iterator::visit_table_indexes(share, index, & visitor);
+
+  if (! share->m_lock.end_optimistic_lock(&lock))
+    return;
+
+  m_row_exists= true;
+  m_row.m_stat.set(m_normalizer, & visitor.m_stat);
+}
+
+int table_tiws_by_index_usage::read_row_values(TABLE *table,
+                                         unsigned char *buf,
+                                         Field **fields,
+                                         bool read_all)
+{
+  Field *f;
+
+  if (unlikely(! m_row_exists))
+    return HA_ERR_RECORD_DELETED;
+
+  /* Set the null bits */
+  DBUG_ASSERT(table->s->null_bytes == 1);
+  buf[0]= 0;
+
+  for (; (f= *fields) ; fields++)
+  {
+    if (read_all || bitmap_is_set(table->read_set, f->field_index))
+    {
+      switch(f->field_index)
+      {
+      case 0: /* OBJECT_TYPE */
+      case 1: /* SCHEMA_NAME */
+      case 2: /* OBJECT_NAME */
+      case 3: /* INDEX_NAME */
+        m_row.m_index.set_field(f->field_index, f);
+        break;
+      case 4: /* COUNT_STAR */
+        set_field_ulonglong(f, m_row.m_stat.m_all.m_count);
+        break;
+      case 5: /* SUM */
+        set_field_ulonglong(f, m_row.m_stat.m_all.m_sum);
+        break;
+      case 6: /* MIN */
+        set_field_ulonglong(f, m_row.m_stat.m_all.m_min);
+        break;
+      case 7: /* AVG */
+        set_field_ulonglong(f, m_row.m_stat.m_all.m_avg);
+        break;
+      case 8: /* MAX */
+        set_field_ulonglong(f, m_row.m_stat.m_all.m_max);
+        break;
+      case 9: /* COUNT_READ */
+        set_field_ulonglong(f, m_row.m_stat.m_all_read.m_count);
+        break;
+      case 10: /* SUM_READ */
+        set_field_ulonglong(f, m_row.m_stat.m_all_read.m_sum);
+        break;
+      case 11: /* MIN_READ */
+        set_field_ulonglong(f, m_row.m_stat.m_all_read.m_min);
+        break;
+      case 12: /* AVG_READ */
+        set_field_ulonglong(f, m_row.m_stat.m_all_read.m_avg);
+        break;
+      case 13: /* MAX_READ */
+        set_field_ulonglong(f, m_row.m_stat.m_all_read.m_max);
+        break;
+      case 14: /* COUNT_WRITE */
+        set_field_ulonglong(f, m_row.m_stat.m_all_write.m_count);
+        break;
+      case 15: /* SUM_WRITE */
+        set_field_ulonglong(f, m_row.m_stat.m_all_write.m_sum);
+        break;
+      case 16: /* MIN_WRITE */
+        set_field_ulonglong(f, m_row.m_stat.m_all_write.m_min);
+        break;
+      case 17: /* AVG_WRITE */
+        set_field_ulonglong(f, m_row.m_stat.m_all_write.m_avg);
+        break;
+      case 18: /* MAX_WRITE */
+        set_field_ulonglong(f, m_row.m_stat.m_all_write.m_max);
+        break;
+      case 19: /* COUNT_FETCH */
+        set_field_ulonglong(f, m_row.m_stat.m_fetch.m_count);
+        break;
+      case 20: /* SUM_FETCH */
+        set_field_ulonglong(f, m_row.m_stat.m_fetch.m_sum);
+        break;
+      case 21: /* MIN_FETCH */
+        set_field_ulonglong(f, m_row.m_stat.m_fetch.m_min);
+        break;
+      case 22: /* AVG_FETCH */
+        set_field_ulonglong(f, m_row.m_stat.m_fetch.m_avg);
+        break;
+      case 23: /* MAX_FETCH */
+        set_field_ulonglong(f, m_row.m_stat.m_fetch.m_max);
+        break;
+      case 24: /* COUNT_INSERT */
+        set_field_ulonglong(f, m_row.m_stat.m_insert.m_count);
+        break;
+      case 25: /* SUM_INSERT */
+        set_field_ulonglong(f, m_row.m_stat.m_insert.m_sum);
+        break;
+      case 26: /* MIN_INSERT */
+        set_field_ulonglong(f, m_row.m_stat.m_insert.m_min);
+        break;
+      case 27: /* AVG_INSERT */
+        set_field_ulonglong(f, m_row.m_stat.m_insert.m_avg);
+        break;
+      case 28: /* MAX_INSERT */
+        set_field_ulonglong(f, m_row.m_stat.m_insert.m_max);
+        break;
+      case 29: /* COUNT_UPDATE */
+        set_field_ulonglong(f, m_row.m_stat.m_update.m_count);
+        break;
+      case 30: /* SUM_UPDATE */
+        set_field_ulonglong(f, m_row.m_stat.m_update.m_sum);
+        break;
+      case 31: /* MIN_UPDATE */
+        set_field_ulonglong(f, m_row.m_stat.m_update.m_min);
+        break;
+      case 32: /* AVG_UPDATE */
+        set_field_ulonglong(f, m_row.m_stat.m_update.m_avg);
+        break;
+      case 33: /* MAX_UPDATE */
+        set_field_ulonglong(f, m_row.m_stat.m_update.m_max);
+        break;
+      case 34: /* COUNT_DELETE */
+        set_field_ulonglong(f, m_row.m_stat.m_delete.m_count);
+        break;
+      case 35: /* SUM_DELETE */
+        set_field_ulonglong(f, m_row.m_stat.m_delete.m_sum);
+        break;
+      case 36: /* MIN_DELETE */
+        set_field_ulonglong(f, m_row.m_stat.m_delete.m_min);
+        break;
+      case 37: /* AVG_DELETE */
+        set_field_ulonglong(f, m_row.m_stat.m_delete.m_avg);
+        break;
+      case 38: /* MAX_DELETE */
+        set_field_ulonglong(f, m_row.m_stat.m_delete.m_max);
+        break;
+      default:
+        DBUG_ASSERT(false);
+      }
+    }
+  }
+
+  return 0;
+}
+
diff --git a/storage/perfschema/table_tiws_by_index_usage.h b/storage/perfschema/table_tiws_by_index_usage.h
new file mode 100644
index 00000000000..b5f589d0cea
--- /dev/null
+++ b/storage/perfschema/table_tiws_by_index_usage.h
@@ -0,0 +1,123 @@
+/* Copyright (c) 2010, 2011, Oracle and/or its affiliates. All rights reserved.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; version 2 of the License.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301  USA */
+
+#ifndef TABLE_IO_WAIT_SUMMARY_BY_INDEX_USAGE_H
+#define TABLE_IO_WAIT_SUMMARY_BY_INDEX_USAGE_H
+
+/**
+  @file storage/perfschema/table_tiws_by_index_usage.h
+  Table TABLE_IO_WAIT_SUMMARY_BY_INDEX_USAGE (declarations).
+*/
+
+#include "pfs_column_types.h"
+#include "pfs_engine_table.h"
+#include "pfs_instr_class.h"
+#include "pfs_instr.h"
+#include "table_helper.h"
+
+/**
+  @addtogroup Performance_schema_tables
+  @{
+*/
+
+/**
+  A row of table
+  PERFORMANCE_SCHEMA.TABLE_IO_WAIT_SUMMARY_BY_INDEX.
+*/
+struct row_tiws_by_index_usage
+{
+  /** Column OBJECT_TYPE, SCHEMA_NAME, OBJECT_NAME, INDEX_NAME. */
+  PFS_index_row m_index;
+  /** Columns COUNT/SUM/MIN/AVG/MAX (+_READ, +WRITE). */
+  PFS_table_io_stat_row m_stat;
+};
+
+/**
+  Position of a cursor on
+  PERFORMANCE_SCHEMA.TABLE_IO_WAIT_SUMMARY_BY_INDEX.
+  Index 1 on table_share_array (0 based)
+  Index 2 on index (0 based)
+*/
+struct pos_tiws_by_index_usage : public PFS_double_index
+{
+  pos_tiws_by_index_usage()
+    : PFS_double_index(0, 0)
+  {}
+
+  inline void reset(void)
+  {
+    m_index_1= 0;
+    m_index_2= 0;
+  }
+
+  inline bool has_more_table(void)
+  {
+    return (m_index_1 < table_share_max);
+  }
+
+  inline void next_table(void)
+  {
+    m_index_1++;
+    m_index_2= 0;
+  }
+};
+
+/** Table PERFORMANCE_SCHEMA.TABLE_IO_WAIT_SUMMARY_BY_INDEX. */
+class table_tiws_by_index_usage : public PFS_engine_table
+{
+public:
+  /** Table share */
+  static PFS_engine_table_share m_share;
+  static PFS_engine_table* create();
+  static int delete_all_rows();
+
+  virtual int rnd_init(bool scan);
+  virtual int rnd_next();
+  virtual int rnd_pos(const void *pos);
+  virtual void reset_position(void);
+
+protected:
+  virtual int read_row_values(TABLE *table,
+                              unsigned char *buf,
+                              Field **fields,
+                              bool read_all);
+
+  table_tiws_by_index_usage();
+
+public:
+  ~table_tiws_by_index_usage()
+  {}
+
+protected:
+  void make_row(PFS_table_share *table_share, uint index);
+
+private:
+  /** Table share lock. */
+  static THR_LOCK m_table_lock;
+  /** Fields definition. */
+  static TABLE_FIELD_DEF m_field_def;
+
+  /** Current row. */
+  row_tiws_by_index_usage m_row;
+  /** True is the current row exists. */
+  bool m_row_exists;
+  /** Current position. */
+  pos_tiws_by_index_usage m_pos;
+  /** Next position. */
+  pos_tiws_by_index_usage m_next_pos;
+};
+
+/** @} */
+#endif
diff --git a/storage/perfschema/table_tiws_by_table.cc b/storage/perfschema/table_tiws_by_table.cc
new file mode 100644
index 00000000000..f793b5654a2
--- /dev/null
+++ b/storage/perfschema/table_tiws_by_table.cc
@@ -0,0 +1,473 @@
+/* Copyright (c) 2010, 2011, Oracle and/or its affiliates. All rights reserved.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; version 2 of the License.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+/**
+  @file storage/perfschema/table_tiws_by_table.cc
+  Table TABLE_IO_WAITS_SUMMARY_BY_TABLE (implementation).
+*/
+
+#include "my_global.h"
+#include "my_pthread.h"
+#include "pfs_instr_class.h"
+#include "pfs_column_types.h"
+#include "pfs_column_values.h"
+#include "table_tiws_by_table.h"
+#include "pfs_global.h"
+#include "pfs_visitor.h"
+
+THR_LOCK table_tiws_by_table::m_table_lock;
+
+static const TABLE_FIELD_TYPE field_types[]=
+{
+  {
+    { C_STRING_WITH_LEN("OBJECT_TYPE") },
+    { C_STRING_WITH_LEN("varchar(64)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("OBJECT_SCHEMA") },
+    { C_STRING_WITH_LEN("varchar(64)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("OBJECT_NAME") },
+    { C_STRING_WITH_LEN("varchar(64)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("COUNT_STAR") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_TIMER_WAIT") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("MIN_TIMER_WAIT") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("AVG_TIMER_WAIT") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("MAX_TIMER_WAIT") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("COUNT_READ") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_TIMER_READ") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("MIN_TIMER_READ") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("AVG_TIMER_READ") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("MAX_TIMER_READ") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("COUNT_WRITE") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_TIMER_WRITE") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("MIN_TIMER_WRITE") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("AVG_TIMER_WRITE") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("MAX_TIMER_WRITE") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("COUNT_FETCH") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_TIMER_FETCH") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("MIN_TIMER_FETCH") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("AVG_TIMER_FETCH") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("MAX_TIMER_FETCH") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("COUNT_INSERT") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_TIMER_INSERT") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("MIN_TIMER_INSERT") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("AVG_TIMER_INSERT") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("MAX_TIMER_INSERT") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("COUNT_UPDATE") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_TIMER_UPDATE") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("MIN_TIMER_UPDATE") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("AVG_TIMER_UPDATE") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("MAX_TIMER_UPDATE") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("COUNT_DELETE") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_TIMER_DELETE") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("MIN_TIMER_DELETE") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("AVG_TIMER_DELETE") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("MAX_TIMER_DELETE") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  }
+};
+
+TABLE_FIELD_DEF
+table_tiws_by_table::m_field_def=
+{ 38, field_types };
+
+PFS_engine_table_share
+table_tiws_by_table::m_share=
+{
+  { C_STRING_WITH_LEN("table_io_waits_summary_by_table") },
+  &pfs_truncatable_acl,
+  table_tiws_by_table::create,
+  NULL, /* write_row */
+  table_tiws_by_table::delete_all_rows,
+  NULL, /* get_row_count */
+  1000, /* records */
+  sizeof(PFS_simple_index),
+  &m_table_lock,
+  &m_field_def,
+  false /* checked */
+};
+
+PFS_engine_table*
+table_tiws_by_table::create(void)
+{
+  return new table_tiws_by_table();
+}
+
+int
+table_tiws_by_table::delete_all_rows(void)
+{
+  reset_table_io_waits_by_table_handle();
+  reset_table_io_waits_by_table();
+  return 0;
+}
+
+table_tiws_by_table::table_tiws_by_table()
+  : PFS_engine_table(&m_share, &m_pos),
+    m_row_exists(false), m_pos(0), m_next_pos(0)
+{}
+
+void table_tiws_by_table::reset_position(void)
+{
+  m_pos.m_index= 0;
+  m_next_pos.m_index= 0;
+}
+
+int table_tiws_by_table::rnd_init(bool scan)
+{
+  m_normalizer= time_normalizer::get(wait_timer);
+  return 0;
+}
+
+int table_tiws_by_table::rnd_next(void)
+{
+  PFS_table_share *table_share;
+
+  for (m_pos.set_at(&m_next_pos);
+       m_pos.m_index < table_share_max;
+       m_pos.m_index++)
+  {
+    table_share= &table_share_array[m_pos.m_index];
+    if (table_share->m_lock.is_populated())
+    {
+      make_row(table_share);
+      m_next_pos.set_after(&m_pos);
+      return 0;
+    }
+  }
+
+  return HA_ERR_END_OF_FILE;
+}
+
+int
+table_tiws_by_table::rnd_pos(const void *pos)
+{
+  PFS_table_share *table_share;
+
+  set_position(pos);
+
+  table_share= &table_share_array[m_pos.m_index];
+  if (table_share->m_lock.is_populated())
+  {
+    make_row(table_share);
+    return 0;
+  }
+
+  return HA_ERR_RECORD_DELETED;
+}
+
+void table_tiws_by_table::make_row(PFS_table_share *share)
+{
+  pfs_lock lock;
+
+  m_row_exists= false;
+
+  share->m_lock.begin_optimistic_lock(&lock);
+
+  if (m_row.m_object.make_row(share))
+    return;
+
+  PFS_table_io_stat_visitor visitor;
+  PFS_object_iterator::visit_tables(share, & visitor);
+
+  if (! share->m_lock.end_optimistic_lock(&lock))
+    return;
+
+  m_row_exists= true;
+  m_row.m_stat.set(m_normalizer, &visitor.m_stat);
+}
+
+int table_tiws_by_table::read_row_values(TABLE *table,
+                                        unsigned char *buf,
+                                        Field **fields,
+                                        bool read_all)
+{
+  Field *f;
+
+  if (unlikely(! m_row_exists))
+    return HA_ERR_RECORD_DELETED;
+
+  /* Set the null bits */
+  DBUG_ASSERT(table->s->null_bytes == 1);
+  buf[0]= 0;
+
+  for (; (f= *fields) ; fields++)
+  {
+    if (read_all || bitmap_is_set(table->read_set, f->field_index))
+    {
+      switch(f->field_index)
+      {
+      case 0: /* OBJECT_TYPE */
+      case 1: /* SCHEMA_NAME */
+      case 2: /* OBJECT_NAME */
+        m_row.m_object.set_field(f->field_index, f);
+        break;
+      case 3: /* COUNT_STAR */
+        set_field_ulonglong(f, m_row.m_stat.m_all.m_count);
+        break;
+      case 4: /* SUM */
+        set_field_ulonglong(f, m_row.m_stat.m_all.m_sum);
+        break;
+      case 5: /* MIN */
+        set_field_ulonglong(f, m_row.m_stat.m_all.m_min);
+        break;
+      case 6: /* AVG */
+        set_field_ulonglong(f, m_row.m_stat.m_all.m_avg);
+        break;
+      case 7: /* MAX */
+        set_field_ulonglong(f, m_row.m_stat.m_all.m_max);
+        break;
+      case 8: /* COUNT_READ */
+        set_field_ulonglong(f, m_row.m_stat.m_all_read.m_count);
+        break;
+      case 9: /* SUM_READ */
+        set_field_ulonglong(f, m_row.m_stat.m_all_read.m_sum);
+        break;
+      case 10: /* MIN_READ */
+        set_field_ulonglong(f, m_row.m_stat.m_all_read.m_min);
+        break;
+      case 11: /* AVG_READ */
+        set_field_ulonglong(f, m_row.m_stat.m_all_read.m_avg);
+        break;
+      case 12: /* MAX_READ */
+        set_field_ulonglong(f, m_row.m_stat.m_all_read.m_max);
+        break;
+      case 13: /* COUNT_WRITE */
+        set_field_ulonglong(f, m_row.m_stat.m_all_write.m_count);
+        break;
+      case 14: /* SUM_WRITE */
+        set_field_ulonglong(f, m_row.m_stat.m_all_write.m_sum);
+        break;
+      case 15: /* MIN_WRITE */
+        set_field_ulonglong(f, m_row.m_stat.m_all_write.m_min);
+        break;
+      case 16: /* AVG_WRITE */
+        set_field_ulonglong(f, m_row.m_stat.m_all_write.m_avg);
+        break;
+      case 17: /* MAX_WRITE */
+        set_field_ulonglong(f, m_row.m_stat.m_all_write.m_max);
+        break;
+      case 18: /* COUNT_FETCH */
+        set_field_ulonglong(f, m_row.m_stat.m_fetch.m_count);
+        break;
+      case 19: /* SUM_FETCH */
+        set_field_ulonglong(f, m_row.m_stat.m_fetch.m_sum);
+        break;
+      case 20: /* MIN_FETCH */
+        set_field_ulonglong(f, m_row.m_stat.m_fetch.m_min);
+        break;
+      case 21: /* AVG_FETCH */
+        set_field_ulonglong(f, m_row.m_stat.m_fetch.m_avg);
+        break;
+      case 22: /* MAX_FETCH */
+        set_field_ulonglong(f, m_row.m_stat.m_fetch.m_max);
+        break;
+      case 23: /* COUNT_INSERT */
+        set_field_ulonglong(f, m_row.m_stat.m_insert.m_count);
+        break;
+      case 24: /* SUM_INSERT */
+        set_field_ulonglong(f, m_row.m_stat.m_insert.m_sum);
+        break;
+      case 25: /* MIN_INSERT */
+        set_field_ulonglong(f, m_row.m_stat.m_insert.m_min);
+        break;
+      case 26: /* AVG_INSERT */
+        set_field_ulonglong(f, m_row.m_stat.m_insert.m_avg);
+        break;
+      case 27: /* MAX_INSERT */
+        set_field_ulonglong(f, m_row.m_stat.m_insert.m_max);
+        break;
+      case 28: /* COUNT_UPDATE */
+        set_field_ulonglong(f, m_row.m_stat.m_update.m_count);
+        break;
+      case 29: /* SUM_UPDATE */
+        set_field_ulonglong(f, m_row.m_stat.m_update.m_sum);
+        break;
+      case 30: /* MIN_UPDATE */
+        set_field_ulonglong(f, m_row.m_stat.m_update.m_min);
+        break;
+      case 31: /* AVG_UPDATE */
+        set_field_ulonglong(f, m_row.m_stat.m_update.m_avg);
+        break;
+      case 32: /* MAX_UPDATE */
+        set_field_ulonglong(f, m_row.m_stat.m_update.m_max);
+        break;
+      case 33: /* COUNT_DELETE */
+        set_field_ulonglong(f, m_row.m_stat.m_delete.m_count);
+        break;
+      case 34: /* SUM_DELETE */
+        set_field_ulonglong(f, m_row.m_stat.m_delete.m_sum);
+        break;
+      case 35: /* MIN_DELETE */
+        set_field_ulonglong(f, m_row.m_stat.m_delete.m_min);
+        break;
+      case 36: /* AVG_DELETE */
+        set_field_ulonglong(f, m_row.m_stat.m_delete.m_avg);
+        break;
+      case 37: /* MAX_DELETE */
+        set_field_ulonglong(f, m_row.m_stat.m_delete.m_max);
+        break;
+      default:
+        DBUG_ASSERT(false);
+      }
+    }
+  }
+
+  return 0;
+}
+
diff --git a/storage/perfschema/table_tiws_by_table.h b/storage/perfschema/table_tiws_by_table.h
new file mode 100644
index 00000000000..747b1958c8e
--- /dev/null
+++ b/storage/perfschema/table_tiws_by_table.h
@@ -0,0 +1,93 @@
+/* Copyright (c) 2010, 2011, Oracle and/or its affiliates. All rights reserved.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; version 2 of the License.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+#ifndef TABLE_IO_WAITS_SUMMARY_BY_TABLE_H
+#define TABLE_IO_WAITS_SUMMARY_BY_TABLE_H
+
+/**
+  @file storage/perfschema/table_tiws_by_table.h
+  Table TABLE_IO_WAITS_SUMMARY_BY_TABLE (declarations).
+*/
+
+#include "pfs_column_types.h"
+#include "pfs_engine_table.h"
+#include "pfs_instr_class.h"
+#include "pfs_instr.h"
+#include "table_helper.h"
+
+/**
+  @addtogroup Performance_schema_tables
+  @{
+*/
+
+/**
+  A row of table
+  PERFORMANCE_SCHEMA.TABLE_IO_WAITS_SUMMARY_BY_TABLE.
+*/
+struct row_tiws_by_table
+{
+  /** Column OBJECT_TYPE, SCHEMA_NAME, OBJECT_NAME. */
+  PFS_object_row m_object;
+  /** Columns COUNT/SUM/MIN/AVG/MAX (+_READ, +WRITE). */
+  PFS_table_io_stat_row m_stat;
+};
+
+/** Table PERFORMANCE_SCHEMA.TABLE_IO_WAITS_SUMMARY_BY_TABLE. */
+class table_tiws_by_table : public PFS_engine_table
+{
+public:
+  /** Table share */
+  static PFS_engine_table_share m_share;
+  static PFS_engine_table* create();
+  static int delete_all_rows();
+
+  virtual int rnd_init(bool scan);
+  virtual int rnd_next();
+  virtual int rnd_pos(const void *pos);
+  virtual void reset_position(void);
+
+protected:
+  virtual int read_row_values(TABLE *table,
+                              unsigned char *buf,
+                              Field **fields,
+                              bool read_all);
+
+  table_tiws_by_table();
+
+public:
+  ~table_tiws_by_table()
+  {}
+
+protected:
+  void make_row(PFS_table_share *table_share);
+
+private:
+  /** Table share lock. */
+  static THR_LOCK m_table_lock;
+  /** Fields definition. */
+  static TABLE_FIELD_DEF m_field_def;
+
+  /** Current row. */
+  row_tiws_by_table m_row;
+  /** True is the current row exists. */
+  bool m_row_exists;
+  /** Current position. */
+  PFS_simple_index m_pos;
+  /** Next position. */
+  PFS_simple_index m_next_pos;
+};
+
+/** @} */
+#endif
diff --git a/storage/perfschema/table_tlws_by_table.cc b/storage/perfschema/table_tlws_by_table.cc
new file mode 100644
index 00000000000..c6f454ac178
--- /dev/null
+++ b/storage/perfschema/table_tlws_by_table.cc
@@ -0,0 +1,765 @@
+/* Copyright (c) 2010, 2011, Oracle and/or its affiliates. All rights reserved.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; version 2 of the License.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+/**
+  @file storage/perfschema/table_tlws_by_table.cc
+  Table TABLE_LOCK_WAITS_SUMMARY_BY_TABLE (implementation).
+*/
+
+#include "my_global.h"
+#include "my_pthread.h"
+#include "pfs_instr_class.h"
+#include "pfs_column_types.h"
+#include "pfs_column_values.h"
+#include "table_tlws_by_table.h"
+#include "pfs_global.h"
+#include "pfs_visitor.h"
+
+THR_LOCK table_tlws_by_table::m_table_lock;
+
+static const TABLE_FIELD_TYPE field_types[]=
+{
+  {
+    { C_STRING_WITH_LEN("OBJECT_TYPE") },
+    { C_STRING_WITH_LEN("varchar(64)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("OBJECT_SCHEMA") },
+    { C_STRING_WITH_LEN("varchar(64)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("OBJECT_NAME") },
+    { C_STRING_WITH_LEN("varchar(64)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("COUNT_STAR") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_TIMER_WAIT") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("MIN_TIMER_WAIT") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("AVG_TIMER_WAIT") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("MAX_TIMER_WAIT") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("COUNT_READ") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_TIMER_READ") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("MIN_TIMER_READ") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("AVG_TIMER_READ") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("MAX_TIMER_READ") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("COUNT_WRITE") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_TIMER_WRITE") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("MIN_TIMER_WRITE") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("AVG_TIMER_WRITE") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("MAX_TIMER_WRITE") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("COUNT_READ_NORMAL") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_TIMER_READ_NORMAL") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("MIN_TIMER_READ_NORMAL") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("AVG_TIMER_READ_NORMAL") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("MAX_TIMER_READ_NORMAL") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("COUNT_READ_WITH_SHARED_LOCKS") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_TIMER_READ_WITH_SHARED_LOCKS") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("MIN_TIMER_READ_WITH_SHARED_LOCKS") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("AVG_TIMER_READ_WITH_SHARED_LOCKS") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("MAX_TIMER_READ_WITH_SHARED_LOCKS") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("COUNT_READ_HIGH_PRIORITY") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_TIMER_READ_HIGH_PRIORITY") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("MIN_TIMER_READ_HIGH_PRIORITY") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("AVG_TIMER_READ_HIGH_PRIORITY") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("MAX_TIMER_READ_HIGH_PRIORITY") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("COUNT_READ_NO_INSERT") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_TIMER_READ_NO_INSERT") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("MIN_TIMER_READ_NO_INSERT") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("AVG_TIMER_READ_NO_INSERT") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("MAX_TIMER_READ_NO_INSERT") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("COUNT_READ_EXTERNAL") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_TIMER_READ_EXTERNAL") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("MIN_TIMER_READ_EXTERNAL") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("AVG_TIMER_READ_EXTERNAL") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("MAX_TIMER_READ_EXTERNAL") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("COUNT_WRITE_ALLOW_WRITE") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_TIMER_WRITE_ALLOW_WRITE") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("MIN_TIMER_WRITE_ALLOW_WRITE") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("AVG_TIMER_WRITE_ALLOW_WRITE") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("MAX_TIMER_WRITE_ALLOW_WRITE") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("COUNT_WRITE_CONCURRENT_INSERT") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_TIMER_WRITE_CONCURRENT_INSERT") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("MIN_TIMER_WRITE_CONCURRENT_INSERT") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("AVG_TIMER_WRITE_CONCURRENT_INSERT") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("MAX_TIMER_WRITE_CONCURRENT_INSERT") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("COUNT_WRITE_DELAYED") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_TIMER_WRITE_DELAYED") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("MIN_TIMER_WRITE_DELAYED") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("AVG_TIMER_WRITE_DELAYED") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("MAX_TIMER_WRITE_DELAYED") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("COUNT_WRITE_LOW_PRIORITY") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_TIMER_WRITE_LOW_PRIORITY") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("MIN_TIMER_WRITE_LOW_PRIORITY") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("AVG_TIMER_WRITE_LOW_PRIORITY") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("MAX_TIMER_WRITE_LOW_PRIORITY") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("COUNT_WRITE_NORMAL") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_TIMER_WRITE_NORMAL") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("MIN_TIMER_WRITE_NORMAL") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("AVG_TIMER_WRITE_NORMAL") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("MAX_TIMER_WRITE_NORMAL") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("COUNT_WRITE_EXTERNAL") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("SUM_TIMER_WRITE_EXTERNAL") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("MIN_TIMER_WRITE_EXTERNAL") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("AVG_TIMER_WRITE_EXTERNAL") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("MAX_TIMER_WRITE_EXTERNAL") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  }
+};
+
+TABLE_FIELD_DEF
+table_tlws_by_table::m_field_def=
+{ 73, field_types };
+
+PFS_engine_table_share
+table_tlws_by_table::m_share=
+{
+  { C_STRING_WITH_LEN("table_lock_waits_summary_by_table") },
+  &pfs_truncatable_acl,
+  table_tlws_by_table::create,
+  NULL, /* write_row */
+  table_tlws_by_table::delete_all_rows,
+  NULL, /* get_row_count */
+  1000, /* records */
+  sizeof(PFS_simple_index),
+  &m_table_lock,
+  &m_field_def,
+  false /* checked */
+};
+
+PFS_engine_table*
+table_tlws_by_table::create(void)
+{
+  return new table_tlws_by_table();
+}
+
+int
+table_tlws_by_table::delete_all_rows(void)
+{
+  reset_table_lock_waits_by_table_handle();
+  reset_table_lock_waits_by_table();
+  return 0;
+}
+
+table_tlws_by_table::table_tlws_by_table()
+  : PFS_engine_table(&m_share, &m_pos),
+    m_row_exists(false), m_pos(0), m_next_pos(0)
+{}
+
+void table_tlws_by_table::reset_position(void)
+{
+  m_pos.m_index= 0;
+  m_next_pos.m_index= 0;
+}
+
+int table_tlws_by_table::rnd_init(bool scan)
+{
+  m_normalizer= time_normalizer::get(wait_timer);
+  return 0;
+}
+
+int table_tlws_by_table::rnd_next(void)
+{
+  PFS_table_share *table_share;
+
+  for (m_pos.set_at(&m_next_pos);
+       m_pos.m_index < table_share_max;
+       m_pos.m_index++)
+  {
+    table_share= &table_share_array[m_pos.m_index];
+    if (table_share->m_lock.is_populated())
+    {
+      make_row(table_share);
+      m_next_pos.set_after(&m_pos);
+      return 0;
+    }
+  }
+
+  return HA_ERR_END_OF_FILE;
+}
+
+int
+table_tlws_by_table::rnd_pos(const void *pos)
+{
+  PFS_table_share *table_share;
+
+  set_position(pos);
+
+  table_share= &table_share_array[m_pos.m_index];
+  if (table_share->m_lock.is_populated())
+  {
+    make_row(table_share);
+    return 0;
+  }
+
+  return HA_ERR_RECORD_DELETED;
+}
+
+void table_tlws_by_table::make_row(PFS_table_share *share)
+{
+  pfs_lock lock;
+
+  m_row_exists= false;
+
+  share->m_lock.begin_optimistic_lock(&lock);
+
+  if (m_row.m_object.make_row(share))
+    return;
+
+  PFS_table_lock_stat_visitor visitor;
+  PFS_object_iterator::visit_tables(share, & visitor);
+
+  if (! share->m_lock.end_optimistic_lock(&lock))
+    return;
+
+  m_row_exists= true;
+  m_row.m_stat.set(m_normalizer, &visitor.m_stat);
+}
+
+int table_tlws_by_table::read_row_values(TABLE *table,
+                                         unsigned char *buf,
+                                         Field **fields,
+                                         bool read_all)
+{
+  Field *f;
+
+  if (unlikely(! m_row_exists))
+    return HA_ERR_RECORD_DELETED;
+
+  /* Set the null bits */
+  DBUG_ASSERT(table->s->null_bytes == 1);
+  buf[0]= 0;
+
+  for (; (f= *fields) ; fields++)
+  {
+    if (read_all || bitmap_is_set(table->read_set, f->field_index))
+    {
+      switch(f->field_index)
+      {
+      case 0: /* OBJECT_TYPE */
+      case 1: /* SCHEMA_NAME */
+      case 2: /* OBJECT_NAME */
+        m_row.m_object.set_field(f->field_index, f);
+        break;
+      case 3: /* COUNT_STAR */
+        set_field_ulonglong(f, m_row.m_stat.m_all.m_count);
+        break;
+      case 4: /* SUM_TIMER */
+        set_field_ulonglong(f, m_row.m_stat.m_all.m_sum);
+        break;
+      case 5: /* MIN_TIMER */
+        set_field_ulonglong(f, m_row.m_stat.m_all.m_min);
+        break;
+      case 6: /* AVG_TIMER */
+        set_field_ulonglong(f, m_row.m_stat.m_all.m_avg);
+        break;
+      case 7: /* MAX_TIMER */
+        set_field_ulonglong(f, m_row.m_stat.m_all.m_max);
+        break;
+      case 8: /* COUNT_READ */
+        set_field_ulonglong(f, m_row.m_stat.m_all_read.m_count);
+        break;
+      case 9: /* SUM_TIMER_READ */
+        set_field_ulonglong(f, m_row.m_stat.m_all_read.m_sum);
+        break;
+      case 10: /* MIN_TIMER_READ */
+        set_field_ulonglong(f, m_row.m_stat.m_all_read.m_min);
+        break;
+      case 11: /* AVG_TIMER_READ */
+        set_field_ulonglong(f, m_row.m_stat.m_all_read.m_avg);
+        break;
+      case 12: /* MAX_TIMER_READ */
+        set_field_ulonglong(f, m_row.m_stat.m_all_read.m_max);
+        break;
+      case 13: /* COUNT_WRITE */
+        set_field_ulonglong(f, m_row.m_stat.m_all_write.m_count);
+        break;
+      case 14: /* SUM_TIMER_WRITE */
+        set_field_ulonglong(f, m_row.m_stat.m_all_write.m_sum);
+        break;
+      case 15: /* MIN_TIMER_WRITE */
+        set_field_ulonglong(f, m_row.m_stat.m_all_write.m_min);
+        break;
+      case 16: /* AVG_TIMER_WRITE */
+        set_field_ulonglong(f, m_row.m_stat.m_all_write.m_avg);
+        break;
+      case 17: /* MAX_TIMER_WRITE */
+        set_field_ulonglong(f, m_row.m_stat.m_all_write.m_max);
+        break;
+
+      case 18: /* COUNT_READ_NORMAL */
+        set_field_ulonglong(f, m_row.m_stat.m_read_normal.m_count);
+        break;
+      case 19: /* SUM_TIMER_READ_NORMAL */
+        set_field_ulonglong(f, m_row.m_stat.m_read_normal.m_sum);
+        break;
+      case 20: /* MIN_TIMER_READ_NORMAL */
+        set_field_ulonglong(f, m_row.m_stat.m_read_normal.m_min);
+        break;
+      case 21: /* AVG_TIMER_READ_NORMAL */
+        set_field_ulonglong(f, m_row.m_stat.m_read_normal.m_avg);
+        break;
+      case 22: /* MAX_TIMER_READ_NORMAL */
+        set_field_ulonglong(f, m_row.m_stat.m_read_normal.m_max);
+        break;
+
+      case 23: /* COUNT_READ_WITH_SHARED_LOCKS */
+        set_field_ulonglong(f, m_row.m_stat.m_read_with_shared_locks.m_count);
+        break;
+      case 24: /* SUM_TIMER_READ_WITH_SHARED_LOCKS */
+        set_field_ulonglong(f, m_row.m_stat.m_read_with_shared_locks.m_sum);
+        break;
+      case 25: /* MIN_TIMER_READ_WITH_SHARED_LOCKS */
+        set_field_ulonglong(f, m_row.m_stat.m_read_with_shared_locks.m_min);
+        break;
+      case 26: /* AVG_TIMER_READ_WITH_SHARED_LOCKS */
+        set_field_ulonglong(f, m_row.m_stat.m_read_with_shared_locks.m_avg);
+        break;
+      case 27: /* MAX_TIMER_READ_WITH_SHARED_LOCKS */
+        set_field_ulonglong(f, m_row.m_stat.m_read_with_shared_locks.m_max);
+        break;
+
+      case 28: /* COUNT_READ_HIGH_PRIORITY */
+        set_field_ulonglong(f, m_row.m_stat.m_read_high_priority.m_count);
+        break;
+      case 29: /* SUM_TIMER_READ_HIGH_PRIORITY */
+        set_field_ulonglong(f, m_row.m_stat.m_read_high_priority.m_sum);
+        break;
+      case 30: /* MIN_TIMER_READ_HIGH_PRIORITY */
+        set_field_ulonglong(f, m_row.m_stat.m_read_high_priority.m_min);
+        break;
+      case 31: /* AVG_TIMER_READ_HIGH_PRIORITY */
+        set_field_ulonglong(f, m_row.m_stat.m_read_high_priority.m_avg);
+        break;
+      case 32: /* MAX_TIMER_READ_HIGH_PRIORITY */
+        set_field_ulonglong(f, m_row.m_stat.m_read_high_priority.m_max);
+        break;
+
+      case 33: /* COUNT_READ_NO_INSERT */
+        set_field_ulonglong(f, m_row.m_stat.m_read_no_insert.m_count);
+        break;
+      case 34: /* SUM_TIMER_READ_NO_INSERT */
+        set_field_ulonglong(f, m_row.m_stat.m_read_no_insert.m_sum);
+        break;
+      case 35: /* MIN_TIMER_READ_NO_INSERT */
+        set_field_ulonglong(f, m_row.m_stat.m_read_no_insert.m_min);
+        break;
+      case 36: /* AVG_TIMER_READ_NO_INSERT */
+        set_field_ulonglong(f, m_row.m_stat.m_read_no_insert.m_avg);
+        break;
+      case 37: /* MAX_TIMER_READ_NO_INSERT */
+        set_field_ulonglong(f, m_row.m_stat.m_read_no_insert.m_max);
+        break;
+
+      case 38: /* COUNT_READ_EXTERNAL */
+        set_field_ulonglong(f, m_row.m_stat.m_read_external.m_count);
+        break;
+      case 39: /* SUM_TIMER_READ_EXTERNAL */
+        set_field_ulonglong(f, m_row.m_stat.m_read_external.m_sum);
+        break;
+      case 40: /* MIN_TIMER_READ_EXTERNAL */
+        set_field_ulonglong(f, m_row.m_stat.m_read_external.m_min);
+        break;
+      case 41: /* AVG_TIMER_READ_EXTERNAL */
+        set_field_ulonglong(f, m_row.m_stat.m_read_external.m_avg);
+        break;
+      case 42: /* MAX_TIMER_READ_EXTERNAL */
+        set_field_ulonglong(f, m_row.m_stat.m_read_external.m_max);
+        break;
+
+      case 43: /* COUNT_WRITE_ALLOW_WRITE */
+        set_field_ulonglong(f, m_row.m_stat.m_write_allow_write.m_count);
+        break;
+      case 44: /* SUM_TIMER_WRITE_ALLOW_WRITE */
+        set_field_ulonglong(f, m_row.m_stat.m_write_allow_write.m_sum);
+        break;
+      case 45: /* MIN_TIMER_WRITE_ALLOW_WRITE */
+        set_field_ulonglong(f, m_row.m_stat.m_write_allow_write.m_min);
+        break;
+      case 46: /* AVG_TIMER_WRITE_ALLOW_WRITE */
+        set_field_ulonglong(f, m_row.m_stat.m_write_allow_write.m_avg);
+        break;
+      case 47: /* MAX_TIMER_WRITE_ALLOW_WRITE */
+        set_field_ulonglong(f, m_row.m_stat.m_write_allow_write.m_max);
+        break;
+
+      case 48: /* COUNT_WRITE_CONCURRENT_INSERT */
+        set_field_ulonglong(f, m_row.m_stat.m_write_concurrent_insert.m_count);
+        break;
+      case 49: /* SUM_TIMER_WRITE_CONCURRENT_INSERT */
+        set_field_ulonglong(f, m_row.m_stat.m_write_concurrent_insert.m_sum);
+        break;
+      case 50: /* MIN_TIMER_WRITE_CONCURRENT_INSERT */
+        set_field_ulonglong(f, m_row.m_stat.m_write_concurrent_insert.m_min);
+        break;
+      case 51: /* AVG_TIMER_WRITE_CONCURRENT_INSERT */
+        set_field_ulonglong(f, m_row.m_stat.m_write_concurrent_insert.m_avg);
+        break;
+      case 52: /* MAX_TIMER_WRITE_CONCURRENT_INSERT */
+        set_field_ulonglong(f, m_row.m_stat.m_write_concurrent_insert.m_max);
+        break;
+
+      case 53: /* COUNT_WRITE_DELAYED */
+        set_field_ulonglong(f, m_row.m_stat.m_write_delayed.m_count);
+        break;
+      case 54: /* SUM_TIMER_WRITE_DELAYED */
+        set_field_ulonglong(f, m_row.m_stat.m_write_delayed.m_sum);
+        break;
+      case 55: /* MIN_TIMER_WRITE_DELAYED */
+        set_field_ulonglong(f, m_row.m_stat.m_write_delayed.m_min);
+        break;
+      case 56: /* AVG_TIMER_WRITE_DELAYED */
+        set_field_ulonglong(f, m_row.m_stat.m_write_delayed.m_avg);
+        break;
+      case 57: /* MAX_TIMER_WRITE_DELAYED */
+        set_field_ulonglong(f, m_row.m_stat.m_write_delayed.m_max);
+        break;
+
+      case 58: /* COUNT_WRITE_LOW_PRIORITY */
+        set_field_ulonglong(f, m_row.m_stat.m_write_low_priority.m_count);
+        break;
+      case 59: /* SUM_TIMER_WRITE_LOW_PRIORITY */
+        set_field_ulonglong(f, m_row.m_stat.m_write_low_priority.m_sum);
+        break;
+      case 60: /* MIN_TIMER_WRITE_LOW_PRIORITY */
+        set_field_ulonglong(f, m_row.m_stat.m_write_low_priority.m_min);
+        break;
+      case 61: /* AVG_TIMER_WRITE_LOW_PRIORITY */
+        set_field_ulonglong(f, m_row.m_stat.m_write_low_priority.m_avg);
+        break;
+      case 62: /* MAX_TIMER_WRITE_LOW_PRIORITY */
+        set_field_ulonglong(f, m_row.m_stat.m_write_low_priority.m_max);
+        break;
+
+      case 63: /* COUNT_WRITE_NORMAL */
+        set_field_ulonglong(f, m_row.m_stat.m_write_normal.m_count);
+        break;
+      case 64: /* SUM_TIMER_WRITE_NORMAL */
+        set_field_ulonglong(f, m_row.m_stat.m_write_normal.m_sum);
+        break;
+      case 65: /* MIN_TIMER_WRITE_NORMAL */
+        set_field_ulonglong(f, m_row.m_stat.m_write_normal.m_min);
+        break;
+      case 66: /* AVG_TIMER_WRITE_NORMAL */
+        set_field_ulonglong(f, m_row.m_stat.m_write_normal.m_avg);
+        break;
+      case 67: /* MAX_TIMER_WRITE_NORMAL */
+        set_field_ulonglong(f, m_row.m_stat.m_write_normal.m_max);
+        break;
+
+      case 68: /* COUNT_WRITE_EXTERNAL */
+        set_field_ulonglong(f, m_row.m_stat.m_write_external.m_count);
+        break;
+      case 69: /* SUM_TIMER_WRITE_EXTERNAL */
+        set_field_ulonglong(f, m_row.m_stat.m_write_external.m_sum);
+        break;
+      case 70: /* MIN_TIMER_WRITE_EXTERNAL */
+        set_field_ulonglong(f, m_row.m_stat.m_write_external.m_min);
+        break;
+      case 71: /* AVG_TIMER_WRITE_EXTERNAL */
+        set_field_ulonglong(f, m_row.m_stat.m_write_external.m_avg);
+        break;
+      case 72: /* MAX_TIMER_WRITE_EXTERNAL */
+        set_field_ulonglong(f, m_row.m_stat.m_write_external.m_max);
+        break;
+
+      default:
+        DBUG_ASSERT(false);
+      }
+    }
+  }
+
+  return 0;
+}
+
diff --git a/storage/perfschema/table_tlws_by_table.h b/storage/perfschema/table_tlws_by_table.h
new file mode 100644
index 00000000000..c5521c72470
--- /dev/null
+++ b/storage/perfschema/table_tlws_by_table.h
@@ -0,0 +1,93 @@
+/* Copyright (c) 2010, 2011, Oracle and/or its affiliates. All rights reserved.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; version 2 of the License.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+#ifndef TABLE_LOCK_WAITS_SUMMARY_BY_TABLE_H
+#define TABLE_LOCK_WAITS_SUMMARY_BY_TABLE_H
+
+/**
+  @file storage/perfschema/table_tlws_by_table.h
+  Table TABLE_LOCK_WAITS_SUMMARY_BY_TABLE (declarations).
+*/
+
+#include "pfs_column_types.h"
+#include "pfs_engine_table.h"
+#include "pfs_instr_class.h"
+#include "pfs_instr.h"
+#include "table_helper.h"
+
+/**
+  @addtogroup Performance_schema_tables
+  @{
+*/
+
+/**
+  A row of table
+  PERFORMANCE_SCHEMA.TABLE_LOCK_WAITS_SUMMARY_BY_TABLE.
+*/
+struct row_tlws_by_table
+{
+  /** Column OBJECT_TYPE, SCHEMA_NAME, OBJECT_NAME. */
+  PFS_object_row m_object;
+  /** Columns COUNT/SUM/MIN/AVG/MAX READ/WRITE/READ_NORMAL/etc. */
+  PFS_table_lock_stat_row m_stat;
+};
+
+/** Table PERFORMANCE_SCHEMA.TABLE_LOCK_WAITS_SUMMARY_BY_TABLE. */
+class table_tlws_by_table : public PFS_engine_table
+{
+public:
+  /** Table share */
+  static PFS_engine_table_share m_share;
+  static PFS_engine_table* create();
+  static int delete_all_rows();
+
+  virtual int rnd_init(bool scan);
+  virtual int rnd_next();
+  virtual int rnd_pos(const void *pos);
+  virtual void reset_position(void);
+
+protected:
+  virtual int read_row_values(TABLE *table,
+                              unsigned char *buf,
+                              Field **fields,
+                              bool read_all);
+
+  table_tlws_by_table();
+
+public:
+  ~table_tlws_by_table()
+  {}
+
+protected:
+  void make_row(PFS_table_share *table_share);
+
+private:
+  /** Table share lock. */
+  static THR_LOCK m_table_lock;
+  /** Fields definition. */
+  static TABLE_FIELD_DEF m_field_def;
+
+  /** Current row. */
+  row_tlws_by_table m_row;
+  /** True is the current row exists. */
+  bool m_row_exists;
+  /** Current position. */
+  PFS_simple_index m_pos;
+  /** Next position. */
+  PFS_simple_index m_next_pos;
+};
+
+/** @} */
+#endif
diff --git a/storage/perfschema/table_users.cc b/storage/perfschema/table_users.cc
new file mode 100644
index 00000000000..c0d85017ebc
--- /dev/null
+++ b/storage/perfschema/table_users.cc
@@ -0,0 +1,147 @@
+/* Copyright (c) 2011, Oracle and/or its affiliates. All rights reserved.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; version 2 of the License.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software Foundation,
+  51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA */
+
+#include "my_global.h"
+#include "my_pthread.h"
+#include "table_users.h"
+#include "pfs_instr_class.h"
+#include "pfs_instr.h"
+#include "pfs_account.h"
+#include "pfs_user.h"
+#include "pfs_visitor.h"
+
+THR_LOCK table_users::m_table_lock;
+
+static const TABLE_FIELD_TYPE field_types[]=
+{
+  {
+    { C_STRING_WITH_LEN("USER") },
+    { C_STRING_WITH_LEN("char(16)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("CURRENT_CONNECTIONS") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  },
+  {
+    { C_STRING_WITH_LEN("TOTAL_CONNECTIONS") },
+    { C_STRING_WITH_LEN("bigint(20)") },
+    { NULL, 0}
+  }
+};
+
+TABLE_FIELD_DEF
+table_users::m_field_def=
+{ 3, field_types };
+
+PFS_engine_table_share
+table_users::m_share=
+{
+  { C_STRING_WITH_LEN("users") },
+  &pfs_truncatable_acl,
+  &table_users::create,
+  NULL, /* write_row */
+  table_users::delete_all_rows,
+  NULL, /* get_row_count */
+  1000, /* records */
+  sizeof(PFS_simple_index), /* ref length */
+  &m_table_lock,
+  &m_field_def,
+  false /* checked */
+};
+
+PFS_engine_table* table_users::create()
+{
+  return new table_users();
+}
+
+int
+table_users::delete_all_rows(void)
+{
+  reset_events_waits_by_thread();
+  reset_events_waits_by_account();
+  reset_events_waits_by_user();
+  reset_events_stages_by_thread();
+  reset_events_stages_by_account();
+  reset_events_stages_by_user();
+  reset_events_statements_by_thread();
+  reset_events_statements_by_account();
+  reset_events_statements_by_user();
+  purge_all_account();
+  purge_all_user();
+  return 0;
+}
+
+table_users::table_users()
+  : cursor_by_user(& m_share),
+  m_row_exists(false)
+{}
+
+void table_users::make_row(PFS_user *pfs)
+{
+  pfs_lock lock;
+
+  m_row_exists= false;
+  pfs->m_lock.begin_optimistic_lock(&lock);
+
+  if (m_row.m_user.make_row(pfs))
+    return;
+
+  PFS_connection_stat_visitor visitor;
+  PFS_connection_iterator::visit_user(pfs, true, true, & visitor);
+
+  if (! pfs->m_lock.end_optimistic_lock(& lock))
+    return;
+
+  m_row.m_connection_stat.set(& visitor.m_stat);
+  m_row_exists= true;
+}
+
+int table_users::read_row_values(TABLE *table,
+                                 unsigned char *buf,
+                                 Field **fields,
+                                 bool read_all)
+{
+  Field *f;
+
+  if (unlikely(! m_row_exists))
+    return HA_ERR_RECORD_DELETED;
+
+  /* Set the null bits */
+  DBUG_ASSERT(table->s->null_bytes == 1);
+  buf[0]= 0;
+
+  for (; (f= *fields) ; fields++)
+  {
+    if (read_all || bitmap_is_set(table->read_set, f->field_index))
+    {
+      switch(f->field_index)
+      {
+      case 0: /* USER */
+        m_row.m_user.set_field(f);
+        break;
+      case 1: /* CURRENT_CONNECTIONS */
+      case 2: /* TOTAL_CONNECTIONS */
+        m_row.m_connection_stat.set_field(f->field_index - 1, f);
+        break;
+      default:
+        DBUG_ASSERT(false);
+      }
+    }
+  }
+  return 0;
+}
+
diff --git a/storage/perfschema/table_users.h b/storage/perfschema/table_users.h
new file mode 100644
index 00000000000..94ea44832d1
--- /dev/null
+++ b/storage/perfschema/table_users.h
@@ -0,0 +1,80 @@
+/* Copyright (c) 2011, Oracle and/or its affiliates. All rights reserved.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; version 2 of the License.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software Foundation,
+  51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA */
+
+#ifndef TABLE_USERS_H
+#define TABLE_USERS_H
+
+#include "pfs_column_types.h"
+#include "cursor_by_user.h"
+#include "table_helper.h"
+
+struct PFS_user;
+
+/**
+  \addtogroup Performance_schema_tables
+  @{
+*/
+
+/**
+  A row of PERFORMANCE_SCHEMA.USERS.
+*/
+struct row_users
+{
+  /** Column USER. */
+  PFS_user_row m_user;
+  /** Columns CURRENT_CONNECTIONS, TOTAL_CONNECTIONS. */
+  PFS_connection_stat_row m_connection_stat;
+};
+
+/** Table PERFORMANCE_SCHEMA.USERS. */
+class table_users : public cursor_by_user
+{
+public:
+  /** Table share */
+  static PFS_engine_table_share m_share;
+  /** Table builder */
+  static PFS_engine_table* create();
+  static int delete_all_rows();
+
+protected:
+  virtual int read_row_values(TABLE *table,
+                              unsigned char *buf,
+                              Field **fields,
+                              bool read_all);
+
+
+protected:
+  table_users();
+
+public:
+  ~table_users()
+  {}
+
+private:
+  virtual void make_row(PFS_user *pfs);
+
+  /** Table share lock. */
+  static THR_LOCK m_table_lock;
+  /** Fields definition. */
+  static TABLE_FIELD_DEF m_field_def;
+
+  /** Current row. */
+  row_users m_row;
+  /** True if the current row exists. */
+  bool m_row_exists;
+};
+
+/** @} */
+#endif
diff --git a/storage/perfschema/unittest/CMakeLists.txt b/storage/perfschema/unittest/CMakeLists.txt
index a9bb1c3f4f2..757bc24c566 100644
--- a/storage/perfschema/unittest/CMakeLists.txt
+++ b/storage/perfschema/unittest/CMakeLists.txt
@@ -24,5 +24,5 @@ INCLUDE_DIRECTORIES(${CMAKE_SOURCE_DIR}/include
 
 ADD_DEFINITIONS(-DMYSQL_SERVER)
 
-MY_ADD_TESTS(pfs_instr_class pfs_instr_class-oom pfs_instr pfs_instr-oom pfs
+MY_ADD_TESTS(pfs_instr_class pfs_instr_class-oom pfs_instr pfs_instr-oom  pfs_account-oom pfs_host-oom pfs_user-oom pfs
   EXT "cc" LINK_LIBRARIES perfschema mysys)
diff --git a/storage/perfschema/unittest/conf.txt b/storage/perfschema/unittest/conf.txt
index 6d262854330..8afd0b4dca7 100644
--- a/storage/perfschema/unittest/conf.txt
+++ b/storage/perfschema/unittest/conf.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2009, 2010 Sun Microsystems, Inc.
+# Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
diff --git a/storage/perfschema/unittest/pfs-t.cc b/storage/perfschema/unittest/pfs-t.cc
index ea3e1aab5ec..6b30c0cc498 100644
--- a/storage/perfschema/unittest/pfs-t.cc
+++ b/storage/perfschema/unittest/pfs-t.cc
@@ -25,6 +25,7 @@
 #include <memory.h>
 
 #include "stub_print_error.h"
+#include "stub_pfs_defaults.h"
 #include "stub_server_misc.h"
 
 /* test helpers, to simulate the setup */
@@ -79,6 +80,7 @@ void test_bootstrap()
 
   diag("test_bootstrap");
 
+  memset(& param, 0xFF, sizeof(param));
   param.m_enabled= true;
   param.m_mutex_class_sizing= 0;
   param.m_rwlock_class_sizing= 0;
@@ -86,6 +88,7 @@ void test_bootstrap()
   param.m_thread_class_sizing= 0;
   param.m_table_share_sizing= 0;
   param.m_file_class_sizing= 0;
+  param.m_socket_class_sizing= 0;
   param.m_mutex_sizing= 0;
   param.m_rwlock_sizing= 0;
   param.m_cond_sizing= 0;
@@ -93,8 +96,21 @@ void test_bootstrap()
   param.m_table_sizing= 0;
   param.m_file_sizing= 0;
   param.m_file_handle_sizing= 0;
+  param.m_socket_sizing= 0;
   param.m_events_waits_history_sizing= 0;
   param.m_events_waits_history_long_sizing= 0;
+  param.m_setup_actor_sizing= 0;
+  param.m_setup_object_sizing= 0;
+  param.m_user_sizing= 0;
+  param.m_account_sizing= 0;
+  param.m_host_sizing= 0;
+  param.m_stage_class_sizing= 0;
+  param.m_events_stages_history_sizing= 0;
+  param.m_events_stages_history_long_sizing= 0;
+  param.m_statement_class_sizing= 0;
+  param.m_events_statements_history_sizing= 0;
+  param.m_events_statements_history_long_sizing= 0;
+  param.m_digest_sizing= 0;
 
   boot= initialize_performance_schema(& param);
   ok(boot != NULL, "boot");
@@ -121,6 +137,7 @@ PSI * load_perfschema()
   PSI_bootstrap *boot;
   PFS_global_param param;
 
+  memset(& param, 0xFF, sizeof(param));
   param.m_enabled= true;
   param.m_mutex_class_sizing= 10;
   param.m_rwlock_class_sizing= 10;
@@ -128,6 +145,7 @@ PSI * load_perfschema()
   param.m_thread_class_sizing= 10;
   param.m_table_share_sizing= 10;
   param.m_file_class_sizing= 10;
+  param.m_socket_class_sizing= 10;
   param.m_mutex_sizing= 10;
   param.m_rwlock_sizing= 10;
   param.m_cond_sizing= 10;
@@ -135,13 +153,30 @@ PSI * load_perfschema()
   param.m_table_sizing= 10;
   param.m_file_sizing= 10;
   param.m_file_handle_sizing= 50;
+  param.m_socket_sizing= 10;
   param.m_events_waits_history_sizing= 10;
   param.m_events_waits_history_long_sizing= 10;
+  param.m_setup_actor_sizing= 0;
+  param.m_setup_object_sizing= 0;
+  param.m_user_sizing= 0;
+  param.m_account_sizing= 0;
+  param.m_host_sizing= 0;
+  param.m_stage_class_sizing= 0;
+  param.m_events_stages_history_sizing= 0;
+  param.m_events_stages_history_long_sizing= 0;
+  param.m_statement_class_sizing= 0;
+  param.m_events_statements_history_sizing= 0;
+  param.m_events_statements_history_long_sizing= 0;
+  param.m_digest_sizing= 0;
 
   /* test_bootstrap() covered this, assuming it just works */
   boot= initialize_performance_schema(& param);
   psi= boot->get_interface(PSI_VERSION_1);
 
+  /* Reset every consumer to a known state */
+  flag_global_instrumentation= true;
+  flag_thread_instrumentation= true;
+
   return (PSI*) psi;
 }
 
@@ -478,6 +513,72 @@ void test_bad_registration()
   psi->register_file("X", bad_file_3, 1);
   ok(dummy_file_key == 2, "assigned key");
 
+ /*
+    Test that length('wait/io/socket/' (15) + category + '/' (1)) < 32
+    --> category can be up to 15 chars for a socket.
+  */
+
+  PSI_socket_key dummy_socket_key= 9999;
+  PSI_socket_info bad_socket_1[]=
+  {
+    { & dummy_socket_key, "X", 0}
+  };
+
+  psi->register_socket("/", bad_socket_1, 1);
+  ok(dummy_socket_key == 0, "zero key");
+  dummy_socket_key= 9999;
+  psi->register_socket("a/", bad_socket_1, 1);
+  ok(dummy_socket_key == 0, "zero key");
+  dummy_socket_key= 9999;
+  psi->register_socket("/b", bad_socket_1, 1);
+  ok(dummy_socket_key == 0, "zero key");
+  dummy_socket_key= 9999;
+  psi->register_socket("a/b", bad_socket_1, 1);
+  ok(dummy_socket_key == 0, "zero key");
+  dummy_socket_key= 9999;
+  psi->register_socket("1234567890123456", bad_socket_1, 1);
+  ok(dummy_socket_key == 0, "zero key");
+  dummy_socket_key= 9999;
+  psi->register_socket("123456789012345", bad_socket_1, 1);
+  ok(dummy_socket_key == 1, "assigned key");
+
+  /*
+    Test that length('wait/io/socket/' (15) + category + '/' (1) + name) <= 128
+    --> category + name can be up to 112 chars for a socket.
+  */
+
+  dummy_socket_key= 9999;
+  PSI_socket_info bad_socket_2[]=
+  {
+    { & dummy_socket_key,
+      /* 112 chars name */
+      "12345678901234567890123456789012345678901234567890"
+      "12345678901234567890123456789012345678901234567890"
+      "123456789012",
+      0}
+  };
+
+  psi->register_socket("X", bad_socket_2, 1);
+  ok(dummy_socket_key == 0, "zero key");
+
+  dummy_socket_key= 9999;
+  PSI_socket_info bad_socket_3[]=
+  {
+    { & dummy_socket_key,
+      /* 111 chars name */
+      "12345678901234567890123456789012345678901234567890"
+      "12345678901234567890123456789012345678901234567890"
+      "12345678901",
+      0}
+  };
+
+  psi->register_socket("XX", bad_socket_3, 1);
+  ok(dummy_socket_key == 0, "zero key");
+
+  psi->register_socket("X", bad_socket_3, 1);
+  ok(dummy_socket_key == 2, "assigned key");
+
+
   shutdown_performance_schema();
 }
 
@@ -513,6 +614,12 @@ void test_init_disabled()
     { & file_key_A, "F-A", 0}
   };
 
+  PSI_socket_key socket_key_A;
+  PSI_socket_info all_socket[]=
+  {
+    { & socket_key_A, "S-A", 0}
+  };
+
   PSI_thread_key thread_key_1;
   PSI_thread_info all_thread[]=
   {
@@ -523,16 +630,19 @@ void test_init_disabled()
   psi->register_rwlock("test", all_rwlock, 1);
   psi->register_cond("test", all_cond, 1);
   psi->register_file("test", all_file, 1);
+  psi->register_socket("test", all_socket, 1);
   psi->register_thread("test", all_thread, 1);
 
   PFS_mutex_class *mutex_class_A;
   PFS_rwlock_class *rwlock_class_A;
   PFS_cond_class *cond_class_A;
   PFS_file_class *file_class_A;
+  PFS_socket_class *socket_class_A;
   PSI_mutex *mutex_A1;
   PSI_rwlock *rwlock_A1;
   PSI_cond *cond_A1;
   PFS_file *file_A1;
+  PSI_socket *socket_A1;
   PSI_thread *thread_1;
 
   /* Preparation */
@@ -553,8 +663,11 @@ void test_init_disabled()
   file_class_A= find_file_class(file_key_A);
   ok(file_class_A != NULL, "file class A");
 
-  /* Pretend thread T-1 is running, and disabled */
-  /* ------------------------------------------- */
+  socket_class_A= find_socket_class(socket_key_A);
+  ok(socket_class_A != NULL, "socket class A");
+
+  /* Pretend thread T-1 is running, and disabled, with thread_instrumentation */
+  /* ------------------------------------------------------------------------ */
 
   psi->set_thread(thread_1);
   setup_thread(thread_1, false);
@@ -563,61 +676,61 @@ void test_init_disabled()
 
   mutex_class_A->m_enabled= false;
   mutex_A1= psi->init_mutex(mutex_key_A, NULL);
-  ok(mutex_A1 == NULL, "not instrumented");
+  ok(mutex_A1 == NULL, "mutex_A1 not instrumented");
 
-  /* enabled M-A + disabled T-1: no instrumentation */
+  /* enabled M-A + disabled T-1: instrumentation (for later) */
 
   mutex_class_A->m_enabled= true;
   mutex_A1= psi->init_mutex(mutex_key_A, NULL);
-  ok(mutex_A1 == NULL, "not instrumented");
+  ok(mutex_A1 != NULL, "mutex_A1 instrumented");
 
   /* broken key + disabled T-1: no instrumentation */
 
   mutex_class_A->m_enabled= true;
   mutex_A1= psi->init_mutex(0, NULL);
-  ok(mutex_A1 == NULL, "not instrumented");
+  ok(mutex_A1 == NULL, "mutex key 0 not instrumented");
   mutex_A1= psi->init_mutex(99, NULL);
-  ok(mutex_A1 == NULL, "not instrumented");
+  ok(mutex_A1 == NULL, "broken mutex key not instrumented");
 
   /* disabled RW-A + disabled T-1: no instrumentation */
 
   rwlock_class_A->m_enabled= false;
   rwlock_A1= psi->init_rwlock(rwlock_key_A, NULL);
-  ok(rwlock_A1 == NULL, "not instrumented");
+  ok(rwlock_A1 == NULL, "rwlock_A1 not instrumented");
 
-  /* enabled RW-A + disabled T-1: no instrumentation */
+  /* enabled RW-A + disabled T-1: instrumentation (for later) */
 
   rwlock_class_A->m_enabled= true;
   rwlock_A1= psi->init_rwlock(rwlock_key_A, NULL);
-  ok(rwlock_A1 == NULL, "not instrumented");
+  ok(rwlock_A1 != NULL, "rwlock_A1 instrumented");
 
   /* broken key + disabled T-1: no instrumentation */
 
   rwlock_class_A->m_enabled= true;
   rwlock_A1= psi->init_rwlock(0, NULL);
-  ok(rwlock_A1 == NULL, "not instrumented");
+  ok(rwlock_A1 == NULL, "rwlock key 0 not instrumented");
   rwlock_A1= psi->init_rwlock(99, NULL);
-  ok(rwlock_A1 == NULL, "not instrumented");
+  ok(rwlock_A1 == NULL, "broken rwlock key not instrumented");
 
   /* disabled C-A + disabled T-1: no instrumentation */
 
   cond_class_A->m_enabled= false;
   cond_A1= psi->init_cond(cond_key_A, NULL);
-  ok(cond_A1 == NULL, "not instrumented");
+  ok(cond_A1 == NULL, "cond_A1 not instrumented");
 
-  /* enabled C-A + disabled T-1: no instrumentation */
+  /* enabled C-A + disabled T-1: instrumentation (for later) */
 
   cond_class_A->m_enabled= true;
   cond_A1= psi->init_cond(cond_key_A, NULL);
-  ok(cond_A1 == NULL, "not instrumented");
+  ok(cond_A1 != NULL, "cond_A1 instrumented");
 
   /* broken key + disabled T-1: no instrumentation */
 
   cond_class_A->m_enabled= true;
   cond_A1= psi->init_cond(0, NULL);
-  ok(cond_A1 == NULL, "not instrumented");
+  ok(cond_A1 == NULL, "cond key 0 not instrumented");
   cond_A1= psi->init_cond(99, NULL);
-  ok(cond_A1 == NULL, "not instrumented");
+  ok(cond_A1 == NULL, "broken cond key not instrumented");
 
   /* disabled F-A + disabled T-1: no instrumentation */
 
@@ -643,6 +756,26 @@ void test_init_disabled()
   file_A1= lookup_file_by_name("foo");
   ok(file_A1 == NULL, "not instrumented");
 
+  /* disabled S-A + disabled T-1: no instrumentation */
+
+  socket_class_A->m_enabled= false;
+  socket_A1= psi->init_socket(socket_key_A, NULL);
+  ok(socket_A1 == NULL, "socket_A1 not instrumented");
+
+  /* enabled S-A + disabled T-1: instrumentation (for later) */
+
+  socket_class_A->m_enabled= true;
+  socket_A1= psi->init_socket(socket_key_A, NULL);
+  ok(socket_A1 != NULL, "socket_A1 instrumented");
+
+  /* broken key + disabled T-1: no instrumentation */
+
+  socket_class_A->m_enabled= true;
+  socket_A1= psi->init_socket(0, NULL);
+  ok(socket_A1 == NULL, "socket key 0 not instrumented");
+  socket_A1= psi->init_socket(99, NULL);
+  ok(socket_A1 == NULL, "broken socket key not instrumented");
+  
   /* Pretend thread T-1 is enabled */
   /* ----------------------------- */
 
@@ -739,17 +872,37 @@ void test_init_disabled()
   file_class_A->m_enabled= true;
   psi->create_file(file_key_A, "foo-instrumented", (File) 12);
   file_A1= lookup_file_by_name("foo-instrumented");
-  ok(file_A1 != NULL, "instrumented");
+  ok(file_A1 != NULL, "file_A1 instrumented");
 
   /* broken key + enabled T-1: no instrumentation */
 
   file_class_A->m_enabled= true;
   psi->create_file(0, "foo", (File) 12);
   file_A1= lookup_file_by_name("foo");
-  ok(file_A1 == NULL, "not instrumented");
+  ok(file_A1 == NULL, "file key 0 not instrumented");
   psi->create_file(99, "foo", (File) 12);
   file_A1= lookup_file_by_name("foo");
-  ok(file_A1 == NULL, "not instrumented");
+  ok(file_A1 == NULL, "broken file key not instrumented");
+
+  /* disabled S-A + enabled T-1: no instrumentation */
+
+  socket_class_A->m_enabled= false;
+  ok(socket_A1 == NULL, "not instrumented");
+
+  /* enabled S-A + enabled T-1: instrumentation */
+
+  socket_class_A->m_enabled= true;
+  socket_A1= psi->init_socket(socket_key_A, NULL);
+  ok(socket_A1 != NULL, "instrumented");
+  psi->destroy_socket(socket_A1);
+
+  /* broken key + enabled T-1: no instrumentation */
+
+  socket_class_A->m_enabled= true;
+  socket_A1= psi->init_socket(0, NULL);
+  ok(socket_A1 == NULL, "not instrumented");
+  socket_A1= psi->init_socket(99, NULL);
+  ok(socket_A1 == NULL, "not instrumented");
 
   /* Pretend the running thread is not instrumented */
   /* ---------------------------------------------- */
@@ -760,61 +913,61 @@ void test_init_disabled()
 
   mutex_class_A->m_enabled= false;
   mutex_A1= psi->init_mutex(mutex_key_A, NULL);
-  ok(mutex_A1 == NULL, "not instrumented");
+  ok(mutex_A1 == NULL, "mutex_A1 not instrumented");
 
-  /* enabled M-A + unknown thread: no instrumentation */
+  /* enabled M-A + unknown thread: instrumentation (for later) */
 
   mutex_class_A->m_enabled= true;
   mutex_A1= psi->init_mutex(mutex_key_A, NULL);
-  ok(mutex_A1 == NULL, "not instrumented");
+  ok(mutex_A1 != NULL, "mutex_A1 instrumented");
 
   /* broken key + unknown thread: no instrumentation */
 
   mutex_class_A->m_enabled= true;
   mutex_A1= psi->init_mutex(0, NULL);
-  ok(mutex_A1 == NULL, "not instrumented");
+  ok(mutex_A1 == NULL, "mutex key 0 not instrumented");
   mutex_A1= psi->init_mutex(99, NULL);
-  ok(mutex_A1 == NULL, "not instrumented");
+  ok(mutex_A1 == NULL, "broken mutex key not instrumented");
 
   /* disabled RW-A + unknown thread: no instrumentation */
 
   rwlock_class_A->m_enabled= false;
   rwlock_A1= psi->init_rwlock(rwlock_key_A, NULL);
-  ok(rwlock_A1 == NULL, "not instrumented");
+  ok(rwlock_A1 == NULL, "rwlock_A1 not instrumented");
 
-  /* enabled RW-A + unknown thread: no instrumentation */
+  /* enabled RW-A + unknown thread: instrumentation (for later) */
 
   rwlock_class_A->m_enabled= true;
   rwlock_A1= psi->init_rwlock(rwlock_key_A, NULL);
-  ok(rwlock_A1 == NULL, "not instrumented");
+  ok(rwlock_A1 != NULL, "rwlock_A1 instrumented");
 
   /* broken key + unknown thread: no instrumentation */
 
   rwlock_class_A->m_enabled= true;
   rwlock_A1= psi->init_rwlock(0, NULL);
-  ok(rwlock_A1 == NULL, "not instrumented");
+  ok(rwlock_A1 == NULL, "rwlock key 0 not instrumented");
   rwlock_A1= psi->init_rwlock(99, NULL);
-  ok(rwlock_A1 == NULL, "not instrumented");
+  ok(rwlock_A1 == NULL, "broken rwlock key not instrumented");
 
   /* disabled C-A + unknown thread: no instrumentation */
 
   cond_class_A->m_enabled= false;
   cond_A1= psi->init_cond(cond_key_A, NULL);
-  ok(cond_A1 == NULL, "not instrumented");
+  ok(cond_A1 == NULL, "cond_A1 not instrumented");
 
-  /* enabled C-A + unknown thread: no instrumentation */
+  /* enabled C-A + unknown thread: instrumentation (for later) */
 
   cond_class_A->m_enabled= true;
   cond_A1= psi->init_cond(cond_key_A, NULL);
-  ok(cond_A1 == NULL, "not instrumented");
+  ok(cond_A1 != NULL, "cond_A1 instrumented");
 
   /* broken key + unknown thread: no instrumentation */
 
   cond_class_A->m_enabled= true;
   cond_A1= psi->init_cond(0, NULL);
-  ok(cond_A1 == NULL, "not instrumented");
+  ok(cond_A1 == NULL, "cond key 0 not instrumented");
   cond_A1= psi->init_cond(99, NULL);
-  ok(cond_A1 == NULL, "not instrumented");
+  ok(cond_A1 == NULL, "broken cond key not instrumented");
 
   /* disabled F-A + unknown thread: no instrumentation */
 
@@ -840,6 +993,26 @@ void test_init_disabled()
   file_A1= lookup_file_by_name("foo");
   ok(file_A1 == NULL, "not instrumented");
 
+  /* disabled S-A + unknown thread: no instrumentation */
+
+  socket_class_A->m_enabled= false;
+  socket_A1= psi->init_socket(socket_key_A, NULL);
+  ok(socket_A1 == NULL, "socket_A1 not instrumented");
+
+  /* enabled S-A + unknown thread: instrumentation (for later) */
+
+  socket_class_A->m_enabled= true;
+  socket_A1= psi->init_socket(socket_key_A, NULL);
+  ok(socket_A1 != NULL, "socket_A1 instrumented");
+
+  /* broken key + unknown thread: no instrumentation */
+
+  socket_class_A->m_enabled= true;
+  socket_A1= psi->init_socket(0, NULL);
+  ok(socket_A1 == NULL, "socket key 0 not instrumented");
+  socket_A1= psi->init_socket(99, NULL);
+  ok(socket_A1 == NULL, "broken socket key not instrumented");
+  
   shutdown_performance_schema();
 }
 
@@ -875,6 +1048,12 @@ void test_locker_disabled()
     { & file_key_A, "F-A", 0}
   };
 
+  PSI_socket_key socket_key_A;
+  PSI_socket_info all_socket[]=
+  {
+    { & socket_key_A, "S-A", 0}
+  };
+
   PSI_thread_key thread_key_1;
   PSI_thread_info all_thread[]=
   {
@@ -885,16 +1064,19 @@ void test_locker_disabled()
   psi->register_rwlock("test", all_rwlock, 1);
   psi->register_cond("test", all_cond, 1);
   psi->register_file("test", all_file, 1);
+  psi->register_socket("test", all_socket, 1);
   psi->register_thread("test", all_thread, 1);
 
   PFS_mutex_class *mutex_class_A;
   PFS_rwlock_class *rwlock_class_A;
   PFS_cond_class *cond_class_A;
   PFS_file_class *file_class_A;
+  PFS_socket_class *socket_class_A;
   PSI_mutex *mutex_A1;
   PSI_rwlock *rwlock_A1;
   PSI_cond *cond_A1;
   PSI_file *file_A1;
+  PSI_socket *socket_A1;
   PSI_thread *thread_1;
 
   /* Preparation */
@@ -915,6 +1097,9 @@ void test_locker_disabled()
   file_class_A= find_file_class(file_key_A);
   ok(file_class_A != NULL, "file info A");
 
+  socket_class_A= find_socket_class(socket_key_A);
+  ok(socket_class_A != NULL, "socket info A");
+
   /* Pretend thread T-1 is running, and enabled */
   /* ------------------------------------------ */
 
@@ -940,6 +1125,13 @@ void test_locker_disabled()
   file_A1= (PSI_file*) lookup_file_by_name("foo");
   ok(file_A1 != NULL, "instrumented");
 
+  socket_class_A->m_enabled= true;
+  socket_A1= psi->init_socket(socket_key_A, NULL);
+  ok(socket_A1 != NULL, "instrumented");
+
+  /* Socket lockers require a thread owner */
+  psi->set_socket_thread_owner(socket_A1);
+
   PSI_mutex_locker *mutex_locker;
   PSI_mutex_locker_state mutex_state;
   PSI_rwlock_locker *rwlock_locker;
@@ -948,6 +1140,8 @@ void test_locker_disabled()
   PSI_cond_locker_state cond_state;
   PSI_file_locker *file_locker;
   PSI_file_locker_state file_state;
+  PSI_socket_locker *socket_locker;
+  PSI_socket_locker_state socket_state;
 
   /* Pretend thread T-1 is disabled */
   /* ------------------------------ */
@@ -958,58 +1152,110 @@ void test_locker_disabled()
   rwlock_class_A->m_enabled= true;
   cond_class_A->m_enabled= true;
   file_class_A->m_enabled= true;
-
-  mutex_locker= psi->get_thread_mutex_locker(&mutex_state, mutex_A1, PSI_MUTEX_LOCK);
-  ok(mutex_locker == NULL, "no locker");
-  rwlock_locker= psi->get_thread_rwlock_locker(&rwlock_state, rwlock_A1, PSI_RWLOCK_READLOCK);
-  ok(rwlock_locker == NULL, "no locker");
-  cond_locker= psi->get_thread_cond_locker(&cond_state, cond_A1, mutex_A1, PSI_COND_WAIT);
-  ok(cond_locker == NULL, "no locker");
+  socket_class_A->m_enabled= true;
+
+  mutex_locker= psi->start_mutex_wait(&mutex_state, mutex_A1, PSI_MUTEX_LOCK, "foo.cc", 12);
+  ok(mutex_locker == NULL, "no locker (T-1 disabled)");
+  rwlock_locker= psi->start_rwlock_rdwait(&rwlock_state, rwlock_A1, PSI_RWLOCK_READLOCK, "foo.cc", 12);
+  ok(rwlock_locker == NULL, "no locker (T-1 disabled)");
+  cond_locker= psi->start_cond_wait(&cond_state, cond_A1, mutex_A1, PSI_COND_WAIT, "foo.cc", 12);
+  ok(cond_locker == NULL, "no locker (T-1 disabled)");
   file_locker= psi->get_thread_file_name_locker(&file_state, file_key_A, PSI_FILE_OPEN, "xxx", NULL);
-  ok(file_locker == NULL, "no locker");
+  ok(file_locker == NULL, "no locker (T-1 disabled)");
   file_locker= psi->get_thread_file_stream_locker(&file_state, file_A1, PSI_FILE_READ);
-  ok(file_locker == NULL, "no locker");
+  ok(file_locker == NULL, "no locker (T-1 disabled)");
   file_locker= psi->get_thread_file_descriptor_locker(&file_state, (File) 12, PSI_FILE_READ);
-  ok(file_locker == NULL, "no locker");
+  ok(file_locker == NULL, "no locker (T-1 disabled)");
+  socket_locker= psi->start_socket_wait(&socket_state, socket_A1, PSI_SOCKET_SEND, 12, "foo.cc", 12);
+  ok(socket_locker == NULL, "no locker (T-1 disabled)");
 
-  /* Pretend the consumer is disabled */
-  /* -------------------------------- */
+  /* Pretend the global consumer is disabled */
+  /* --------------------------------------- */
 
   setup_thread(thread_1, true);
-  flag_events_waits_current= false;
+  flag_global_instrumentation= false;
   mutex_class_A->m_enabled= true;
   rwlock_class_A->m_enabled= true;
   cond_class_A->m_enabled= true;
   file_class_A->m_enabled= true;
+  socket_class_A->m_enabled= true;
+  update_instruments_derived_flags();
+
+  mutex_locker= psi->start_mutex_wait(&mutex_state, mutex_A1, PSI_MUTEX_LOCK, "foo.cc", 12);
+  ok(mutex_locker == NULL, "no locker (global disabled)");
+  rwlock_locker= psi->start_rwlock_rdwait(&rwlock_state, rwlock_A1, PSI_RWLOCK_READLOCK, "foo.cc", 12);
+  ok(rwlock_locker == NULL, "no locker (global disabled)");
+  cond_locker= psi->start_cond_wait(&cond_state, cond_A1, mutex_A1, PSI_COND_WAIT, "foo.cc", 12);
+  ok(cond_locker == NULL, "no locker (global disabled)");
+  file_locker= psi->get_thread_file_name_locker(&file_state, file_key_A, PSI_FILE_OPEN, "xxx", NULL);
+  ok(file_locker == NULL, "no locker (global disabled)");
+  file_locker= psi->get_thread_file_stream_locker(&file_state, file_A1, PSI_FILE_READ);
+  ok(file_locker == NULL, "no locker (global disabled)");
+  file_locker= psi->get_thread_file_descriptor_locker(&file_state, (File) 12, PSI_FILE_READ);
+  ok(file_locker == NULL, "no locker (global disabled)");
+  socket_locker= psi->start_socket_wait(&socket_state, socket_A1, PSI_SOCKET_SEND, 12, "foo.cc", 12);
+  ok(socket_locker == NULL, "no locker (global disabled)");
 
-  mutex_locker= psi->get_thread_mutex_locker(&mutex_state, mutex_A1, PSI_MUTEX_LOCK);
-  ok(mutex_locker == NULL, "no locker");
-  rwlock_locker= psi->get_thread_rwlock_locker(&rwlock_state, rwlock_A1, PSI_RWLOCK_READLOCK);
-  ok(rwlock_locker == NULL, "no locker");
-  cond_locker= psi->get_thread_cond_locker(&cond_state, cond_A1, mutex_A1, PSI_COND_WAIT);
-  ok(cond_locker == NULL, "no locker");
+  /* Pretend the mode is global, counted only */
+  /* ---------------------------------------- */
+
+  setup_thread(thread_1, true);
+  flag_global_instrumentation= true;
+  flag_thread_instrumentation= false;
+  mutex_class_A->m_enabled= true;
+  mutex_class_A->m_timed= false;
+  rwlock_class_A->m_enabled= true;
+  rwlock_class_A->m_timed= false;
+  cond_class_A->m_enabled= true;
+  cond_class_A->m_timed= false;
+  file_class_A->m_enabled= true;
+  file_class_A->m_timed= false;
+  socket_class_A->m_enabled= true;
+  socket_class_A->m_timed= false;
+  update_instruments_derived_flags();
+
+  mutex_locker= psi->start_mutex_wait(&mutex_state, mutex_A1, PSI_MUTEX_LOCK, "foo.cc", 12);
+  ok(mutex_locker == NULL, "no locker (global counted)");
+  rwlock_locker= psi->start_rwlock_rdwait(&rwlock_state, rwlock_A1, PSI_RWLOCK_READLOCK, "foo.cc", 12);
+  ok(rwlock_locker == NULL, "no locker (global counted)");
+  cond_locker= psi->start_cond_wait(&cond_state, cond_A1, mutex_A1, PSI_COND_WAIT, "foo.cc", 12);
+  ok(cond_locker == NULL, "no locker (global counted)");
   file_locker= psi->get_thread_file_name_locker(&file_state, file_key_A, PSI_FILE_OPEN, "xxx", NULL);
-  ok(file_locker == NULL, "no locker");
+  ok(file_locker != NULL, "locker (global counted)");
+  psi->start_file_wait(file_locker, 10, __FILE__, __LINE__);
+  psi->end_file_wait(file_locker, 10);
   file_locker= psi->get_thread_file_stream_locker(&file_state, file_A1, PSI_FILE_READ);
-  ok(file_locker == NULL, "no locker");
+  ok(file_locker != NULL, "locker (global counted)");
+  psi->start_file_wait(file_locker, 10, __FILE__, __LINE__);
+  psi->end_file_wait(file_locker, 10);
   file_locker= psi->get_thread_file_descriptor_locker(&file_state, (File) 12, PSI_FILE_READ);
-  ok(file_locker == NULL, "no locker");
+  ok(file_locker != NULL, "locker (global counted)");
+  psi->start_file_wait(file_locker, 10, __FILE__, __LINE__);
+  psi->end_file_wait(file_locker, 10);
+  /* The null locker shortcut applies only to socket ops with no byte count */
+  socket_locker= psi->start_socket_wait(&socket_state, socket_A1, PSI_SOCKET_BIND, 0, "foo.cc", 12);
+  ok(socket_locker == NULL, "no locker (global counted)");
+
+  /* TODO */
 
   /* Pretend the instrument is disabled */
   /* ---------------------------------- */
 
   setup_thread(thread_1, true);
+  flag_global_instrumentation= true;
   flag_events_waits_current= true;
   mutex_class_A->m_enabled= false;
   rwlock_class_A->m_enabled= false;
   cond_class_A->m_enabled= false;
   file_class_A->m_enabled= false;
+  socket_class_A->m_enabled= false;
+  update_instruments_derived_flags();
 
-  mutex_locker= psi->get_thread_mutex_locker(&mutex_state, mutex_A1, PSI_MUTEX_LOCK);
+  mutex_locker= psi->start_mutex_wait(&mutex_state, mutex_A1, PSI_MUTEX_LOCK, "foo.cc", 12);
   ok(mutex_locker == NULL, "no locker");
-  rwlock_locker= psi->get_thread_rwlock_locker(&rwlock_state, rwlock_A1, PSI_RWLOCK_READLOCK);
+  rwlock_locker= psi->start_rwlock_rdwait(&rwlock_state, rwlock_A1, PSI_RWLOCK_READLOCK, "foo.cc", 12);
   ok(rwlock_locker == NULL, "no locker");
-  cond_locker= psi->get_thread_cond_locker(&cond_state, cond_A1, mutex_A1, PSI_COND_WAIT);
+  cond_locker= psi->start_cond_wait(&cond_state, cond_A1, mutex_A1, PSI_COND_WAIT, "foo.cc", 12);
   ok(cond_locker == NULL, "no locker");
   file_locker= psi->get_thread_file_name_locker(&file_state, file_key_A, PSI_FILE_OPEN, "xxx", NULL);
   ok(file_locker == NULL, "no locker");
@@ -1017,28 +1263,36 @@ void test_locker_disabled()
   ok(file_locker == NULL, "no locker");
   file_locker= psi->get_thread_file_descriptor_locker(&file_state, (File) 12, PSI_FILE_READ);
   ok(file_locker == NULL, "no locker");
+  socket_locker= psi->start_socket_wait(&socket_state, socket_A1, PSI_SOCKET_SEND, 12, "foo.cc", 12);
+  ok(socket_locker == NULL, "no locker");
 
-  /* Pretend everything is enabled */
-  /* ----------------------------- */
+  /* Pretend everything is enabled and timed */
+  /* --------------------------------------- */
 
   setup_thread(thread_1, true);
+  flag_global_instrumentation= true;
+  flag_thread_instrumentation= true;
   flag_events_waits_current= true;
   mutex_class_A->m_enabled= true;
+  mutex_class_A->m_timed= true;
   rwlock_class_A->m_enabled= true;
+  rwlock_class_A->m_timed= true;
   cond_class_A->m_enabled= true;
+  cond_class_A->m_timed= true;
   file_class_A->m_enabled= true;
+  file_class_A->m_timed= true;
+  socket_class_A->m_enabled= true;
+  socket_class_A->m_timed= true;
+  update_instruments_derived_flags();
 
-  mutex_locker= psi->get_thread_mutex_locker(&mutex_state, mutex_A1, PSI_MUTEX_LOCK);
+  mutex_locker= psi->start_mutex_wait(&mutex_state, mutex_A1, PSI_MUTEX_LOCK, __FILE__, __LINE__);
   ok(mutex_locker != NULL, "locker");
-  psi->start_mutex_wait(mutex_locker, __FILE__, __LINE__);
   psi->end_mutex_wait(mutex_locker, 0);
-  rwlock_locker= psi->get_thread_rwlock_locker(&rwlock_state, rwlock_A1, PSI_RWLOCK_READLOCK);
+  rwlock_locker= psi->start_rwlock_rdwait(&rwlock_state, rwlock_A1, PSI_RWLOCK_READLOCK, __FILE__, __LINE__);
   ok(rwlock_locker != NULL, "locker");
-  psi->start_rwlock_rdwait(rwlock_locker, __FILE__, __LINE__);
   psi->end_rwlock_rdwait(rwlock_locker, 0);
-  cond_locker= psi->get_thread_cond_locker(&cond_state, cond_A1, mutex_A1, PSI_COND_WAIT);
+  cond_locker= psi->start_cond_wait(&cond_state, cond_A1, mutex_A1, PSI_COND_WAIT, __FILE__, __LINE__);
   ok(cond_locker != NULL, "locker");
-  psi->start_cond_wait(cond_locker, __FILE__, __LINE__);
   psi->end_cond_wait(cond_locker, 0);
   file_locker= psi->get_thread_file_name_locker(&file_state, file_key_A, PSI_FILE_OPEN, "xxx", NULL);
   ok(file_locker != NULL, "locker");
@@ -1052,7 +1306,20 @@ void test_locker_disabled()
   ok(file_locker != NULL, "locker");
   psi->start_file_wait(file_locker, 10, __FILE__, __LINE__);
   psi->end_file_wait(file_locker, 10);
+  socket_locker= psi->start_socket_wait(&socket_state, socket_A1, PSI_SOCKET_SEND, 12, "foo.cc", 12);
+  ok(socket_locker != NULL, "locker");
+  psi->end_socket_wait(socket_locker, 10);
+
+  /* Pretend the socket does not have a thread owner */
+  /* ---------------------------------------------- */
 
+  socket_class_A->m_enabled= true;
+  socket_A1= psi->init_socket(socket_key_A, NULL);
+  ok(socket_A1 != NULL, "instrumented");
+  /* Socket thread owner has not been set */
+  socket_locker= psi->start_socket_wait(&socket_state, socket_A1, PSI_SOCKET_SEND, 12, "foo.cc", 12);
+  ok(socket_locker == NULL, "no locker (no thread owner)");
+  
   /* Pretend the running thread is not instrumented */
   /* ---------------------------------------------- */
 
@@ -1062,12 +1329,14 @@ void test_locker_disabled()
   rwlock_class_A->m_enabled= true;
   cond_class_A->m_enabled= true;
   file_class_A->m_enabled= true;
+  socket_class_A->m_enabled= true;
+  update_instruments_derived_flags();
 
-  mutex_locker= psi->get_thread_mutex_locker(&mutex_state, mutex_A1, PSI_MUTEX_LOCK);
+  mutex_locker= psi->start_mutex_wait(&mutex_state, mutex_A1, PSI_MUTEX_LOCK, "foo.cc", 12);
   ok(mutex_locker == NULL, "no locker");
-  rwlock_locker= psi->get_thread_rwlock_locker(&rwlock_state, rwlock_A1, PSI_RWLOCK_READLOCK);
+  rwlock_locker= psi->start_rwlock_rdwait(&rwlock_state, rwlock_A1, PSI_RWLOCK_READLOCK, "foo.cc", 12);
   ok(rwlock_locker == NULL, "no locker");
-  cond_locker= psi->get_thread_cond_locker(&cond_state, cond_A1, mutex_A1, PSI_COND_WAIT);
+  cond_locker= psi->start_cond_wait(&cond_state, cond_A1, mutex_A1, PSI_COND_WAIT, "foo.cc", 12);
   ok(cond_locker == NULL, "no locker");
   file_locker= psi->get_thread_file_name_locker(&file_state, file_key_A, PSI_FILE_OPEN, "xxx", NULL);
   ok(file_locker == NULL, "no locker");
@@ -1075,6 +1344,8 @@ void test_locker_disabled()
   ok(file_locker == NULL, "no locker");
   file_locker= psi->get_thread_file_descriptor_locker(&file_state, (File) 12, PSI_FILE_READ);
   ok(file_locker == NULL, "no locker");
+  socket_locker= psi->start_socket_wait(&socket_state, socket_A1, PSI_SOCKET_SEND, 12, "foo.cc", 12);
+  ok(socket_locker == NULL, "no locker");
 
   shutdown_performance_schema();
 }
@@ -1203,6 +1474,150 @@ void test_enabled()
 #endif
 }
 
+void test_event_name_index()
+{
+  PSI *psi;
+  PSI_bootstrap *boot;
+  PFS_global_param param;
+
+  diag("test_event_name_index");
+
+  memset(& param, 0xFF, sizeof(param));
+  param.m_enabled= true;
+
+  /* Per mutex info waits should be at [0..9] */
+  param.m_mutex_class_sizing= 10;
+  /* Per rwlock info waits should be at [10..29] */
+  param.m_rwlock_class_sizing= 20;
+  /* Per cond info waits should be at [30..69] */
+  param.m_cond_class_sizing= 40;
+  /* Per file info waits should be at [70..149] */
+  param.m_file_class_sizing= 80;
+  /* Per socket info waits should be at [150..309] */
+  param.m_socket_class_sizing= 160;
+  /* Per table info waits should be at [310] */
+  param.m_table_share_sizing= 320;
+
+  param.m_thread_class_sizing= 0;
+  param.m_user_sizing= 0;
+  param.m_account_sizing= 0;
+  param.m_host_sizing= 0;
+  param.m_stage_class_sizing= 0;
+  param.m_events_stages_history_sizing= 0;
+  param.m_events_stages_history_long_sizing= 0;
+  param.m_statement_class_sizing= 0;
+  param.m_events_statements_history_sizing= 0;
+  param.m_events_statements_history_long_sizing= 0;
+  param.m_digest_sizing= 0;
+
+  param.m_mutex_sizing= 0;
+  param.m_rwlock_sizing= 0;
+  param.m_cond_sizing= 0;
+  param.m_thread_sizing= 0;
+  param.m_table_sizing= 0;
+  param.m_file_sizing= 0;
+  param.m_file_handle_sizing= 0;
+  param.m_socket_sizing= 0;
+  param.m_events_waits_history_sizing= 0;
+  param.m_events_waits_history_long_sizing= 0;
+  param.m_setup_actor_sizing= 0;
+  param.m_setup_object_sizing= 0;
+
+  boot= initialize_performance_schema(& param);
+  ok(boot != NULL, "bootstrap");
+  psi= (PSI*) boot->get_interface(PSI_VERSION_1);
+  ok(psi != NULL, "psi");
+
+  PFS_mutex_class *mutex_class;
+  PSI_mutex_key dummy_mutex_key_1;
+  PSI_mutex_key dummy_mutex_key_2;
+  PSI_mutex_info dummy_mutexes[]=
+  {
+    { & dummy_mutex_key_1, "M-1", 0},
+    { & dummy_mutex_key_2, "M-2", 0}
+  };
+
+  psi->register_mutex("X", dummy_mutexes, 2);
+  mutex_class= find_mutex_class(dummy_mutex_key_1);
+  ok(mutex_class != NULL, "mutex class 1");
+  ok(mutex_class->m_event_name_index == 0, "index 0");
+  mutex_class= find_mutex_class(dummy_mutex_key_2);
+  ok(mutex_class != NULL, "mutex class 2");
+  ok(mutex_class->m_event_name_index == 1, "index 1");
+
+  PFS_rwlock_class *rwlock_class;
+  PSI_rwlock_key dummy_rwlock_key_1;
+  PSI_rwlock_key dummy_rwlock_key_2;
+  PSI_rwlock_info dummy_rwlocks[]=
+  {
+    { & dummy_rwlock_key_1, "RW-1", 0},
+    { & dummy_rwlock_key_2, "RW-2", 0}
+  };
+
+  psi->register_rwlock("X", dummy_rwlocks, 2);
+  rwlock_class= find_rwlock_class(dummy_rwlock_key_1);
+  ok(rwlock_class != NULL, "rwlock class 1");
+  ok(rwlock_class->m_event_name_index == 10, "index 10");
+  rwlock_class= find_rwlock_class(dummy_rwlock_key_2);
+  ok(rwlock_class != NULL, "rwlock class 2");
+  ok(rwlock_class->m_event_name_index == 11, "index 11");
+
+  PFS_cond_class *cond_class;
+  PSI_cond_key dummy_cond_key_1;
+  PSI_cond_key dummy_cond_key_2;
+  PSI_cond_info dummy_conds[]=
+  {
+    { & dummy_cond_key_1, "C-1", 0},
+    { & dummy_cond_key_2, "C-2", 0}
+  };
+
+  psi->register_cond("X", dummy_conds, 2);
+  cond_class= find_cond_class(dummy_cond_key_1);
+  ok(cond_class != NULL, "cond class 1");
+  ok(cond_class->m_event_name_index == 30, "index 30");
+  cond_class= find_cond_class(dummy_cond_key_2);
+  ok(cond_class != NULL, "cond class 2");
+  ok(cond_class->m_event_name_index == 31, "index 31");
+
+  PFS_file_class *file_class;
+  PSI_file_key dummy_file_key_1;
+  PSI_file_key dummy_file_key_2;
+  PSI_file_info dummy_files[]=
+  {
+    { & dummy_file_key_1, "F-1", 0},
+    { & dummy_file_key_2, "F-2", 0}
+  };
+
+  psi->register_file("X", dummy_files, 2);
+  file_class= find_file_class(dummy_file_key_1);
+  ok(file_class != NULL, "file class 1");
+  ok(file_class->m_event_name_index == 70, "index 70");
+  file_class= find_file_class(dummy_file_key_2);
+  ok(file_class != NULL, "file class 2");
+  ok(file_class->m_event_name_index == 71, "index 71");
+
+  PFS_socket_class *socket_class;
+  PSI_socket_key dummy_socket_key_1;
+  PSI_socket_key dummy_socket_key_2;
+  PSI_socket_info dummy_sockets[]=
+  {
+    { & dummy_socket_key_1, "S-1", 0},
+    { & dummy_socket_key_2, "S-2", 0}
+  };
+
+  psi->register_socket("X", dummy_sockets, 2);
+  socket_class= find_socket_class(dummy_socket_key_1);
+  ok(socket_class != NULL, "socket class 1");
+  ok(socket_class->m_event_name_index == 150, "index 150");
+  socket_class= find_socket_class(dummy_socket_key_2);
+  ok(socket_class != NULL, "socket class 2");
+  ok(socket_class->m_event_name_index == 151, "index 151");
+
+  ok(global_table_io_class.m_event_name_index == 310, "index 310");
+  ok(global_table_lock_class.m_event_name_index == 311, "index 311");
+  ok(wait_class_max= 313, "313 event names"); // 3 global classes
+}
+
 void do_all_tests()
 {
   /* Using initialize_performance_schema(), no partial init needed. */
@@ -1212,14 +1627,14 @@ void do_all_tests()
   test_init_disabled();
   test_locker_disabled();
   test_file_instrumentation_leak();
+  test_event_name_index();
 }
 
 int main(int argc, char **argv)
 {
-  plan(153);
+  plan(216);
   MY_INIT(argv[0]);
   do_all_tests();
   my_end(0);
   return 0;
 }
-
diff --git a/storage/perfschema/unittest/pfs_account-oom-t.cc b/storage/perfschema/unittest/pfs_account-oom-t.cc
new file mode 100644
index 00000000000..e6fddead511
--- /dev/null
+++ b/storage/perfschema/unittest/pfs_account-oom-t.cc
@@ -0,0 +1,116 @@
+/* Copyright (c) 2011, Oracle and/or its affiliates. All rights reserved.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; version 2 of the License.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software Foundation,
+  51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA */
+
+#include <my_global.h>
+#include <my_pthread.h>
+#include <pfs_instr.h>
+#include <pfs_stat.h>
+#include <pfs_global.h>
+#include <pfs_account.h>
+#include <tap.h>
+
+#include "stub_pfs_global.h"
+#include "stub_server_misc.h"
+
+#include <string.h> /* memset */
+
+void test_oom()
+{
+  int rc;
+  PFS_global_param param;
+
+  memset(& param, 0xFF, sizeof(param));
+  param.m_enabled= true;
+  param.m_mutex_class_sizing= 0;
+  param.m_rwlock_class_sizing= 0;
+  param.m_cond_class_sizing= 0;
+  param.m_thread_class_sizing= 10;
+  param.m_table_share_sizing= 0;
+  param.m_file_class_sizing= 0;
+  param.m_mutex_sizing= 0;
+  param.m_rwlock_sizing= 0;
+  param.m_cond_sizing= 0;
+  param.m_thread_sizing= 1000;
+  param.m_table_sizing= 0;
+  param.m_file_sizing= 0;
+  param.m_file_handle_sizing= 0;
+  param.m_events_waits_history_sizing= 10;
+  param.m_events_waits_history_long_sizing= 0;
+  param.m_setup_actor_sizing= 0;
+  param.m_setup_object_sizing= 0;
+  param.m_host_sizing= 0;
+  param.m_user_sizing= 0;
+  param.m_account_sizing= 1000;
+  param.m_stage_class_sizing= 50;
+  param.m_events_stages_history_sizing= 0;
+  param.m_events_stages_history_long_sizing= 0;
+  param.m_statement_class_sizing= 50;
+  param.m_events_statements_history_sizing= 0;
+  param.m_events_statements_history_long_sizing= 0;
+
+  /* Setup */
+
+  stub_alloc_always_fails= false;
+  stub_alloc_fails_after_count= 1000;
+
+  init_event_name_sizing(& param);
+  rc= init_stage_class(param.m_stage_class_sizing);
+  ok(rc == 0, "init stage class");
+  rc= init_statement_class(param.m_statement_class_sizing);
+  ok(rc == 0, "init statement class");
+
+  /* Tests */
+
+  stub_alloc_fails_after_count= 1;
+  rc= init_account(& param);
+  ok(rc == 1, "oom (account)");
+  cleanup_account();
+
+  stub_alloc_fails_after_count= 2;
+  rc= init_account(& param);
+  ok(rc == 1, "oom (account waits)");
+  cleanup_account();
+
+  stub_alloc_fails_after_count= 3;
+  rc= init_account(& param);
+  ok(rc == 1, "oom (account stages)");
+  cleanup_account();
+
+  stub_alloc_fails_after_count= 4;
+  rc= init_account(& param);
+  ok(rc == 1, "oom (account statements)");
+  cleanup_account();
+
+  cleanup_statement_class();
+  cleanup_stage_class();
+}
+
+void do_all_tests()
+{
+  PFS_atomic::init();
+
+  test_oom();
+
+  PFS_atomic::cleanup();
+}
+
+int main(int, char **)
+{
+  plan(6);
+  MY_INIT("pfs_account-oom-t");
+  do_all_tests();
+  return 0;
+}
+
diff --git a/storage/perfschema/unittest/pfs_host-oom-t.cc b/storage/perfschema/unittest/pfs_host-oom-t.cc
new file mode 100644
index 00000000000..075a2e6a07a
--- /dev/null
+++ b/storage/perfschema/unittest/pfs_host-oom-t.cc
@@ -0,0 +1,116 @@
+/* Copyright (c) 2011, Oracle and/or its affiliates. All rights reserved.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; version 2 of the License.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software Foundation,
+  51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA */
+
+#include <my_global.h>
+#include <my_pthread.h>
+#include <pfs_instr.h>
+#include <pfs_stat.h>
+#include <pfs_global.h>
+#include <pfs_host.h>
+#include <tap.h>
+
+#include "stub_pfs_global.h"
+#include "stub_server_misc.h"
+
+#include <string.h> /* memset */
+
+void test_oom()
+{
+  int rc;
+  PFS_global_param param;
+
+  memset(& param, 0xFF, sizeof(param));
+  param.m_enabled= true;
+  param.m_mutex_class_sizing= 0;
+  param.m_rwlock_class_sizing= 0;
+  param.m_cond_class_sizing= 0;
+  param.m_thread_class_sizing= 10;
+  param.m_table_share_sizing= 0;
+  param.m_file_class_sizing= 0;
+  param.m_mutex_sizing= 0;
+  param.m_rwlock_sizing= 0;
+  param.m_cond_sizing= 0;
+  param.m_thread_sizing= 1000;
+  param.m_table_sizing= 0;
+  param.m_file_sizing= 0;
+  param.m_file_handle_sizing= 0;
+  param.m_events_waits_history_sizing= 10;
+  param.m_events_waits_history_long_sizing= 0;
+  param.m_setup_actor_sizing= 0;
+  param.m_setup_object_sizing= 0;
+  param.m_host_sizing= 1000;
+  param.m_user_sizing= 0;
+  param.m_account_sizing= 0;
+  param.m_stage_class_sizing= 50;
+  param.m_events_stages_history_sizing= 0;
+  param.m_events_stages_history_long_sizing= 0;
+  param.m_statement_class_sizing= 50;
+  param.m_events_statements_history_sizing= 0;
+  param.m_events_statements_history_long_sizing= 0;
+
+  /* Setup */
+
+  stub_alloc_always_fails= false;
+  stub_alloc_fails_after_count= 1000;
+
+  init_event_name_sizing(& param);
+  rc= init_stage_class(param.m_stage_class_sizing);
+  ok(rc == 0, "init stage class");
+  rc= init_statement_class(param.m_statement_class_sizing);
+  ok(rc == 0, "init statement class");
+
+  /* Tests */
+
+  stub_alloc_fails_after_count= 1;
+  rc= init_host(& param);
+  ok(rc == 1, "oom (host)");
+  cleanup_host();
+
+  stub_alloc_fails_after_count= 2;
+  rc= init_host(& param);
+  ok(rc == 1, "oom (host waits)");
+  cleanup_host();
+
+  stub_alloc_fails_after_count= 3;
+  rc= init_host(& param);
+  ok(rc == 1, "oom (host stages)");
+  cleanup_host();
+
+  stub_alloc_fails_after_count= 4;
+  rc= init_host(& param);
+  ok(rc == 1, "oom (host statements)");
+  cleanup_host();
+
+  cleanup_statement_class();
+  cleanup_stage_class();
+}
+
+void do_all_tests()
+{
+  PFS_atomic::init();
+
+  test_oom();
+
+  PFS_atomic::cleanup();
+}
+
+int main(int, char **)
+{
+  plan(6);
+  MY_INIT("pfs_host-oom-t");
+  do_all_tests();
+  return 0;
+}
+
diff --git a/storage/perfschema/unittest/pfs_instr-oom-t.cc b/storage/perfschema/unittest/pfs_instr-oom-t.cc
index 175c09bcd39..41bb4ed6c5a 100644
--- a/storage/perfschema/unittest/pfs_instr-oom-t.cc
+++ b/storage/perfschema/unittest/pfs_instr-oom-t.cc
@@ -18,11 +18,14 @@
 #include <pfs_instr.h>
 #include <pfs_stat.h>
 #include <pfs_global.h>
+#include <pfs_instr_class.h>
 #include <tap.h>
 
 #include "stub_pfs_global.h"
 #include "stub_server_misc.h"
 
+#include <string.h> /* memset */
+
 void test_oom()
 {
   int rc;
@@ -30,6 +33,7 @@ void test_oom()
 
   stub_alloc_always_fails= true;
 
+  memset(& param, 0xFF, sizeof(param));
   param.m_enabled= true;
   param.m_mutex_class_sizing= 10;
   param.m_rwlock_class_sizing= 0;
@@ -37,6 +41,7 @@ void test_oom()
   param.m_thread_class_sizing= 0;
   param.m_table_share_sizing= 0;
   param.m_file_class_sizing= 0;
+  param.m_socket_class_sizing= 0;
   param.m_mutex_sizing= 1000;
   param.m_rwlock_sizing= 0;
   param.m_cond_sizing= 0;
@@ -44,11 +49,25 @@ void test_oom()
   param.m_table_sizing= 0;
   param.m_file_sizing= 0;
   param.m_file_handle_sizing= 0;
+  param.m_socket_sizing= 0;
   param.m_events_waits_history_sizing= 0;
   param.m_events_waits_history_long_sizing= 0;
+  param.m_setup_actor_sizing= 0;
+  param.m_setup_object_sizing= 0;
+  param.m_host_sizing= 0;
+  param.m_user_sizing= 0;
+  param.m_account_sizing= 0;
+  param.m_stage_class_sizing= 0;
+  param.m_events_stages_history_sizing= 0;
+  param.m_events_stages_history_long_sizing= 0;
+  param.m_statement_class_sizing= 0;
+  param.m_events_statements_history_sizing= 0;
+  param.m_events_statements_history_long_sizing= 0;
 
+  init_event_name_sizing(& param);
   rc= init_instruments(& param);
   ok(rc == 1, "oom (mutex)");
+  cleanup_instruments();
 
   param.m_enabled= true;
   param.m_mutex_class_sizing= 0;
@@ -57,6 +76,7 @@ void test_oom()
   param.m_thread_class_sizing= 0;
   param.m_table_share_sizing= 0;
   param.m_file_class_sizing= 0;
+  param.m_socket_class_sizing= 0;
   param.m_mutex_sizing= 0;
   param.m_rwlock_sizing= 1000;
   param.m_cond_sizing= 0;
@@ -64,11 +84,25 @@ void test_oom()
   param.m_table_sizing= 0;
   param.m_file_sizing= 0;
   param.m_file_handle_sizing= 0;
+  param.m_socket_sizing= 0;
   param.m_events_waits_history_sizing= 0;
   param.m_events_waits_history_long_sizing= 0;
+  param.m_setup_actor_sizing= 0;
+  param.m_setup_object_sizing= 0;
+  param.m_host_sizing= 0;
+  param.m_user_sizing= 0;
+  param.m_account_sizing= 0;
+  param.m_stage_class_sizing= 0;
+  param.m_events_stages_history_sizing= 0;
+  param.m_events_stages_history_long_sizing= 0;
+  param.m_statement_class_sizing= 0;
+  param.m_events_statements_history_sizing= 0;
+  param.m_events_statements_history_long_sizing= 0;
 
+  init_event_name_sizing(& param);
   rc= init_instruments(& param);
   ok(rc == 1, "oom (rwlock)");
+  cleanup_instruments();
 
   param.m_enabled= true;
   param.m_mutex_class_sizing= 0;
@@ -77,6 +111,7 @@ void test_oom()
   param.m_thread_class_sizing= 0;
   param.m_table_share_sizing= 0;
   param.m_file_class_sizing= 0;
+  param.m_socket_class_sizing= 0;
   param.m_mutex_sizing= 0;
   param.m_rwlock_sizing= 0;
   param.m_cond_sizing= 1000;
@@ -84,11 +119,25 @@ void test_oom()
   param.m_table_sizing= 0;
   param.m_file_sizing= 0;
   param.m_file_handle_sizing= 0;
+  param.m_socket_sizing= 0;
   param.m_events_waits_history_sizing= 0;
   param.m_events_waits_history_long_sizing= 0;
+  param.m_setup_actor_sizing= 0;
+  param.m_setup_object_sizing= 0;
+  param.m_host_sizing= 0;
+  param.m_user_sizing= 0;
+  param.m_account_sizing= 0;
+  param.m_stage_class_sizing= 0;
+  param.m_events_stages_history_sizing= 0;
+  param.m_events_stages_history_long_sizing= 0;
+  param.m_statement_class_sizing= 0;
+  param.m_events_statements_history_sizing= 0;
+  param.m_events_statements_history_long_sizing= 0;
 
+  init_event_name_sizing(& param);
   rc= init_instruments(& param);
   ok(rc == 1, "oom (cond)");
+  cleanup_instruments();
 
   param.m_enabled= true;
   param.m_mutex_class_sizing= 0;
@@ -97,6 +146,7 @@ void test_oom()
   param.m_thread_class_sizing= 0;
   param.m_table_share_sizing= 0;
   param.m_file_class_sizing= 10;
+  param.m_socket_class_sizing= 0;
   param.m_mutex_sizing= 0;
   param.m_rwlock_sizing= 0;
   param.m_cond_sizing= 0;
@@ -104,11 +154,58 @@ void test_oom()
   param.m_table_sizing= 0;
   param.m_file_sizing= 1000;
   param.m_file_handle_sizing= 1000;
+  param.m_socket_sizing= 0;
   param.m_events_waits_history_sizing= 0;
   param.m_events_waits_history_long_sizing= 0;
+  param.m_setup_actor_sizing= 0;
+  param.m_setup_object_sizing= 0;
+  param.m_host_sizing= 0;
+  param.m_user_sizing= 0;
+  param.m_account_sizing= 0;
+  param.m_stage_class_sizing= 0;
+  param.m_events_stages_history_sizing= 0;
+  param.m_events_stages_history_long_sizing= 0;
+  param.m_statement_class_sizing= 0;
+  param.m_events_statements_history_sizing= 0;
+  param.m_events_statements_history_long_sizing= 0;
 
+  init_event_name_sizing(& param);
   rc= init_instruments(& param);
   ok(rc == 1, "oom (file)");
+  cleanup_instruments();
+
+  param.m_enabled= true;
+  param.m_mutex_class_sizing= 0;
+  param.m_rwlock_class_sizing= 0;
+  param.m_cond_class_sizing= 0;
+  param.m_thread_class_sizing= 0;
+  param.m_table_share_sizing= 0;
+  param.m_file_class_sizing= 0;
+  param.m_mutex_sizing= 0;
+  param.m_rwlock_sizing= 0;
+  param.m_cond_sizing= 0;
+  param.m_thread_sizing= 0;
+  param.m_table_sizing= 0;
+  param.m_file_sizing= 0;
+  param.m_file_handle_sizing= 1000;
+  param.m_events_waits_history_sizing= 0;
+  param.m_events_waits_history_long_sizing= 0;
+  param.m_setup_actor_sizing= 0;
+  param.m_setup_object_sizing= 0;
+  param.m_host_sizing= 0;
+  param.m_user_sizing= 0;
+  param.m_account_sizing= 0;
+  param.m_stage_class_sizing= 0;
+  param.m_events_stages_history_sizing= 0;
+  param.m_events_stages_history_long_sizing= 0;
+  param.m_statement_class_sizing= 0;
+  param.m_events_statements_history_sizing= 0;
+  param.m_events_statements_history_long_sizing= 0;
+
+  init_event_name_sizing(& param);
+  rc= init_instruments(& param);
+  ok(rc == 1, "oom (file handle)");
+  cleanup_instruments();
 
   param.m_enabled= true;
   param.m_mutex_class_sizing= 0;
@@ -117,6 +214,7 @@ void test_oom()
   param.m_thread_class_sizing= 0;
   param.m_table_share_sizing= 10;
   param.m_file_class_sizing= 0;
+  param.m_socket_class_sizing= 0;
   param.m_mutex_sizing= 0;
   param.m_rwlock_sizing= 0;
   param.m_cond_sizing= 0;
@@ -124,11 +222,25 @@ void test_oom()
   param.m_table_sizing= 1000;
   param.m_file_sizing= 0;
   param.m_file_handle_sizing= 0;
+  param.m_socket_sizing= 0;
   param.m_events_waits_history_sizing= 0;
   param.m_events_waits_history_long_sizing= 0;
+  param.m_setup_actor_sizing= 0;
+  param.m_setup_object_sizing= 0;
+  param.m_host_sizing= 0;
+  param.m_user_sizing= 0;
+  param.m_account_sizing= 0;
+  param.m_stage_class_sizing= 0;
+  param.m_events_stages_history_sizing= 0;
+  param.m_events_stages_history_long_sizing= 0;
+  param.m_statement_class_sizing= 0;
+  param.m_events_statements_history_sizing= 0;
+  param.m_events_statements_history_long_sizing= 0;
 
+  init_event_name_sizing(& param);
   rc= init_instruments(& param);
   ok(rc == 1, "oom (table)");
+  cleanup_instruments();
 
   param.m_enabled= true;
   param.m_mutex_class_sizing= 0;
@@ -137,6 +249,7 @@ void test_oom()
   param.m_thread_class_sizing= 10;
   param.m_table_share_sizing= 0;
   param.m_file_class_sizing= 0;
+  param.m_socket_class_sizing= 0;
   param.m_mutex_sizing= 0;
   param.m_rwlock_sizing= 0;
   param.m_cond_sizing= 0;
@@ -144,11 +257,25 @@ void test_oom()
   param.m_table_sizing= 0;
   param.m_file_sizing= 0;
   param.m_file_handle_sizing= 0;
+  param.m_socket_sizing= 0;
   param.m_events_waits_history_sizing= 0;
   param.m_events_waits_history_long_sizing= 0;
+  param.m_setup_actor_sizing= 0;
+  param.m_setup_object_sizing= 0;
+  param.m_host_sizing= 0;
+  param.m_user_sizing= 0;
+  param.m_account_sizing= 0;
+  param.m_stage_class_sizing= 0;
+  param.m_events_stages_history_sizing= 0;
+  param.m_events_stages_history_long_sizing= 0;
+  param.m_statement_class_sizing= 0;
+  param.m_events_statements_history_sizing= 0;
+  param.m_events_statements_history_long_sizing= 0;
 
+  init_event_name_sizing(& param);
   rc= init_instruments(& param);
   ok(rc == 1, "oom (thread)");
+  cleanup_instruments();
 
   stub_alloc_always_fails= false;
 
@@ -159,6 +286,7 @@ void test_oom()
   param.m_thread_class_sizing= 10;
   param.m_table_share_sizing= 0;
   param.m_file_class_sizing= 0;
+  param.m_socket_class_sizing= 0;
   param.m_mutex_sizing= 0;
   param.m_rwlock_sizing= 0;
   param.m_cond_sizing= 0;
@@ -166,12 +294,26 @@ void test_oom()
   param.m_table_sizing= 0;
   param.m_file_sizing= 0;
   param.m_file_handle_sizing= 0;
+  param.m_socket_sizing= 0;
   param.m_events_waits_history_sizing= 10;
   param.m_events_waits_history_long_sizing= 0;
+  param.m_setup_actor_sizing= 0;
+  param.m_setup_object_sizing= 0;
+  param.m_host_sizing= 0;
+  param.m_user_sizing= 0;
+  param.m_account_sizing= 0;
+  param.m_stage_class_sizing= 0;
+  param.m_events_stages_history_sizing= 0;
+  param.m_events_stages_history_long_sizing= 0;
+  param.m_statement_class_sizing= 0;
+  param.m_events_statements_history_sizing= 0;
+  param.m_events_statements_history_long_sizing= 0;
 
   stub_alloc_fails_after_count= 2;
+  init_event_name_sizing(& param);
   rc= init_instruments(& param);
-  ok(rc == 1, "oom (thread history sizing)");
+  ok(rc == 1, "oom (thread waits history sizing)");
+  cleanup_instruments();
 
   param.m_enabled= true;
   param.m_mutex_class_sizing= 50;
@@ -180,6 +322,7 @@ void test_oom()
   param.m_thread_class_sizing= 10;
   param.m_table_share_sizing= 0;
   param.m_file_class_sizing= 50;
+  param.m_socket_class_sizing= 0;
   param.m_mutex_sizing= 0;
   param.m_rwlock_sizing= 0;
   param.m_cond_sizing= 0;
@@ -187,14 +330,309 @@ void test_oom()
   param.m_table_sizing= 0;
   param.m_file_sizing= 0;
   param.m_file_handle_sizing= 0;
+  param.m_socket_sizing= 0;
   param.m_events_waits_history_sizing= 0;
   param.m_events_waits_history_long_sizing= 0;
+  param.m_setup_actor_sizing= 0;
+  param.m_setup_object_sizing= 0;
+  param.m_stage_class_sizing= 0;
+  param.m_events_stages_history_sizing= 0;
+  param.m_events_stages_history_long_sizing= 0;
+  param.m_statement_class_sizing= 0;
+  param.m_events_statements_history_sizing= 0;
+  param.m_events_statements_history_long_sizing= 0;
 
   stub_alloc_fails_after_count= 2;
+  init_event_name_sizing(& param);
   rc= init_instruments(& param);
   ok(rc == 1, "oom (per thread wait)");
 
+  param.m_enabled= true;
+  param.m_mutex_class_sizing= 0;
+  param.m_rwlock_class_sizing= 0;
+  param.m_cond_class_sizing= 0;
+  param.m_thread_class_sizing= 0;
+  param.m_table_share_sizing= 0;
+  param.m_file_class_sizing= 0;
+  param.m_socket_class_sizing= 10;
+  param.m_mutex_sizing= 0;
+  param.m_rwlock_sizing= 0;
+  param.m_cond_sizing= 0;
+  param.m_thread_sizing= 0;
+  param.m_table_sizing= 0;
+  param.m_file_sizing= 0;
+  param.m_file_handle_sizing= 0;
+  param.m_socket_sizing= 1000;
+  param.m_events_waits_history_sizing= 0;
+  param.m_events_waits_history_long_sizing= 0;
+  param.m_setup_actor_sizing= 0;
+  param.m_setup_object_sizing= 0;
+
+  init_event_name_sizing(& param);
+  rc= init_instruments(& param);
+  ok(rc == 1, "oom (socket)");
+
+  cleanup_instruments();
+
+  param.m_host_sizing= 0;
+  param.m_user_sizing= 0;
+  param.m_account_sizing= 0;
+  param.m_stage_class_sizing= 0;
+  param.m_events_stages_history_sizing= 0;
+  param.m_events_stages_history_long_sizing= 0;
+  param.m_statement_class_sizing= 0;
+  param.m_events_statements_history_sizing= 0;
+  param.m_events_statements_history_long_sizing= 0;
+
+  stub_alloc_fails_after_count= 2;
+  init_event_name_sizing(& param);
+  rc= init_instruments(& param);
+  ok(rc == 1, "oom (per thread waits)");
+  cleanup_instruments();
+
+  param.m_enabled= true;
+  param.m_mutex_class_sizing= 0;
+  param.m_rwlock_class_sizing= 0;
+  param.m_cond_class_sizing= 0;
+  param.m_thread_class_sizing= 10;
+  param.m_table_share_sizing= 0;
+  param.m_file_class_sizing= 0;
+  param.m_mutex_sizing= 0;
+  param.m_rwlock_sizing= 0;
+  param.m_cond_sizing= 0;
+  param.m_thread_sizing= 1000;
+  param.m_table_sizing= 0;
+  param.m_file_sizing= 0;
+  param.m_file_handle_sizing= 0;
+  param.m_events_waits_history_sizing= 0;
+  param.m_events_waits_history_long_sizing= 0;
+  param.m_setup_actor_sizing= 0;
+  param.m_setup_object_sizing= 0;
+  param.m_host_sizing= 0;
+  param.m_user_sizing= 0;
+  param.m_account_sizing= 0;
+  param.m_stage_class_sizing= 0;
+  param.m_events_stages_history_sizing= 10;
+  param.m_events_stages_history_long_sizing= 0;
+  param.m_statement_class_sizing= 0;
+  param.m_events_statements_history_sizing= 0;
+  param.m_events_statements_history_long_sizing= 0;
+
+  stub_alloc_fails_after_count= 3;
+  init_event_name_sizing(& param);
+  rc= init_instruments(& param);
+  ok(rc == 1, "oom (thread stages history sizing)");
+  cleanup_instruments();
+
+  param.m_enabled= true;
+  param.m_mutex_class_sizing= 0;
+  param.m_rwlock_class_sizing= 0;
+  param.m_cond_class_sizing= 0;
+  param.m_thread_class_sizing= 10;
+  param.m_table_share_sizing= 0;
+  param.m_file_class_sizing= 0;
+  param.m_mutex_sizing= 0;
+  param.m_rwlock_sizing= 0;
+  param.m_cond_sizing= 0;
+  param.m_thread_sizing= 1000;
+  param.m_table_sizing= 0;
+  param.m_file_sizing= 0;
+  param.m_file_handle_sizing= 0;
+  param.m_events_waits_history_sizing= 0;
+  param.m_events_waits_history_long_sizing= 0;
+  param.m_setup_actor_sizing= 0;
+  param.m_setup_object_sizing= 0;
+  param.m_host_sizing= 0;
+  param.m_user_sizing= 0;
+  param.m_account_sizing= 0;
+  param.m_stage_class_sizing= 50;
+  param.m_events_stages_history_sizing= 0;
+  param.m_events_stages_history_long_sizing= 0;
+  param.m_statement_class_sizing= 0;
+  param.m_events_statements_history_sizing= 0;
+  param.m_events_statements_history_long_sizing= 0;
+
+  stub_alloc_fails_after_count= 2;
+  init_event_name_sizing(& param);
+  rc= init_instruments(& param);
+  ok(rc == 1, "oom (per thread stages)");
+  cleanup_instruments();
+
+  param.m_enabled= true;
+  param.m_mutex_class_sizing= 0;
+  param.m_rwlock_class_sizing= 0;
+  param.m_cond_class_sizing= 0;
+  param.m_thread_class_sizing= 10;
+  param.m_table_share_sizing= 0;
+  param.m_file_class_sizing= 0;
+  param.m_mutex_sizing= 0;
+  param.m_rwlock_sizing= 0;
+  param.m_cond_sizing= 0;
+  param.m_thread_sizing= 1000;
+  param.m_table_sizing= 0;
+  param.m_file_sizing= 0;
+  param.m_file_handle_sizing= 0;
+  param.m_events_waits_history_sizing= 0;
+  param.m_events_waits_history_long_sizing= 0;
+  param.m_setup_actor_sizing= 0;
+  param.m_setup_object_sizing= 0;
+  param.m_host_sizing= 0;
+  param.m_user_sizing= 0;
+  param.m_account_sizing= 0;
+  param.m_stage_class_sizing= 0;
+  param.m_events_stages_history_sizing= 0;
+  param.m_events_stages_history_long_sizing= 0;
+  param.m_statement_class_sizing= 0;
+  param.m_events_statements_history_sizing= 10;
+  param.m_events_statements_history_long_sizing= 0;
+
+  stub_alloc_fails_after_count= 2;
+  init_event_name_sizing(& param);
+  rc= init_instruments(& param);
+  ok(rc == 1, "oom (thread statements history sizing)");
+  cleanup_instruments();
+
+  param.m_enabled= true;
+  param.m_mutex_class_sizing= 0;
+  param.m_rwlock_class_sizing= 0;
+  param.m_cond_class_sizing= 0;
+  param.m_thread_class_sizing= 10;
+  param.m_table_share_sizing= 0;
+  param.m_file_class_sizing= 0;
+  param.m_mutex_sizing= 0;
+  param.m_rwlock_sizing= 0;
+  param.m_cond_sizing= 0;
+  param.m_thread_sizing= 1000;
+  param.m_table_sizing= 0;
+  param.m_file_sizing= 0;
+  param.m_file_handle_sizing= 0;
+  param.m_events_waits_history_sizing= 0;
+  param.m_events_waits_history_long_sizing= 0;
+  param.m_setup_actor_sizing= 0;
+  param.m_setup_object_sizing= 0;
+  param.m_host_sizing= 0;
+  param.m_user_sizing= 0;
+  param.m_account_sizing= 0;
+  param.m_stage_class_sizing= 0;
+  param.m_events_stages_history_sizing= 0;
+  param.m_events_stages_history_long_sizing= 0;
+  param.m_statement_class_sizing= 50;
+  param.m_events_statements_history_sizing= 0;
+  param.m_events_statements_history_long_sizing= 0;
+
+  stub_alloc_fails_after_count= 2;
+  init_event_name_sizing(& param);
+  rc= init_instruments(& param);
+  ok(rc == 1, "oom (per thread statements)");
+  cleanup_instruments();
+
+  param.m_enabled= true;
+  param.m_mutex_class_sizing= 10;
+  param.m_rwlock_class_sizing= 0;
+  param.m_cond_class_sizing= 0;
+  param.m_thread_class_sizing= 0;
+  param.m_table_share_sizing= 0;
+  param.m_file_class_sizing= 0;
+  param.m_mutex_sizing= 0;
+  param.m_rwlock_sizing= 0;
+  param.m_cond_sizing= 0;
+  param.m_thread_sizing= 0;
+  param.m_table_sizing= 0;
+  param.m_file_sizing= 0;
+  param.m_file_handle_sizing= 0;
+  param.m_events_waits_history_sizing= 0;
+  param.m_events_waits_history_long_sizing= 0;
+  param.m_setup_actor_sizing= 0;
+  param.m_setup_object_sizing= 0;
+  param.m_host_sizing= 0;
+  param.m_user_sizing= 0;
+  param.m_account_sizing= 0;
+  param.m_stage_class_sizing= 0;
+  param.m_events_stages_history_sizing= 0;
+  param.m_events_stages_history_long_sizing= 0;
+  param.m_statement_class_sizing= 0;
+  param.m_events_statements_history_sizing= 0;
+  param.m_events_statements_history_long_sizing= 0;
+
+  stub_alloc_fails_after_count= 1;
+  init_event_name_sizing(& param);
+  rc= init_instruments(& param);
+  ok(rc == 1, "oom (global waits)");
+  cleanup_instruments();
+
+  param.m_enabled= true;
+  param.m_mutex_class_sizing= 10;
+  param.m_rwlock_class_sizing= 0;
+  param.m_cond_class_sizing= 0;
+  param.m_thread_class_sizing= 0;
+  param.m_table_share_sizing= 0;
+  param.m_file_class_sizing= 0;
+  param.m_mutex_sizing= 0;
+  param.m_rwlock_sizing= 0;
+  param.m_cond_sizing= 0;
+  param.m_thread_sizing= 0;
+  param.m_table_sizing= 0;
+  param.m_file_sizing= 0;
+  param.m_file_handle_sizing= 0;
+  param.m_events_waits_history_sizing= 0;
+  param.m_events_waits_history_long_sizing= 0;
+  param.m_setup_actor_sizing= 0;
+  param.m_setup_object_sizing= 0;
+  param.m_host_sizing= 0;
+  param.m_user_sizing= 0;
+  param.m_account_sizing= 0;
+  param.m_stage_class_sizing= 20;
+  param.m_events_stages_history_sizing= 0;
+  param.m_events_stages_history_long_sizing= 0;
+  param.m_statement_class_sizing= 0;
+  param.m_events_statements_history_sizing= 0;
+  param.m_events_statements_history_long_sizing= 0;
+
+  stub_alloc_fails_after_count= 3;
+  init_event_name_sizing(& param);
+  rc= init_stage_class(param.m_stage_class_sizing);
+  ok(rc == 0, "init stage class");
+  rc= init_instruments(& param);
+  ok(rc == 1, "oom (global stages)");
+  cleanup_instruments();
+  cleanup_stage_class();
+
+  param.m_enabled= true;
+  param.m_mutex_class_sizing= 10;
+  param.m_rwlock_class_sizing= 0;
+  param.m_cond_class_sizing= 0;
+  param.m_thread_class_sizing= 0;
+  param.m_table_share_sizing= 0;
+  param.m_file_class_sizing= 0;
+  param.m_mutex_sizing= 0;
+  param.m_rwlock_sizing= 0;
+  param.m_cond_sizing= 0;
+  param.m_thread_sizing= 0;
+  param.m_table_sizing= 0;
+  param.m_file_sizing= 0;
+  param.m_file_handle_sizing= 0;
+  param.m_events_waits_history_sizing= 0;
+  param.m_events_waits_history_long_sizing= 0;
+  param.m_setup_actor_sizing= 0;
+  param.m_setup_object_sizing= 0;
+  param.m_host_sizing= 0;
+  param.m_user_sizing= 0;
+  param.m_account_sizing= 0;
+  param.m_stage_class_sizing= 0;
+  param.m_events_stages_history_sizing= 0;
+  param.m_events_stages_history_long_sizing= 0;
+  param.m_statement_class_sizing= 20;
+  param.m_events_statements_history_sizing= 0;
+  param.m_events_statements_history_long_sizing= 0;
+
+  stub_alloc_fails_after_count= 3;
+  init_event_name_sizing(& param);
+  rc= init_statement_class(param.m_statement_class_sizing);
+  ok(rc == 0, "init statement class");
+  rc= init_instruments(& param);
+  ok(rc == 1, "oom (global statements)");
   cleanup_instruments();
+  cleanup_statement_class();
 }
 
 void do_all_tests()
@@ -208,7 +646,7 @@ void do_all_tests()
 
 int main(int argc, char **argv)
 {
-  plan(8);
+  plan(20);
   MY_INIT(argv[0]);
   do_all_tests();
   my_end(0);
diff --git a/storage/perfschema/unittest/pfs_instr-t.cc b/storage/perfschema/unittest/pfs_instr-t.cc
index 0daf38e5fc8..b0839de70b2 100644
--- a/storage/perfschema/unittest/pfs_instr-t.cc
+++ b/storage/perfschema/unittest/pfs_instr-t.cc
@@ -18,6 +18,7 @@
 #include <pfs_instr.h>
 #include <pfs_stat.h>
 #include <pfs_global.h>
+#include <pfs_instr_class.h>
 #include <tap.h>
 
 #include <memory.h>
@@ -29,6 +30,7 @@ void test_no_instruments()
   int rc;
   PFS_global_param param;
 
+  memset(& param, 0xFF, sizeof(param));
   param.m_enabled= true;
   param.m_mutex_class_sizing= 0;
   param.m_rwlock_class_sizing= 0;
@@ -36,6 +38,7 @@ void test_no_instruments()
   param.m_thread_class_sizing= 0;
   param.m_table_share_sizing= 0;
   param.m_file_class_sizing= 0;
+  param.m_socket_class_sizing= 0;
   param.m_mutex_sizing= 0;
   param.m_rwlock_sizing= 0;
   param.m_cond_sizing= 0;
@@ -43,9 +46,22 @@ void test_no_instruments()
   param.m_table_sizing= 0;
   param.m_file_sizing= 0;
   param.m_file_handle_sizing= 0;
+  param.m_socket_sizing= 0;
   param.m_events_waits_history_sizing= 0;
   param.m_events_waits_history_long_sizing= 0;
-
+  param.m_setup_actor_sizing= 0;
+  param.m_setup_object_sizing= 0;
+  param.m_host_sizing= 0;
+  param.m_user_sizing= 0;
+  param.m_account_sizing= 0;
+  param.m_stage_class_sizing= 0;
+  param.m_events_stages_history_sizing= 0;
+  param.m_events_stages_history_long_sizing= 0;
+  param.m_statement_class_sizing= 0;
+  param.m_events_statements_history_sizing= 0;
+  param.m_events_statements_history_long_sizing= 0;
+
+  init_event_name_sizing(& param);
   rc= init_instruments(& param);
   ok(rc == 0, "zero init");
 
@@ -61,14 +77,17 @@ void test_no_instances()
   PFS_thread_class dummy_thread_class;
   PFS_file_class dummy_file_class;
   PFS_table_share dummy_table_share;
+  PFS_socket_class dummy_socket_class;
   PFS_mutex *mutex;
   PFS_rwlock *rwlock;
   PFS_cond *cond;
   PFS_thread *thread;
   PFS_file *file;
+  PFS_socket *socket;
   PFS_table *table;
   PFS_global_param param;
 
+  memset(& param, 0xFF, sizeof(param));
   param.m_enabled= true;
   param.m_mutex_class_sizing= 1;
   param.m_rwlock_class_sizing= 1;
@@ -76,6 +95,7 @@ void test_no_instances()
   param.m_thread_class_sizing= 1;
   param.m_table_share_sizing= 1;
   param.m_file_class_sizing= 1;
+  param.m_socket_class_sizing= 0;
   param.m_mutex_sizing= 0;
   param.m_rwlock_sizing= 0;
   param.m_cond_sizing= 0;
@@ -83,9 +103,22 @@ void test_no_instances()
   param.m_table_sizing= 0;
   param.m_file_sizing= 0;
   param.m_file_handle_sizing= 0;
+  param.m_socket_sizing= 0;
   param.m_events_waits_history_sizing= 0;
   param.m_events_waits_history_long_sizing= 0;
-
+  param.m_setup_actor_sizing= 0;
+  param.m_setup_object_sizing= 0;
+  param.m_host_sizing= 0;
+  param.m_user_sizing= 0;
+  param.m_account_sizing= 0;
+  param.m_stage_class_sizing= 0;
+  param.m_events_stages_history_sizing= 0;
+  param.m_events_stages_history_long_sizing= 0;
+  param.m_statement_class_sizing= 0;
+  param.m_events_statements_history_sizing= 0;
+  param.m_events_statements_history_long_sizing= 0;
+
+  init_event_name_sizing(& param);
   rc= init_instruments(& param);
   ok(rc == 0, "no instances init");
 
@@ -144,16 +177,23 @@ void test_no_instances()
   ok(file == NULL, "no file");
   ok(file_lost == 5, "lost 5");
 
-  table= create_table(& dummy_table_share, NULL);
+  table= create_table(& dummy_table_share, & fake_thread, NULL);
   ok(table == NULL, "no table");
   ok(table_lost == 1, "lost 1");
-  table= create_table(& dummy_table_share, NULL);
+  table= create_table(& dummy_table_share, & fake_thread, NULL);
   ok(table == NULL, "no table");
   ok(table_lost == 2, "lost 2");
 
+  socket= create_socket(& dummy_socket_class, NULL);
+  ok(socket == NULL, "no socket");
+  ok(socket_lost == 1, "lost 1");
+  socket= create_socket(& dummy_socket_class, NULL);
+  ok(socket == NULL, "no socket");
+  ok(socket_lost == 2, "lost 2");
+
   /* No result to test, just make sure it does not crash */
   reset_events_waits_by_instance();
-  reset_per_thread_wait_stat();
+  reset_events_waits_by_thread();
 
   cleanup_file_hash();
   cleanup_instruments();
@@ -167,6 +207,7 @@ void test_with_instances()
   PFS_cond_class dummy_cond_class;
   PFS_thread_class dummy_thread_class;
   PFS_file_class dummy_file_class;
+  PFS_socket_class dummy_socket_class;
   PFS_table_share dummy_table_share;
   PFS_mutex *mutex_1;
   PFS_mutex *mutex_2;
@@ -178,10 +219,13 @@ void test_with_instances()
   PFS_thread *thread_2;
   PFS_file *file_1;
   PFS_file *file_2;
+  PFS_socket *socket_1;
+  PFS_socket *socket_2;
   PFS_table *table_1;
   PFS_table *table_2;
   PFS_global_param param;
 
+  memset(& param, 0xFF, sizeof(param));
   param.m_enabled= true;
   param.m_mutex_class_sizing= 1;
   param.m_rwlock_class_sizing= 1;
@@ -189,6 +233,7 @@ void test_with_instances()
   param.m_thread_class_sizing= 1;
   param.m_table_share_sizing= 1;
   param.m_file_class_sizing= 1;
+  param.m_socket_class_sizing= 1;
   param.m_mutex_sizing= 2;
   param.m_rwlock_sizing= 2;
   param.m_cond_sizing= 2;
@@ -196,12 +241,31 @@ void test_with_instances()
   param.m_table_sizing= 2;
   param.m_file_sizing= 2;
   param.m_file_handle_sizing= 100;
+  param.m_socket_sizing= 2;
   param.m_events_waits_history_sizing= 10;
   param.m_events_waits_history_long_sizing= 10000;
-
+  param.m_setup_actor_sizing= 0;
+  param.m_setup_object_sizing= 0;
+  param.m_host_sizing= 0;
+  param.m_user_sizing= 0;
+  param.m_account_sizing= 0;
+  param.m_stage_class_sizing= 0;
+  param.m_events_stages_history_sizing= 0;
+  param.m_events_stages_history_long_sizing= 0;
+  param.m_statement_class_sizing= 0;
+  param.m_events_statements_history_sizing= 0;
+  param.m_events_statements_history_long_sizing= 0;
+
+  init_event_name_sizing(& param);
   rc= init_instruments(& param);
   ok(rc == 0, "instances init");
 
+  dummy_mutex_class.m_event_name_index= 0;
+  dummy_rwlock_class.m_event_name_index= 1;
+  dummy_cond_class.m_event_name_index= 2;
+  dummy_file_class.m_event_name_index= 3;
+  dummy_socket_class.m_event_name_index= 4;
+
   mutex_1= create_mutex(& dummy_mutex_class, NULL);
   ok(mutex_1 != NULL, "mutex");
   ok(mutex_lost == 0, "not lost");
@@ -294,110 +358,42 @@ void test_with_instances()
   ok(file_2 == NULL, "no file");
   ok(file_lost == 2, "lost");
 
-  table_1= create_table(& dummy_table_share, NULL);
+  socket_1= create_socket(& dummy_socket_class, NULL);
+  ok(socket_1 != NULL, "socket");
+  ok(socket_lost == 0, "not lost");
+  socket_2= create_socket(& dummy_socket_class, NULL);
+  ok(socket_2 != NULL, "socket");
+  ok(socket_lost == 0, "not lost");
+  socket_2= create_socket(& dummy_socket_class, NULL);
+  ok(socket_2 == NULL, "no socket");
+  ok(socket_lost == 1, "lost 1");
+  destroy_socket(socket_1);
+  socket_2= create_socket(& dummy_socket_class, NULL);
+  ok(socket_2 != NULL, "socket");
+  ok(socket_lost == 1, "no new loss");
+
+  table_1= create_table(& dummy_table_share, & fake_thread, NULL);
   ok(table_1 != NULL, "table");
   ok(table_lost == 0, "not lost");
-  table_2= create_table(& dummy_table_share, NULL);
+  table_2= create_table(& dummy_table_share, & fake_thread, NULL);
   ok(table_2 != NULL, "table");
   ok(table_lost == 0, "not lost");
-  table_2= create_table(& dummy_table_share, NULL);
+  table_2= create_table(& dummy_table_share, & fake_thread, NULL);
   ok(table_2 == NULL, "no table");
   ok(table_lost == 1, "lost 1");
   destroy_table(table_1);
-  table_2= create_table(& dummy_table_share, NULL);
+  table_2= create_table(& dummy_table_share, & fake_thread, NULL);
   ok(table_2 != NULL, "table");
   ok(table_lost == 1, "no new loss");
 
   //TODO: test that cleanup works
   reset_events_waits_by_instance();
-  reset_per_thread_wait_stat();
+  reset_events_waits_by_thread();
 
   cleanup_file_hash();
   cleanup_instruments();
 }
 
-void test_per_thread_wait()
-{
-  int rc;
-  PFS_mutex_class dummy_mutex_class;
-  PFS_rwlock_class dummy_rwlock_class;
-  PFS_cond_class dummy_cond_class;
-  PFS_thread_class dummy_thread_class;
-  PFS_file_class dummy_file_class;
-  PFS_thread *thread;
-  PFS_single_stat_chain *base;
-  PFS_single_stat_chain *stat;
-  PFS_global_param param;
-
-
-  /* Per mutex info waits should be at [0..9] */
-  mutex_class_max= 10;
-  /* Per rwlock info waits should be at [10..29] */
-  rwlock_class_max= 20;
-  /* Per cond info waits should be at [30..69] */
-  cond_class_max= 40;
-  /* Per file info waits should be at [70..149] */
-  file_class_max= 80;
-  /* Per table info waits should be at [150..309] */
-  table_share_max= 160;
-
-  param.m_enabled= true;
-  param.m_mutex_class_sizing= mutex_class_max;
-  param.m_rwlock_class_sizing= rwlock_class_max;
-  param.m_cond_class_sizing= cond_class_max;
-  param.m_thread_class_sizing= 2;
-  param.m_table_share_sizing= table_share_max;
-  param.m_file_class_sizing= file_class_max;
-  param.m_mutex_sizing= 0;
-  param.m_rwlock_sizing= 0;
-  param.m_cond_sizing= 0;
-  param.m_thread_sizing= 2;
-  param.m_table_sizing= 0;
-  param.m_file_sizing= 0;
-  param.m_file_handle_sizing= 0;
-  param.m_events_waits_history_sizing= 10;
-  param.m_events_waits_history_long_sizing= 10000;
-
-  rc= init_instruments(& param);
-  ok(rc == 0, "instances init");
-
-  thread= create_thread(& dummy_thread_class, NULL, 0);
-  ok(thread != NULL, "thread");
-  ok(thread_lost == 0, "not lost");
-
-  base= & thread->m_instr_class_wait_stats[0];
-
-  dummy_mutex_class.m_index= 0;
-  stat= find_per_thread_mutex_class_wait_stat(thread, & dummy_mutex_class);
-  ok(base + 0 == stat, "fist mutex info slot at 0");
-  dummy_mutex_class.m_index= mutex_class_max - 1;
-  stat= find_per_thread_mutex_class_wait_stat(thread, & dummy_mutex_class);
-  ok(base + 9 == stat, "last mutex info slot at 9");
-
-  dummy_rwlock_class.m_index= 0;
-  stat= find_per_thread_rwlock_class_wait_stat(thread, & dummy_rwlock_class);
-  ok(base + 10 == stat, "fist rwlock info slot at 10");
-  dummy_rwlock_class.m_index= rwlock_class_max - 1;
-  stat= find_per_thread_rwlock_class_wait_stat(thread, & dummy_rwlock_class);
-  ok(base + 29 == stat, "last rwlock info slot at 29");
-
-  dummy_cond_class.m_index= 0;
-  stat= find_per_thread_cond_class_wait_stat(thread, & dummy_cond_class);
-  ok(base + 30 == stat, "fist cond info slot at 30");
-  dummy_cond_class.m_index= cond_class_max - 1;
-  stat= find_per_thread_cond_class_wait_stat(thread, & dummy_cond_class);
-  ok(base + 69 == stat, "last cond info slot at 69");
-
-  dummy_file_class.m_index= 0;
-  stat= find_per_thread_file_class_wait_stat(thread, & dummy_file_class);
-  ok(base + 70 == stat, "fist file info slot at 70");
-  dummy_file_class.m_index= file_class_max - 1;
-  stat= find_per_thread_file_class_wait_stat(thread, & dummy_file_class);
-  ok(base + 149 == stat, "last file info slot at 149");
-
-  cleanup_instruments();
-}
-
 void do_all_tests()
 {
   PFS_atomic::init();
@@ -405,14 +401,13 @@ void do_all_tests()
   test_no_instruments();
   test_no_instances();
   test_with_instances();
-  test_per_thread_wait();
 
   PFS_atomic::cleanup();
 }
 
 int main(int argc, char **argv)
 {
-  plan(102);
+  plan(103);
   MY_INIT(argv[0]);
   do_all_tests();
   my_end(0);
diff --git a/storage/perfschema/unittest/pfs_instr_class-oom-t.cc b/storage/perfschema/unittest/pfs_instr_class-oom-t.cc
index 49f229015b8..2de83e654c5 100644
--- a/storage/perfschema/unittest/pfs_instr_class-oom-t.cc
+++ b/storage/perfschema/unittest/pfs_instr_class-oom-t.cc
@@ -38,11 +38,20 @@ void test_oom()
   ok(rc == 1, "oom (file)");
   rc= init_table_share(1000);
   ok(rc == 1, "oom (cond)");
+  rc= init_socket_class(1000);
+  ok(rc == 1, "oom (socket)");
+  rc= init_stage_class(1000);
+  ok(rc == 1, "oom (stage)");
+  rc= init_statement_class(1000);
+  ok(rc == 1, "oom (statement)");
 
   cleanup_sync_class();
   cleanup_thread_class();
   cleanup_file_class();
   cleanup_table_share();
+  cleanup_socket_class();
+  cleanup_stage_class();
+  cleanup_statement_class();
 }
 
 void do_all_tests()
@@ -56,11 +65,10 @@ void do_all_tests()
 
 int main(int argc, char **argv)
 {
-  plan(6);
+  plan(9);
   MY_INIT(argv[0]);
   do_all_tests();
   my_end(0);
   return 0;
 }
 
-
diff --git a/storage/perfschema/unittest/pfs_instr_class-t.cc b/storage/perfschema/unittest/pfs_instr_class-t.cc
index ea50f0647d9..9e3efde656e 100644
--- a/storage/perfschema/unittest/pfs_instr_class-t.cc
+++ b/storage/perfschema/unittest/pfs_instr_class-t.cc
@@ -29,12 +29,14 @@ void test_no_registration()
   PFS_sync_key key;
   PFS_thread_key thread_key;
   PFS_file_key file_key;
+  PFS_socket_key socket_key;
   PFS_mutex_class *mutex;
   PFS_rwlock_class *rwlock;
   PFS_cond_class *cond;
   PFS_thread_class *thread;
   PFS_file_class *file;
-  PFS_table_share *table;
+  PFS_socket_class *socket;
+  /* PFS_table_share *table; */
 
   rc= init_sync_class(0, 0, 0);
   ok(rc == 0, "zero init (sync)");
@@ -42,6 +44,8 @@ void test_no_registration()
   ok(rc == 0, "zero init (thread)");
   rc= init_file_class(0);
   ok(rc == 0, "zero init (file)");
+  rc= init_socket_class(0);
+  ok(rc == 0, "zero init (socket)");
   rc= init_table_share(0);
   ok(rc == 0, "zero init (table)");
 
@@ -80,15 +84,24 @@ void test_no_registration()
   file_key= register_file_class("FOO", 3, 0);
   ok(file_key == 0, "no file registered");
 
+  socket_key= register_socket_class("FOO", 3, 0);
+  ok(socket_key == 0, "no socket registered");
+  socket_key= register_socket_class("BAR", 3, 0);
+  ok(socket_key == 0, "no socket registered");
+  socket_key= register_socket_class("FOO", 3, 0);
+  ok(socket_key == 0, "no socket registered");
+
+#ifdef LATER
   PFS_thread fake_thread;
   fake_thread.m_table_share_hash_pins= NULL;
 
-  table= find_or_create_table_share(& fake_thread, "foo_db", 6, "foo_table", 9);
+  table= find_or_create_table_share(& fake_thread, false, "foo_db", 6, "foo_table", 9);
   ok(table == NULL, "not created");
-  table= find_or_create_table_share(& fake_thread, "bar_db", 6, "bar_table", 9);
+  table= find_or_create_table_share(& fake_thread, false, "bar_db", 6, "bar_table", 9);
   ok(table == NULL, "not created");
-  table= find_or_create_table_share(& fake_thread, "foo_db", 6, "foo_table", 9);
+  table= find_or_create_table_share(& fake_thread, false, "foo_db", 6, "foo_table", 9);
   ok(table == NULL, "not created");
+#endif
 
   mutex= find_mutex_class(0);
   ok(mutex == NULL, "no mutex key 0");
@@ -125,9 +138,17 @@ void test_no_registration()
   file= find_file_class(9999);
   ok(file == NULL, "no file key 9999");
 
+  socket= find_socket_class(0);
+  ok(socket == NULL, "no socket key 0");
+  socket= find_socket_class(1);
+  ok(socket == NULL, "no socket key 1");
+  socket= find_socket_class(9999);
+  ok(socket == NULL, "no socket key 9999");
+
   cleanup_sync_class();
   cleanup_thread_class();
   cleanup_file_class();
+  cleanup_socket_class();
   cleanup_table_share();
 }
 
@@ -346,8 +367,56 @@ void test_file_registration()
   cleanup_file_class();
 }
 
+void test_socket_registration()
+{
+  int rc;
+  PFS_socket_key key;
+  PFS_socket_class *socket;
+
+  rc= init_socket_class(5);
+  ok(rc == 0, "room for 5 socket");
+
+  key= register_socket_class("FOO", 3, 0);
+  ok(key == 1, "foo registered");
+  key= register_socket_class("BAR", 3, 0);
+  ok(key == 2, "bar registered");
+  key= register_socket_class("FOO", 3, 0);
+  ok(key == 1, "foo re registered");
+  key= register_socket_class("Socket-3", 8, 0);
+  ok(key == 3, "Socket-3 registered");
+  key= register_socket_class("Socket-4", 8, 0);
+  ok(key == 4, "Socket-4 registered");
+  key= register_socket_class("Socket-5", 8, 0);
+  ok(key == 5, "Socket-5 registered");
+  ok(socket_class_lost == 0, "lost nothing");
+  key= register_socket_class("Socket-6", 8, 0);
+  ok(key == 0, "Socket-6 not registered");
+  ok(socket_class_lost == 1, "lost 1 socket");
+  key= register_socket_class("Socket-7", 8, 0);
+  ok(key == 0, "Socket-7 not registered");
+  ok(socket_class_lost == 2, "lost 2 socket");
+  key= register_socket_class("Socket-3", 8, 0);
+  ok(key == 3, "Socket-3 re registered");
+  ok(socket_class_lost == 2, "lost 2 socket");
+  key= register_socket_class("Socket-5", 8, 0);
+  ok(key == 5, "Socket-5 re registered");
+  ok(socket_class_lost == 2, "lost 2 socket");
+
+  socket= find_socket_class(0);
+  ok(socket == NULL, "no key 0");
+  socket= find_socket_class(3);
+  ok(socket != NULL, "found key 3");
+  ok(strncmp(socket->m_name, "Socket-3", 8) == 0, "key 3 is Socket-3");
+  ok(socket->m_name_length == 8, "name length 3");
+  socket= find_socket_class(9999);
+  ok(socket == NULL, "no key 9999");
+
+  cleanup_socket_class();
+}
+
 void test_table_registration()
 {
+#ifdef LATER
   PFS_table_share *table_share;
   PFS_table_share *table_share_2;
 
@@ -355,7 +424,7 @@ void test_table_registration()
   fake_thread.m_table_share_hash_pins= NULL;
 
   table_share_lost= 0;
-  table_share= find_or_create_table_share(& fake_thread, "db1", 3, "t1", 2);
+  table_share= find_or_create_table_share(& fake_thread, false, "db1", 3, "t1", 2);
   ok(table_share == NULL, "not created");
   ok(table_share_lost == 1, "lost the table");
 
@@ -363,37 +432,37 @@ void test_table_registration()
   init_table_share(5);
   init_table_share_hash();
 
-  table_share= find_or_create_table_share(& fake_thread, "db1", 3, "t1", 2);
+  table_share= find_or_create_table_share(& fake_thread, false, "db1", 3, "t1", 2);
   ok(table_share != NULL, "created db1.t1");
   ok(table_share_lost == 0, "not lost");
 
-  table_share_2= find_or_create_table_share(& fake_thread, "db1", 3, "t1", 2);
+  table_share_2= find_or_create_table_share(& fake_thread, false, "db1", 3, "t1", 2);
   ok(table_share_2 != NULL, "found db1.t1");
   ok(table_share_lost == 0, "not lost");
   ok(table_share == table_share_2, "same table");
 
-  table_share_2= find_or_create_table_share(& fake_thread, "db1", 3, "t2", 2);
+  table_share_2= find_or_create_table_share(& fake_thread, false, "db1", 3, "t2", 2);
   ok(table_share_2 != NULL, "created db1.t2");
   ok(table_share_lost == 0, "not lost");
 
-  table_share_2= find_or_create_table_share(& fake_thread, "db2", 3, "t1", 2);
+  table_share_2= find_or_create_table_share(& fake_thread, false, "db2", 3, "t1", 2);
   ok(table_share_2 != NULL, "created db2.t1");
   ok(table_share_lost == 0, "not lost");
 
-  table_share_2= find_or_create_table_share(& fake_thread, "db2", 3, "t2", 2);
+  table_share_2= find_or_create_table_share(& fake_thread, false, "db2", 3, "t2", 2);
   ok(table_share_2 != NULL, "created db2.t2");
   ok(table_share_lost == 0, "not lost");
 
-  table_share_2= find_or_create_table_share(& fake_thread, "db3", 3, "t3", 2);
+  table_share_2= find_or_create_table_share(& fake_thread, false, "db3", 3, "t3", 2);
   ok(table_share_2 != NULL, "created db3.t3");
   ok(table_share_lost == 0, "not lost");
 
-  table_share_2= find_or_create_table_share(& fake_thread, "db4", 3, "t4", 2);
+  table_share_2= find_or_create_table_share(& fake_thread, false, "db4", 3, "t4", 2);
   ok(table_share_2 == NULL, "lost db4.t4");
   ok(table_share_lost == 1, "lost");
 
   table_share_lost= 0;
-  table_share_2= find_or_create_table_share(& fake_thread, "db1", 3, "t2", 2);
+  table_share_2= find_or_create_table_share(& fake_thread, false, "db1", 3, "t2", 2);
   ok(table_share_2 != NULL, "found db1.t2");
   ok(table_share_lost == 0, "not lost");
   ok(strncmp(table_share_2->m_schema_name, "db1", 3) == 0 , "schema db1");
@@ -403,18 +472,25 @@ void test_table_registration()
 
   cleanup_table_share_hash();
   cleanup_table_share();
+#endif
 }
 
-void set_wait_stat(PFS_single_stat_chain *stat)
+void set_wait_stat(PFS_instr_class *klass)
 {
+  PFS_single_stat *stat;
+  stat= & global_instr_class_waits_array[klass->m_event_name_index];
+
   stat->m_count= 12;
   stat->m_min= 5;
   stat->m_max= 120;
   stat->m_sum= 999;
 }
 
-bool is_empty_stat(PFS_single_stat_chain *stat)
+bool is_empty_stat(PFS_instr_class *klass)
 {
+  PFS_single_stat *stat;
+  stat= & global_instr_class_waits_array[klass->m_event_name_index];
+
   if (stat->m_count != 0)
     return false;
   if (stat->m_min != (ulonglong) -1)
@@ -431,6 +507,7 @@ void test_instruments_reset()
   int rc;
   PFS_sync_key key;
   PFS_file_key file_key;
+  PFS_socket_key socket_key;
   PFS_mutex_class *mutex_1;
   PFS_mutex_class *mutex_2;
   PFS_mutex_class *mutex_3;
@@ -443,6 +520,9 @@ void test_instruments_reset()
   PFS_file_class *file_1;
   PFS_file_class *file_2;
   PFS_file_class *file_3;
+  PFS_socket_class *socket_1;
+  PFS_socket_class *socket_2;
+  PFS_socket_class *socket_3;
 
   rc= init_sync_class(3, 3, 3);
   ok(rc == 0, "init (sync)");
@@ -450,6 +530,8 @@ void test_instruments_reset()
   ok(rc == 0, "init (thread)");
   rc= init_file_class(3);
   ok(rc == 0, "init (file)");
+  rc= init_socket_class(3);
+  ok(rc == 0, "init (socket)");
 
   key= register_mutex_class("M-1", 3, 0);
   ok(key == 1, "mutex registered");
@@ -479,6 +561,13 @@ void test_instruments_reset()
   file_key= register_file_class("F-3", 3, 0);
   ok(file_key == 3, "file registered");
 
+  socket_key= register_socket_class("S-1", 3, 0);
+  ok(socket_key == 1, "socket registered");
+  socket_key= register_socket_class("S-2", 3, 0);
+  ok(socket_key == 2, "socket registered");
+  socket_key= register_socket_class("S-3", 3, 0);
+  ok(socket_key == 3, "socket registered");
+
   mutex_1= find_mutex_class(1);
   ok(mutex_1 != NULL, "mutex key 1");
   mutex_2= find_mutex_class(2);
@@ -507,49 +596,59 @@ void test_instruments_reset()
   file_3= find_file_class(3);
   ok(file_3 != NULL, "file key 3");
 
-  set_wait_stat(& mutex_1->m_wait_stat);
-  set_wait_stat(& mutex_2->m_wait_stat);
-  set_wait_stat(& mutex_3->m_wait_stat);
-  set_wait_stat(& rwlock_1->m_wait_stat);
-  set_wait_stat(& rwlock_2->m_wait_stat);
-  set_wait_stat(& rwlock_3->m_wait_stat);
-  set_wait_stat(& cond_1->m_wait_stat);
-  set_wait_stat(& cond_2->m_wait_stat);
-  set_wait_stat(& cond_3->m_wait_stat);
-  set_wait_stat(& file_1->m_wait_stat);
-  set_wait_stat(& file_2->m_wait_stat);
-  set_wait_stat(& file_3->m_wait_stat);
-
-  ok(! is_empty_stat(& mutex_1->m_wait_stat), "mutex_1 stat is populated");
-  ok(! is_empty_stat(& mutex_2->m_wait_stat), "mutex_2 stat is populated");
-  ok(! is_empty_stat(& mutex_3->m_wait_stat), "mutex_3 stat is populated");
-  ok(! is_empty_stat(& rwlock_1->m_wait_stat), "rwlock_1 stat is populated");
-  ok(! is_empty_stat(& rwlock_2->m_wait_stat), "rwlock_2 stat is populated");
-  ok(! is_empty_stat(& rwlock_3->m_wait_stat), "rwlock_3 stat is populated");
-  ok(! is_empty_stat(& cond_1->m_wait_stat), "cond_1 stat is populated");
-  ok(! is_empty_stat(& cond_2->m_wait_stat), "cond_2 stat is populated");
-  ok(! is_empty_stat(& cond_3->m_wait_stat), "cond_3 stat is populated");
-  ok(! is_empty_stat(& file_1->m_wait_stat), "file_1 stat is populated");
-  ok(! is_empty_stat(& file_2->m_wait_stat), "file_2 stat is populated");
-  ok(! is_empty_stat(& file_3->m_wait_stat), "file_3 stat is populated");
-
-  reset_instrument_class_waits();
-
-  ok(is_empty_stat(& mutex_1->m_wait_stat), "mutex_1 stat is cleared");
-  ok(is_empty_stat(& mutex_2->m_wait_stat), "mutex_2 stat is cleared");
-  ok(is_empty_stat(& mutex_3->m_wait_stat), "mutex_3 stat is cleared");
-  ok(is_empty_stat(& rwlock_1->m_wait_stat), "rwlock_1 stat is cleared");
-  ok(is_empty_stat(& rwlock_2->m_wait_stat), "rwlock_2 stat is cleared");
-  ok(is_empty_stat(& rwlock_3->m_wait_stat), "rwlock_3 stat is cleared");
-  ok(is_empty_stat(& cond_1->m_wait_stat), "cond_1 stat is cleared");
-  ok(is_empty_stat(& cond_2->m_wait_stat), "cond_2 stat is cleared");
-  ok(is_empty_stat(& cond_3->m_wait_stat), "cond_3 stat is cleared");
-  ok(is_empty_stat(& file_1->m_wait_stat), "file_1 stat is cleared");
-  ok(is_empty_stat(& file_2->m_wait_stat), "file_2 stat is cleared");
-  ok(is_empty_stat(& file_3->m_wait_stat), "file_3 stat is cleared");
+  socket_1= find_socket_class(1);
+  ok(socket_1 != NULL, "socket key 1");
+  socket_2= find_socket_class(2);
+  ok(socket_2 != NULL, "socket key 2");
+  socket_3= find_socket_class(3);
+  ok(socket_3 != NULL, "socket key 3");
+
+#ifdef LATER
+  set_wait_stat(mutex_1);
+  set_wait_stat(mutex_2);
+  set_wait_stat(mutex_3);
+  set_wait_stat(rwlock_1);
+  set_wait_stat(rwlock_2);
+  set_wait_stat(rwlock_3);
+  set_wait_stat(cond_1);
+  set_wait_stat(cond_2);
+  set_wait_stat(cond_3);
+  set_wait_stat(file_1);
+  set_wait_stat(file_2);
+  set_wait_stat(file_3);
+
+  ok(! is_empty_stat(mutex_1), "mutex_1 stat is populated");
+  ok(! is_empty_stat(mutex_2), "mutex_2 stat is populated");
+  ok(! is_empty_stat(mutex_3), "mutex_3 stat is populated");
+  ok(! is_empty_stat(rwlock_1), "rwlock_1 stat is populated");
+  ok(! is_empty_stat(rwlock_2), "rwlock_2 stat is populated");
+  ok(! is_empty_stat(rwlock_3), "rwlock_3 stat is populated");
+  ok(! is_empty_stat(cond_1), "cond_1 stat is populated");
+  ok(! is_empty_stat(cond_2), "cond_2 stat is populated");
+  ok(! is_empty_stat(cond_3), "cond_3 stat is populated");
+  ok(! is_empty_stat(file_1), "file_1 stat is populated");
+  ok(! is_empty_stat(file_2), "file_2 stat is populated");
+  ok(! is_empty_stat(file_3), "file_3 stat is populated");
+
+  reset_global_wait_stat();
+
+  ok(is_empty_stat(mutex_1), "mutex_1 stat is cleared");
+  ok(is_empty_stat(mutex_2), "mutex_2 stat is cleared");
+  ok(is_empty_stat(mutex_3), "mutex_3 stat is cleared");
+  ok(is_empty_stat(rwlock_1), "rwlock_1 stat is cleared");
+  ok(is_empty_stat(rwlock_2), "rwlock_2 stat is cleared");
+  ok(is_empty_stat(rwlock_3), "rwlock_3 stat is cleared");
+  ok(is_empty_stat(cond_1), "cond_1 stat is cleared");
+  ok(is_empty_stat(cond_2), "cond_2 stat is cleared");
+  ok(is_empty_stat(cond_3), "cond_3 stat is cleared");
+  ok(is_empty_stat(file_1), "file_1 stat is cleared");
+  ok(is_empty_stat(file_2), "file_2 stat is cleared");
+  ok(is_empty_stat(file_3), "file_3 stat is cleared");
+#endif
 
   cleanup_sync_class();
   cleanup_file_class();
+  cleanup_socket_class();
 }
 
 void do_all_tests()
@@ -562,6 +661,7 @@ void do_all_tests()
   test_cond_registration();
   test_thread_registration();
   test_file_registration();
+  test_socket_registration();
   test_table_registration();
   test_instruments_reset();
 
@@ -570,11 +670,9 @@ void do_all_tests()
 
 int main(int argc, char **argv)
 {
-  plan(196);
+  plan(181);
   MY_INIT(argv[0]);
   do_all_tests();
   my_end(0);
   return 0;
 }
-
-
diff --git a/storage/perfschema/unittest/pfs_timer-t.cc b/storage/perfschema/unittest/pfs_timer-t.cc
index 9a1c743f642..9c9ae0f75f1 100644
--- a/storage/perfschema/unittest/pfs_timer-t.cc
+++ b/storage/perfschema/unittest/pfs_timer-t.cc
@@ -34,26 +34,26 @@ void test_timers()
 
   init_timers();
 
-  t1_a= get_timer_value(TIMER_NAME_CYCLE);
+  t1_a= get_timer_pico_value(TIMER_NAME_CYCLE);
   /* Wait 5 seconds */
   my_sleep(5000000);
-  t1_b= get_timer_value(TIMER_NAME_CYCLE);
+  t1_b= get_timer_pico_value(TIMER_NAME_CYCLE);
 
-  t2_a= get_timer_value(TIMER_NAME_NANOSEC);
+  t2_a= get_timer_pico_value(TIMER_NAME_NANOSEC);
   my_sleep(5000000);
-  t2_b= get_timer_value(TIMER_NAME_NANOSEC);
+  t2_b= get_timer_pico_value(TIMER_NAME_NANOSEC);
 
-  t3_a= get_timer_value(TIMER_NAME_MICROSEC);
+  t3_a= get_timer_pico_value(TIMER_NAME_MICROSEC);
   my_sleep(5000000);
-  t3_b= get_timer_value(TIMER_NAME_MICROSEC);
+  t3_b= get_timer_pico_value(TIMER_NAME_MICROSEC);
 
-  t4_a= get_timer_value(TIMER_NAME_MILLISEC);
+  t4_a= get_timer_pico_value(TIMER_NAME_MILLISEC);
   my_sleep(5000000);
-  t4_b= get_timer_value(TIMER_NAME_MILLISEC);
+  t4_b= get_timer_pico_value(TIMER_NAME_MILLISEC);
 
-  t5_a= get_timer_value(TIMER_NAME_TICK);
+  t5_a= get_timer_pico_value(TIMER_NAME_TICK);
   my_sleep(5000000);
-  t5_b= get_timer_value(TIMER_NAME_TICK);
+  t5_b= get_timer_pico_value(TIMER_NAME_TICK);
 
   /*
     Print the timer values, for manual inspection by a human.
diff --git a/storage/perfschema/unittest/pfs_user-oom-t.cc b/storage/perfschema/unittest/pfs_user-oom-t.cc
new file mode 100644
index 00000000000..1faf29b753a
--- /dev/null
+++ b/storage/perfschema/unittest/pfs_user-oom-t.cc
@@ -0,0 +1,116 @@
+/* Copyright (c) 2011, Oracle and/or its affiliates. All rights reserved.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; version 2 of the License.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software Foundation,
+  51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA */
+
+#include <my_global.h>
+#include <my_pthread.h>
+#include <pfs_instr.h>
+#include <pfs_stat.h>
+#include <pfs_global.h>
+#include <pfs_user.h>
+#include <tap.h>
+
+#include "stub_pfs_global.h"
+#include "stub_server_misc.h"
+
+#include <string.h> /* memset */
+
+void test_oom()
+{
+  int rc;
+  PFS_global_param param;
+
+  memset(& param, 0xFF, sizeof(param));
+  param.m_enabled= true;
+  param.m_mutex_class_sizing= 0;
+  param.m_rwlock_class_sizing= 0;
+  param.m_cond_class_sizing= 0;
+  param.m_thread_class_sizing= 10;
+  param.m_table_share_sizing= 0;
+  param.m_file_class_sizing= 0;
+  param.m_mutex_sizing= 0;
+  param.m_rwlock_sizing= 0;
+  param.m_cond_sizing= 0;
+  param.m_thread_sizing= 1000;
+  param.m_table_sizing= 0;
+  param.m_file_sizing= 0;
+  param.m_file_handle_sizing= 0;
+  param.m_events_waits_history_sizing= 10;
+  param.m_events_waits_history_long_sizing= 0;
+  param.m_setup_actor_sizing= 0;
+  param.m_setup_object_sizing= 0;
+  param.m_host_sizing= 0;
+  param.m_user_sizing= 1000;
+  param.m_account_sizing= 0;
+  param.m_stage_class_sizing= 50;
+  param.m_events_stages_history_sizing= 0;
+  param.m_events_stages_history_long_sizing= 0;
+  param.m_statement_class_sizing= 50;
+  param.m_events_statements_history_sizing= 0;
+  param.m_events_statements_history_long_sizing= 0;
+
+  /* Setup */
+
+  stub_alloc_always_fails= false;
+  stub_alloc_fails_after_count= 1000;
+
+  init_event_name_sizing(& param);
+  rc= init_stage_class(param.m_stage_class_sizing);
+  ok(rc == 0, "init stage class");
+  rc= init_statement_class(param.m_statement_class_sizing);
+  ok(rc == 0, "init statement class");
+
+  /* Tests */
+
+  stub_alloc_fails_after_count= 1;
+  rc= init_user(& param);
+  ok(rc == 1, "oom (user)");
+  cleanup_user();
+
+  stub_alloc_fails_after_count= 2;
+  rc= init_user(& param);
+  ok(rc == 1, "oom (user waits)");
+  cleanup_user();
+
+  stub_alloc_fails_after_count= 3;
+  rc= init_user(& param);
+  ok(rc == 1, "oom (user stages)");
+  cleanup_user();
+
+  stub_alloc_fails_after_count= 4;
+  rc= init_user(& param);
+  ok(rc == 1, "oom (user statements)");
+  cleanup_user();
+
+  cleanup_statement_class();
+  cleanup_stage_class();
+}
+
+void do_all_tests()
+{
+  PFS_atomic::init();
+
+  test_oom();
+
+  PFS_atomic::cleanup();
+}
+
+int main(int, char **)
+{
+  plan(6);
+  MY_INIT("pfs_user-oom-t");
+  do_all_tests();
+  return 0;
+}
+
diff --git a/storage/perfschema/unittest/stub_pfs_defaults.h b/storage/perfschema/unittest/stub_pfs_defaults.h
new file mode 100644
index 00000000000..042d069b367
--- /dev/null
+++ b/storage/perfschema/unittest/stub_pfs_defaults.h
@@ -0,0 +1,23 @@
+/* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; version 2 of the License.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+#include <my_global.h>
+#include <pfs.h>
+#include <pfs_defaults.h>
+
+void install_default_setup(PSI_bootstrap *)
+{
+}
+
diff --git a/storage/perfschema/unittest/stub_pfs_global.h b/storage/perfschema/unittest/stub_pfs_global.h
index 2b581f22645..34c52e18b5a 100644
--- a/storage/perfschema/unittest/stub_pfs_global.h
+++ b/storage/perfschema/unittest/stub_pfs_global.h
@@ -1,5 +1,4 @@
-/* Copyright (c) 2008 MySQL AB, 2010 Sun Microsystems, Inc.
-   Use is subject to license terms.
+/* Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
@@ -17,26 +16,34 @@
 #include <my_global.h>
 #include <my_sys.h>
 #include <pfs_global.h>
+#include <string.h>
 
 bool pfs_initialized= false;
 
 bool stub_alloc_always_fails= true;
 int stub_alloc_fails_after_count= 0;
 
-void *pfs_malloc(size_t, myf)
+void *pfs_malloc(size_t size, myf)
 {
-  static char garbage[100];
-
   if (stub_alloc_always_fails)
     return NULL;
 
   if (--stub_alloc_fails_after_count <= 0)
     return NULL;
 
-  return garbage;
+  void *ptr= malloc(size);
+  if (ptr != NULL)
+    memset(ptr, 0, size);
+  return ptr;
+}
+
+void pfs_free(void *ptr)
+{
+  if (ptr != NULL)
+    free(ptr);
 }
 
-void pfs_free(void *)
+void pfs_print_error(const char *format, ...)
 {
 }
 
diff --git a/storage/perfschema/unittest/stub_print_error.h b/storage/perfschema/unittest/stub_print_error.h
index adfa3a62d8c..caad24e5257 100644
--- a/storage/perfschema/unittest/stub_print_error.h
+++ b/storage/perfschema/unittest/stub_print_error.h
@@ -1,5 +1,4 @@
-/* Copyright (c) 2008 MySQL AB, 2010 Sun Microsystems, Inc.
-   Use is subject to license terms.
+/* Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
diff --git a/storage/perfschema/unittest/stub_server_misc.h b/storage/perfschema/unittest/stub_server_misc.h
index 17beadbb104..8b008273bd8 100644
--- a/storage/perfschema/unittest/stub_server_misc.h
+++ b/storage/perfschema/unittest/stub_server_misc.h
@@ -19,3 +19,10 @@
 
 volatile bool ready_to_exit= false;
 
+uint lower_case_table_names= 0;
+CHARSET_INFO *files_charset_info= NULL;
+
+extern "C" void compute_md5_hash(char *, const char *, int)
+{
+}
+
diff --git a/storage/xtradb/CMakeLists.txt b/storage/xtradb/CMakeLists.txt-disabled
index 1d2b0b29dea..1c75d5f829c 100644
--- a/storage/xtradb/CMakeLists.txt
+++ b/storage/xtradb/CMakeLists.txt-disabled
@@ -15,6 +15,10 @@
 
 # This is the CMakeLists for XtraDB
 
+IF(NOT WITH_XTRADB_STORAGE_ENGINE)
+  SET(WITHOUT_XTRADB 1)
+ENDIF(NOT WITH_XTRADB_STORAGE_ENGINE)
+
 INCLUDE(CheckFunctionExists)
 INCLUDE(CheckCSourceCompiles)
 INCLUDE(CheckCSourceRuns)